diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
index 7bf5db603bccd..e0329a13b1edb 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -3,93 +3,13 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h"
 
-namespace cms {
-  namespace cuda {
-
-    template <typename Histo, typename T>
-    __global__ void countFromVector(Histo *__restrict__ h,
-                                    uint32_t nh,
-                                    T const *__restrict__ v,
-                                    uint32_t const *__restrict__ offsets) {
-      int first = blockDim.x * blockIdx.x + threadIdx.x;
-      for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
-        auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
-        assert((*off) > 0);
-        int32_t ih = off - offsets - 1;
-        assert(ih >= 0);
-        assert(ih < int(nh));
-        (*h).count(v[i], ih);
-      }
-    }
-
-    template <typename Histo, typename T>
-    __global__ void fillFromVector(Histo *__restrict__ h,
-                                   uint32_t nh,
-                                   T const *__restrict__ v,
-                                   uint32_t const *__restrict__ offsets) {
-      int first = blockDim.x * blockIdx.x + threadIdx.x;
-      for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
-        auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
-        assert((*off) > 0);
-        int32_t ih = off - offsets - 1;
-        assert(ih >= 0);
-        assert(ih < int(nh));
-        (*h).fill(v[i], i, ih);
-      }
-    }
-
-    template <typename Histo, typename T>
-    inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h,
-                                                                  uint32_t nh,
-                                                                  T const *__restrict__ v,
-                                                                  uint32_t const *__restrict__ offsets,
-                                                                  int32_t totSize,
-                                                                  int nthreads,
-                                                                  typename Histo::index_type *mem,
-                                                                  cudaStream_t stream
-#ifndef __CUDACC__
-                                                                  = cudaStreamDefault
-#endif
-    ) {
-      typename Histo::View view = {h, nullptr, mem, -1, totSize};
-      launchZero(view, stream);
 #ifdef __CUDACC__
-      auto nblocks = (totSize + nthreads - 1) / nthreads;
-      assert(nblocks > 0);
-      countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
-      cudaCheck(cudaGetLastError());
-      launchFinalize(view, stream);
-      fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
-      cudaCheck(cudaGetLastError());
-#else
-      countFromVector(h, nh, v, offsets);
-      h->finalize();
-      fillFromVector(h, nh, v, offsets);
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h"
 #endif
-    }
-
-    // iteratate over N bins left and right of the one containing "v"
-    template <typename Hist, typename V, typename Func>
-    __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) {
-      int bs = Hist::bin(value);
-      int be = std::min(int(Hist::nbins() - 1), bs + n);
-      bs = std::max(0, bs - n);
-      assert(be >= bs);
-      for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
-        func(*pj);
-      }
-    }
-
-    // iteratate over bins containing all values in window wmin, wmax
-    template <typename Hist, typename V, typename Func>
-    __host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) {
-      auto bs = Hist::bin(wmin);
-      auto be = Hist::bin(wmax);
-      assert(be >= bs);
-      for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
-        func(*pj);
-      }
-    }
+
+namespace cms {
+  namespace cuda {
 
     template <typename T,      // the type of the discretized input values
               uint32_t NBINS,  // number of bins
diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h
new file mode 100644
index 0000000000000..138991f60ac3e
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h
@@ -0,0 +1,171 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_HistoContainerAlgo_h
+#define HeterogeneousCore_CUDAUtilities_interface_HistoContainerAlgo_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+
+#ifdef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h"
+#endif
+
+namespace cms {
+  namespace cuda {
+
+    template <template <CountOrFill> typename Func, typename Histo, typename... Args>
+    __global__ void kernel_populate(typename Histo::View view, typename Histo::View::Counter *ws, Args... args) {
+      namespace cg = cooperative_groups;
+      auto grid = cg::this_grid();
+      auto histo = static_cast<Histo *>(view.assoc);
+      zeroAndInitCoop(view);
+      grid.sync();
+      Func<CountOrFill::count>::countOrFill(histo, std::forward<Args>(args)...);
+      grid.sync();
+      finalizeCoop(view, ws);
+      grid.sync();
+      Func<CountOrFill::fill>::countOrFill(histo, std::forward<Args>(args)...);
+    }
+
+    template <typename Histo, typename T, CountOrFill cof>
+    __device__ __inline__ void countOrFillFromVector(Histo *__restrict__ h,
+                                                     uint32_t nh,
+                                                     T const *__restrict__ v,
+                                                     uint32_t const *__restrict__ offsets) {
+      int first = blockDim.x * blockIdx.x + threadIdx.x;
+      for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
+        auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
+        assert((*off) > 0);
+        int32_t ih = off - offsets - 1;
+        assert(ih >= 0);
+        assert(ih < int(nh));
+        if constexpr (CountOrFill::count == cof)
+          (*h).count(v[i], ih);
+        else
+          (*h).fill(v[i], i, ih);
+      }
+    }
+
+    template <typename Histo, typename T, CountOrFill cof>
+    __global__ void countOrFillFromVectorKernel(Histo *__restrict__ h,
+                                                uint32_t nh,
+                                                T const *__restrict__ v,
+                                                uint32_t const *__restrict__ offsets) {
+      countOrFillFromVector<Histo, T, cof>(h, nh, v, offsets);
+    }
+
+    template <typename Histo, typename T>
+    inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h,
+                                                                  uint32_t nh,
+                                                                  T const *__restrict__ v,
+                                                                  uint32_t const *__restrict__ offsets,
+                                                                  int32_t totSize,
+                                                                  int nthreads,
+                                                                  typename Histo::index_type *mem,
+                                                                  cudaStream_t stream
+#ifndef __CUDACC__
+                                                                  = cudaStreamDefault
+#endif
+    ) {
+      typename Histo::View view = {h, nullptr, mem, -1, totSize};
+      launchZero(view, stream);
+#ifdef __CUDACC__
+      auto nblocks = (totSize + nthreads - 1) / nthreads;
+      assert(nblocks > 0);
+      countOrFillFromVectorKernel<Histo, T, CountOrFill::count><<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+      cudaCheck(cudaGetLastError());
+      launchFinalize(view, stream);
+      countOrFillFromVectorKernel<Histo, T, CountOrFill::fill><<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+      cudaCheck(cudaGetLastError());
+#else
+      countOrFillFromVectorKernel<Histo, T, CountOrFill::count>(h, nh, v, offsets);
+      h->finalize();
+      countOrFillFromVectorKernel<Histo, T, CountOrFill::fill>(h, nh, v, offsets);
+#endif
+    }
+
+#ifdef __CUDACC__
+    template <typename Histo, typename T>
+    __global__ void fillManyFromVectorCoopKernel(typename Histo::View view,
+                                                 uint32_t nh,
+                                                 T const *__restrict__ v,
+                                                 uint32_t const *__restrict__ offsets,
+                                                 int32_t totSize,
+                                                 typename Histo::View::Counter *ws) {
+      namespace cg = cooperative_groups;
+      auto grid = cg::this_grid();
+      auto h = static_cast<Histo *>(view.assoc);
+      zeroAndInitCoop(view);
+      grid.sync();
+      countOrFillFromVector<Histo, T, CountOrFill::count>(h, nh, v, offsets);
+      grid.sync();
+      finalizeCoop(view, ws);
+      grid.sync();
+      countOrFillFromVector<Histo, T, CountOrFill::fill>(h, nh, v, offsets);
+    }
+#endif
+
+    template <typename Histo, typename T>
+    inline __attribute__((always_inline)) void fillManyFromVectorCoop(Histo *h,
+                                                                      uint32_t nh,
+                                                                      T const *v,
+                                                                      uint32_t const *offsets,
+                                                                      int32_t totSize,
+                                                                      int nthreads,
+                                                                      typename Histo::index_type *mem,
+                                                                      cudaStream_t stream
+#ifndef __CUDACC__
+                                                                      = cudaStreamDefault
+#endif
+    ) {
+      using View = typename Histo::View;
+      View view = {h, nullptr, mem, -1, totSize};
+#ifdef __CUDACC__
+      auto kernel = fillManyFromVectorCoopKernel<Histo, T>;
+      auto nblocks = (totSize + nthreads - 1) / nthreads;
+      assert(nblocks > 0);
+      auto nOnes = view.size();
+      auto nchunks = nOnes / nthreads + 1;
+      auto ws = cms::cuda::make_device_unique<typename View::Counter[]>(nchunks, stream);
+      auto wsp = ws.get();
+      // FIXME: discuss with FW team: cuda calls are expensive and not needed for each event
+      static int maxBlocks = maxCoopBlocks(kernel, nthreads, 0, 0);
+      auto ncoopblocks = std::min(nblocks, maxBlocks);
+      assert(ncoopblocks > 0);
+      void *kernelArgs[] = {&view, &nh, &v, &offsets, &totSize, &wsp};
+      dim3 dimBlock(nthreads, 1, 1);
+      dim3 dimGrid(ncoopblocks, 1, 1);
+      // launch
+      cudaCheck(cudaLaunchCooperativeKernel((void *)kernel, dimGrid, dimBlock, kernelArgs, 0, stream));
+#else
+      launchZero(view, stream);
+      countFromVector(h, nh, v, offsets);
+      h->finalize();
+      fillFromVector(h, nh, v, offsets);
+#endif
+    }
+
+    // iteratate over N bins left and right of the one containing "v"
+    template <typename Hist, typename V, typename Func>
+    __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) {
+      int bs = Hist::bin(value);
+      int be = std::min(int(Hist::nbins() - 1), bs + n);
+      bs = std::max(0, bs - n);
+      assert(be >= bs);
+      for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
+        func(*pj);
+      }
+    }
+
+    // iteratate over bins containing all values in window wmin, wmax
+    template <typename Hist, typename V, typename Func>
+    __host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func &&func) {
+      auto bs = Hist::bin(wmin);
+      auto be = Hist::bin(wmax);
+      assert(be >= bs);
+      for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
+        func(*pj);
+      }
+    }
+  }  // namespace cuda
+}  // namespace cms
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_HistoContainerAlgo_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h b/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h
index 01f48bca94f4b..65baf7c046922 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h
@@ -19,6 +19,8 @@
 namespace cms {
   namespace cuda {
 
+    enum class CountOrFill { count, fill };
+
     template <typename Assoc>
     struct OneToManyAssocView {
       using Counter = typename Assoc::Counter;
@@ -29,6 +31,24 @@ namespace cms {
       index_type *contentStorage = nullptr;
       int32_t offSize = -1;
       int32_t contentSize = -1;
+
+      constexpr Counter *offsets() const {
+        Counter *poff = (Counter *)((char *)(assoc) + offsetof(Assoc, off));
+        if constexpr (Assoc::ctNOnes() < 0) {
+          assert(offStorage);
+          poff = offStorage;
+        }
+        return poff;
+      }
+
+      constexpr int32_t size() const {
+        auto nOnes = Assoc::ctNOnes();
+        if constexpr (Assoc::ctNOnes() < 0) {
+          nOnes = offSize;
+        }
+        assert(nOnes > 0);
+        return nOnes;
+      }
     };
 
     // this MUST BE DONE in a single block (or in two kernels!)
@@ -50,6 +70,26 @@ namespace cms {
       }
     }
 
+    template <typename Assoc>
+    __device__ void zeroAndInitCoop(OneToManyAssocView<Assoc> view) {
+      namespace cg = cooperative_groups;
+      auto grid = cg::this_grid();
+
+      auto h = view.assoc;
+
+      auto first = blockDim.x * blockIdx.x + threadIdx.x;
+
+      if (0 == first) {
+        h->psws = 0;
+        h->initStorage(view);
+      }
+
+      grid.sync();
+      for (int i = first, nt = h->totOnes(); i < nt; i += gridDim.x * blockDim.x) {
+        h->off[i] = 0;
+      }
+    }
+
     template <typename Assoc>
     inline __attribute__((always_inline)) void launchZero(Assoc *h,
                                                           cudaStream_t stream
@@ -111,16 +151,8 @@ namespace cms {
       auto h = view.assoc;
       assert(h);
 #ifdef __CUDACC__
-      using Counter = typename Assoc::Counter;
-      Counter *poff = (Counter *)((char *)(h) + offsetof(Assoc, off));
-      auto nOnes = Assoc::ctNOnes();
-      if constexpr (Assoc::ctNOnes() < 0) {
-        assert(view.offStorage);
-        assert(view.offSize > 0);
-        nOnes = view.offSize;
-        poff = view.offStorage;
-      }
-      assert(nOnes > 0);
+      auto poff = view.offsets();
+      auto nOnes = view.size();
       int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Assoc, psws));
       auto nthreads = 1024;
       auto nblocks = (nOnes + nthreads - 1) / nthreads;
@@ -131,6 +163,19 @@ namespace cms {
 #endif
     }
 
+    template <typename Assoc>
+    __device__ __inline__ void finalizeCoop(OneToManyAssocView<Assoc> view, typename Assoc::Counter *ws) {
+#ifdef __CUDACC__
+      auto poff = view.offsets();
+      auto nOnes = view.size();
+      coopBlockPrefixScan(poff, poff, nOnes, ws);
+#else
+      auto h = view.assoc;
+      assert(h);
+      h->finalize();
+#endif
+    }
+
     template <typename Assoc>
     __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) {
       assoc->bulkFinalizeFill(*apc);
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
index a7e4e963a87c6..bce1c9493606c 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
@@ -21,6 +21,9 @@
 #undef __forceinline__
 #define __forceinline__ inline __attribute__((always_inline))
 
+#undef __launch_bounds__
+#define __launch_bounds__(...)
+
 namespace cms {
   namespace cudacompat {
 
@@ -106,14 +109,140 @@ namespace cms {
     }
 
     inline void __syncthreads() {}
-    inline void __threadfence() {}
     inline bool __syncthreads_or(bool x) { return x; }
     inline bool __syncthreads_and(bool x) { return x; }
+
+    inline void __trap() { abort(); }
+    inline void __threadfence() {}
+
     template <typename T>
     inline T __ldg(T const* x) {
       return *x;
     }
 
+    namespace cooperative_groups {
+
+      // This class represents the grid group
+      class grid_group {
+      private:
+        grid_group() = default;
+
+        friend grid_group this_grid();
+
+      public:
+        // Synchronize the threads named in the group.
+        // On the serial CPU implementation, do nothing.
+        static void sync() {}
+      };
+
+      inline grid_group this_grid() { return grid_group{}; }
+
+      // This class represents the thread block
+      class thread_block {
+      private:
+        thread_block() = default;
+
+        friend thread_block this_thread_block();
+
+      public:
+        // Synchronize the threads named in the group.
+        // On the serial CPU implementation, do nothing.
+        static void sync() {}
+
+        // Total number of threads in the group.
+        // On the serial CPU implementation, always 1.
+        static unsigned long long size() { return 1; }
+
+        // Rank of the calling thread within [0, size-1].
+        // On the serial CPU implementation, always 0.
+        static unsigned long long thread_rank() { return 0; }
+
+        // 3-Dimensional index of the block within the launched grid.
+        // On the serial CPU implementation, always {0, 0, 0}.
+        static dim3 group_index() { return blockIdx; }
+
+        // 3-Dimensional index of the thread within the launched block
+        // On the serial CPU implementation, always {0, 0, 0}.
+        static dim3 thread_index() { return threadIdx; }
+
+        // Dimensions of the launched block.
+        // On the serial CPU implementation, always {1, 1, 1}.
+        static dim3 group_dim() { return blockDim; }
+      };
+
+      // Return the current thread block
+      inline thread_block this_thread_block() { return thread_block{}; }
+
+      // Represent a tiled group of threads, with compile-time fixed size.
+      // On the serial CPU implementation, the only valid Size is 1
+      template <unsigned int Size, typename ParentT>
+      class thread_block_tile {
+      private:
+        static_assert(
+            Size == 1,
+            "The cudaCompat Cooperative Groups implementation supports only tiled groups of a single thread.");
+
+        thread_block_tile() = default;
+
+        friend thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g);
+
+      public:
+        // Synchronize the threads named in the group.
+        // On the serial CPU implementation, do nothing.
+        void sync() const {}
+
+        // Total number of threads in the group.
+        // On the serial CPU implementation, always 1.
+        unsigned long long size() const { return 1; }
+
+        // Rank of the calling thread within [0, size-1].
+        // On the serial CPU implementation, always 0.
+        unsigned long long thread_rank() const { return 0; }
+
+        // Returns the number of groups created when the parent group was partitioned.
+        // On the serial CPU implementation, always 1.
+        unsigned long long meta_group_size() const { return 1; }
+
+        // Linear rank of the group within the set of tiles partitioned from a parent group (bounded by meta_group_size).
+        // On the serial CPU implementation, always 0.
+        unsigned long long meta_group_rank() const { return 0; }
+
+        // Not implemented - Refer to Warp Shuffle Functions
+        template <typename T>
+        T shfl(T var, unsigned int src_rank) const;
+        template <typename T>
+        T shfl_up(T var, int delta) const;
+        template <typename T>
+        T shfl_down(T var, int delta) const;
+        template <typename T>
+        T shfl_xor(T var, int delta) const;
+
+        // Not implemented - Refer to Warp Vote Functions
+        template <typename T>
+        T any(int predicate) const;
+        template <typename T>
+        T all(int predicate) const;
+        template <typename T>
+        T ballot(int predicate) const;
+
+        // Not implemented - Refer to Warp Match Functions
+        template <typename T>
+        T match_any(T val) const;
+        template <typename T>
+        T match_all(T val, int& pred) const;
+      };
+
+      template <unsigned int Size, typename ParentT>
+      inline thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g) {
+        static_assert(
+            Size == 1,
+            "The cudaCompat Cooperative Groups implementation supports only tiled groups of a single thread.");
+
+        return thread_block_tile<Size, ParentT>{};
+      }
+
+    }  // namespace cooperative_groups
+
   }  // namespace cudacompat
 }  // namespace cms
 
diff --git a/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h b/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h
new file mode 100644
index 0000000000000..ecb0ba5c05121
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h
@@ -0,0 +1,30 @@
+
+#ifndef HeterogeneousCore_CUDAUtilities_interface_maxCoopBlocks_h
+#define HeterogeneousCore_CUDAUtilities_interface_maxCoopBlocks_h
+
+#include <cuda_runtime.h>
+#include <algorithm>
+#include <cstdlib>
+
+template <typename F>
+inline int maxCoopBlocks(F kernel, int nthreads, int shmem, int device, int redFact = 10) {
+// #define GET_COOP_RED_FACT_FROM_ENV
+
+// to drive performance assessment by envvar
+#ifdef GET_COOP_RED_FACT_FROM_ENV
+  auto env = std::getenv("COOP_RED_FACT");
+  int redFactFromEnv = env ? atoi(env) : 0;
+  if (redFactFromEnv != 0)
+    redFact = redFactFromEnv;
+#endif
+
+  int numBlocksPerSm = 0;
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, device);
+  cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, kernel, nthreads, shmem);
+  int mxblocks = deviceProp.multiProcessorCount * numBlocksPerSm;
+  // reduce number of blocks to account for multiple CPU threads
+  return std::max(1, mxblocks / redFact);
+}
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
index 1a779fc677ff7..d305e2dc13e4e 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
@@ -6,6 +6,7 @@
 #include "FWCore/Utilities/interface/CMSUnrollLoop.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include <cooperative_groups.h>
 
 #ifdef __CUDA_ARCH__
 
@@ -183,6 +184,52 @@ namespace cms {
         co[i] += psum[k];
       }
     }
+
+    template <typename T>
+    __device__ void coopBlockPrefixScan(T const* ici, T* ico, int32_t size, T* ipsum) {
+      namespace cg = cooperative_groups;
+      auto grid = cg::this_grid();
+      volatile T const* ci = ici;
+      volatile T* co = ico;
+      volatile T* psum = ipsum;
+
+      __shared__ T ws[32];
+
+      //   assert(blockDim.x * gridDim.x >= size);
+
+      int nChunks = size / blockDim.x + 1;
+
+      // first each block does a scan
+      for (int ib = blockIdx.x; ib < nChunks; ib += gridDim.x) {
+        int off = blockDim.x * ib;
+        if (size - off > 0) {
+          int ls = std::min(int(blockDim.x), size - off);
+          blockPrefixScan(ci + off, co + off, ls, ws);
+          psum[ib] = co[off + ls - 1];
+          __syncthreads();
+        }
+      }
+      grid.sync();
+
+      // good each block has done his work
+      // let's get the partial sums from each block
+      if (0 == blockIdx.x) {
+        blockPrefixScan(psum, psum, nChunks, ws);
+      }
+
+      grid.sync();
+
+      for (int ib = blockIdx.x; ib < nChunks; ib += gridDim.x) {
+        // now each block updates its piece (but for chunk 0)
+        int k = ib;
+        if (0 == k)
+          continue;
+        int i = threadIdx.x + k * blockDim.x;
+        if (i < size)
+          co[i] += psum[k - 1];
+      }
+    }
+
   }  // namespace cuda
 }  // namespace cms
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
index 53d41efcf4236..782c46dc66739 100644
--- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -52,6 +52,14 @@
     <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
   </bin>
 
+ <bin file="HistoContainer_t.cu" name="gpuHistoContainerCoop_t">
+    <flags CUDA_FLAGS="-g -DUSE_COOPERATIVE_GROUPS"/>
+  </bin>
+
+  <bin file="HistoContainer_t.cu" name="gpuHistoContainerCoop_debug">
+    <flags CUDA_FLAGS="-g -DUSE_COOPERATIVE_GROUPS -DGPU_DEBUG"/>
+  </bin>
+
  <bin file="HistoContainerRT_t.cu" name="gpuHistoContainerRT_t">
     <flags CUDA_FLAGS="-g"/>
   </bin>
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainerRT_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainerRT_t.cu
index b49498990fcbe..3831e8e384f85 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainerRT_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainerRT_t.cu
@@ -6,7 +6,7 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
 using namespace cms::cuda;
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
index 2109e5a0d5b38..d8ffe204bf56e 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
@@ -4,7 +4,7 @@
 #include <limits>
 #include <random>
 
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
 using namespace cms::cuda;
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
index 75f9cc0e626f5..c9fe4ee66a2ed 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
@@ -6,7 +6,7 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
 using namespace cms::cuda;
@@ -72,7 +72,12 @@ void go() {
 
     cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
 
+#ifdef USE_COOPERATIVE_GROUPS
+    fillManyFromVectorCoop(h_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, nullptr, 0);
+#else
     fillManyFromVector(h_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, nullptr, 0);
+#endif
+
     cudaCheck(cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost));
     assert(0 == h.off[0]);
     assert(offsets[10] == h.size());
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
index f2d74c8e7ce98..44d487b907699 100644
--- a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
@@ -6,7 +6,7 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
index 8ba9158cb69b7..da4e1be0ba782 100644
--- a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
+++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
@@ -11,6 +11,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h"
 #endif
 
 #include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h"
@@ -58,7 +59,7 @@ __global__ void verifyMulti(Multiplicity* __restrict__ m1, Multiplicity* __restr
     assert(m1->off[i] == m2->off[i]);
 }
 
-__global__ void count(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
+__device__ __inline__ void count(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int i = first; i < 4 * n; i += gridDim.x * blockDim.x) {
     auto k = i / 4;
@@ -70,8 +71,9 @@ __global__ void count(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int3
       assoc->count(tk[k][j]);
   }
 }
+__global__ void countKernel(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { count(tk, assoc, n); }
 
-__global__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
+__device__ __inline__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int i = first; i < 4 * n; i += gridDim.x * blockDim.x) {
     auto k = i / 4;
@@ -84,6 +86,21 @@ __global__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32
   }
 }
 
+__global__ void fillKernel(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { fill(tk, assoc, n); }
+
+__global__ void populate(TK const* __restrict__ tk, Assoc::View view, int32_t n, Assoc::View::Counter* ws) {
+  namespace cg = cooperative_groups;
+  auto grid = cg::this_grid();
+  auto h = view.assoc;
+  zeroAndInitCoop(view);
+  grid.sync();
+  count(tk, h, n);
+  grid.sync();
+  finalizeCoop(view, ws);
+  grid.sync();
+  fill(tk, h, n);
+}
+
 __global__ void verify(Assoc* __restrict__ assoc) { assert(int(assoc->size()) < assoc->capacity()); }
 
 template <typename Assoc>
@@ -118,7 +135,7 @@ __global__ void verifyFill(Assoc const* __restrict__ la, int n) {
     imax = std::max(imax, int(x));
   }
   assert(0 == la->size(n));
-  printf("found with %d elements %f %d %d\n", n, double(ave) / n, imax, z);
+  printf("found with %d elements %f %d %d\n\n", n, double(ave) / n, imax, z);
 }
 
 template <typename Assoc>
@@ -179,7 +196,7 @@ int main() {
 
   std::geometric_distribution<int> rdm(0.8);
 
-  constexpr uint32_t N = 4000;
+  uint32_t N = 4000;
 
   std::vector<std::array<uint16_t, 4>> tr(N);
 
@@ -249,21 +266,40 @@ int main() {
   launchZero(saView, 0);
 
 #ifdef __CUDACC__
-  auto nThreads = 256;
-  auto nBlocks = (4 * N + nThreads - 1) / nThreads;
+  int nThreads = 256;
+  int nBlocks = (4 * N + nThreads - 1) / nThreads;
 
-  count<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
+  countKernel<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
 
   launchFinalize(aView, 0);
   verify<<<1, 1>>>(a_d.get());
-  fill<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
+  fillKernel<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
   verifyFill<<<1, 1>>>(a_d.get(), n);
 
+  // now with cooperative gropus
+
+  auto nOnes = aView.size();
+  auto nchunks = nOnes / nThreads + 1;
+  auto ws = cms::cuda::make_device_unique<Assoc::Counter[]>(nchunks, 0);
+
+  int maxBlocks = maxCoopBlocks(populate, nThreads, 0, 0, 0);
+  std::cout << "max number of blocks is " << maxBlocks << std::endl;
+  auto ncoopblocks = std::min(nBlocks, maxBlocks);
+  auto a1 = v_d.get();
+  auto a4 = ws.get();
+  void* kernelArgs[] = {&a1, &aView, &N, &a4};
+  dim3 dimBlock(nThreads, 1, 1);
+  dim3 dimGrid(ncoopblocks, 1, 1);
+  // launch
+  cudaCheck(cudaLaunchCooperativeKernel((void*)populate, dimGrid, dimBlock, kernelArgs));
+  verifyFill<<<1, 1>>>(a_d.get(), n);
+  cudaCheck(cudaGetLastError());
+
 #else
-  count(v_d, a_d.get(), N);
+  countKernel(v_d, a_d.get(), N);
   launchFinalize(aView);
   verify(a_d.get());
-  fill(v_d, a_d.get(), N);
+  fillKernel(v_d, a_d.get(), N);
   verifyFill(a_d.get(), n);
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
index 5e4a943f5c069..c4ca11a8eddf1 100644
--- a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
@@ -3,6 +3,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h"
 
 using namespace cms::cuda;
 
@@ -83,6 +84,11 @@ __global__ void verify(uint32_t const *v, uint32_t n) {
     printf("verify\n");
 }
 
+template <typename T>
+__global__ void doCoop(T const *ici, T *ico, int32_t size, T *ipsum) {
+  coopBlockPrefixScan(ici, ico, size, ipsum);
+}
+
 int main() {
   cms::cudatest::requireDevices();
 
@@ -113,9 +119,9 @@ int main() {
   int num_items = 200;
   for (int ksize = 1; ksize < 4; ++ksize) {
     // test multiblock
-    std::cout << "multiblok" << std::endl;
     // Declare, allocate, and initialize device-accessible pointers for input and output
     num_items *= 10;
+    std::cout << "multiblok " << num_items << std::endl;
     uint32_t *d_in;
     uint32_t *d_out1;
     uint32_t *d_out2;
@@ -141,7 +147,31 @@ int main() {
     cudaCheck(cudaGetLastError());
     verify<<<nblocks, nthreads, 0>>>(d_out1, num_items);
     cudaCheck(cudaGetLastError());
-    cudaDeviceSynchronize();
+    cudaCheck(cudaDeviceSynchronize());
+    cudaCheck(cudaGetLastError());
+
+    uint32_t *d_psum;
+    cudaCheck(cudaMalloc(&d_psum, nblocks * sizeof(uint32_t)));
+    std::cout << "launch coopBlockPrefixScan " << num_items << ' ' << nblocks << std::endl;
+    int maxBlocks = maxCoopBlocks(doCoop<uint32_t>, nthreads, 0, 0, 0);
+    std::cout << "max number of blocks is " << maxBlocks << std::endl;
+    auto ncoopblocks = std::min(nblocks, maxBlocks);
+    void *kernelArgs[] = {&d_in, &d_out2, &num_items, &d_psum};
+    dim3 dimBlock(nthreads, 1, 1);
+    dim3 dimGrid(ncoopblocks, 1, 1);
+    // launch
+    cudaLaunchCooperativeKernel((void *)doCoop<uint32_t>, dimGrid, dimBlock, kernelArgs);
+    cudaCheck(cudaGetLastError());
+    verify<<<nblocks, nthreads, 0>>>(d_out2, num_items);
+    cudaCheck(cudaGetLastError());
+    cudaCheck(cudaDeviceSynchronize());
+    cudaCheck(cudaGetLastError());
+
+    std::cout << "Free" << std::endl;
+    cudaCheck(cudaFree(d_psum));
+    cudaCheck(cudaFree(d_out2));
+    cudaCheck(cudaFree(d_out1));
+    cudaCheck(cudaFree(d_in));
 
   }  // ksize
   return 0;
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
index b4fb89c3a709e..5a4385854e258 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
@@ -6,7 +6,7 @@
 
 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
 namespace gpuClustering {
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
index 135254fa6e9f2..b623f787375df 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
@@ -9,6 +9,7 @@
 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 
 #include "PixelRecHitGPUKernel.h"
 #include "gpuPixelRecHits.h"
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
index 51aa7e896cf5d..6d32ef96359c4 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
@@ -10,6 +10,7 @@
 #include "DataFormats/Common/interface/Handle.h"
 #include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
 #include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
 #include "FWCore/Framework/interface/MakerMacros.h"
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index bd8151a2a816a..3d2567714790e 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -134,9 +134,9 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   // remove duplicates (tracks that share a doublet)
   kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_);
 
-  kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
+  kernel_countOrFillMultiplicity<CountOrFill::count>(tuples_d, quality_d, device_tupleMultiplicity_.get());
   cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
-  kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
+  kernel_countOrFillMultiplicity<CountOrFill::fill>(tuples_d, quality_d, device_tupleMultiplicity_.get());
 
   if (nhits > 1 && params_.lateFishbone_) {
     gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true);
@@ -163,9 +163,9 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 
   // fill hit->track "map"
   if (params_.doSharedHitCut_ || params_.doStats_) {
-    kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
+    kernel_countOrFillHitInTracks<CountOrFill::count>(tuples_d, quality_d, device_hitToTuple_.get());
     cms::cuda::launchFinalize(hitToTupleView_, cudaStream);
-    kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
+    kernel_countOrFillHitInTracks<CountOrFill::fill>(tuples_d, quality_d, device_hitToTuple_.get());
   }
 
   // remove duplicates (tracks that share at least one hit)
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index 689cc0afd052b..27f4e0141456c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -1,5 +1,132 @@
 #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h"
 
+/// Compute the number of quadruplet blocks for block size
+inline uint32_t nQuadrupletBlocks(uint32_t blockSize) {
+  // caConstants::maxNumberOfQuadruplets is a constexpr, so the compiler will pre compute the 3*max/4
+  return (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
+}
+
+// #define CMS_CA_USE_COOPERATIVE_GROUPS
+
+#ifndef CMS_CA_USE_COOPERATIVE_GROUPS
+__inline__ void populateMultiplicity(HitContainer const *__restrict__ tuples_d,
+                                     Quality const *__restrict__ quality_d,
+                                     caConstants::TupleMultiplicity *tupleMultiplicity_d,
+                                     cudaStream_t cudaStream) {
+  cms::cuda::launchZero(tupleMultiplicity_d, cudaStream);
+  auto blockSize = 128;
+  auto numberOfBlocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize;
+  kernel_countOrFillMultiplicity<CountOrFill::count>
+      <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, tupleMultiplicity_d);
+  cms::cuda::launchFinalize(tupleMultiplicity_d, cudaStream);
+  kernel_countOrFillMultiplicity<CountOrFill::fill>
+      <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, tupleMultiplicity_d);
+}
+
+__inline__ void populateHitInTracks(HitContainer const *__restrict__ tuples_d,
+                                    Quality const *__restrict__ quality_d,
+                                    CAHitNtupletGeneratorKernelsGPU::HitToTuple::View hitToTupleView,
+                                    cudaStream_t cudaStream) {
+  auto hitToTuple_d = static_cast<CAHitNtupletGeneratorKernelsGPU::HitToTuple *>(hitToTupleView.assoc);
+  cms::cuda::launchZero(hitToTupleView, cudaStream);
+  auto blockSize = 64;
+  auto numberOfBlocks = nQuadrupletBlocks(blockSize);
+  kernel_countOrFillHitInTracks<CountOrFill::count>
+      <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, hitToTuple_d);
+  cudaCheck(cudaGetLastError());
+  cms::cuda::launchFinalize(hitToTupleView, cudaStream);
+  cudaCheck(cudaGetLastError());
+  kernel_countOrFillHitInTracks<CountOrFill::fill>
+      <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, hitToTuple_d);
+}
+
+#else
+__global__ void kernel_populateHitInTracks(HitContainer const *__restrict__ tuples_d,
+                                           Quality const *__restrict__ quality_d,
+                                           HitToTuple::View view,
+                                           HitToTuple::View::Counter *ws) {
+  namespace cg = cooperative_groups;
+  auto grid = cg::this_grid();
+  auto tuple_d = static_cast<HitToTuple *>(view.assoc);
+  zeroAndInitCoop(view);
+  grid.sync();
+  countOrFillHitInTracks<CountOrFill::count>(tuples_d, quality_d, tuple_d);
+  grid.sync();
+  finalizeCoop(view, ws);
+  grid.sync();
+  countOrFillHitInTracks<CountOrFill::fill>(tuples_d, quality_d, tuple_d);
+}
+
+__inline__ void populateHitInTracks(HitContainer const *tuples_d,
+                                    Quality const *quality_d,
+                                    HitToTuple::View view,
+                                    cudaStream_t cudaStream) {
+  using View = HitToTuple::View;
+  int blockSize = 64;
+  int nblocks = nQuadrupletBlocks(blockSize);
+
+  auto kernel = kernel_populateHitInTracks;
+
+  assert(nblocks > 0);
+  auto nOnes = view.size();
+  auto nchunks = nOnes / blockSize + 1;
+  auto ws = cms::cuda::make_device_unique<View::Counter[]>(nchunks, cudaStream);
+  auto wsp = ws.get();
+  // FIXME: discuss with FW team: cuda calls are expensive and not needed for each event
+  static int const maxBlocks = maxCoopBlocks(kernel, blockSize, 0, 0);
+  auto ncoopblocks = std::min(nblocks, maxBlocks);
+  assert(ncoopblocks > 0);
+  void *kernelArgs[] = {&tuples_d, &quality_d, &view, &wsp};
+  dim3 dimBlock(blockSize, 1, 1);
+  dim3 dimGrid(ncoopblocks, 1, 1);
+  // launch
+  cudaCheck(cudaLaunchCooperativeKernel((void *)kernel, dimGrid, dimBlock, kernelArgs, 0, cudaStream));
+}
+
+__global__ void kernel_populateMultiplicity(HitContainer const *__restrict__ tuples_d,
+                                            Quality const *__restrict__ quality_d,
+                                            caConstants::TupleMultiplicity::View view,
+                                            caConstants::TupleMultiplicity::View::Counter *ws) {
+  namespace cg = cooperative_groups;
+  auto grid = cg::this_grid();
+  auto tupleMultiplicity_d = static_cast<caConstants::TupleMultiplicity *>(view.assoc);
+  zeroAndInitCoop(view);
+  grid.sync();
+  countOrFillMultiplicity<CountOrFill::count>(tuples_d, quality_d, tupleMultiplicity_d);
+  grid.sync();
+  finalizeCoop(view, ws);
+  grid.sync();
+  countOrFillMultiplicity<CountOrFill::fill>(tuples_d, quality_d, tupleMultiplicity_d);
+}
+
+__inline__ void populateMultiplicity(HitContainer const *tuples_d,
+                                     Quality const *quality_d,
+                                     caConstants::TupleMultiplicity *tupleMultiplicity_d,
+                                     cudaStream_t cudaStream) {
+  auto kernel = kernel_populateMultiplicity;
+  using View = caConstants::TupleMultiplicity::View;
+  View view = {tupleMultiplicity_d, nullptr, nullptr, -1, -1};
+
+  int blockSize = 128;
+  int nblocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize;
+  assert(nblocks > 0);
+  auto nOnes = view.size();
+  auto nchunks = nOnes / blockSize + 1;
+  auto ws = cms::cuda::make_device_unique<View::Counter[]>(nchunks, cudaStream);
+  auto wsp = ws.get();
+  // FIXME: discuss with FW team: cuda calls are expensive and not needed for each event
+  static int const maxBlocks = maxCoopBlocks(kernel, blockSize, 0, 0);
+  auto ncoopblocks = std::min(nblocks, maxBlocks);
+  assert(ncoopblocks > 0);
+  void *kernelArgs[] = {&tuples_d, &quality_d, &view, &wsp};
+  dim3 dimBlock(blockSize, 1, 1);
+  dim3 dimGrid(ncoopblocks, 1, 1);
+  // launch
+  cudaCheck(cudaLaunchCooperativeKernel((void *)kernel, dimGrid, dimBlock, kernelArgs, 0, cudaStream));
+}
+
+#endif
+
 template <>
 void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
   // these are pointer on GPU!
@@ -99,13 +226,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
       device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_);
   cudaCheck(cudaGetLastError());
 
-  blockSize = 128;
-  numberOfBlocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize;
-  kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      tuples_d, quality_d, device_tupleMultiplicity_.get());
-  cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
-  kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      tuples_d, quality_d, device_tupleMultiplicity_.get());
+  populateMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get(), cudaStream);
   cudaCheck(cudaGetLastError());
 
   // do not run the fishbone if there are hits only in BPIX1
@@ -253,17 +374,8 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 #endif
 
   if (params_.doSharedHitCut_ || params_.doStats_) {
-    // fill hit->track "map"
-    assert(hitToTupleView_.offSize > nhits);
-    numberOfBlocks = nQuadrupletBlocks(blockSize);
-    kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-        tuples_d, quality_d, device_hitToTuple_.get());
-    cudaCheck(cudaGetLastError());
-    assert((hitToTupleView_.assoc == device_hitToTuple_.get()) &&
-           (hitToTupleView_.offStorage == device_hitToTupleStorage_.get()) && (hitToTupleView_.offSize > 0));
-    cms::cuda::launchFinalize(hitToTupleView_, cudaStream);
-    cudaCheck(cudaGetLastError());
-    kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_hitToTuple_.get());
+    // populate hit->track "map"
+    populateHitInTracks(tuples_d, quality_d, hitToTupleView_, cudaStream);
     cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
     cudaCheck(cudaDeviceSynchronize());
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
index 10a02309185a3..641f68e5fad85 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -227,12 +227,6 @@ class CAHitNtupletGeneratorKernels {
     // We want (3 * params_.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize, but first part is pre-computed.
     return (paramsMaxDoubletes3Quarters_ + blockSize - 1) / blockSize;
   }
-
-  /// Compute the number of quadruplet blocks for block size
-  inline uint32_t nQuadrupletBlocks(uint32_t blockSize) {
-    // caConstants::maxNumberOfQuadruplets is a constexpr, so the compiler will pre compute the 3*max/4
-    return (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
-  }
 };
 
 using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels<cms::cudacompat::GPUTraits>;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc
index 5978ef8851c73..5bd8793b32411 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc
@@ -40,9 +40,10 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(int32_t nHits, cudaStream_t
     cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream));
   } else {
     *device_nCells_ = 0;
+    // for gpu moved to populate functions
+    cms::cuda::launchZero(device_tupleMultiplicity_.get(), stream);
+    cms::cuda::launchZero(hitToTupleView_, stream);  // we may wish to keep it in the edm
   }
-  cms::cuda::launchZero(device_tupleMultiplicity_.get(), stream);
-  cms::cuda::launchZero(hitToTupleView_, stream);  // we may wish to keep it in the edm
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
   cudaCheck(cudaGetLastError());
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 86d7ab8b5a1a6..fb7378890d826 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -13,6 +13,7 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
 
 #include "CAConstants.h"
@@ -31,6 +32,8 @@ using Quality = pixelTrack::Quality;
 using TkSoA = pixelTrack::TrackSoA;
 using HitContainer = pixelTrack::HitContainer;
 
+using CountOrFill = cms::cuda::CountOrFill;
+
 namespace {
 
   constexpr uint16_t tkNotFound = std::numeric_limits<uint16_t>::max();
@@ -376,9 +379,10 @@ __global__ void kernel_mark_used(GPUCACell *__restrict__ cells, uint32_t const *
   }
 }
 
-__global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets,
-                                         Quality const *__restrict__ quality,
-                                         caConstants::TupleMultiplicity *tupleMultiplicity) {
+template <CountOrFill cof>
+__device__ __inline__ void countOrFillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
+                                                   Quality const *__restrict__ quality,
+                                                   caConstants::TupleMultiplicity *tupleMultiplicity) {
   auto first = blockIdx.x * blockDim.x + threadIdx.x;
   for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
     auto nhits = foundNtuplets->size(it);
@@ -390,26 +394,18 @@ __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundN
     if (nhits > 7)  // current limit
       printf("wrong mult %d %d\n", it, nhits);
     assert(nhits <= caConstants::maxHitsOnTrack);
-    tupleMultiplicity->count(nhits);
+    if constexpr (CountOrFill::count == cof)
+      tupleMultiplicity->count(nhits);
+    else
+      tupleMultiplicity->fill(nhits, it);
   }
 }
 
-__global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
-                                        Quality const *__restrict__ quality,
-                                        caConstants::TupleMultiplicity *tupleMultiplicity) {
-  auto first = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
-    auto nhits = foundNtuplets->size(it);
-    if (nhits < 3)
-      continue;
-    if (quality[it] == pixelTrack::Quality::edup)
-      continue;
-    assert(quality[it] == pixelTrack::Quality::bad);
-    if (nhits > 7)
-      printf("wrong mult %d %d\n", it, nhits);
-    assert(nhits <= caConstants::maxHitsOnTrack);
-    tupleMultiplicity->fill(nhits, it);
-  }
+template <CountOrFill cof>
+__global__ void kernel_countOrFillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
+                                               Quality const *__restrict__ quality,
+                                               caConstants::TupleMultiplicity *tupleMultiplicity) {
+  countOrFillMultiplicity<cof>(foundNtuplets, quality, tupleMultiplicity);
 }
 
 __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
@@ -516,28 +512,27 @@ __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples,
   }
 }
 
-__global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples,
-                                        Quality const *__restrict__ quality,
-                                        CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
+template <CountOrFill cof>
+__device__ __inline__ void countOrFillHitInTracks(HitContainer const *__restrict__ tuples,
+                                                  Quality const *__restrict__ quality,
+                                                  CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
     if (tuples->size(idx) == 0)
       break;  // guard
     for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
-      hitToTuple->count(*h);
+      if constexpr (CountOrFill::count == cof)
+        hitToTuple->count(*h);
+      else
+        hitToTuple->fill(*h, idx);
   }
 }
 
-__global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples,
-                                       Quality const *__restrict__ quality,
-                                       CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
-  int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (tuples->size(idx) == 0)
-      break;  // guard
-    for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
-      hitToTuple->fill(*h, idx);
-  }
+template <CountOrFill cof>
+__global__ void kernel_countOrFillHitInTracks(HitContainer const *__restrict__ tuples,
+                                              Quality const *__restrict__ quality,
+                                              CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
+  countOrFillHitInTracks<cof>(tuples, quality, hitToTuple);
 }
 
 __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples,
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
index f71aa56842a67..8bd30493e661b 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
@@ -5,7 +5,7 @@
 #include <cmath>
 #include <cstdint>
 
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
 #include "gpuVertexFinder.h"
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
index a11283a7b2065..4132ab0c227da 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
@@ -5,7 +5,7 @@
 #include <cmath>
 #include <cstdint>
 
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
 #include "gpuVertexFinder.h"
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
index 66d246fcfa4fa..8837b04e45654 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
@@ -5,7 +5,7 @@
 #include <cmath>
 #include <cstdint>
 
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
 #include "gpuVertexFinder.h"
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
index 0acf67244528a..e59063a84c0b3 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
@@ -5,7 +5,7 @@
 #include <cmath>
 #include <cstdint>
 
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
 #include "gpuVertexFinder.h"
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
index 93f78d498b26f..28ef595bb4868 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
@@ -5,7 +5,7 @@
 #include <cmath>
 #include <cstdint>
 
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #ifdef __CUDA_ARCH__
 #include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h"
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
index 0fe8bd882dcc5..00c7448b51a63 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
@@ -5,7 +5,7 @@
 #include <cmath>
 #include <cstdint>
 
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
 #include "gpuVertexFinder.h"