From 650c9710a475839205e920229459d0f0db0e9911 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Tue, 12 Oct 2021 12:28:47 +0200
Subject: [PATCH 01/29] add cooperative groups

---
 .../CUDAUtilities/interface/cudaCompat.h      | 116 +++++++++++++++++-
 1 file changed, 115 insertions(+), 1 deletion(-)
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
index a7e4e963a87c6..c5f6d2e0c2af6 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
@@ -21,6 +21,9 @@
 #undef __forceinline__
 #define __forceinline__ inline __attribute__((always_inline))
 
+#undef __launch_bounds__
+#define __launch_bounds__(...)
+
 namespace cms {
   namespace cudacompat {
 
@@ -106,14 +109,125 @@ namespace cms {
     }
 
     inline void __syncthreads() {}
-    inline void __threadfence() {}
     inline bool __syncthreads_or(bool x) { return x; }
     inline bool __syncthreads_and(bool x) { return x; }
+
+    inline void __trap() { abort(); }
+    inline void __threadfence() {}
+
     template <typename T>
     inline T __ldg(T const* x) {
       return *x;
     }
 
+    namespace cooperative_groups {
+
+      // This class represents the thread block
+      class thread_block {
+      private:
+        thread_block() = default;
+
+        friend thread_block this_thread_block();
+
+      public:
+        // Synchronize the threads named in the group.
+        // On the serial CPU implementation, do nothing.
+        static void sync() {}
+
+        // Total number of threads in the group.
+        // On the serial CPU implementation, always 1.
+        static unsigned long long size() { return 1; }
+
+        // Rank of the calling thread within [0, size-1].
+        // On the serial CPU implementation, always 0.
+        static unsigned long long thread_rank() { return 0; }
+
+        // 3-Dimensional index of the block within the launched grid.
+        // On the serial CPU implementation, always {0, 0, 0}.
+        static dim3 group_index() { return blockIdx; }
+
+        // 3-Dimensional index of the thread within the launched block
+        // On the serial CPU implementation, always {0, 0, 0}.
+        static dim3 thread_index() { return threadIdx; }
+
+        // Dimensions of the launched block.
+        // On the serial CPU implementation, always {1, 1, 1}.
+        static dim3 group_dim() { return blockDim; }
+      };
+
+      // Return the current thread block
+      inline thread_block this_thread_block() { return thread_block{}; }
+
+      // Represent a tiled group of threads, with compile-time fixed size.
+      // On the serial CPU implementation, the only valid Size is 1
+      template <unsigned int Size, typename ParentT>
+      class thread_block_tile {
+      private:
+        static_assert(
+            Size == 1,
+            "The cudaCompat Cooperative Groups implementation supports only tiled groups of a single thread.");
+
+        thread_block_tile() = default;
+
+        friend thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g);
+
+      public:
+        // Synchronize the threads named in the group.
+        // On the serial CPU implementation, do nothing.
+        void sync() const {}
+
+        // Total number of threads in the group.
+        // On the serial CPU implementation, always 1.
+        unsigned long long size() const { return 1; }
+
+        // Rank of the calling thread within [0, size-1].
+        // On the serial CPU implementation, always 0.
+        unsigned long long thread_rank() const { return 0; }
+
+        // Returns the number of groups created when the parent group was partitioned.
+        // On the serial CPU implementation, always 1.
+        unsigned long long meta_group_size() const { return 1; }
+
+        // Linear rank of the group within the set of tiles partitioned from a parent group (bounded by meta_group_size).
+        // On the serial CPU implementation, always 0.
+        unsigned long long meta_group_rank() const { return 0; }
+
+        // Not implemented - Refer to Warp Shuffle Functions
+        template <typename T>
+        T shfl(T var, unsigned int src_rank) const;
+        template <typename T>
+        T shfl_up(T var, int delta) const;
+        template <typename T>
+        T shfl_down(T var, int delta) const;
+        template <typename T>
+        T shfl_xor(T var, int delta) const;
+
+        // Not implemented - Refer to Warp Vote Functions
+        template <typename T>
+        T any(int predicate) const;
+        template <typename T>
+        T all(int predicate) const;
+        template <typename T>
+        T ballot(int predicate) const;
+
+        // Not implemented - Refer to Warp Match Functions
+        template <typename T>
+        T match_any(T val) const;
+        template <typename T>
+        T match_all(T val, int& pred) const;
+      };
+
+      template <unsigned int Size, typename ParentT>
+      inline thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g) {
+        static_assert(
+            Size == 1,
+            "The cudaCompat Cooperative Groups implementation supports only tiled groups of a single thread.");
+
+        return thread_block_tile<Size, ParentT>{};
+      }
+
+    }  // namespace cooperative_groups
+
   }  // namespace cudacompat
 }  // namespace cms
 

From e654481c42b9c5065d5588a6d02613782b3babe9 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Tue, 12 Oct 2021 17:56:10 +0200
Subject: [PATCH 02/29] works with coop

---
 .../CUDAUtilities/interface/cudaCompat.h      | 15 ++++++
 .../CUDAUtilities/interface/prefixScan.h      | 46 +++++++++++++++++++
 .../CUDAUtilities/test/prefixScan_t.cu        | 37 ++++++++++++++-
 3 files changed, 96 insertions(+), 2 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
index c5f6d2e0c2af6..c2bc3f04450c0 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
@@ -122,6 +122,21 @@ namespace cms {
 
     namespace cooperative_groups {
 
+      // This class represents the grid group
+     class grid_group {
+     private: 
+       grid_group() = default;
+
+       friend grid_group this_grid();
+
+      public:
+        // Synchronize the threads named in the group.
+        // On the serial CPU implementation, do nothing.
+        static void sync() {}
+     };
+
+     inline grid_group this_grid() { return grid_group{};}
+
       // This class represents the thread block
       class thread_block {
       private:
diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
index 1a779fc677ff7..e4ff57e0eb415 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
@@ -6,6 +6,7 @@
 #include "FWCore/Utilities/interface/CMSUnrollLoop.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include <cooperative_groups.h>
 
 #ifdef __CUDA_ARCH__
 
@@ -183,6 +184,51 @@ namespace cms {
         co[i] += psum[k];
       }
     }
+
+
+    template <typename T>
+    __device__ void coopBlockPrefixScan(T const* ici, T* ico, int32_t size, T * ipsum) {
+      namespace cg = cooperative_groups;
+      auto grid = cg::this_grid();
+      volatile T const* ci = ici;
+      volatile T* co = ico;
+      volatile T* psum = ipsum;
+
+      __shared__ T ws[32];
+
+      //   assert(blockDim.x * gridDim.x >= size);
+
+     int nChunks = size/blockDim.x + 1;
+
+      // first each block does a scan
+      for (int ib=blockIdx.x; ib<nChunks; ib+=gridDim.x) {
+        int off = blockDim.x * ib;
+        if (size - off > 0) {
+         int ls = std::min(int(blockDim.x), size - off);
+         blockPrefixScan(ci + off, co + off, ls, ws);
+         psum[ib] = co[off+ls-1];
+         __syncthreads();
+        }
+      }
+      grid.sync();
+
+      // good each block has done his work
+      // let's get the partial sums from each block
+      if (0==blockIdx.x) {
+        blockPrefixScan(psum, psum, nChunks, ws);
+      }
+
+      grid.sync();
+
+      for (int ib=blockIdx.x; ib<nChunks; ib+=gridDim.x) {
+        // now each block updates its piece (but for chunk 0)
+        int k = ib;
+        if (0==k) continue;
+        int i = threadIdx.x + k*blockDim.x;
+        if (i<size) co[i] += psum[k-1];
+      }
+    }
+
   }  // namespace cuda
 }  // namespace cms
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
index 897d7036d01d1..23575fd418e40 100644
--- a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
@@ -83,6 +83,11 @@ __global__ void verify(uint32_t const *v, uint32_t n) {
     printf("verify\n");
 }
 
+template<typename T>
+__global__ void doCoop(T const* ici, T* ico, int32_t size, T * ipsum) {
+   coopBlockPrefixScan(ici,ico,size,ipsum);
+}
+
 int main() {
   cms::cudatest::requireDevices();
 
@@ -113,9 +118,9 @@ int main() {
   int num_items = 200;
   for (int ksize = 1; ksize < 4; ++ksize) {
     // test multiblock
-    std::cout << "multiblok" << std::endl;
     // Declare, allocate, and initialize device-accessible pointers for input and output
     num_items *= 10;
+    std::cout << "multiblok " << num_items << std::endl;
     uint32_t *d_in;
     uint32_t *d_out1;
     uint32_t *d_out2;
@@ -141,7 +146,35 @@ int main() {
     cudaCheck(cudaGetLastError());
     verify<<<nblocks, nthreads, 0>>>(d_out1, num_items);
     cudaCheck(cudaGetLastError());
-    cudaDeviceSynchronize();
+    cudaCheck(cudaDeviceSynchronize());
+    cudaCheck(cudaGetLastError());
+
+    uint32_t *d_psum;
+    cudaCheck(cudaMalloc(&d_psum, nblocks*sizeof(uint32_t)));
+    std::cout << "launch coopBlockPrefixScan " << num_items << ' ' << nblocks << std::endl;
+    int numBlocksPerSm = 0;
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, 0);
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, (void*)doCoop<uint32_t>, nthreads, 0);
+    std::cout << "max number of blocks is " << deviceProp.multiProcessorCount*numBlocksPerSm << std::endl;
+    auto ncoopblocks = std::min(nblocks,deviceProp.multiProcessorCount*numBlocksPerSm);
+    void *kernelArgs[] = { &d_in, &d_out2, &num_items, &d_psum };
+    dim3 dimBlock(nthreads, 1, 1);
+    dim3 dimGrid(ncoopblocks, 1, 1);
+    // launch
+    cudaLaunchCooperativeKernel((void*)doCoop<uint32_t>, dimGrid, dimBlock, kernelArgs);
+    cudaCheck(cudaGetLastError());
+    verify<<<nblocks, nthreads, 0>>>(d_out2, num_items);
+    cudaCheck(cudaGetLastError());
+    cudaCheck(cudaDeviceSynchronize());
+    cudaCheck(cudaGetLastError());
+
+    std::cout << "Free" << std::endl;
+    cudaCheck(cudaFree(d_psum));
+    cudaCheck(cudaFree(d_out2));
+    cudaCheck(cudaFree(d_out1));
+    cudaCheck(cudaFree(d_in));
+
 
   }  // ksize
   return 0;

From 061469415ea4a4914a162689dac0bf07986ca395 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Wed, 13 Oct 2021 12:08:45 +0200
Subject: [PATCH 03/29] coop works in assoc

---
 .../CUDAUtilities/interface/OneToManyAssoc.h  | 67 ++++++++++++++++---
 .../CUDAUtilities/interface/maxCoopBlocks.h   | 16 +++++
 .../CUDAUtilities/test/OneToManyAssoc_t.h     | 63 ++++++++++++++---
 .../CUDAUtilities/test/prefixScan_t.cu        | 10 ++-
 4 files changed, 130 insertions(+), 26 deletions(-)
 create mode 100644 HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h

diff --git a/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h b/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h
index 01f48bca94f4b..7a3876881ab7d 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h
@@ -29,6 +29,25 @@ namespace cms {
       index_type *contentStorage = nullptr;
       int32_t offSize = -1;
       int32_t contentSize = -1;
+
+      constexpr Counter * offsets() const {
+        Counter *poff = (Counter *)((char *)(assoc) + offsetof(Assoc, off));
+        if constexpr (Assoc::ctNOnes() < 0) {
+          assert(offStorage);
+          poff = offStorage; 
+        }
+        return poff;
+      }
+
+      constexpr int32_t size() const {
+        auto nOnes = Assoc::ctNOnes();
+        if constexpr (Assoc::ctNOnes() < 0) {
+          nOnes = offSize;
+        }
+        assert(nOnes > 0);
+        return nOnes; 
+      }      
+
     };
 
     // this MUST BE DONE in a single block (or in two kernels!)
@@ -50,6 +69,29 @@ namespace cms {
       }
     }
 
+
+    template <typename Assoc>
+    __device__ void zeroAndInitCoop(OneToManyAssocView<Assoc> view) {
+      namespace cg = cooperative_groups;
+      auto grid = cg::this_grid();
+
+      auto h = view.assoc;
+
+      auto first = blockDim.x * blockIdx.x + threadIdx.x;
+
+      if (0 == first) {
+        h->psws = 0;
+        h->initStorage(view);
+      }
+
+      grid.sync();
+      for (int i = first, nt = h->totOnes(); i < nt; i += gridDim.x * blockDim.x) {
+        h->off[i] = 0;
+      }
+
+    }
+
+
     template <typename Assoc>
     inline __attribute__((always_inline)) void launchZero(Assoc *h,
                                                           cudaStream_t stream
@@ -111,16 +153,8 @@ namespace cms {
       auto h = view.assoc;
       assert(h);
 #ifdef __CUDACC__
-      using Counter = typename Assoc::Counter;
-      Counter *poff = (Counter *)((char *)(h) + offsetof(Assoc, off));
-      auto nOnes = Assoc::ctNOnes();
-      if constexpr (Assoc::ctNOnes() < 0) {
-        assert(view.offStorage);
-        assert(view.offSize > 0);
-        nOnes = view.offSize;
-        poff = view.offStorage;
-      }
-      assert(nOnes > 0);
+      auto poff = view.offsets();
+      auto nOnes = view.size();
       int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Assoc, psws));
       auto nthreads = 1024;
       auto nblocks = (nOnes + nthreads - 1) / nthreads;
@@ -131,6 +165,19 @@ namespace cms {
 #endif
     }
 
+    template <typename Assoc>
+    __device__ __inline__ void finalizeCoop(OneToManyAssocView<Assoc> view, typename Assoc::Counter * ws) {
+#ifdef __CUDACC__
+      auto poff = view.offsets();
+      auto nOnes = view.size();
+      coopBlockPrefixScan(poff, poff, nOnes, ws);
+#else
+      auto h = view.assoc;
+      assert(h);
+      h->finalize();
+#endif
+    }
+
     template <typename Assoc>
     __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) {
       assoc->bulkFinalizeFill(*apc);
diff --git a/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h b/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h
new file mode 100644
index 0000000000000..1522af45e247b
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h
@@ -0,0 +1,16 @@
+
+#ifndef HeterogeneousCore_CUDAUtilities_interface_maxCoopBlocks_h
+#define HeterogeneousCore_CUDAUtilities_interface_maxCoopBlocks_h
+
+#include<cuda_runtime.h>
+
+template<typename F>
+inline int maxCoopBlocks(F kernel, int nthreads, int shmem, int device) {
+    int numBlocksPerSm = 0;
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, device);
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, kernel, nthreads, shmem);
+    return deviceProp.multiProcessorCount*numBlocksPerSm;
+}
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
index 8ba9158cb69b7..c369e8da2c7bd 100644
--- a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
+++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
@@ -11,6 +11,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h"
 #endif
 
 #include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h"
@@ -58,7 +59,7 @@ __global__ void verifyMulti(Multiplicity* __restrict__ m1, Multiplicity* __restr
     assert(m1->off[i] == m2->off[i]);
 }
 
-__global__ void count(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
+__device__ __inline__ void count(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int i = first; i < 4 * n; i += gridDim.x * blockDim.x) {
     auto k = i / 4;
@@ -70,8 +71,12 @@ __global__ void count(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int3
       assoc->count(tk[k][j]);
   }
 }
+__global__ void countKernel(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
+  count(tk,assoc,n);
+}
+
 
-__global__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
+__device__ __inline__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int i = first; i < 4 * n; i += gridDim.x * blockDim.x) {
     auto k = i / 4;
@@ -84,6 +89,24 @@ __global__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32
   }
 }
 
+__global__ void fillKernel(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
+  fill(tk,assoc,n);
+}
+
+
+__global__ void populate(TK const* __restrict__ tk, Assoc::View view, int32_t n, Assoc::View::Counter * ws) {
+  namespace cg = cooperative_groups;
+  auto grid = cg::this_grid();
+  auto h = view.assoc;
+  zeroAndInitCoop(view);
+  grid.sync();
+  count(tk,h,n);
+  grid.sync();
+  finalizeCoop(view, ws);
+  grid.sync();
+  fill(tk,h,n);
+}
+
 __global__ void verify(Assoc* __restrict__ assoc) { assert(int(assoc->size()) < assoc->capacity()); }
 
 template <typename Assoc>
@@ -118,7 +141,7 @@ __global__ void verifyFill(Assoc const* __restrict__ la, int n) {
     imax = std::max(imax, int(x));
   }
   assert(0 == la->size(n));
-  printf("found with %d elements %f %d %d\n", n, double(ave) / n, imax, z);
+  printf("found with %d elements %f %d %d\n\n", n, double(ave) / n, imax, z);
 }
 
 template <typename Assoc>
@@ -179,7 +202,7 @@ int main() {
 
   std::geometric_distribution<int> rdm(0.8);
 
-  constexpr uint32_t N = 4000;
+  uint32_t N = 4000;
 
   std::vector<std::array<uint16_t, 4>> tr(N);
 
@@ -249,21 +272,41 @@ int main() {
   launchZero(saView, 0);
 
 #ifdef __CUDACC__
-  auto nThreads = 256;
-  auto nBlocks = (4 * N + nThreads - 1) / nThreads;
+  int nThreads = 256;
+  int nBlocks = (4 * N + nThreads - 1) / nThreads;
 
-  count<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
+  countKernel<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
 
   launchFinalize(aView, 0);
   verify<<<1, 1>>>(a_d.get());
-  fill<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
+  fillKernel<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
+  verifyFill<<<1, 1>>>(a_d.get(), n);
+
+  // now with cooperative gropus
+
+  auto nOnes = aView.size();
+  auto nchunks = nOnes/nThreads + 1;
+  auto ws =  cms::cuda::make_device_unique<Assoc::Counter[]>(nchunks,0);
+
+  int maxBlocks =  maxCoopBlocks(populate, nThreads, 0,0);
+  std::cout << "max number of blocks is " << maxBlocks << std::endl;
+  auto ncoopblocks = std::min(nBlocks,maxBlocks);
+  auto a1 =  v_d.get();
+  auto a4 =  ws.get();
+  void *kernelArgs[] = { &a1, &aView, &N, &a4 };
+  dim3 dimBlock(nThreads, 1, 1);
+  dim3 dimGrid(ncoopblocks, 1, 1);
+  // launch
+  cudaCheck(cudaLaunchCooperativeKernel((void*)populate, dimGrid, dimBlock, kernelArgs));
   verifyFill<<<1, 1>>>(a_d.get(), n);
+  cudaCheck(cudaGetLastError());
+
 
 #else
-  count(v_d, a_d.get(), N);
+  countKernel(v_d, a_d.get(), N);
   launchFinalize(aView);
   verify(a_d.get());
-  fill(v_d, a_d.get(), N);
+  fillKernel(v_d, a_d.get(), N);
   verifyFill(a_d.get(), n);
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
index 23575fd418e40..909b2886e32ef 100644
--- a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
@@ -3,6 +3,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h"
 
 using namespace cms::cuda;
 
@@ -152,12 +153,9 @@ int main() {
     uint32_t *d_psum;
     cudaCheck(cudaMalloc(&d_psum, nblocks*sizeof(uint32_t)));
     std::cout << "launch coopBlockPrefixScan " << num_items << ' ' << nblocks << std::endl;
-    int numBlocksPerSm = 0;
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, 0);
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, (void*)doCoop<uint32_t>, nthreads, 0);
-    std::cout << "max number of blocks is " << deviceProp.multiProcessorCount*numBlocksPerSm << std::endl;
-    auto ncoopblocks = std::min(nblocks,deviceProp.multiProcessorCount*numBlocksPerSm);
+    int maxBlocks =  maxCoopBlocks(doCoop<uint32_t>, nthreads, 0,0);
+    std::cout << "max number of blocks is " << maxBlocks << std::endl;
+    auto ncoopblocks = std::min(nblocks,maxBlocks);
     void *kernelArgs[] = { &d_in, &d_out2, &num_items, &d_psum };
     dim3 dimBlock(nthreads, 1, 1);
     dim3 dimGrid(ncoopblocks, 1, 1);

From df689169dfef93892a1b61095c7ec032bec2f976 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Wed, 13 Oct 2021 13:52:00 +0200
Subject: [PATCH 04/29] coops implelented in histo filling

---
 .../CUDAUtilities/interface/HistoContainer.h  | 93 ++++++++++++++++++-
 .../CUDAUtilities/test/BuildFile.xml          |  8 ++
 .../CUDAUtilities/test/HistoContainer_t.cu    |  5 +
 3 files changed, 102 insertions(+), 4 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
index 7bf5db603bccd..02da35a8bd39e 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -3,11 +3,17 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h"
 
+#ifdef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h"
+#endif
+
+
 namespace cms {
   namespace cuda {
 
     template <typename Histo, typename T>
-    __global__ void countFromVector(Histo *__restrict__ h,
+    __device__ __inline__ void countFromVector(Histo *__restrict__ h,
                                     uint32_t nh,
                                     T const *__restrict__ v,
                                     uint32_t const *__restrict__ offsets) {
@@ -23,7 +29,15 @@ namespace cms {
     }
 
     template <typename Histo, typename T>
-    __global__ void fillFromVector(Histo *__restrict__ h,
+    __global__ void countFromVectorKernel(Histo *__restrict__ h,
+                                    uint32_t nh,
+                                    T const *__restrict__ v,
+                                    uint32_t const *__restrict__ offsets) {
+        countFromVector(h,nh,v,offsets);
+    }
+
+    template <typename Histo, typename T>
+    __device__ __inline__ void fillFromVector(Histo *__restrict__ h,
                                    uint32_t nh,
                                    T const *__restrict__ v,
                                    uint32_t const *__restrict__ offsets) {
@@ -38,6 +52,15 @@ namespace cms {
       }
     }
 
+    template <typename Histo, typename T>
+    __global__ void fillFromVectorKernel(Histo *__restrict__ h,
+                                   uint32_t nh,
+                                   T const *__restrict__ v,
+                                   uint32_t const *__restrict__ offsets) {
+            fillFromVector(h,nh,v,offsets);
+   }
+
+
     template <typename Histo, typename T>
     inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h,
                                                                   uint32_t nh,
@@ -56,18 +79,80 @@ namespace cms {
 #ifdef __CUDACC__
       auto nblocks = (totSize + nthreads - 1) / nthreads;
       assert(nblocks > 0);
-      countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+      countFromVectorKernel<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
       cudaCheck(cudaGetLastError());
       launchFinalize(view, stream);
-      fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+      fillFromVectorKernel<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+      cudaCheck(cudaGetLastError());
+#else
+      countFromVectorKernel(h, nh, v, offsets);
+      h->finalize();
+      fillFromVectorKernel(h, nh, v, offsets);
+#endif
+    }
+
+
+#ifdef __CUDACC__
+    template <typename Histo, typename T>
+    __global__ void fillManyFromVectorCoopKernel(typename Histo::View  view,
+                                                                  uint32_t nh,
+                                                                  T const *__restrict__ v,
+                                                                  uint32_t const *__restrict__ offsets,
+                                                                  int32_t totSize, typename Histo::View::Counter * ws) {
+      namespace cg = cooperative_groups;
+      auto grid = cg::this_grid();
+      auto h = static_cast<Histo *>(view.assoc);
+      zeroAndInitCoop(view);
+      grid.sync();
+      countFromVector(h, nh, v, offsets);
+      grid.sync();
+      finalizeCoop(view, ws);
+      grid.sync();
+      fillFromVector(h, nh, v, offsets);
+    }
+#endif
+
+    template <typename Histo, typename T>
+    inline __attribute__((always_inline)) void fillManyFromVectorCoop(Histo *  h,
+                                                                  uint32_t nh,
+                                                                  T const * v,
+                                                                  uint32_t const * offsets,
+                                                                  int32_t totSize,
+                                                                  int nthreads,
+                                                                  typename Histo::index_type *mem,
+                                                                  cudaStream_t stream
+#ifndef __CUDACC__
+                                                                  = cudaStreamDefault
+#endif
+    ) {
+      using View = typename Histo::View;
+      View view = {h, nullptr, mem, -1, totSize};
+#ifdef __CUDACC__
+      auto kernel = fillManyFromVectorCoopKernel<Histo,T>;
+      auto nblocks = (totSize + nthreads - 1) / nthreads;
+      assert(nblocks > 0);
+      auto nOnes = view.size();
+      auto nchunks = nOnes/nthreads + 1;
+      auto ws =  cms::cuda::make_device_unique<typename View::Counter[]>(nchunks,stream);
+      auto wsp = ws.get();
+      int maxBlocks =  maxCoopBlocks(kernel, nthreads, 0,0);
+      auto ncoopblocks = std::min(nblocks,maxBlocks);
+      assert(ncoopblocks>0);
+      void *kernelArgs[] = { &view, &nh, &v,  &offsets, &totSize, &wsp };
+      dim3 dimBlock(nthreads, 1, 1);
+      dim3 dimGrid(ncoopblocks, 1, 1);
+      // launch
+      cudaCheck(cudaLaunchCooperativeKernel((void*)kernel, dimGrid, dimBlock, kernelArgs, 0, stream));
       cudaCheck(cudaGetLastError());
 #else
+      launchZero(view, stream);
       countFromVector(h, nh, v, offsets);
       h->finalize();
       fillFromVector(h, nh, v, offsets);
 #endif
     }
 
+
     // iteratate over N bins left and right of the one containing "v"
     template <typename Hist, typename V, typename Func>
     __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) {
diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
index 53d41efcf4236..72f34863ab8dd 100644
--- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -52,6 +52,14 @@
     <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
   </bin>
 
+ <bin file="HistoContainer_t.cu" name="gpuHistoContainerCoop_t">
+    <flags CUDA_FLAGS="-g -DCOOP"/>
+  </bin>
+
+  <bin file="HistoContainer_t.cu" name="gpuHistoContainerCoop_debug">
+    <flags CUDA_FLAGS="-g -DCOOP -DGPU_DEBUG"/>
+  </bin>
+
  <bin file="HistoContainerRT_t.cu" name="gpuHistoContainerRT_t">
     <flags CUDA_FLAGS="-g"/>
   </bin>
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
index 75f9cc0e626f5..67ec589bb9bc6 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
@@ -72,7 +72,12 @@ void go() {
 
     cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
 
+#ifdef COOP
+    fillManyFromVectorCoop(h_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, nullptr, 0);
+#else
     fillManyFromVector(h_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, nullptr, 0);
+#endif
+
     cudaCheck(cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost));
     assert(0 == h.off[0]);
     assert(offsets[10] == h.size());

From f71031aecef3b53d7f9b3f9eaf1383959ec3e34b Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Wed, 13 Oct 2021 14:04:24 +0200
Subject: [PATCH 05/29] format

---
 .../CUDAUtilities/interface/HistoContainer.h  | 79 +++++++++----------
 .../CUDAUtilities/interface/OneToManyAssoc.h  | 14 ++--
 .../CUDAUtilities/interface/cudaCompat.h      | 12 +--
 .../CUDAUtilities/interface/maxCoopBlocks.h   | 14 ++--
 .../CUDAUtilities/interface/prefixScan.h      | 27 ++++---
 .../CUDAUtilities/test/OneToManyAssoc_t.h     | 31 +++-----
 .../CUDAUtilities/test/prefixScan_t.cu        | 17 ++--
 7 files changed, 90 insertions(+), 104 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
index 02da35a8bd39e..49e198bf43332 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -8,15 +8,14 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h"
 #endif
 
-
 namespace cms {
   namespace cuda {
 
     template <typename Histo, typename T>
     __device__ __inline__ void countFromVector(Histo *__restrict__ h,
-                                    uint32_t nh,
-                                    T const *__restrict__ v,
-                                    uint32_t const *__restrict__ offsets) {
+                                               uint32_t nh,
+                                               T const *__restrict__ v,
+                                               uint32_t const *__restrict__ offsets) {
       int first = blockDim.x * blockIdx.x + threadIdx.x;
       for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
         auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
@@ -30,17 +29,17 @@ namespace cms {
 
     template <typename Histo, typename T>
     __global__ void countFromVectorKernel(Histo *__restrict__ h,
-                                    uint32_t nh,
-                                    T const *__restrict__ v,
-                                    uint32_t const *__restrict__ offsets) {
-        countFromVector(h,nh,v,offsets);
+                                          uint32_t nh,
+                                          T const *__restrict__ v,
+                                          uint32_t const *__restrict__ offsets) {
+      countFromVector(h, nh, v, offsets);
     }
 
     template <typename Histo, typename T>
     __device__ __inline__ void fillFromVector(Histo *__restrict__ h,
-                                   uint32_t nh,
-                                   T const *__restrict__ v,
-                                   uint32_t const *__restrict__ offsets) {
+                                              uint32_t nh,
+                                              T const *__restrict__ v,
+                                              uint32_t const *__restrict__ offsets) {
       int first = blockDim.x * blockIdx.x + threadIdx.x;
       for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
         auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
@@ -54,12 +53,11 @@ namespace cms {
 
     template <typename Histo, typename T>
     __global__ void fillFromVectorKernel(Histo *__restrict__ h,
-                                   uint32_t nh,
-                                   T const *__restrict__ v,
-                                   uint32_t const *__restrict__ offsets) {
-            fillFromVector(h,nh,v,offsets);
-   }
-
+                                         uint32_t nh,
+                                         T const *__restrict__ v,
+                                         uint32_t const *__restrict__ offsets) {
+      fillFromVector(h, nh, v, offsets);
+    }
 
     template <typename Histo, typename T>
     inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h,
@@ -91,14 +89,14 @@ namespace cms {
 #endif
     }
 
-
 #ifdef __CUDACC__
     template <typename Histo, typename T>
-    __global__ void fillManyFromVectorCoopKernel(typename Histo::View  view,
-                                                                  uint32_t nh,
-                                                                  T const *__restrict__ v,
-                                                                  uint32_t const *__restrict__ offsets,
-                                                                  int32_t totSize, typename Histo::View::Counter * ws) {
+    __global__ void fillManyFromVectorCoopKernel(typename Histo::View view,
+                                                 uint32_t nh,
+                                                 T const *__restrict__ v,
+                                                 uint32_t const *__restrict__ offsets,
+                                                 int32_t totSize,
+                                                 typename Histo::View::Counter *ws) {
       namespace cg = cooperative_groups;
       auto grid = cg::this_grid();
       auto h = static_cast<Histo *>(view.assoc);
@@ -113,36 +111,36 @@ namespace cms {
 #endif
 
     template <typename Histo, typename T>
-    inline __attribute__((always_inline)) void fillManyFromVectorCoop(Histo *  h,
-                                                                  uint32_t nh,
-                                                                  T const * v,
-                                                                  uint32_t const * offsets,
-                                                                  int32_t totSize,
-                                                                  int nthreads,
-                                                                  typename Histo::index_type *mem,
-                                                                  cudaStream_t stream
+    inline __attribute__((always_inline)) void fillManyFromVectorCoop(Histo *h,
+                                                                      uint32_t nh,
+                                                                      T const *v,
+                                                                      uint32_t const *offsets,
+                                                                      int32_t totSize,
+                                                                      int nthreads,
+                                                                      typename Histo::index_type *mem,
+                                                                      cudaStream_t stream
 #ifndef __CUDACC__
-                                                                  = cudaStreamDefault
+                                                                      = cudaStreamDefault
 #endif
     ) {
       using View = typename Histo::View;
       View view = {h, nullptr, mem, -1, totSize};
 #ifdef __CUDACC__
-      auto kernel = fillManyFromVectorCoopKernel<Histo,T>;
+      auto kernel = fillManyFromVectorCoopKernel<Histo, T>;
       auto nblocks = (totSize + nthreads - 1) / nthreads;
       assert(nblocks > 0);
       auto nOnes = view.size();
-      auto nchunks = nOnes/nthreads + 1;
-      auto ws =  cms::cuda::make_device_unique<typename View::Counter[]>(nchunks,stream);
+      auto nchunks = nOnes / nthreads + 1;
+      auto ws = cms::cuda::make_device_unique<typename View::Counter[]>(nchunks, stream);
       auto wsp = ws.get();
-      int maxBlocks =  maxCoopBlocks(kernel, nthreads, 0,0);
-      auto ncoopblocks = std::min(nblocks,maxBlocks);
-      assert(ncoopblocks>0);
-      void *kernelArgs[] = { &view, &nh, &v,  &offsets, &totSize, &wsp };
+      int maxBlocks = maxCoopBlocks(kernel, nthreads, 0, 0);
+      auto ncoopblocks = std::min(nblocks, maxBlocks);
+      assert(ncoopblocks > 0);
+      void *kernelArgs[] = {&view, &nh, &v, &offsets, &totSize, &wsp};
       dim3 dimBlock(nthreads, 1, 1);
       dim3 dimGrid(ncoopblocks, 1, 1);
       // launch
-      cudaCheck(cudaLaunchCooperativeKernel((void*)kernel, dimGrid, dimBlock, kernelArgs, 0, stream));
+      cudaCheck(cudaLaunchCooperativeKernel((void *)kernel, dimGrid, dimBlock, kernelArgs, 0, stream));
       cudaCheck(cudaGetLastError());
 #else
       launchZero(view, stream);
@@ -152,7 +150,6 @@ namespace cms {
 #endif
     }
 
-
     // iteratate over N bins left and right of the one containing "v"
     template <typename Hist, typename V, typename Func>
     __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) {
diff --git a/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h b/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h
index 7a3876881ab7d..b7bf7ca4f354f 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h
@@ -30,11 +30,11 @@ namespace cms {
       int32_t offSize = -1;
       int32_t contentSize = -1;
 
-      constexpr Counter * offsets() const {
+      constexpr Counter *offsets() const {
         Counter *poff = (Counter *)((char *)(assoc) + offsetof(Assoc, off));
         if constexpr (Assoc::ctNOnes() < 0) {
           assert(offStorage);
-          poff = offStorage; 
+          poff = offStorage;
         }
         return poff;
       }
@@ -45,9 +45,8 @@ namespace cms {
           nOnes = offSize;
         }
         assert(nOnes > 0);
-        return nOnes; 
-      }      
-
+        return nOnes;
+      }
     };
 
     // this MUST BE DONE in a single block (or in two kernels!)
@@ -69,7 +68,6 @@ namespace cms {
       }
     }
 
-
     template <typename Assoc>
     __device__ void zeroAndInitCoop(OneToManyAssocView<Assoc> view) {
       namespace cg = cooperative_groups;
@@ -88,10 +86,8 @@ namespace cms {
       for (int i = first, nt = h->totOnes(); i < nt; i += gridDim.x * blockDim.x) {
         h->off[i] = 0;
       }
-
     }
 
-
     template <typename Assoc>
     inline __attribute__((always_inline)) void launchZero(Assoc *h,
                                                           cudaStream_t stream
@@ -166,7 +162,7 @@ namespace cms {
     }
 
     template <typename Assoc>
-    __device__ __inline__ void finalizeCoop(OneToManyAssocView<Assoc> view, typename Assoc::Counter * ws) {
+    __device__ __inline__ void finalizeCoop(OneToManyAssocView<Assoc> view, typename Assoc::Counter *ws) {
 #ifdef __CUDACC__
       auto poff = view.offsets();
       auto nOnes = view.size();
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
index c2bc3f04450c0..bce1c9493606c 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
@@ -123,19 +123,19 @@ namespace cms {
     namespace cooperative_groups {
 
       // This class represents the grid group
-     class grid_group {
-     private: 
-       grid_group() = default;
+      class grid_group {
+      private:
+        grid_group() = default;
 
-       friend grid_group this_grid();
+        friend grid_group this_grid();
 
       public:
         // Synchronize the threads named in the group.
         // On the serial CPU implementation, do nothing.
         static void sync() {}
-     };
+      };
 
-     inline grid_group this_grid() { return grid_group{};}
+      inline grid_group this_grid() { return grid_group{}; }
 
       // This class represents the thread block
       class thread_block {
diff --git a/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h b/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h
index 1522af45e247b..44490110eeb28 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h
@@ -2,15 +2,15 @@
 #ifndef HeterogeneousCore_CUDAUtilities_interface_maxCoopBlocks_h
 #define HeterogeneousCore_CUDAUtilities_interface_maxCoopBlocks_h
 
-#include<cuda_runtime.h>
+#include <cuda_runtime.h>
 
-template<typename F>
+template <typename F>
 inline int maxCoopBlocks(F kernel, int nthreads, int shmem, int device) {
-    int numBlocksPerSm = 0;
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, device);
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, kernel, nthreads, shmem);
-    return deviceProp.multiProcessorCount*numBlocksPerSm;
+  int numBlocksPerSm = 0;
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, device);
+  cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, kernel, nthreads, shmem);
+  return deviceProp.multiProcessorCount * numBlocksPerSm;
 }
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
index e4ff57e0eb415..d305e2dc13e4e 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
@@ -185,9 +185,8 @@ namespace cms {
       }
     }
 
-
     template <typename T>
-    __device__ void coopBlockPrefixScan(T const* ici, T* ico, int32_t size, T * ipsum) {
+    __device__ void coopBlockPrefixScan(T const* ici, T* ico, int32_t size, T* ipsum) {
       namespace cg = cooperative_groups;
       auto grid = cg::this_grid();
       volatile T const* ci = ici;
@@ -198,34 +197,36 @@ namespace cms {
 
       //   assert(blockDim.x * gridDim.x >= size);
 
-     int nChunks = size/blockDim.x + 1;
+      int nChunks = size / blockDim.x + 1;
 
       // first each block does a scan
-      for (int ib=blockIdx.x; ib<nChunks; ib+=gridDim.x) {
+      for (int ib = blockIdx.x; ib < nChunks; ib += gridDim.x) {
         int off = blockDim.x * ib;
         if (size - off > 0) {
-         int ls = std::min(int(blockDim.x), size - off);
-         blockPrefixScan(ci + off, co + off, ls, ws);
-         psum[ib] = co[off+ls-1];
-         __syncthreads();
+          int ls = std::min(int(blockDim.x), size - off);
+          blockPrefixScan(ci + off, co + off, ls, ws);
+          psum[ib] = co[off + ls - 1];
+          __syncthreads();
         }
       }
       grid.sync();
 
       // good each block has done his work
       // let's get the partial sums from each block
-      if (0==blockIdx.x) {
+      if (0 == blockIdx.x) {
         blockPrefixScan(psum, psum, nChunks, ws);
       }
 
       grid.sync();
 
-      for (int ib=blockIdx.x; ib<nChunks; ib+=gridDim.x) {
+      for (int ib = blockIdx.x; ib < nChunks; ib += gridDim.x) {
         // now each block updates its piece (but for chunk 0)
         int k = ib;
-        if (0==k) continue;
-        int i = threadIdx.x + k*blockDim.x;
-        if (i<size) co[i] += psum[k-1];
+        if (0 == k)
+          continue;
+        int i = threadIdx.x + k * blockDim.x;
+        if (i < size)
+          co[i] += psum[k - 1];
       }
     }
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
index c369e8da2c7bd..5685dce628e59 100644
--- a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
+++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
@@ -71,10 +71,7 @@ __device__ __inline__ void count(TK const* __restrict__ tk, Assoc* __restrict__
       assoc->count(tk[k][j]);
   }
 }
-__global__ void countKernel(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
-  count(tk,assoc,n);
-}
-
+__global__ void countKernel(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { count(tk, assoc, n); }
 
 __device__ __inline__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
@@ -89,22 +86,19 @@ __device__ __inline__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ a
   }
 }
 
-__global__ void fillKernel(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
-  fill(tk,assoc,n);
-}
+__global__ void fillKernel(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) { fill(tk, assoc, n); }
 
-
-__global__ void populate(TK const* __restrict__ tk, Assoc::View view, int32_t n, Assoc::View::Counter * ws) {
+__global__ void populate(TK const* __restrict__ tk, Assoc::View view, int32_t n, Assoc::View::Counter* ws) {
   namespace cg = cooperative_groups;
   auto grid = cg::this_grid();
   auto h = view.assoc;
   zeroAndInitCoop(view);
   grid.sync();
-  count(tk,h,n);
+  count(tk, h, n);
   grid.sync();
   finalizeCoop(view, ws);
   grid.sync();
-  fill(tk,h,n);
+  fill(tk, h, n);
 }
 
 __global__ void verify(Assoc* __restrict__ assoc) { assert(int(assoc->size()) < assoc->capacity()); }
@@ -285,15 +279,15 @@ int main() {
   // now with cooperative gropus
 
   auto nOnes = aView.size();
-  auto nchunks = nOnes/nThreads + 1;
-  auto ws =  cms::cuda::make_device_unique<Assoc::Counter[]>(nchunks,0);
+  auto nchunks = nOnes / nThreads + 1;
+  auto ws = cms::cuda::make_device_unique<Assoc::Counter[]>(nchunks, 0);
 
-  int maxBlocks =  maxCoopBlocks(populate, nThreads, 0,0);
+  int maxBlocks = maxCoopBlocks(populate, nThreads, 0, 0);
   std::cout << "max number of blocks is " << maxBlocks << std::endl;
-  auto ncoopblocks = std::min(nBlocks,maxBlocks);
-  auto a1 =  v_d.get();
-  auto a4 =  ws.get();
-  void *kernelArgs[] = { &a1, &aView, &N, &a4 };
+  auto ncoopblocks = std::min(nBlocks, maxBlocks);
+  auto a1 = v_d.get();
+  auto a4 = ws.get();
+  void* kernelArgs[] = {&a1, &aView, &N, &a4};
   dim3 dimBlock(nThreads, 1, 1);
   dim3 dimGrid(ncoopblocks, 1, 1);
   // launch
@@ -301,7 +295,6 @@ int main() {
   verifyFill<<<1, 1>>>(a_d.get(), n);
   cudaCheck(cudaGetLastError());
 
-
 #else
   countKernel(v_d, a_d.get(), N);
   launchFinalize(aView);
diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
index 909b2886e32ef..ef0a785e00bd5 100644
--- a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
@@ -84,9 +84,9 @@ __global__ void verify(uint32_t const *v, uint32_t n) {
     printf("verify\n");
 }
 
-template<typename T>
-__global__ void doCoop(T const* ici, T* ico, int32_t size, T * ipsum) {
-   coopBlockPrefixScan(ici,ico,size,ipsum);
+template <typename T>
+__global__ void doCoop(T const *ici, T *ico, int32_t size, T *ipsum) {
+  coopBlockPrefixScan(ici, ico, size, ipsum);
 }
 
 int main() {
@@ -151,16 +151,16 @@ int main() {
     cudaCheck(cudaGetLastError());
 
     uint32_t *d_psum;
-    cudaCheck(cudaMalloc(&d_psum, nblocks*sizeof(uint32_t)));
+    cudaCheck(cudaMalloc(&d_psum, nblocks * sizeof(uint32_t)));
     std::cout << "launch coopBlockPrefixScan " << num_items << ' ' << nblocks << std::endl;
-    int maxBlocks =  maxCoopBlocks(doCoop<uint32_t>, nthreads, 0,0);
+    int maxBlocks = maxCoopBlocks(doCoop<uint32_t>, nthreads, 0, 0);
     std::cout << "max number of blocks is " << maxBlocks << std::endl;
-    auto ncoopblocks = std::min(nblocks,maxBlocks);
-    void *kernelArgs[] = { &d_in, &d_out2, &num_items, &d_psum };
+    auto ncoopblocks = std::min(nblocks, maxBlocks);
+    void *kernelArgs[] = {&d_in, &d_out2, &num_items, &d_psum};
     dim3 dimBlock(nthreads, 1, 1);
     dim3 dimGrid(ncoopblocks, 1, 1);
     // launch
-    cudaLaunchCooperativeKernel((void*)doCoop<uint32_t>, dimGrid, dimBlock, kernelArgs);
+    cudaLaunchCooperativeKernel((void *)doCoop<uint32_t>, dimGrid, dimBlock, kernelArgs);
     cudaCheck(cudaGetLastError());
     verify<<<nblocks, nthreads, 0>>>(d_out2, num_items);
     cudaCheck(cudaGetLastError());
@@ -173,7 +173,6 @@ int main() {
     cudaCheck(cudaFree(d_out1));
     cudaCheck(cudaFree(d_in));
 
-
   }  // ksize
   return 0;
 }

From ac394e7d357b0d7fd511349b7f7b5bcf7123d624 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Wed, 13 Oct 2021 14:28:01 +0200
Subject: [PATCH 06/29] use in rechits

---
 RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
index 30d1ee2353a65..e70acffa5ecd2 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
@@ -61,7 +61,7 @@ namespace pixelgpudetails {
       setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart());
       cudaCheck(cudaGetLastError());
 
-      cms::cuda::fillManyFromVector(
+      cms::cuda::fillManyFromVectorCoop(
           hits_d.phiBinner(), 10, hits_d.iphi(), hits_d.hitsLayerStart(), nHits, 256, hits_d.phiBinnerStorage(), stream);
       cudaCheck(cudaGetLastError());
 

From a3ab3fff8aaa84a3e48972c67077ed9cdfa47525 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 14 Oct 2021 12:03:37 +0200
Subject: [PATCH 07/29] factorize away algos

---
 .../CUDAUtilities/interface/HistoContainer.h  | 162 ----------------
 .../interface/HistoContainerAlgo.h            | 179 ++++++++++++++++++
 .../CUDAUtilities/test/HistoContainerRT_t.cu  |   2 +-
 .../CUDAUtilities/test/HistoContainer_t.cpp   |   2 +-
 .../CUDAUtilities/test/HistoContainer_t.cu    |   2 +-
 .../CUDAUtilities/test/OneHistoContainer_t.cu |   2 +-
 .../plugins/gpuClustering.h                   |   2 +-
 .../plugins/PixelRecHitGPUKernel.cu           |   1 +
 .../plugins/SiPixelRecHitSoAFromLegacy.cc     |   1 +
 .../CAHitNtupletGeneratorKernelsImpl.h        |   1 +
 .../plugins/gpuClusterTracksByDensity.h       |   2 +-
 .../plugins/gpuClusterTracksDBSCAN.h          |   2 +-
 .../plugins/gpuClusterTracksIterative.h       |   2 +-
 .../plugins/gpuFitVertices.h                  |   2 +-
 .../PixelVertexFinding/plugins/gpuSortByPt2.h |   2 +-
 .../plugins/gpuSplitVertices.h                |   2 +-
 16 files changed, 193 insertions(+), 173 deletions(-)
 create mode 100644 HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h

diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
index 49e198bf43332..e0329a13b1edb 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -11,168 +11,6 @@
 namespace cms {
   namespace cuda {
 
-    template <typename Histo, typename T>
-    __device__ __inline__ void countFromVector(Histo *__restrict__ h,
-                                               uint32_t nh,
-                                               T const *__restrict__ v,
-                                               uint32_t const *__restrict__ offsets) {
-      int first = blockDim.x * blockIdx.x + threadIdx.x;
-      for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
-        auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
-        assert((*off) > 0);
-        int32_t ih = off - offsets - 1;
-        assert(ih >= 0);
-        assert(ih < int(nh));
-        (*h).count(v[i], ih);
-      }
-    }
-
-    template <typename Histo, typename T>
-    __global__ void countFromVectorKernel(Histo *__restrict__ h,
-                                          uint32_t nh,
-                                          T const *__restrict__ v,
-                                          uint32_t const *__restrict__ offsets) {
-      countFromVector(h, nh, v, offsets);
-    }
-
-    template <typename Histo, typename T>
-    __device__ __inline__ void fillFromVector(Histo *__restrict__ h,
-                                              uint32_t nh,
-                                              T const *__restrict__ v,
-                                              uint32_t const *__restrict__ offsets) {
-      int first = blockDim.x * blockIdx.x + threadIdx.x;
-      for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
-        auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
-        assert((*off) > 0);
-        int32_t ih = off - offsets - 1;
-        assert(ih >= 0);
-        assert(ih < int(nh));
-        (*h).fill(v[i], i, ih);
-      }
-    }
-
-    template <typename Histo, typename T>
-    __global__ void fillFromVectorKernel(Histo *__restrict__ h,
-                                         uint32_t nh,
-                                         T const *__restrict__ v,
-                                         uint32_t const *__restrict__ offsets) {
-      fillFromVector(h, nh, v, offsets);
-    }
-
-    template <typename Histo, typename T>
-    inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h,
-                                                                  uint32_t nh,
-                                                                  T const *__restrict__ v,
-                                                                  uint32_t const *__restrict__ offsets,
-                                                                  int32_t totSize,
-                                                                  int nthreads,
-                                                                  typename Histo::index_type *mem,
-                                                                  cudaStream_t stream
-#ifndef __CUDACC__
-                                                                  = cudaStreamDefault
-#endif
-    ) {
-      typename Histo::View view = {h, nullptr, mem, -1, totSize};
-      launchZero(view, stream);
-#ifdef __CUDACC__
-      auto nblocks = (totSize + nthreads - 1) / nthreads;
-      assert(nblocks > 0);
-      countFromVectorKernel<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
-      cudaCheck(cudaGetLastError());
-      launchFinalize(view, stream);
-      fillFromVectorKernel<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
-      cudaCheck(cudaGetLastError());
-#else
-      countFromVectorKernel(h, nh, v, offsets);
-      h->finalize();
-      fillFromVectorKernel(h, nh, v, offsets);
-#endif
-    }
-
-#ifdef __CUDACC__
-    template <typename Histo, typename T>
-    __global__ void fillManyFromVectorCoopKernel(typename Histo::View view,
-                                                 uint32_t nh,
-                                                 T const *__restrict__ v,
-                                                 uint32_t const *__restrict__ offsets,
-                                                 int32_t totSize,
-                                                 typename Histo::View::Counter *ws) {
-      namespace cg = cooperative_groups;
-      auto grid = cg::this_grid();
-      auto h = static_cast<Histo *>(view.assoc);
-      zeroAndInitCoop(view);
-      grid.sync();
-      countFromVector(h, nh, v, offsets);
-      grid.sync();
-      finalizeCoop(view, ws);
-      grid.sync();
-      fillFromVector(h, nh, v, offsets);
-    }
-#endif
-
-    template <typename Histo, typename T>
-    inline __attribute__((always_inline)) void fillManyFromVectorCoop(Histo *h,
-                                                                      uint32_t nh,
-                                                                      T const *v,
-                                                                      uint32_t const *offsets,
-                                                                      int32_t totSize,
-                                                                      int nthreads,
-                                                                      typename Histo::index_type *mem,
-                                                                      cudaStream_t stream
-#ifndef __CUDACC__
-                                                                      = cudaStreamDefault
-#endif
-    ) {
-      using View = typename Histo::View;
-      View view = {h, nullptr, mem, -1, totSize};
-#ifdef __CUDACC__
-      auto kernel = fillManyFromVectorCoopKernel<Histo, T>;
-      auto nblocks = (totSize + nthreads - 1) / nthreads;
-      assert(nblocks > 0);
-      auto nOnes = view.size();
-      auto nchunks = nOnes / nthreads + 1;
-      auto ws = cms::cuda::make_device_unique<typename View::Counter[]>(nchunks, stream);
-      auto wsp = ws.get();
-      int maxBlocks = maxCoopBlocks(kernel, nthreads, 0, 0);
-      auto ncoopblocks = std::min(nblocks, maxBlocks);
-      assert(ncoopblocks > 0);
-      void *kernelArgs[] = {&view, &nh, &v, &offsets, &totSize, &wsp};
-      dim3 dimBlock(nthreads, 1, 1);
-      dim3 dimGrid(ncoopblocks, 1, 1);
-      // launch
-      cudaCheck(cudaLaunchCooperativeKernel((void *)kernel, dimGrid, dimBlock, kernelArgs, 0, stream));
-      cudaCheck(cudaGetLastError());
-#else
-      launchZero(view, stream);
-      countFromVector(h, nh, v, offsets);
-      h->finalize();
-      fillFromVector(h, nh, v, offsets);
-#endif
-    }
-
-    // iteratate over N bins left and right of the one containing "v"
-    template <typename Hist, typename V, typename Func>
-    __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) {
-      int bs = Hist::bin(value);
-      int be = std::min(int(Hist::nbins() - 1), bs + n);
-      bs = std::max(0, bs - n);
-      assert(be >= bs);
-      for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
-        func(*pj);
-      }
-    }
-
-    // iteratate over bins containing all values in window wmin, wmax
-    template <typename Hist, typename V, typename Func>
-    __host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) {
-      auto bs = Hist::bin(wmin);
-      auto be = Hist::bin(wmax);
-      assert(be >= bs);
-      for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
-        func(*pj);
-      }
-    }
-
     template <typename T,      // the type of the discretized input values
               uint32_t NBINS,  // number of bins
               int32_t SIZE,    // max number of element. If -1 is initialized at runtime using external storage
diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h
new file mode 100644
index 0000000000000..bd80b505fdcda
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h
@@ -0,0 +1,179 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_HistoContainerAlgo_h
+#define HeterogeneousCore_CUDAUtilities_interface_HistoContainerAlgo_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+
+#ifdef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h"
+#endif
+
+namespace cms {
+  namespace cuda {
+
+    template <typename Histo, typename T>
+    __device__ __inline__ void countFromVector(Histo *__restrict__ h,
+                                               uint32_t nh,
+                                               T const *__restrict__ v,
+                                               uint32_t const *__restrict__ offsets) {
+      int first = blockDim.x * blockIdx.x + threadIdx.x;
+      for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
+        auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
+        assert((*off) > 0);
+        int32_t ih = off - offsets - 1;
+        assert(ih >= 0);
+        assert(ih < int(nh));
+        (*h).count(v[i], ih);
+      }
+    }
+
+    template <typename Histo, typename T>
+    __global__ void countFromVectorKernel(Histo *__restrict__ h,
+                                          uint32_t nh,
+                                          T const *__restrict__ v,
+                                          uint32_t const *__restrict__ offsets) {
+      countFromVector(h, nh, v, offsets);
+    }
+
+    template <typename Histo, typename T>
+    __device__ __inline__ void fillFromVector(Histo *__restrict__ h,
+                                              uint32_t nh,
+                                              T const *__restrict__ v,
+                                              uint32_t const *__restrict__ offsets) {
+      int first = blockDim.x * blockIdx.x + threadIdx.x;
+      for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
+        auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
+        assert((*off) > 0);
+        int32_t ih = off - offsets - 1;
+        assert(ih >= 0);
+        assert(ih < int(nh));
+        (*h).fill(v[i], i, ih);
+      }
+    }
+
+    template <typename Histo, typename T>
+    __global__ void fillFromVectorKernel(Histo *__restrict__ h,
+                                         uint32_t nh,
+                                         T const *__restrict__ v,
+                                         uint32_t const *__restrict__ offsets) {
+      fillFromVector(h, nh, v, offsets);
+    }
+
+    template <typename Histo, typename T>
+    inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h,
+                                                                  uint32_t nh,
+                                                                  T const *__restrict__ v,
+                                                                  uint32_t const *__restrict__ offsets,
+                                                                  int32_t totSize,
+                                                                  int nthreads,
+                                                                  typename Histo::index_type *mem,
+                                                                  cudaStream_t stream
+#ifndef __CUDACC__
+                                                                  = cudaStreamDefault
+#endif
+    ) {
+      typename Histo::View view = {h, nullptr, mem, -1, totSize};
+      launchZero(view, stream);
+#ifdef __CUDACC__
+      auto nblocks = (totSize + nthreads - 1) / nthreads;
+      assert(nblocks > 0);
+      countFromVectorKernel<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+      cudaCheck(cudaGetLastError());
+      launchFinalize(view, stream);
+      fillFromVectorKernel<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+      cudaCheck(cudaGetLastError());
+#else
+      countFromVectorKernel(h, nh, v, offsets);
+      h->finalize();
+      fillFromVectorKernel(h, nh, v, offsets);
+#endif
+    }
+
+#ifdef __CUDACC__
+    template <typename Histo, typename T>
+    __global__ void fillManyFromVectorCoopKernel(typename Histo::View view,
+                                                 uint32_t nh,
+                                                 T const *__restrict__ v,
+                                                 uint32_t const *__restrict__ offsets,
+                                                 int32_t totSize,
+                                                 typename Histo::View::Counter *ws) {
+      namespace cg = cooperative_groups;
+      auto grid = cg::this_grid();
+      auto h = static_cast<Histo *>(view.assoc);
+      zeroAndInitCoop(view);
+      grid.sync();
+      countFromVector(h, nh, v, offsets);
+      grid.sync();
+      finalizeCoop(view, ws);
+      grid.sync();
+      fillFromVector(h, nh, v, offsets);
+    }
+#endif
+
+    template <typename Histo, typename T>
+    inline __attribute__((always_inline)) void fillManyFromVectorCoop(Histo *h,
+                                                                      uint32_t nh,
+                                                                      T const *v,
+                                                                      uint32_t const *offsets,
+                                                                      int32_t totSize,
+                                                                      int nthreads,
+                                                                      typename Histo::index_type *mem,
+                                                                      cudaStream_t stream
+#ifndef __CUDACC__
+                                                                      = cudaStreamDefault
+#endif
+    ) {
+      using View = typename Histo::View;
+      View view = {h, nullptr, mem, -1, totSize};
+#ifdef __CUDACC__
+      auto kernel = fillManyFromVectorCoopKernel<Histo, T>;
+      auto nblocks = (totSize + nthreads - 1) / nthreads;
+      assert(nblocks > 0);
+      auto nOnes = view.size();
+      auto nchunks = nOnes / nthreads + 1;
+      auto ws = cms::cuda::make_device_unique<typename View::Counter[]>(nchunks, stream);
+      auto wsp = ws.get();
+      // FIXME: discuss with FW team: cuda calls are expensive and not needed for each event
+      static int maxBlocks = maxCoopBlocks(kernel, nthreads, 0, 0);
+      auto ncoopblocks = std::min(nblocks, maxBlocks);
+      assert(ncoopblocks > 0);
+      void *kernelArgs[] = {&view, &nh, &v, &offsets, &totSize, &wsp};
+      dim3 dimBlock(nthreads, 1, 1);
+      dim3 dimGrid(ncoopblocks, 1, 1);
+      // launch
+      cudaCheck(cudaLaunchCooperativeKernel((void *)kernel, dimGrid, dimBlock, kernelArgs, 0, stream));
+      cudaCheck(cudaGetLastError());
+#else
+      launchZero(view, stream);
+      countFromVector(h, nh, v, offsets);
+      h->finalize();
+      fillFromVector(h, nh, v, offsets);
+#endif
+    }
+
+    // iteratate over N bins left and right of the one containing "v"
+    template <typename Hist, typename V, typename Func>
+    __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) {
+      int bs = Hist::bin(value);
+      int be = std::min(int(Hist::nbins() - 1), bs + n);
+      bs = std::max(0, bs - n);
+      assert(be >= bs);
+      for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
+        func(*pj);
+      }
+    }
+
+    // iteratate over bins containing all values in window wmin, wmax
+    template <typename Hist, typename V, typename Func>
+    __host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) {
+      auto bs = Hist::bin(wmin);
+      auto be = Hist::bin(wmax);
+      assert(be >= bs);
+      for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
+        func(*pj);
+      }
+    }
+  }  // namespace cuda
+}  // namespace cms
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_HistoContainerAlgo_h
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainerRT_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainerRT_t.cu
index b49498990fcbe..3831e8e384f85 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainerRT_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainerRT_t.cu
@@ -6,7 +6,7 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
 using namespace cms::cuda;
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
index 2109e5a0d5b38..d8ffe204bf56e 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
@@ -4,7 +4,7 @@
 #include <limits>
 #include <random>
 
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
 using namespace cms::cuda;
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
index 67ec589bb9bc6..b879bc8f17e96 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
@@ -6,7 +6,7 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
 using namespace cms::cuda;
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
index f2d74c8e7ce98..44d487b907699 100644
--- a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
@@ -6,7 +6,7 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
index b266f2f2a997e..2201e8bddc8b2 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
@@ -6,7 +6,7 @@
 
 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
 #include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
 namespace gpuClustering {
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
index e70acffa5ecd2..795b1a35641ef 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
@@ -9,6 +9,7 @@
 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 
 #include "PixelRecHitGPUKernel.h"
 #include "gpuPixelRecHits.h"
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
index ab0a1ef97e0f2..f184b47ca3b96 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
@@ -10,6 +10,7 @@
 #include "DataFormats/Common/interface/Handle.h"
 #include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
 #include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
 #include "FWCore/Framework/interface/MakerMacros.h"
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 92c720bb86012..51276ea403264 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -13,6 +13,7 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
 
 #include "CAConstants.h"
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
index f71aa56842a67..8bd30493e661b 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
@@ -5,7 +5,7 @@
 #include <cmath>
 #include <cstdint>
 
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
 #include "gpuVertexFinder.h"
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
index a11283a7b2065..4132ab0c227da 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
@@ -5,7 +5,7 @@
 #include <cmath>
 #include <cstdint>
 
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
 #include "gpuVertexFinder.h"
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
index 66d246fcfa4fa..8837b04e45654 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
@@ -5,7 +5,7 @@
 #include <cmath>
 #include <cstdint>
 
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
 #include "gpuVertexFinder.h"
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
index 0acf67244528a..e59063a84c0b3 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
@@ -5,7 +5,7 @@
 #include <cmath>
 #include <cstdint>
 
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
 #include "gpuVertexFinder.h"
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
index 93f78d498b26f..28ef595bb4868 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
@@ -5,7 +5,7 @@
 #include <cmath>
 #include <cstdint>
 
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #ifdef __CUDA_ARCH__
 #include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h"
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
index 0fe8bd882dcc5..00c7448b51a63 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
@@ -5,7 +5,7 @@
 #include <cmath>
 #include <cstdint>
 
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
 #include "gpuVertexFinder.h"

From 8a5d69b0086f3e9f9a2c8eb6220fbd9db273b02d Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 14 Oct 2021 15:45:51 +0200
Subject: [PATCH 08/29] decapsulate and format

---
 .../interface/HistoContainerAlgo.h            |  3 +-
 .../CAHitNtupletGeneratorKernelsImpl.h        | 48 ++++++++++++++-----
 2 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h
index bd80b505fdcda..e2b37a3c138b5 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h
@@ -134,7 +134,8 @@ namespace cms {
       auto ws = cms::cuda::make_device_unique<typename View::Counter[]>(nchunks, stream);
       auto wsp = ws.get();
       // FIXME: discuss with FW team: cuda calls are expensive and not needed for each event
-      static int maxBlocks = maxCoopBlocks(kernel, nthreads, 0, 0);
+      // static int maxBlocks = maxCoopBlocks(kernel, nthreads, 0, 0);
+      static int maxBlocks = std::max(1, maxCoopBlocks(kernel, nthreads, 0, 0) / 10);
       auto ncoopblocks = std::min(nblocks, maxBlocks);
       assert(ncoopblocks > 0);
       void *kernelArgs[] = {&view, &nh, &v, &offsets, &totSize, &wsp};
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 51276ea403264..3d7b307f06e88 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -380,9 +380,9 @@ __global__ void kernel_mark_used(GPUCACell::Hits const *__restrict__ hhp,
   }
 }
 
-__global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets,
-                                         Quality const *__restrict__ quality,
-                                         caConstants::TupleMultiplicity *tupleMultiplicity) {
+__device__ __inline__ void countMultiplicity(HitContainer const *__restrict__ foundNtuplets,
+                                             Quality const *__restrict__ quality,
+                                             caConstants::TupleMultiplicity *tupleMultiplicity) {
   auto first = blockIdx.x * blockDim.x + threadIdx.x;
   for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
     auto nhits = foundNtuplets->size(it);
@@ -398,9 +398,15 @@ __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundN
   }
 }
 
-__global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
-                                        Quality const *__restrict__ quality,
-                                        caConstants::TupleMultiplicity *tupleMultiplicity) {
+__global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets,
+                                         Quality const *__restrict__ quality,
+                                         caConstants::TupleMultiplicity *tupleMultiplicity) {
+  countMultiplicity(foundNtuplets, quality, tupleMultiplicity);
+}
+
+__device__ __inline__ void fillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
+                                            Quality const *__restrict__ quality,
+                                            caConstants::TupleMultiplicity *tupleMultiplicity) {
   auto first = blockIdx.x * blockDim.x + threadIdx.x;
   for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
     auto nhits = foundNtuplets->size(it);
@@ -416,6 +422,12 @@ __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNt
   }
 }
 
+__global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
+                                        Quality const *__restrict__ quality,
+                                        caConstants::TupleMultiplicity *tupleMultiplicity) {
+  fillMultiplicity(foundNtuplets, quality, tupleMultiplicity);
+}
+
 __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
                                       TkSoA const *__restrict__ tracks,
                                       CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts,
@@ -520,9 +532,9 @@ __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples,
   }
 }
 
-__global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples,
-                                        Quality const *__restrict__ quality,
-                                        CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
+__device__ __inline__ void countHitInTracks(HitContainer const *__restrict__ tuples,
+                                            Quality const *__restrict__ quality,
+                                            CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
     if (tuples->size(idx) == 0)
@@ -532,9 +544,15 @@ __global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples,
   }
 }
 
-__global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples,
-                                       Quality const *__restrict__ quality,
-                                       CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
+__global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples,
+                                        Quality const *__restrict__ quality,
+                                        CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
+  countHitInTracks(tuples, quality, hitToTuple);
+}
+
+__device__ __inline__ void fillHitInTracks(HitContainer const *__restrict__ tuples,
+                                           Quality const *__restrict__ quality,
+                                           CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
     if (tuples->size(idx) == 0)
@@ -544,6 +562,12 @@ __global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples,
   }
 }
 
+__global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples,
+                                       Quality const *__restrict__ quality,
+                                       CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
+  fillHitInTracks(tuples, quality, hitToTuple);
+}
+
 __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples,
                                          TrackingRecHit2DSOAView const *__restrict__ hhp,
                                          HitContainer *__restrict__ hitDetIndices) {

From 697402983d7da419a912f6732b132c4a3de0a114 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 14 Oct 2021 16:30:52 +0200
Subject: [PATCH 09/29] encapsulate

---
 .../plugins/CAHitNtupletGeneratorKernels.cu   | 61 +++++++++++++------
 .../plugins/CAHitNtupletGeneratorKernels.h    |  5 --
 .../CAHitNtupletGeneratorKernelsAlloc.cc      |  5 +-
 3 files changed, 46 insertions(+), 25 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index a47c89b27ed30..85a82f3171270 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -1,5 +1,44 @@
 #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h"
 
+  /// Compute the number of quadruplet blocks for block size
+  inline uint32_t nQuadrupletBlocks(uint32_t blockSize) {
+    // caConstants::maxNumberOfQuadruplets is a constexpr, so the compiler will pre compute the 3*max/4
+    return (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
+  }
+
+__inline__  void populateMultiplicity(HitContainer const *__restrict__ tuples_d,
+                                        Quality const *__restrict__ quality_d,
+                                        caConstants::TupleMultiplicity *tupleMultiplicity_d, cudaStream_t cudaStream) {
+
+  cms::cuda::launchZero(tupleMultiplicity_d, cudaStream);
+  auto  blockSize = 128;
+  auto numberOfBlocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize;
+  kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      tuples_d, quality_d, tupleMultiplicity_d);
+  cms::cuda::launchFinalize(tupleMultiplicity_d, cudaStream);
+  kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      tuples_d, quality_d, tupleMultiplicity_d);
+
+}
+
+
+__inline__  void populateHitInTracks(HitContainer const *__restrict__ tuples_d, 
+                                        Quality const *__restrict__ quality_d,
+                                        CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple_d, HitToTuple::View hitToTupleView, cudaStream_t cudaStream) {
+
+    cms::cuda::launchZero(hitToTupleView, cudaStream);
+    auto  blockSize = 64;
+    auto numberOfBlocks = nQuadrupletBlocks(blockSize);
+    kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        tuples_d, quality_d, hitToTuple_d);
+    cudaCheck(cudaGetLastError());
+    cms::cuda::launchFinalize(hitToTupleView, cudaStream);
+    cudaCheck(cudaGetLastError());
+    kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, hitToTuple_d);
+}
+
+
+
 template <>
 void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream) {
   auto blockSize = 128;
@@ -106,13 +145,8 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
       device_theCells_.get(), device_nCells_, tuples_d, quality_d, params_.dupPassThrough_);
   cudaCheck(cudaGetLastError());
 
-  blockSize = 128;
-  numberOfBlocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize;
-  kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      tuples_d, quality_d, device_tupleMultiplicity_.get());
-  cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
-  kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      tuples_d, quality_d, device_tupleMultiplicity_.get());
+  
+  populateMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get(),cudaStream);
   cudaCheck(cudaGetLastError());
 
   if (nhits > 1 && params_.lateFishbone_) {
@@ -259,17 +293,8 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 #endif
 
   if (params_.doSharedHitCut_ || params_.doStats_) {
-    // fill hit->track "map"
-    assert(hitToTupleView_.offSize > nhits);
-    numberOfBlocks = nQuadrupletBlocks(blockSize);
-    kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-        tuples_d, quality_d, device_hitToTuple_.get());
-    cudaCheck(cudaGetLastError());
-    assert((hitToTupleView_.assoc == device_hitToTuple_.get()) &&
-           (hitToTupleView_.offStorage == device_hitToTupleStorage_.get()) && (hitToTupleView_.offSize > 0));
-    cms::cuda::launchFinalize(hitToTupleView_, cudaStream);
-    cudaCheck(cudaGetLastError());
-    kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_hitToTuple_.get());
+    // populate hit->track "map"
+    populateHitInTracks(tuples_d, quality_d, device_hitToTuple_.get(),hitToTupleView_,cudaStream);
     cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
     cudaCheck(cudaDeviceSynchronize());
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
index 684a54f5d2ed4..c77132dfe1c2d 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -230,11 +230,6 @@ class CAHitNtupletGeneratorKernels {
     return (paramsMaxDoubletes3Quarters_ + blockSize - 1) / blockSize;
   }
 
-  /// Compute the number of quadruplet blocks for block size
-  inline uint32_t nQuadrupletBlocks(uint32_t blockSize) {
-    // caConstants::maxNumberOfQuadruplets is a constexpr, so the compiler will pre compute the 3*max/4
-    return (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
-  }
 };
 
 using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels<cms::cudacompat::GPUTraits>;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc
index 5978ef8851c73..5bd8793b32411 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc
@@ -40,9 +40,10 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(int32_t nHits, cudaStream_t
     cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream));
   } else {
     *device_nCells_ = 0;
+    // for gpu moved to populate functions
+    cms::cuda::launchZero(device_tupleMultiplicity_.get(), stream);
+    cms::cuda::launchZero(hitToTupleView_, stream);  // we may wish to keep it in the edm
   }
-  cms::cuda::launchZero(device_tupleMultiplicity_.get(), stream);
-  cms::cuda::launchZero(hitToTupleView_, stream);  // we may wish to keep it in the edm
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
   cudaCheck(cudaGetLastError());

From 0c1e5f45bf966018850f7163f4ee5440000090a2 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 14 Oct 2021 16:31:20 +0200
Subject: [PATCH 10/29] format

---
 .../plugins/CAHitNtupletGeneratorKernels.cu   | 61 ++++++++-----------
 .../plugins/CAHitNtupletGeneratorKernels.h    |  1 -
 2 files changed, 27 insertions(+), 35 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index 85a82f3171270..9947033633eb1 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -1,44 +1,38 @@
 #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h"
 
-  /// Compute the number of quadruplet blocks for block size
-  inline uint32_t nQuadrupletBlocks(uint32_t blockSize) {
-    // caConstants::maxNumberOfQuadruplets is a constexpr, so the compiler will pre compute the 3*max/4
-    return (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
-  }
-
-__inline__  void populateMultiplicity(HitContainer const *__restrict__ tuples_d,
-                                        Quality const *__restrict__ quality_d,
-                                        caConstants::TupleMultiplicity *tupleMultiplicity_d, cudaStream_t cudaStream) {
+/// Compute the number of quadruplet blocks for block size
+inline uint32_t nQuadrupletBlocks(uint32_t blockSize) {
+  // caConstants::maxNumberOfQuadruplets is a constexpr, so the compiler will pre compute the 3*max/4
+  return (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
+}
 
+__inline__ void populateMultiplicity(HitContainer const *__restrict__ tuples_d,
+                                     Quality const *__restrict__ quality_d,
+                                     caConstants::TupleMultiplicity *tupleMultiplicity_d,
+                                     cudaStream_t cudaStream) {
   cms::cuda::launchZero(tupleMultiplicity_d, cudaStream);
-  auto  blockSize = 128;
+  auto blockSize = 128;
   auto numberOfBlocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize;
-  kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      tuples_d, quality_d, tupleMultiplicity_d);
+  kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, tupleMultiplicity_d);
   cms::cuda::launchFinalize(tupleMultiplicity_d, cudaStream);
-  kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      tuples_d, quality_d, tupleMultiplicity_d);
-
+  kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, tupleMultiplicity_d);
 }
 
-
-__inline__  void populateHitInTracks(HitContainer const *__restrict__ tuples_d, 
-                                        Quality const *__restrict__ quality_d,
-                                        CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple_d, HitToTuple::View hitToTupleView, cudaStream_t cudaStream) {
-
-    cms::cuda::launchZero(hitToTupleView, cudaStream);
-    auto  blockSize = 64;
-    auto numberOfBlocks = nQuadrupletBlocks(blockSize);
-    kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-        tuples_d, quality_d, hitToTuple_d);
-    cudaCheck(cudaGetLastError());
-    cms::cuda::launchFinalize(hitToTupleView, cudaStream);
-    cudaCheck(cudaGetLastError());
-    kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, hitToTuple_d);
+__inline__ void populateHitInTracks(HitContainer const *__restrict__ tuples_d,
+                                    Quality const *__restrict__ quality_d,
+                                    CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple_d,
+                                    HitToTuple::View hitToTupleView,
+                                    cudaStream_t cudaStream) {
+  cms::cuda::launchZero(hitToTupleView, cudaStream);
+  auto blockSize = 64;
+  auto numberOfBlocks = nQuadrupletBlocks(blockSize);
+  kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, hitToTuple_d);
+  cudaCheck(cudaGetLastError());
+  cms::cuda::launchFinalize(hitToTupleView, cudaStream);
+  cudaCheck(cudaGetLastError());
+  kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, hitToTuple_d);
 }
 
-
-
 template <>
 void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream) {
   auto blockSize = 128;
@@ -145,8 +139,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
       device_theCells_.get(), device_nCells_, tuples_d, quality_d, params_.dupPassThrough_);
   cudaCheck(cudaGetLastError());
 
-  
-  populateMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get(),cudaStream);
+  populateMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get(), cudaStream);
   cudaCheck(cudaGetLastError());
 
   if (nhits > 1 && params_.lateFishbone_) {
@@ -294,7 +287,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 
   if (params_.doSharedHitCut_ || params_.doStats_) {
     // populate hit->track "map"
-    populateHitInTracks(tuples_d, quality_d, device_hitToTuple_.get(),hitToTupleView_,cudaStream);
+    populateHitInTracks(tuples_d, quality_d, device_hitToTuple_.get(), hitToTupleView_, cudaStream);
     cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
     cudaCheck(cudaDeviceSynchronize());
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
index c77132dfe1c2d..880664007cb7c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -229,7 +229,6 @@ class CAHitNtupletGeneratorKernels {
     // We want (3 * params_.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize, but first part is pre-computed.
     return (paramsMaxDoubletes3Quarters_ + blockSize - 1) / blockSize;
   }
-
 };
 
 using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels<cms::cudacompat::GPUTraits>;

From 151aea71704bad057662fa2ab153359c23cf0c85 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 15 Oct 2021 12:12:18 +0200
Subject: [PATCH 11/29] use coop for other 2 assoc

---
 .../plugins/CAHitNtupletGeneratorKernels.cu   | 98 ++++++++++++++++++-
 1 file changed, 95 insertions(+), 3 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index 9947033633eb1..cca4b0cde5983 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -6,6 +6,9 @@ inline uint32_t nQuadrupletBlocks(uint32_t blockSize) {
   return (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
 }
 
+#define CMS_USE_COOP_GROUPS
+
+#ifndef CMS_USE_COOP_GROUPS
 __inline__ void populateMultiplicity(HitContainer const *__restrict__ tuples_d,
                                      Quality const *__restrict__ quality_d,
                                      caConstants::TupleMultiplicity *tupleMultiplicity_d,
@@ -20,9 +23,9 @@ __inline__ void populateMultiplicity(HitContainer const *__restrict__ tuples_d,
 
 __inline__ void populateHitInTracks(HitContainer const *__restrict__ tuples_d,
                                     Quality const *__restrict__ quality_d,
-                                    CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple_d,
-                                    HitToTuple::View hitToTupleView,
+                                    CAHitNtupletGeneratorKernelsGPU::HitToTuple::View hitToTupleView,
                                     cudaStream_t cudaStream) {
+  auto hitToTuple_d = static_cast<CAHitNtupletGeneratorKernelsGPU::HitToTuple *>(hitToTupleView.assoc);
   cms::cuda::launchZero(hitToTupleView, cudaStream);
   auto blockSize = 64;
   auto numberOfBlocks = nQuadrupletBlocks(blockSize);
@@ -33,6 +36,95 @@ __inline__ void populateHitInTracks(HitContainer const *__restrict__ tuples_d,
   kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, hitToTuple_d);
 }
 
+#else
+__global__ void kernel_populateHitInTracks(HitContainer const *__restrict__ tuples_d,
+                                           Quality const *__restrict__ quality_d,
+                                           HitToTuple::View view,
+                                           HitToTuple::View::Counter *ws) {
+  namespace cg = cooperative_groups;
+  auto grid = cg::this_grid();
+  auto tuple_d = static_cast<HitToTuple *>(view.assoc);
+  zeroAndInitCoop(view);
+  grid.sync();
+  countHitInTracks(tuples_d, quality_d, tuple_d);
+  grid.sync();
+  finalizeCoop(view, ws);
+  grid.sync();
+  fillHitInTracks(tuples_d, quality_d, tuple_d);
+}
+
+__inline__ void populateHitInTracks(HitContainer const *tuples_d,
+                                    Quality const *quality_d,
+                                    HitToTuple::View view,
+                                    cudaStream_t cudaStream) {
+  using View = HitToTuple::View;
+  int blockSize = 64;
+  int nblocks = nQuadrupletBlocks(blockSize);
+
+  auto kernel = kernel_populateHitInTracks;
+
+  assert(nblocks > 0);
+  auto nOnes = view.size();
+  auto nchunks = nOnes / blockSize + 1;
+  auto ws = cms::cuda::make_device_unique<View::Counter[]>(nchunks, cudaStream);
+  auto wsp = ws.get();
+  // FIXME: discuss with FW team: cuda calls are expensive and not needed for each event
+  // static int maxBlocks = maxCoopBlocks(kernel, blockSize, 0, 0);
+  static int maxBlocks = std::max(1, maxCoopBlocks(kernel, blockSize, 0, 0) / 10);
+  auto ncoopblocks = std::min(nblocks, maxBlocks);
+  assert(ncoopblocks > 0);
+  void *kernelArgs[] = {&tuples_d, &quality_d, &view, &wsp};
+  dim3 dimBlock(blockSize, 1, 1);
+  dim3 dimGrid(ncoopblocks, 1, 1);
+  // launch
+  cudaCheck(cudaLaunchCooperativeKernel((void *)kernel, dimGrid, dimBlock, kernelArgs, 0, cudaStream));
+}
+
+__global__ void kernel_populateMultiplicity(HitContainer const *__restrict__ tuples_d,
+                                            Quality const *__restrict__ quality_d,
+                                            caConstants::TupleMultiplicity::View view,
+                                            caConstants::TupleMultiplicity::View::Counter *ws) {
+  namespace cg = cooperative_groups;
+  auto grid = cg::this_grid();
+  auto tupleMultiplicity_d = static_cast<caConstants::TupleMultiplicity *>(view.assoc);
+  zeroAndInitCoop(view);
+  grid.sync();
+  countMultiplicity(tuples_d, quality_d, tupleMultiplicity_d);
+  grid.sync();
+  finalizeCoop(view, ws);
+  grid.sync();
+  fillMultiplicity(tuples_d, quality_d, tupleMultiplicity_d);
+}
+
+__inline__ void populateMultiplicity(HitContainer const *tuples_d,
+                                     Quality const *quality_d,
+                                     caConstants::TupleMultiplicity *tupleMultiplicity_d,
+                                     cudaStream_t cudaStream) {
+  auto kernel = kernel_populateMultiplicity;
+  using View = caConstants::TupleMultiplicity::View;
+  View view = {tupleMultiplicity_d, nullptr, nullptr, -1, -1};
+
+  int blockSize = 128;
+  int nblocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize;
+  assert(nblocks > 0);
+  auto nOnes = view.size();
+  auto nchunks = nOnes / blockSize + 1;
+  auto ws = cms::cuda::make_device_unique<View::Counter[]>(nchunks, cudaStream);
+  auto wsp = ws.get();
+  // FIXME: discuss with FW team: cuda calls are expensive and not needed for each event
+  // static int maxBlocks = maxCoopBlocks(kernel, blockSize, 0, 0);
+  static int maxBlocks = std::max(1, maxCoopBlocks(kernel, blockSize, 0, 0) / 10);
+  auto ncoopblocks = std::min(nblocks, maxBlocks);
+  assert(ncoopblocks > 0);
+  void *kernelArgs[] = {&tuples_d, &quality_d, &view, &wsp};
+  dim3 dimBlock(blockSize, 1, 1);
+  dim3 dimGrid(ncoopblocks, 1, 1);
+  // launch
+  cudaCheck(cudaLaunchCooperativeKernel((void *)kernel, dimGrid, dimBlock, kernelArgs, 0, cudaStream));
+}
+
+#endif
+
 template <>
 void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream) {
   auto blockSize = 128;
@@ -287,7 +379,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 
   if (params_.doSharedHitCut_ || params_.doStats_) {
     // populate hit->track "map"
-    populateHitInTracks(tuples_d, quality_d, device_hitToTuple_.get(), hitToTupleView_, cudaStream);
+    populateHitInTracks(tuples_d, quality_d, hitToTupleView_, cudaStream);
     cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
     cudaCheck(cudaDeviceSynchronize());

From b8e2760f7929128da1fa43619c36306c59c878d1 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sat, 16 Oct 2021 16:03:29 +0200
Subject: [PATCH 12/29] drive performance tests by envvar

---
 .../CUDAUtilities/interface/HistoContainerAlgo.h |  3 +--
 .../CUDAUtilities/interface/maxCoopBlocks.h      | 16 ++++++++++++++--
 .../CUDAUtilities/test/OneToManyAssoc_t.h        |  2 +-
 .../CUDAUtilities/test/prefixScan_t.cu           |  2 +-
 .../plugins/CAHitNtupletGeneratorKernels.cu      |  6 ++----
 5 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h
index e2b37a3c138b5..bd80b505fdcda 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h
@@ -134,8 +134,7 @@ namespace cms {
       auto ws = cms::cuda::make_device_unique<typename View::Counter[]>(nchunks, stream);
       auto wsp = ws.get();
       // FIXME: discuss with FW team: cuda calls are expensive and not needed for each event
-      // static int maxBlocks = maxCoopBlocks(kernel, nthreads, 0, 0);
-      static int maxBlocks = std::max(1, maxCoopBlocks(kernel, nthreads, 0, 0) / 10);
+      static int maxBlocks = maxCoopBlocks(kernel, nthreads, 0, 0);
       auto ncoopblocks = std::min(nblocks, maxBlocks);
       assert(ncoopblocks > 0);
       void *kernelArgs[] = {&view, &nh, &v, &offsets, &totSize, &wsp};
diff --git a/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h b/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h
index 44490110eeb28..9d4db05e9276f 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h
@@ -3,14 +3,26 @@
 #define HeterogeneousCore_CUDAUtilities_interface_maxCoopBlocks_h
 
 #include <cuda_runtime.h>
+#include <algorithm>
+#include <cstdlib>
 
 template <typename F>
-inline int maxCoopBlocks(F kernel, int nthreads, int shmem, int device) {
+inline int maxCoopBlocks(F kernel, int nthreads, int shmem, int device, int redFact = 10) {
+#define GET_COOP_RED_FACT_FROM_ENV
+#ifdef GET_COOP_RED_FACT_FROM_ENV
+  auto env = getenv("COOP_RED_FACT");
+  int redFactFromEnv = env ? atoi(env) : 0;
+  if (redFactFromEnv != 0)
+    redFact = redFactFromEnv;
+#endif
+
   int numBlocksPerSm = 0;
   cudaDeviceProp deviceProp;
   cudaGetDeviceProperties(&deviceProp, device);
   cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, kernel, nthreads, shmem);
-  return deviceProp.multiProcessorCount * numBlocksPerSm;
+  int mxblocks = deviceProp.multiProcessorCount * numBlocksPerSm;
+  // reduce number of blocks to account for multiple CPU threads
+  return std::max(1, mxblocks / redFact);
 }
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
index 5685dce628e59..da4e1be0ba782 100644
--- a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
+++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
@@ -282,7 +282,7 @@ int main() {
   auto nchunks = nOnes / nThreads + 1;
   auto ws = cms::cuda::make_device_unique<Assoc::Counter[]>(nchunks, 0);
 
-  int maxBlocks = maxCoopBlocks(populate, nThreads, 0, 0);
+  int maxBlocks = maxCoopBlocks(populate, nThreads, 0, 0, 0);
   std::cout << "max number of blocks is " << maxBlocks << std::endl;
   auto ncoopblocks = std::min(nBlocks, maxBlocks);
   auto a1 = v_d.get();
diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
index ef0a785e00bd5..61dc6dfc0a3dd 100644
--- a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
@@ -153,7 +153,7 @@ int main() {
     uint32_t *d_psum;
     cudaCheck(cudaMalloc(&d_psum, nblocks * sizeof(uint32_t)));
     std::cout << "launch coopBlockPrefixScan " << num_items << ' ' << nblocks << std::endl;
-    int maxBlocks = maxCoopBlocks(doCoop<uint32_t>, nthreads, 0, 0);
+    int maxBlocks = maxCoopBlocks(doCoop<uint32_t>, nthreads, 0, 0, 0);
     std::cout << "max number of blocks is " << maxBlocks << std::endl;
     auto ncoopblocks = std::min(nblocks, maxBlocks);
     void *kernelArgs[] = {&d_in, &d_out2, &num_items, &d_psum};
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index cca4b0cde5983..52c343a470ba5 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -69,8 +69,7 @@ __inline__ void populateHitInTracks(HitContainer const *tuples_d,
   auto ws = cms::cuda::make_device_unique<View::Counter[]>(nchunks, cudaStream);
   auto wsp = ws.get();
   // FIXME: discuss with FW team: cuda calls are expensive and not needed for each event
-  // static int maxBlocks = maxCoopBlocks(kernel, blockSize, 0, 0);
-  static int maxBlocks = std::max(1, maxCoopBlocks(kernel, blockSize, 0, 0) / 10);
+  static int maxBlocks = maxCoopBlocks(kernel, blockSize, 0, 0);
   auto ncoopblocks = std::min(nblocks, maxBlocks);
   assert(ncoopblocks > 0);
   void *kernelArgs[] = {&tuples_d, &quality_d, &view, &wsp};
@@ -112,8 +111,7 @@ __inline__ void populateMultiplicity(HitContainer const *tuples_d,
   auto ws = cms::cuda::make_device_unique<View::Counter[]>(nchunks, cudaStream);
   auto wsp = ws.get();
   // FIXME: discuss with FW team: cuda calls are expensive and not needed for each event
-  // static int maxBlocks = maxCoopBlocks(kernel, blockSize, 0, 0);
-  static int maxBlocks = std::max(1, maxCoopBlocks(kernel, blockSize, 0, 0) / 10);
+  static int maxBlocks = maxCoopBlocks(kernel, blockSize, 0, 0);
   auto ncoopblocks = std::min(nblocks, maxBlocks);
   assert(ncoopblocks > 0);
   void *kernelArgs[] = {&tuples_d, &quality_d, &view, &wsp};

From 446d652df9076188755ca1364fe2d716ad4865e3 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sat, 16 Oct 2021 16:29:36 +0200
Subject: [PATCH 13/29] add comment

---
 HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h b/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h
index 9d4db05e9276f..f8e493540aacb 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h
@@ -9,6 +9,8 @@
 template <typename F>
 inline int maxCoopBlocks(F kernel, int nthreads, int shmem, int device, int redFact = 10) {
 #define GET_COOP_RED_FACT_FROM_ENV
+
+// to drive performance assessment by envvar
 #ifdef GET_COOP_RED_FACT_FROM_ENV
   auto env = getenv("COOP_RED_FACT");
   int redFactFromEnv = env ? atoi(env) : 0;

From 5f6f59676038677f704b8e820046fb052da0231e Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sun, 17 Oct 2021 13:39:54 +0200
Subject: [PATCH 14/29] factorize, encapsulate

---
 .../interface/HistoContainerAlgo.h            | 51 ++++++-------------
 .../CUDAUtilities/interface/OneToManyAssoc.h  |  3 ++
 2 files changed, 18 insertions(+), 36 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h
index bd80b505fdcda..4177dfda97f83 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h
@@ -11,8 +11,8 @@
 namespace cms {
   namespace cuda {
 
-    template <typename Histo, typename T>
-    __device__ __inline__ void countFromVector(Histo *__restrict__ h,
+    template <typename Histo, typename T, CountOrFill cof>
+    __device__ __inline__ void countOrFillFromVector(Histo *__restrict__ h,
                                                uint32_t nh,
                                                T const *__restrict__ v,
                                                uint32_t const *__restrict__ offsets) {
@@ -23,40 +23,19 @@ namespace cms {
         int32_t ih = off - offsets - 1;
         assert(ih >= 0);
         assert(ih < int(nh));
-        (*h).count(v[i], ih);
+        if constexpr(CountOrFill::count==cof)
+          (*h).count(v[i], ih);
+        else
+          (*h).fill(v[i], i, ih);
       }
     }
 
-    template <typename Histo, typename T>
-    __global__ void countFromVectorKernel(Histo *__restrict__ h,
+    template <typename Histo, typename T,CountOrFill cof>
+    __global__ void countOrFillFromVectorKernel(Histo *__restrict__ h,
                                           uint32_t nh,
                                           T const *__restrict__ v,
                                           uint32_t const *__restrict__ offsets) {
-      countFromVector(h, nh, v, offsets);
-    }
-
-    template <typename Histo, typename T>
-    __device__ __inline__ void fillFromVector(Histo *__restrict__ h,
-                                              uint32_t nh,
-                                              T const *__restrict__ v,
-                                              uint32_t const *__restrict__ offsets) {
-      int first = blockDim.x * blockIdx.x + threadIdx.x;
-      for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
-        auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
-        assert((*off) > 0);
-        int32_t ih = off - offsets - 1;
-        assert(ih >= 0);
-        assert(ih < int(nh));
-        (*h).fill(v[i], i, ih);
-      }
-    }
-
-    template <typename Histo, typename T>
-    __global__ void fillFromVectorKernel(Histo *__restrict__ h,
-                                         uint32_t nh,
-                                         T const *__restrict__ v,
-                                         uint32_t const *__restrict__ offsets) {
-      fillFromVector(h, nh, v, offsets);
+      countOrFillFromVector<Histo,T,cof>(h, nh, v, offsets);
     }
 
     template <typename Histo, typename T>
@@ -77,15 +56,15 @@ namespace cms {
 #ifdef __CUDACC__
       auto nblocks = (totSize + nthreads - 1) / nthreads;
       assert(nblocks > 0);
-      countFromVectorKernel<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+      countOrFillFromVectorKernel<Histo,T,CountOrFill::count><<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
       cudaCheck(cudaGetLastError());
       launchFinalize(view, stream);
-      fillFromVectorKernel<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+      countOrFillFromVectorKernel<Histo,T,CountOrFill::fill><<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
       cudaCheck(cudaGetLastError());
 #else
-      countFromVectorKernel(h, nh, v, offsets);
+      countOrFillFromVectorKernel<Histo,T,CountOrFill::count>(h, nh, v, offsets);
       h->finalize();
-      fillFromVectorKernel(h, nh, v, offsets);
+      countOrFillFromVectorKernel<Histo,T,CountOrFill::fill>(h, nh, v, offsets);
 #endif
     }
 
@@ -102,11 +81,11 @@ namespace cms {
       auto h = static_cast<Histo *>(view.assoc);
       zeroAndInitCoop(view);
       grid.sync();
-      countFromVector(h, nh, v, offsets);
+      countOrFillFromVector<Histo,T,CountOrFill::count>(h, nh, v, offsets);
       grid.sync();
       finalizeCoop(view, ws);
       grid.sync();
-      fillFromVector(h, nh, v, offsets);
+      countOrFillFromVector<Histo,T,CountOrFill::fill>(h, nh, v, offsets);
     }
 #endif
 
diff --git a/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h b/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h
index b7bf7ca4f354f..41a88d6f624ec 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h
@@ -19,6 +19,9 @@
 namespace cms {
   namespace cuda {
 
+    enum class CountOrFill {count, fill};
+
+
     template <typename Assoc>
     struct OneToManyAssocView {
       using Counter = typename Assoc::Counter;

From 71631b28890bbc839c84ef22a5f28ca2a405583a Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sun, 17 Oct 2021 14:36:32 +0200
Subject: [PATCH 15/29] propagate factorization

---
 .../interface/HistoContainerAlgo.h            | 44 ++++++----
 .../CUDAUtilities/interface/OneToManyAssoc.h  |  3 +-
 .../plugins/CAHitNtupletGeneratorKernels.cc   |  8 +-
 .../plugins/CAHitNtupletGeneratorKernels.cu   | 20 +++--
 .../CAHitNtupletGeneratorKernelsImpl.h        | 86 ++++++-------------
 5 files changed, 74 insertions(+), 87 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h
index 4177dfda97f83..9de360bc32142 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h
@@ -11,11 +11,25 @@
 namespace cms {
   namespace cuda {
 
+    template <template <CountOrFill> typename Func, typename Histo, typename... Args>
+    __global__ void kernel_populate(typename Histo::View view, typename Histo::View::Counter *ws, Args... args) {
+      namespace cg = cooperative_groups;
+      auto grid = cg::this_grid();
+      auto histo = static_cast<Histo *>(view.assoc);
+      zeroAndInitCoop(view);
+      grid.sync();
+      Func<CountOrFill::count>::countOrFill(histo, std::forward<Args>(args)...);
+      grid.sync();
+      finalizeCoop(view, ws);
+      grid.sync();
+      Func<CountOrFill::fill>::countOrFill(histo, std::forward<Args>(args)...);
+    }
+
     template <typename Histo, typename T, CountOrFill cof>
     __device__ __inline__ void countOrFillFromVector(Histo *__restrict__ h,
-                                               uint32_t nh,
-                                               T const *__restrict__ v,
-                                               uint32_t const *__restrict__ offsets) {
+                                                     uint32_t nh,
+                                                     T const *__restrict__ v,
+                                                     uint32_t const *__restrict__ offsets) {
       int first = blockDim.x * blockIdx.x + threadIdx.x;
       for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
         auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
@@ -23,19 +37,19 @@ namespace cms {
         int32_t ih = off - offsets - 1;
         assert(ih >= 0);
         assert(ih < int(nh));
-        if constexpr(CountOrFill::count==cof)
+        if constexpr (CountOrFill::count == cof)
           (*h).count(v[i], ih);
         else
           (*h).fill(v[i], i, ih);
       }
     }
 
-    template <typename Histo, typename T,CountOrFill cof>
+    template <typename Histo, typename T, CountOrFill cof>
     __global__ void countOrFillFromVectorKernel(Histo *__restrict__ h,
-                                          uint32_t nh,
-                                          T const *__restrict__ v,
-                                          uint32_t const *__restrict__ offsets) {
-      countOrFillFromVector<Histo,T,cof>(h, nh, v, offsets);
+                                                uint32_t nh,
+                                                T const *__restrict__ v,
+                                                uint32_t const *__restrict__ offsets) {
+      countOrFillFromVector<Histo, T, cof>(h, nh, v, offsets);
     }
 
     template <typename Histo, typename T>
@@ -56,15 +70,15 @@ namespace cms {
 #ifdef __CUDACC__
       auto nblocks = (totSize + nthreads - 1) / nthreads;
       assert(nblocks > 0);
-      countOrFillFromVectorKernel<Histo,T,CountOrFill::count><<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+      countOrFillFromVectorKernel<Histo, T, CountOrFill::count><<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
       cudaCheck(cudaGetLastError());
       launchFinalize(view, stream);
-      countOrFillFromVectorKernel<Histo,T,CountOrFill::fill><<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+      countOrFillFromVectorKernel<Histo, T, CountOrFill::fill><<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
       cudaCheck(cudaGetLastError());
 #else
-      countOrFillFromVectorKernel<Histo,T,CountOrFill::count>(h, nh, v, offsets);
+      countOrFillFromVectorKernel<Histo, T, CountOrFill::count>(h, nh, v, offsets);
       h->finalize();
-      countOrFillFromVectorKernel<Histo,T,CountOrFill::fill>(h, nh, v, offsets);
+      countOrFillFromVectorKernel<Histo, T, CountOrFill::fill>(h, nh, v, offsets);
 #endif
     }
 
@@ -81,11 +95,11 @@ namespace cms {
       auto h = static_cast<Histo *>(view.assoc);
       zeroAndInitCoop(view);
       grid.sync();
-      countOrFillFromVector<Histo,T,CountOrFill::count>(h, nh, v, offsets);
+      countOrFillFromVector<Histo, T, CountOrFill::count>(h, nh, v, offsets);
       grid.sync();
       finalizeCoop(view, ws);
       grid.sync();
-      countOrFillFromVector<Histo,T,CountOrFill::fill>(h, nh, v, offsets);
+      countOrFillFromVector<Histo, T, CountOrFill::fill>(h, nh, v, offsets);
     }
 #endif
 
diff --git a/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h b/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h
index 41a88d6f624ec..65baf7c046922 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h
@@ -19,8 +19,7 @@
 namespace cms {
   namespace cuda {
 
-    enum class CountOrFill {count, fill};
-
+    enum class CountOrFill { count, fill };
 
     template <typename Assoc>
     struct OneToManyAssocView {
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 4932467d4fd2c..9b02ce129c4ff 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -127,9 +127,9 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   // remove duplicates (tracks that share a doublet)
   kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, quality_d, params_.dupPassThrough_);
 
-  kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
+  kernel_countOrFillMultiplicity<CountOrFill::count>(tuples_d, quality_d, device_tupleMultiplicity_.get());
   cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
-  kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
+  kernel_countOrFillMultiplicity<CountOrFill::fill>(tuples_d, quality_d, device_tupleMultiplicity_.get());
 
   if (nhits > 1 && params_.lateFishbone_) {
     gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true);
@@ -169,9 +169,9 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 
   // fill hit->track "map"
   if (params_.doSharedHitCut_ || params_.doStats_) {
-    kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
+    kernel_countOrFillHitInTracks<CountOrFill::count>(tuples_d, quality_d, device_hitToTuple_.get());
     cms::cuda::launchFinalize(hitToTupleView_, cudaStream);
-    kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
+    kernel_countOrFillHitInTracks<CountOrFill::fill>(tuples_d, quality_d, device_hitToTuple_.get());
   }
 
   // remove duplicates (tracks that share at least one hit)
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index 52c343a470ba5..3120d68326f79 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -16,9 +16,11 @@ __inline__ void populateMultiplicity(HitContainer const *__restrict__ tuples_d,
   cms::cuda::launchZero(tupleMultiplicity_d, cudaStream);
   auto blockSize = 128;
   auto numberOfBlocks = (3 * caConstants::maxTuples / 4 + blockSize - 1) / blockSize;
-  kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, tupleMultiplicity_d);
+  kernel_countOrFillMultiplicity<CountOrFill::count>
+      <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, tupleMultiplicity_d);
   cms::cuda::launchFinalize(tupleMultiplicity_d, cudaStream);
-  kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, tupleMultiplicity_d);
+  kernel_countOrFillMultiplicity<CountOrFill::fill>
+      <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, tupleMultiplicity_d);
 }
 
 __inline__ void populateHitInTracks(HitContainer const *__restrict__ tuples_d,
@@ -29,11 +31,13 @@ __inline__ void populateHitInTracks(HitContainer const *__restrict__ tuples_d,
   cms::cuda::launchZero(hitToTupleView, cudaStream);
   auto blockSize = 64;
   auto numberOfBlocks = nQuadrupletBlocks(blockSize);
-  kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, hitToTuple_d);
+  kernel_countOrFillHitInTracks<CountOrFill::count>
+      <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, hitToTuple_d);
   cudaCheck(cudaGetLastError());
   cms::cuda::launchFinalize(hitToTupleView, cudaStream);
   cudaCheck(cudaGetLastError());
-  kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, hitToTuple_d);
+  kernel__countOrFillHitInTracks<CountOrFill::fill>
+      <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, hitToTuple_d);
 }
 
 #else
@@ -46,11 +50,11 @@ __global__ void kernel_populateHitInTracks(HitContainer const *__restrict__ tupl
   auto tuple_d = static_cast<HitToTuple *>(view.assoc);
   zeroAndInitCoop(view);
   grid.sync();
-  countHitInTracks(tuples_d, quality_d, tuple_d);
+  countOrFillHitInTracks<CountOrFill::count>(tuples_d, quality_d, tuple_d);
   grid.sync();
   finalizeCoop(view, ws);
   grid.sync();
-  fillHitInTracks(tuples_d, quality_d, tuple_d);
+  countOrFillHitInTracks<CountOrFill::fill>(tuples_d, quality_d, tuple_d);
 }
 
 __inline__ void populateHitInTracks(HitContainer const *tuples_d,
@@ -88,11 +92,11 @@ __global__ void kernel_populateMultiplicity(HitContainer const *__restrict__ tup
   auto tupleMultiplicity_d = static_cast<caConstants::TupleMultiplicity *>(view.assoc);
   zeroAndInitCoop(view);
   grid.sync();
-  countMultiplicity(tuples_d, quality_d, tupleMultiplicity_d);
+  countOrFillMultiplicity<CountOrFill::count>(tuples_d, quality_d, tupleMultiplicity_d);
   grid.sync();
   finalizeCoop(view, ws);
   grid.sync();
-  fillMultiplicity(tuples_d, quality_d, tupleMultiplicity_d);
+  countOrFillMultiplicity<CountOrFill::fill>(tuples_d, quality_d, tupleMultiplicity_d);
 }
 
 __inline__ void populateMultiplicity(HitContainer const *tuples_d,
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 3d7b307f06e88..428b2045ab1f5 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -32,6 +32,8 @@ using Quality = pixelTrack::Quality;
 using TkSoA = pixelTrack::TrackSoA;
 using HitContainer = pixelTrack::HitContainer;
 
+using CountOrFill = cms::cuda::CountOrFill;
+
 namespace {
 
   constexpr uint16_t tkNotFound = std::numeric_limits<uint16_t>::max();
@@ -380,9 +382,10 @@ __global__ void kernel_mark_used(GPUCACell::Hits const *__restrict__ hhp,
   }
 }
 
-__device__ __inline__ void countMultiplicity(HitContainer const *__restrict__ foundNtuplets,
-                                             Quality const *__restrict__ quality,
-                                             caConstants::TupleMultiplicity *tupleMultiplicity) {
+template <CountOrFill cof>
+__device__ __inline__ void countOrFillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
+                                                   Quality const *__restrict__ quality,
+                                                   caConstants::TupleMultiplicity *tupleMultiplicity) {
   auto first = blockIdx.x * blockDim.x + threadIdx.x;
   for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
     auto nhits = foundNtuplets->size(it);
@@ -394,38 +397,18 @@ __device__ __inline__ void countMultiplicity(HitContainer const *__restrict__ fo
     if (nhits > 5)
       printf("wrong mult %d %d\n", it, nhits);
     assert(nhits < 8);
-    tupleMultiplicity->count(nhits);
+    if constexpr (CountOrFill::count == cof)
+      tupleMultiplicity->count(nhits);
+    else
+      tupleMultiplicity->fill(nhits, it);
   }
 }
 
-__global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets,
-                                         Quality const *__restrict__ quality,
-                                         caConstants::TupleMultiplicity *tupleMultiplicity) {
-  countMultiplicity(foundNtuplets, quality, tupleMultiplicity);
-}
-
-__device__ __inline__ void fillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
-                                            Quality const *__restrict__ quality,
-                                            caConstants::TupleMultiplicity *tupleMultiplicity) {
-  auto first = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
-    auto nhits = foundNtuplets->size(it);
-    if (nhits < 3)
-      continue;
-    if (quality[it] == pixelTrack::Quality::edup)
-      continue;
-    assert(quality[it] == pixelTrack::Quality::bad);
-    if (nhits > 5)
-      printf("wrong mult %d %d\n", it, nhits);
-    assert(nhits < 8);
-    tupleMultiplicity->fill(nhits, it);
-  }
-}
-
-__global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
-                                        Quality const *__restrict__ quality,
-                                        caConstants::TupleMultiplicity *tupleMultiplicity) {
-  fillMultiplicity(foundNtuplets, quality, tupleMultiplicity);
+template <CountOrFill cof>
+__global__ void kernel_countOrFillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
+                                               Quality const *__restrict__ quality,
+                                               caConstants::TupleMultiplicity *tupleMultiplicity) {
+  countOrFillMultiplicity<cof>(foundNtuplets, quality, tupleMultiplicity);
 }
 
 __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
@@ -532,40 +515,27 @@ __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples,
   }
 }
 
-__device__ __inline__ void countHitInTracks(HitContainer const *__restrict__ tuples,
-                                            Quality const *__restrict__ quality,
-                                            CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
-  int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
-    if (tuples->size(idx) == 0)
-      break;  // guard
-    for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
-      hitToTuple->count(*h);
-  }
-}
-
-__global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples,
-                                        Quality const *__restrict__ quality,
-                                        CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
-  countHitInTracks(tuples, quality, hitToTuple);
-}
-
-__device__ __inline__ void fillHitInTracks(HitContainer const *__restrict__ tuples,
-                                           Quality const *__restrict__ quality,
-                                           CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
+template <CountOrFill cof>
+__device__ __inline__ void countOrFillHitInTracks(HitContainer const *__restrict__ tuples,
+                                                  Quality const *__restrict__ quality,
+                                                  CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
   int first = blockDim.x * blockIdx.x + threadIdx.x;
   for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
     if (tuples->size(idx) == 0)
       break;  // guard
     for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
-      hitToTuple->fill(*h, idx);
+      if constexpr (CountOrFill::count == cof)
+        hitToTuple->count(*h);
+      else
+        hitToTuple->fill(*h, idx);
   }
 }
 
-__global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples,
-                                       Quality const *__restrict__ quality,
-                                       CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
-  fillHitInTracks(tuples, quality, hitToTuple);
+template <CountOrFill cof>
+__global__ void kernel_countOrFillHitInTracks(HitContainer const *__restrict__ tuples,
+                                              Quality const *__restrict__ quality,
+                                              CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
+  countOrFillHitInTracks<cof>(tuples, quality, hitToTuple);
 }
 
 __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples,

From cf5b8babd78d566c71a9d0ce107a1165cdf9da5e Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Tue, 16 Nov 2021 09:33:59 +0100
Subject: [PATCH 16/29] add const

Co-authored-by: Matti Kortelainen <matti.kortelainen@cern.ch>
---
 .../PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index 3120d68326f79..f385d7d1701ee 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -73,7 +73,7 @@ __inline__ void populateHitInTracks(HitContainer const *tuples_d,
   auto ws = cms::cuda::make_device_unique<View::Counter[]>(nchunks, cudaStream);
   auto wsp = ws.get();
   // FIXME: discuss with FW team: cuda calls are expensive and not needed for each event
-  static int maxBlocks = maxCoopBlocks(kernel, blockSize, 0, 0);
+  static int const maxBlocks = maxCoopBlocks(kernel, blockSize, 0, 0);
   auto ncoopblocks = std::min(nblocks, maxBlocks);
   assert(ncoopblocks > 0);
   void *kernelArgs[] = {&tuples_d, &quality_d, &view, &wsp};

From da13a56b9c574b25921d66bbe8df58e595515a44 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Tue, 16 Nov 2021 09:34:10 +0100
Subject: [PATCH 17/29] add const

Co-authored-by: Matti Kortelainen <matti.kortelainen@cern.ch>
---
 .../PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index f385d7d1701ee..47f50f0fdd1cf 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -115,7 +115,7 @@ __inline__ void populateMultiplicity(HitContainer const *tuples_d,
   auto ws = cms::cuda::make_device_unique<View::Counter[]>(nchunks, cudaStream);
   auto wsp = ws.get();
   // FIXME: discuss with FW team: cuda calls are expensive and not needed for each event
-  static int maxBlocks = maxCoopBlocks(kernel, blockSize, 0, 0);
+  static int const maxBlocks = maxCoopBlocks(kernel, blockSize, 0, 0);
   auto ncoopblocks = std::min(nblocks, maxBlocks);
   assert(ncoopblocks > 0);
   void *kernelArgs[] = {&tuples_d, &quality_d, &view, &wsp};

From fc4faa76078b7a30ed74a5e28c78943beeabd629 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Tue, 16 Nov 2021 09:35:03 +0100
Subject: [PATCH 18/29] change to on const

Co-authored-by: Matti Kortelainen <matti.kortelainen@cern.ch>
---
 HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h
index 9de360bc32142..1ec643133dce7 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h
@@ -158,7 +158,7 @@ namespace cms {
 
     // iteratate over bins containing all values in window wmin, wmax
     template <typename Hist, typename V, typename Func>
-    __host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) {
+    __host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func &&func) {
       auto bs = Hist::bin(wmin);
       auto be = Hist::bin(wmax);
       assert(be >= bs);

From ac764daffdb59ea826bb51f6c90706422defa06d Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Tue, 16 Nov 2021 09:35:14 +0100
Subject: [PATCH 19/29] Update
 HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h

Co-authored-by: Matti Kortelainen <matti.kortelainen@cern.ch>
---
 HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h b/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h
index f8e493540aacb..cbfeeb365aeb2 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h
@@ -12,7 +12,7 @@ inline int maxCoopBlocks(F kernel, int nthreads, int shmem, int device, int redF
 
 // to drive performance assessment by envvar
 #ifdef GET_COOP_RED_FACT_FROM_ENV
-  auto env = getenv("COOP_RED_FACT");
+  auto env = std::getenv("COOP_RED_FACT");
   int redFactFromEnv = env ? atoi(env) : 0;
   if (redFactFromEnv != 0)
     redFact = redFactFromEnv;

From 72f6df71831fa2bf680a754eab07bb16022e30af Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 9 Dec 2021 12:15:41 +0100
Subject: [PATCH 20/29] remove double check

---
 HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h
index 1ec643133dce7..138991f60ac3e 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h
@@ -135,7 +135,6 @@ namespace cms {
       dim3 dimGrid(ncoopblocks, 1, 1);
       // launch
       cudaCheck(cudaLaunchCooperativeKernel((void *)kernel, dimGrid, dimBlock, kernelArgs, 0, stream));
-      cudaCheck(cudaGetLastError());
 #else
       launchZero(view, stream);
       countFromVector(h, nh, v, offsets);

From 6637f96eb1875bc0aba42d9ce146890b02d63a03 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 9 Dec 2021 12:20:51 +0100
Subject: [PATCH 21/29] be more specific with preprocessor flag name

---
 .../PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index f98c1adc61c2f..e8d7638832749 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -6,9 +6,9 @@ inline uint32_t nQuadrupletBlocks(uint32_t blockSize) {
   return (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
 }
 
-#define CMS_USE_COOP_GROUPS
+#define CMS_CA_USE_COOPERATIVE_GROUPS
 
-#ifndef CMS_USE_COOP_GROUPS
+#ifndef CMS_CA_USE_COOPERATIVE_GROUPS
 __inline__ void populateMultiplicity(HitContainer const *__restrict__ tuples_d,
                                      Quality const *__restrict__ quality_d,
                                      caConstants::TupleMultiplicity *tupleMultiplicity_d,

From e9328e361de1fad9ccd5a704e3d9431128930fc1 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 9 Dec 2021 12:24:01 +0100
Subject: [PATCH 22/29] be more specific with flag name

---
 HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
index b879bc8f17e96..c9fe4ee66a2ed 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
@@ -72,7 +72,7 @@ void go() {
 
     cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
 
-#ifdef COOP
+#ifdef USE_COOPERATIVE_GROUPS
     fillManyFromVectorCoop(h_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, nullptr, 0);
 #else
     fillManyFromVector(h_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, nullptr, 0);

From 0160c6de754b8c88f6f04eba99db8c0f9deee109 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 9 Dec 2021 12:25:08 +0100
Subject: [PATCH 23/29] be more specific with compiler flag (even if it is a
 test)

---
 HeterogeneousCore/CUDAUtilities/test/BuildFile.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
index 72f34863ab8dd..782c46dc66739 100644
--- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -53,11 +53,11 @@
   </bin>
 
  <bin file="HistoContainer_t.cu" name="gpuHistoContainerCoop_t">
-    <flags CUDA_FLAGS="-g -DCOOP"/>
+    <flags CUDA_FLAGS="-g -DUSE_COOPERATIVE_GROUPS"/>
   </bin>
 
   <bin file="HistoContainer_t.cu" name="gpuHistoContainerCoop_debug">
-    <flags CUDA_FLAGS="-g -DCOOP -DGPU_DEBUG"/>
+    <flags CUDA_FLAGS="-g -DUSE_COOPERATIVE_GROUPS -DGPU_DEBUG"/>
   </bin>
 
  <bin file="HistoContainerRT_t.cu" name="gpuHistoContainerRT_t">

From 8f387b09540bdd7b28ac80a31fced7e723f45e52 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 9 Dec 2021 12:32:45 +0100
Subject: [PATCH 24/29] remove the possibility to tune grid-size using envvar

---
 HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h b/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h
index cbfeeb365aeb2..ecb0ba5c05121 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h
@@ -8,7 +8,7 @@
 
 template <typename F>
 inline int maxCoopBlocks(F kernel, int nthreads, int shmem, int device, int redFact = 10) {
-#define GET_COOP_RED_FACT_FROM_ENV
+// #define GET_COOP_RED_FACT_FROM_ENV
 
 // to drive performance assessment by envvar
 #ifdef GET_COOP_RED_FACT_FROM_ENV

From 078f314d0b4acb92c0cece40df2c3c15eabaa12d Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 9 Dec 2021 12:45:00 +0100
Subject: [PATCH 25/29] format code

---
 .../SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu   | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
index 9a228ac3fe04c..5941137efb068 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
@@ -62,9 +62,15 @@ namespace pixelgpudetails {
         setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart());
         cudaCheck(cudaGetLastError());
 
-      cms::cuda::fillManyFromVectorCoop(
-          hits_d.phiBinner(), 10, hits_d.iphi(), hits_d.hitsLayerStart(), nHits, 256, hits_d.phiBinnerStorage(), stream);
-      cudaCheck(cudaGetLastError());
+        cms::cuda::fillManyFromVectorCoop(hits_d.phiBinner(),
+                                          10,
+                                          hits_d.iphi(),
+                                          hits_d.hitsLayerStart(),
+                                          nHits,
+                                          256,
+                                          hits_d.phiBinnerStorage(),
+                                          stream);
+        cudaCheck(cudaGetLastError());
 
 #ifdef GPU_DEBUG
         cudaCheck(cudaDeviceSynchronize());

From 90dec5777bf3350a54fb6140a01496463856cb1f Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 10 Dec 2021 13:57:02 +0100
Subject: [PATCH 26/29] By default DO NOT use Cooperative Groups in CA

---
 .../PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index e8d7638832749..1df18d5788588 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -6,7 +6,7 @@ inline uint32_t nQuadrupletBlocks(uint32_t blockSize) {
   return (3 * caConstants::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
 }
 
-#define CMS_CA_USE_COOPERATIVE_GROUPS
+// #define CMS_CA_USE_COOPERATIVE_GROUPS
 
 #ifndef CMS_CA_USE_COOPERATIVE_GROUPS
 __inline__ void populateMultiplicity(HitContainer const *__restrict__ tuples_d,

From 2e58827fb2447a3e9affbd09e4331ccfd590e972 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 10 Dec 2021 17:14:59 +0100
Subject: [PATCH 27/29] fix misspell

---
 .../PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index 1df18d5788588..27f4e0141456c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -36,7 +36,7 @@ __inline__ void populateHitInTracks(HitContainer const *__restrict__ tuples_d,
   cudaCheck(cudaGetLastError());
   cms::cuda::launchFinalize(hitToTupleView, cudaStream);
   cudaCheck(cudaGetLastError());
-  kernel__countOrFillHitInTracks<CountOrFill::fill>
+  kernel_countOrFillHitInTracks<CountOrFill::fill>
       <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, hitToTuple_d);
 }
 

From c09b4de2646c5cbc4a8e99ccedf10534eb46e808 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sat, 11 Dec 2021 16:04:08 +0100
Subject: [PATCH 28/29] no cooperative groups in rechits as well

---
 .../plugins/PixelRecHitGPUKernel.cu              | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
index 5941137efb068..939a87a270e42 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
@@ -62,14 +62,14 @@ namespace pixelgpudetails {
         setHitsLayerStart<<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart());
         cudaCheck(cudaGetLastError());
 
-        cms::cuda::fillManyFromVectorCoop(hits_d.phiBinner(),
-                                          10,
-                                          hits_d.iphi(),
-                                          hits_d.hitsLayerStart(),
-                                          nHits,
-                                          256,
-                                          hits_d.phiBinnerStorage(),
-                                          stream);
+        cms::cuda::fillManyFromVector(hits_d.phiBinner(),
+                                      10,
+                                      hits_d.iphi(),
+                                      hits_d.hitsLayerStart(),
+                                      nHits,
+                                      256,
+                                      hits_d.phiBinnerStorage(),
+                                     stream);
         cudaCheck(cudaGetLastError());
 
 #ifdef GPU_DEBUG

From 9df0add98908d333b9956ef152c9ffdd8eb995cd Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sat, 11 Dec 2021 16:05:12 +0100
Subject: [PATCH 29/29] align text

---
 RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
index 939a87a270e42..3292b8ff7a1e7 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
@@ -69,7 +69,7 @@ namespace pixelgpudetails {
                                       nHits,
                                       256,
                                       hits_d.phiBinnerStorage(),
-                                     stream);
+                                      stream);
         cudaCheck(cudaGetLastError());
 
 #ifdef GPU_DEBUG