cms-patatrack · VinInn · Dec 5, 2020 · Dec 6, 2020 · Dec 6, 2020 · Dec 6, 2020
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
@@ -34,6 +34,7 @@ class TrackingRecHit2DHeterogeneous {
   auto hitsModuleStart() const { return m_hitsModuleStart; }
   auto hitsLayerStart() { return m_hitsLayerStart; }
   auto phiBinner() { return m_hist; }
+  auto binnerStorage() { return m_histStorage; }
   auto iphi() { return m_iphi; }
 
   // only the local coord and detector index
@@ -42,8 +43,9 @@ class TrackingRecHit2DHeterogeneous {
   cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const;
 
 private:
+  // number of elements of size 16 and 32 respectively
   static constexpr uint32_t n16 = 4;
-  static constexpr uint32_t n32 = 9;
+  static constexpr uint32_t n32 = 10;
   static_assert(sizeof(uint32_t) == sizeof(float));  // just stating the obvious
 
   unique_ptr<uint16_t[]> m_store16;  //!
@@ -60,6 +62,7 @@ class TrackingRecHit2DHeterogeneous {
 
   // needed as kernel params...
   Hist* m_hist;
+  Hist::index_type* m_histStorage;
   uint32_t* m_hitsLayerStart;
   int16_t* m_iphi;
 };
@@ -98,14 +101,19 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
   // this will break 1to1 correspondence with cluster and module locality
   // so unless proven VERY inefficient we keep it ordered as generated
   m_store16 = Traits::template make_device_unique<uint16_t[]>(nHits * n16, stream);
-  m_store32 = Traits::template make_device_unique<float[]>(nHits * n32 + 11, stream);
+  m_store32 =
+      Traits::template make_device_unique<float[]>(nHits * n32 + phase1PixelTopology::numberOfLayers + 1, stream);
   m_HistStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::Hist>(stream);
 
+  static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float));
+  static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::Hist::index_type));
+
   auto get16 = [&](int i) { return m_store16.get() + i * nHits; };
   auto get32 = [&](int i) { return m_store32.get() + i * nHits; };
 
   // copy all the pointers
   m_hist = view->m_hist = m_HistStore.get();
+  m_histStorage = view->m_histStorage = reinterpret_cast<TrackingRecHit2DSOAView::Hist::index_type*>(get32(9));
 
   view->m_xl = get32(0);
   view->m_yl = get32(1);

diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h
@@ -17,8 +17,7 @@ class TrackingRecHit2DSOAView {
   static constexpr uint32_t maxHits() { return gpuClustering::MaxNumClusters; }
   using hindex_type = uint32_t;  // if above is <=2^32
 
-  using PhiBinner =
-      cms::cuda::HistoContainer<int16_t, 128, gpuClustering::MaxNumClusters, 8 * sizeof(int16_t), hindex_type, 10>;
+  using PhiBinner = cms::cuda::HistoContainer<int16_t, 128, -1, 8 * sizeof(int16_t), hindex_type, 10>;
 
   using Hist = PhiBinner;  // FIXME
 
@@ -96,6 +95,7 @@ class TrackingRecHit2DSOAView {
   uint32_t* m_hitsLayerStart;
 
   PhiBinner* m_hist;  // FIXME use a more descriptive name consistently
+  PhiBinner::index_type* m_histStorage;
 
   uint32_t m_nHits;
 };

diff --git a/HeterogeneousCore/CUDAUtilities/interface/FlexiStorage.h b/HeterogeneousCore/CUDAUtilities/interface/FlexiStorage.h
@@ -0,0 +1,49 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_FlexiStorage_h
+#define HeterogeneousCore_CUDAUtilities_interface_FlexiStorage_h
+
+#include <cstdint>
+
+namespace cms {
+  namespace cuda {
+
+    template <typename I, int S>
+    class FlexiStorage {
+    public:
+      constexpr int capacity() const { return S; }
+
+      constexpr I& operator[](int i) { return m_v[i]; }
+      constexpr const I& operator[](int i) const { return m_v[i]; }
+
+      constexpr I* data() { return m_v; }
+      constexpr I const* data() const { return m_v; }
+
+    private:
+      I m_v[S];
+    };
+
+    template <typename I>
+    class FlexiStorage<I, -1> {
+    public:
+      constexpr void init(I* v, int s) {
+        m_v = v;
+        m_capacity = s;
+      }
+
+      constexpr int capacity() const { return m_capacity; }
+
+      constexpr I& operator[](int i) { return m_v[i]; }
+      constexpr const I& operator[](int i) const { return m_v[i]; }
+
+      constexpr I* data() { return m_v; }
+      constexpr I const* data() const { return m_v; }
+
+    private:
+      I* m_v;
-      I* m_v;
+      std::unique_ptr<I[]> m_v;
-      I* m_v;
+      std::unique_ptr<I[]> m_v;
+      int m_capacity;
+    };
+
+  }  // namespace cuda
+
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -1,19 +1,7 @@
 #ifndef HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
 #define HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
 
-#include <algorithm>
-#ifndef __CUDA_ARCH__
-#include <atomic>
-#endif  // __CUDA_ARCH__
-#include <cstddef>
-#include <cstdint>
-#include <type_traits>
-
-#include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h"
 
 namespace cms {
   namespace cuda {
@@ -50,61 +38,27 @@ namespace cms {
       }
     }
 
-    template <typename Histo>
-    inline __attribute__((always_inline)) void launchZero(Histo *__restrict__ h,
-                                                          cudaStream_t stream
-#ifndef __CUDACC__
-                                                          = cudaStreamDefault
-#endif
-    ) {
-      uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off));
-      int32_t size = offsetof(Histo, bins) - offsetof(Histo, off);
-      assert(size >= int(sizeof(uint32_t) * Histo::totbins()));
-#ifdef __CUDACC__
-      cudaCheck(cudaMemsetAsync(poff, 0, size, stream));
-#else
-      ::memset(poff, 0, size);
-#endif
-    }
-
-    template <typename Histo>
-    inline __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h,
-                                                              cudaStream_t stream
-#ifndef __CUDACC__
-                                                              = cudaStreamDefault
-#endif
-    ) {
-#ifdef __CUDACC__
-      uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off));
-      int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Histo, psws));
-      auto nthreads = 1024;
-      auto nblocks = (Histo::totbins() + nthreads - 1) / nthreads;
-      multiBlockPrefixScan<<<nblocks, nthreads, sizeof(int32_t) * nblocks, stream>>>(
-          poff, poff, Histo::totbins(), ppsws);
-      cudaCheck(cudaGetLastError());
-#else
-      h->finalize();
-#endif
-    }
-
     template <typename Histo, typename T>
     inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h,
                                                                   uint32_t nh,
                                                                   T const *__restrict__ v,
                                                                   uint32_t const *__restrict__ offsets,
-                                                                  uint32_t totSize,
+                                                                  int32_t totSize,
                                                                   int nthreads,
+                                                                  typename Histo::index_type *mem,
                                                                   cudaStream_t stream
 #ifndef __CUDACC__
                                                                   = cudaStreamDefault
 #endif
     ) {
-      launchZero(h, stream);
+      typename Histo::View view = {h, nullptr, mem, -1, totSize};
+      launchZero(view, stream);
 #ifdef __CUDACC__
       auto nblocks = (totSize + nthreads - 1) / nthreads;
+      assert(nblocks > 0);
       countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
       cudaCheck(cudaGetLastError());
-      launchFinalize(h, stream);
+      launchFinalize(view, stream);
       fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
       cudaCheck(cudaGetLastError());
 #else
@@ -114,11 +68,6 @@ namespace cms {
 #endif
     }
 
-    template <typename Assoc>
-    __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) {
-      assoc->bulkFinalizeFill(*apc);
-    }
-
     // iteratate over N bins left and right of the one containing "v"
     template <typename Hist, typename V, typename Func>
     __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) {
@@ -142,20 +91,19 @@ namespace cms {
       }
     }
 
-    template <typename T,                  // the type of the discretized input values
-              uint32_t NBINS,              // number of bins
-              uint32_t SIZE,               // max number of element
+    template <typename T,      // the type of the discretized input values
+              uint32_t NBINS,  // number of bins
+              int32_t SIZE,    // max number of element. If -1 is initialized at runtime using external storage
               uint32_t S = sizeof(T) * 8,  // number of significant bits in T
               typename I = uint32_t,  // type stored in the container (usually an index in a vector of the input values)
               uint32_t NHISTS = 1     // number of histos stored
               >
-    class HistoContainer {
+    class HistoContainer : public OneToManyAssoc<I, NHISTS * NBINS + 1, SIZE> {
     public:
-      using Counter = uint32_t;
-
-      using CountersOnly = HistoContainer<T, NBINS, 0, S, I, NHISTS>;
-
-      using index_type = I;
+      using Base = OneToManyAssoc<I, NHISTS * NBINS + 1, SIZE>;
+      using View = typename Base::View;
+      using Counter = typename Base::Counter;
+      using index_type = typename Base::index_type;
       using UT = typename std::make_unsigned<T>::type;
 
       static constexpr uint32_t ilog2(uint32_t v) {
@@ -176,7 +124,8 @@ namespace cms {
       static constexpr uint32_t nhists() { return NHISTS; }
       static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; }
       static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; }
-      static constexpr uint32_t capacity() { return SIZE; }
+
+      // static_assert(int32_t(totbins())==Base::ctNOnes());
 
       static constexpr auto histOff(uint32_t nh) { return NBINS * nh; }
 
@@ -186,137 +135,39 @@ namespace cms {
         return (t >> shift) & mask;
       }
 
-      __host__ __device__ void zero() {
-        for (auto &i : off)
-          i = 0;
-      }
-
-      __host__ __device__ __forceinline__ void add(CountersOnly const &co) {
-        for (uint32_t i = 0; i < totbins(); ++i) {
-#ifdef __CUDA_ARCH__
-          atomicAdd(off + i, co.off[i]);
-#else
-          auto &a = (std::atomic<Counter> &)(off[i]);
-          a += co.off[i];
-#endif
-        }
-      }
-
-      static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) {
-#ifdef __CUDA_ARCH__
-        return atomicAdd(&x, 1);
-#else
-        auto &a = (std::atomic<Counter> &)(x);
-        return a++;
-#endif
-      }
-
-      static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) {
-#ifdef __CUDA_ARCH__
-        return atomicSub(&x, 1);
-#else
-        auto &a = (std::atomic<Counter> &)(x);
-        return a--;
-#endif
-      }
-
-      __host__ __device__ __forceinline__ void countDirect(T b) {
-        assert(b < nbins());
-        atomicIncrement(off[b]);
-      }
-
-      __host__ __device__ __forceinline__ void fillDirect(T b, index_type j) {
-        assert(b < nbins());
-        auto w = atomicDecrement(off[b]);
-        assert(w > 0);
-        bins[w - 1] = j;
-      }
-
-      __host__ __device__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) {
-        auto c = apc.add(n);
-        if (c.m >= nbins())
-          return -int32_t(c.m);
-        off[c.m] = c.n;
-        for (uint32_t j = 0; j < n; ++j)
-          bins[c.n + j] = v[j];
-        return c.m;
-      }
-
-      __host__ __device__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) {
-        off[apc.get().m] = apc.get().n;
-      }
-
-      __host__ __device__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) {
-        auto m = apc.get().m;
-        auto n = apc.get().n;
-        if (m >= nbins()) {  // overflow!
-          off[nbins()] = uint32_t(off[nbins() - 1]);
-          return;
-        }
-        auto first = m + blockDim.x * blockIdx.x + threadIdx.x;
-        for (auto i = first; i < totbins(); i += gridDim.x * blockDim.x) {
-          off[i] = n;
-        }
-      }
-
       __host__ __device__ __forceinline__ void count(T t) {
         uint32_t b = bin(t);
         assert(b < nbins());
-        atomicIncrement(off[b]);
+        Base::atomicIncrement(this->off[b]);
       }
 
       __host__ __device__ __forceinline__ void fill(T t, index_type j) {
         uint32_t b = bin(t);
         assert(b < nbins());
-        auto w = atomicDecrement(off[b]);
+        auto w = Base::atomicDecrement(this->off[b]);
         assert(w > 0);
-        bins[w - 1] = j;
+        this->content[w - 1] = j;
       }
 
       __host__ __device__ __forceinline__ void count(T t, uint32_t nh) {
         uint32_t b = bin(t);
         assert(b < nbins());
         b += histOff(nh);
         assert(b < totbins());
-        atomicIncrement(off[b]);
+        Base::atomicIncrement(this->off[b]);
       }
 
       __host__ __device__ __forceinline__ void fill(T t, index_type j, uint32_t nh) {
         uint32_t b = bin(t);
         assert(b < nbins());
         b += histOff(nh);
         assert(b < totbins());
-        auto w = atomicDecrement(off[b]);
+        auto w = Base::atomicDecrement(this->off[b]);
         assert(w > 0);
-        bins[w - 1] = j;
-      }
-
-      __host__ __device__ __forceinline__ void finalize(Counter *ws = nullptr) {
-        assert(off[totbins() - 1] == 0);
-        blockPrefixScan(off, totbins(), ws);
-        assert(off[totbins() - 1] == off[totbins() - 2]);
+        this->content[w - 1] = j;
       }
-
-      constexpr auto size() const { return uint32_t(off[totbins() - 1]); }
-      constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; }
-
-      constexpr index_type const *begin() const { return bins; }
-      constexpr index_type const *end() const { return begin() + size(); }
-
-      constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; }
-      constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; }
-
-      Counter off[totbins()];
-      int32_t psws;  // prefix-scan working space
-      index_type bins[capacity()];
     };
 
-    template <typename I,        // type stored in the container (usually an index in a vector of the input values)
-              uint32_t MAXONES,  // max number of "ones"
-              uint32_t MAXMANYS  // max number of "manys"
-              >
-    using OneToManyAssoc = HistoContainer<uint32_t, MAXONES, MAXMANYS, sizeof(uint32_t) * 8, I, 1>;
-
   }  // namespace cuda
 }  // namespace cms