From ce7301ff375896a44e8094fef146883a51258528 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 5 Mar 2020 12:36:29 +0100
Subject: [PATCH 01/15] Add common tools for processing on GPUs

---
 .../GeometrySurface/interface/SOARotation.h   |  28 ++
 .../GeometrySurface/test/BuildFile.xml        |  13 +
 .../test/gpuFrameTransformKernel.cu           |  40 +++
 .../test/gpuFrameTransformTest.cpp            | 114 ++++++
 DataFormats/Math/BuildFile.xml                |   7 +-
 .../Math/interface/choleskyInversion.h        | 333 +++++++++++++++++
 DataFormats/Math/test/BuildFile.xml           |  86 +++--
 DataFormats/Math/test/CholeskyInvert_t.cpp    | 136 +++++++
 DataFormats/Math/test/CholeskyInvert_t.cu     | 207 +++++++++++
 DataFormats/Math/test/cudaAtan2Test.cu        |  34 +-
 DataFormats/Math/test/cudaMathTest.cu         |  53 ++-
 DataFormats/Math/test/testAtan2.cpp           |   4 +
 .../interface/AtomicPairCounter.h             |  52 +++
 .../CUDAUtilities/interface/HistoContainer.h  | 340 ++++++++++++++++++
 .../CUDAUtilities/interface/cudaCompat.h      | 103 ++++++
 .../interface/cudastdAlgorithm.h              |  70 ++++
 .../CUDAUtilities/interface/eigenSoA.h        |  55 +++
 .../CUDAUtilities/interface/prefixScan.h      | 169 +++++++++
 .../CUDAUtilities/interface/radixSort.h       | 266 ++++++++++++++
 .../CUDAUtilities/src/cudaCompat.cc           |  15 +
 .../CUDAUtilities/test/AtomicPairCounter_t.cu |  65 ++++
 .../CUDAUtilities/test/HistoContainer_t.cpp   | 146 ++++++++
 .../CUDAUtilities/test/HistoContainer_t.cu    | 154 ++++++++
 .../CUDAUtilities/test/OneHistoContainer_t.cu | 140 ++++++++
 .../CUDAUtilities/test/OneToManyAssoc_t.cpp   |   1 +
 .../CUDAUtilities/test/OneToManyAssoc_t.cu    |   1 +
 .../CUDAUtilities/test/OneToManyAssoc_t.h     | 308 ++++++++++++++++
 .../CUDAUtilities/test/cudastdAlgorithm_t.cpp |  34 ++
 .../CUDAUtilities/test/cudastdAlgorithm_t.cu  |  30 ++
 .../CUDAUtilities/test/eigenSoA_t.cpp         |   1 +
 .../CUDAUtilities/test/eigenSoA_t.cu          |   1 +
 .../CUDAUtilities/test/eigenSoA_t.h           | 101 ++++++
 .../CUDAUtilities/test/prefixScan_t.cu        | 165 +++++++++
 .../CUDAUtilities/test/radixSort_t.cu         | 202 +++++++++++
 34 files changed, 3388 insertions(+), 86 deletions(-)
 create mode 100644 DataFormats/GeometrySurface/test/gpuFrameTransformKernel.cu
 create mode 100644 DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
 create mode 100644 DataFormats/Math/interface/choleskyInversion.h
 create mode 100644 DataFormats/Math/test/CholeskyInvert_t.cpp
 create mode 100644 DataFormats/Math/test/CholeskyInvert_t.cu
 create mode 100644 HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h
 create mode 100644 HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
 create mode 100644 HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
 create mode 100644 HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h
 create mode 100644 HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h
 create mode 100644 HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
 create mode 100644 HeterogeneousCore/CUDAUtilities/interface/radixSort.h
 create mode 100644 HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc
 create mode 100644 HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu
 create mode 100644 HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
 create mode 100644 HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
 create mode 100644 HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
 create mode 100644 HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cpp
 create mode 100644 HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cu
 create mode 100644 HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
 create mode 100644 HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cpp
 create mode 100644 HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cu
 create mode 100644 HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.cpp
 create mode 100644 HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.cu
 create mode 100644 HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h
 create mode 100644 HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
 create mode 100644 HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu

diff --git a/DataFormats/GeometrySurface/interface/SOARotation.h b/DataFormats/GeometrySurface/interface/SOARotation.h
index 331a56b7ecf57..d75efef4736cb 100644
--- a/DataFormats/GeometrySurface/interface/SOARotation.h
+++ b/DataFormats/GeometrySurface/interface/SOARotation.h
@@ -100,6 +100,34 @@ class SOAFrame {
     uz += pz;
   }
 
+  constexpr inline void toGlobal(T cxx, T cxy, T cyy, T *gl) const {
+    auto const &r = rot;
+    gl[0] = r.xx() * (r.xx() * cxx + r.yx() * cxy) + r.yx() * (r.xx() * cxy + r.yx() * cyy);
+    gl[1] = r.xx() * (r.xy() * cxx + r.yy() * cxy) + r.yx() * (r.xy() * cxy + r.yy() * cyy);
+    gl[2] = r.xy() * (r.xy() * cxx + r.yy() * cxy) + r.yy() * (r.xy() * cxy + r.yy() * cyy);
+    gl[3] = r.xx() * (r.xz() * cxx + r.yz() * cxy) + r.yx() * (r.xz() * cxy + r.yz() * cyy);
+    gl[4] = r.xy() * (r.xz() * cxx + r.yz() * cxy) + r.yy() * (r.xz() * cxy + r.yz() * cyy);
+    gl[5] = r.xz() * (r.xz() * cxx + r.yz() * cxy) + r.yz() * (r.xz() * cxy + r.yz() * cyy);
+  }
+
+  constexpr inline void toLocal(T const *ge, T &lxx, T &lxy, T &lyy) const {
+    auto const &r = rot;
+
+    T cxx = ge[0];
+    T cyx = ge[1];
+    T cyy = ge[2];
+    T czx = ge[3];
+    T czy = ge[4];
+    T czz = ge[5];
+
+    lxx = r.xx() * (r.xx() * cxx + r.xy() * cyx + r.xz() * czx) +
+          r.xy() * (r.xx() * cyx + r.xy() * cyy + r.xz() * czy) + r.xz() * (r.xx() * czx + r.xy() * czy + r.xz() * czz);
+    lxy = r.yx() * (r.xx() * cxx + r.xy() * cyx + r.xz() * czx) +
+          r.yy() * (r.xx() * cyx + r.xy() * cyy + r.xz() * czy) + r.yz() * (r.xx() * czx + r.xy() * czy + r.xz() * czz);
+    lyy = r.yx() * (r.yx() * cxx + r.yy() * cyx + r.yz() * czx) +
+          r.yy() * (r.yx() * cyx + r.yy() * cyy + r.yz() * czy) + r.yz() * (r.yx() * czx + r.yy() * czy + r.yz() * czz);
+  }
+
   constexpr inline T x() const { return px; }
   constexpr inline T y() const { return py; }
   constexpr inline T z() const { return pz; }
diff --git a/DataFormats/GeometrySurface/test/BuildFile.xml b/DataFormats/GeometrySurface/test/BuildFile.xml
index 050cdb4c8f19d..5f4db224a639b 100644
--- a/DataFormats/GeometrySurface/test/BuildFile.xml
+++ b/DataFormats/GeometrySurface/test/BuildFile.xml
@@ -13,3 +13,16 @@
 </bin>
 <bin file="Bounds_t.cpp">
 </bin>
+
+<bin file="gpuFrameTransformKernel.cu, gpuFrameTransformTest.cpp" name="gpuFrameTransformTest">
+  <use name="cuda"/>
+  <use name="HeterogeneousCore/CUDAUtilities"/>
+  <flags CXXFLAGS="-g"/>
+</bin>
+
+<bin file="gpuFrameTransformKernel.cu, gpuFrameTransformTest.cpp" name="gpuFrameTransformTestRep">
+  <use name="cuda"/>
+  <use name="HeterogeneousCore/CUDAUtilities"/>
+  <flags CXXFLAGS="-ffp-contract=off"/>
+  <flags CUDA_FLAGS="-fmad=false -ftz=false -prec-div=true -prec-sqrt=true"/>
+</bin>
diff --git a/DataFormats/GeometrySurface/test/gpuFrameTransformKernel.cu b/DataFormats/GeometrySurface/test/gpuFrameTransformKernel.cu
new file mode 100644
index 0000000000000..c24510146fb59
--- /dev/null
+++ b/DataFormats/GeometrySurface/test/gpuFrameTransformKernel.cu
@@ -0,0 +1,40 @@
+#include <cstdint>
+#include <iostream>
+#include <iomanip>
+
+#include "DataFormats/GeometrySurface/interface/SOARotation.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+
+__global__ void toGlobal(SOAFrame<float> const* frame,
+                         float const* xl,
+                         float const* yl,
+                         float* x,
+                         float* y,
+                         float* z,
+                         float const* le,
+                         float* ge,
+                         uint32_t n) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i >= n)
+    return;
+
+  frame[0].toGlobal(xl[i], yl[i], x[i], y[i], z[i]);
+  frame[0].toGlobal(le[3 * i], le[3 * i + 1], le[3 * i + 2], ge + 6 * i);
+}
+
+void toGlobalWrapper(SOAFrame<float> const* frame,
+                     float const* xl,
+                     float const* yl,
+                     float* x,
+                     float* y,
+                     float* z,
+                     float const* le,
+                     float* ge,
+                     uint32_t n) {
+  int threadsPerBlock = 256;
+  int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
+  std::cout << "CUDA toGlobal kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads"
+            << std::endl;
+
+  cms::cuda::launch(toGlobal, {blocksPerGrid, threadsPerBlock}, frame, xl, yl, x, y, z, le, ge, n);
+}
diff --git a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
new file mode 100644
index 0000000000000..ad62b7a1d131c
--- /dev/null
+++ b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp
@@ -0,0 +1,114 @@
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <cstdint>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <numeric>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "DataFormats/GeometrySurface/interface/GloballyPositioned.h"
+#include "DataFormats/GeometrySurface/interface/SOARotation.h"
+#include "DataFormats/GeometrySurface/interface/TkRotation.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+
+void toGlobalWrapper(SOAFrame<float> const *frame,
+                     float const *xl,
+                     float const *yl,
+                     float *x,
+                     float *y,
+                     float *z,
+                     float const *le,
+                     float *ge,
+                     uint32_t n);
+
+int main(void) {
+  cms::cudatest::requireDevices();
+
+  typedef float T;
+  typedef TkRotation<T> Rotation;
+  typedef SOARotation<T> SRotation;
+  typedef GloballyPositioned<T> Frame;
+  typedef SOAFrame<T> SFrame;
+  typedef typename Frame::PositionType Position;
+  typedef typename Frame::GlobalVector GlobalVector;
+  typedef typename Frame::GlobalPoint GlobalPoint;
+  typedef typename Frame::LocalVector LocalVector;
+  typedef typename Frame::LocalPoint LocalPoint;
+
+  constexpr uint32_t size = 10000;
+  constexpr uint32_t size32 = size * sizeof(float);
+
+  float xl[size], yl[size];
+  float x[size], y[size], z[size];
+
+  // errors
+  float le[3 * size];
+  float ge[6 * size];
+
+  auto d_xl = cms::cuda::make_device_unique<float[]>(size, nullptr);
+  auto d_yl = cms::cuda::make_device_unique<float[]>(size, nullptr);
+
+  auto d_x = cms::cuda::make_device_unique<float[]>(size, nullptr);
+  auto d_y = cms::cuda::make_device_unique<float[]>(size, nullptr);
+  auto d_z = cms::cuda::make_device_unique<float[]>(size, nullptr);
+
+  auto d_le = cms::cuda::make_device_unique<float[]>(3 * size, nullptr);
+  auto d_ge = cms::cuda::make_device_unique<float[]>(6 * size, nullptr);
+
+  double a = 0.01;
+  double ca = std::cos(a);
+  double sa = std::sin(a);
+
+  Rotation r1(ca, sa, 0, -sa, ca, 0, 0, 0, 1);
+  Frame f1(Position(2, 3, 4), r1);
+  std::cout << "f1.position() " << f1.position() << std::endl;
+  std::cout << "f1.rotation() " << '\n' << f1.rotation() << std::endl;
+
+  SFrame sf1(f1.position().x(), f1.position().y(), f1.position().z(), f1.rotation());
+
+  auto d_sf = cms::cuda::make_device_unique<char[]>(sizeof(SFrame), nullptr);
+  cudaCheck(cudaMemcpy(d_sf.get(), &sf1, sizeof(SFrame), cudaMemcpyHostToDevice));
+
+  for (auto i = 0U; i < size; ++i) {
+    xl[i] = yl[i] = 0.1f * float(i) - float(size / 2);
+    le[3 * i] = 0.01f;
+    le[3 * i + 2] = (i > size / 2) ? 1.f : 0.04f;
+    le[2 * i + 1] = 0.;
+  }
+  std::random_shuffle(xl, xl + size);
+  std::random_shuffle(yl, yl + size);
+
+  cudaCheck(cudaMemcpy(d_xl.get(), xl, size32, cudaMemcpyHostToDevice));
+  cudaCheck(cudaMemcpy(d_yl.get(), yl, size32, cudaMemcpyHostToDevice));
+  cudaCheck(cudaMemcpy(d_le.get(), le, 3 * size32, cudaMemcpyHostToDevice));
+
+  toGlobalWrapper((SFrame const *)(d_sf.get()),
+                  d_xl.get(),
+                  d_yl.get(),
+                  d_x.get(),
+                  d_y.get(),
+                  d_z.get(),
+                  d_le.get(),
+                  d_ge.get(),
+                  size);
+  cudaCheck(cudaMemcpy(x, d_x.get(), size32, cudaMemcpyDeviceToHost));
+  cudaCheck(cudaMemcpy(y, d_y.get(), size32, cudaMemcpyDeviceToHost));
+  cudaCheck(cudaMemcpy(z, d_z.get(), size32, cudaMemcpyDeviceToHost));
+  cudaCheck(cudaMemcpy(ge, d_ge.get(), 6 * size32, cudaMemcpyDeviceToHost));
+
+  float eps = 0.;
+  for (auto i = 0U; i < size; ++i) {
+    auto gp = f1.toGlobal(LocalPoint(xl[i], yl[i]));
+    eps = std::max(eps, std::abs(x[i] - gp.x()));
+    eps = std::max(eps, std::abs(y[i] - gp.y()));
+    eps = std::max(eps, std::abs(z[i] - gp.z()));
+  }
+
+  std::cout << "max eps " << eps << std::endl;
+
+  return 0;
+}
diff --git a/DataFormats/Math/BuildFile.xml b/DataFormats/Math/BuildFile.xml
index 6aa1d86287860..83d06125a017c 100644
--- a/DataFormats/Math/BuildFile.xml
+++ b/DataFormats/Math/BuildFile.xml
@@ -1,6 +1,7 @@
-<use   name="DataFormats/Common"/>
-<use   name="rootmath"/>
+<use name="eigen"/>
+<use name="rootmath"/>
+<use name="DataFormats/Common"/>
 
 <export>
-  <lib   name="1"/>
+  <lib name="1"/>
 </export>
diff --git a/DataFormats/Math/interface/choleskyInversion.h b/DataFormats/Math/interface/choleskyInversion.h
new file mode 100644
index 0000000000000..eba2a17648008
--- /dev/null
+++ b/DataFormats/Math/interface/choleskyInversion.h
@@ -0,0 +1,333 @@
+#ifndef DataFormat_Math_choleskyInversion_h
+#define DataFormat_Math_choleskyInversion_h
+
+#include <cmath>
+
+#include <Eigen/Core>
+
+/**
+ * fully inlined specialized code to perform the inversion of a
+ * positive defined matrix of rank up to 6.
+ *
+ * originally by
+ * @author Manuel Schiller
+ * @date Aug 29 2008
+ *
+ *
+ */
+namespace choleskyInversion {
+
+  template <typename M1, typename M2>
+  inline constexpr void invert11(M1 const& src, M2& dst) {
+    using F = decltype(src(0, 0));
+    dst(0, 0) = F(1.0) / src(0, 0);
+  }
+
+  template <typename M1, typename M2>
+  inline constexpr void invert22(M1 const& src, M2& dst) {
+    using F = decltype(src(0, 0));
+    auto luc0 = F(1.0) / src(0, 0);
+    auto luc1 = src(1, 0) * src(1, 0) * luc0;
+    auto luc2 = F(1.0) / (src(1, 1) - luc1);
+
+    auto li21 = luc1 * luc0 * luc2;
+
+    dst(0, 0) = li21 + luc0;
+    dst(1, 0) = -src(1, 0) * luc0 * luc2;
+    dst(1, 1) = luc2;
+  }
+
+  template <typename M1, typename M2>
+  inline constexpr void invert33(M1 const& src, M2& dst) {
+    using F = decltype(src(0, 0));
+    auto luc0 = F(1.0) / src(0, 0);
+    auto luc1 = src(1, 0);
+    auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
+    luc2 = F(1.0) / luc2;
+    auto luc3 = src(2, 0);
+    auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
+    auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + (luc2 * luc4) * luc4);
+    luc5 = F(1.0) / luc5;
+
+    auto li21 = -luc0 * luc1;
+    auto li32 = -(luc2 * luc4);
+    auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
+
+    dst(0, 0) = luc5 * li31 * li31 + li21 * li21 * luc2 + luc0;
+    dst(1, 0) = luc5 * li31 * li32 + li21 * luc2;
+    dst(1, 1) = luc5 * li32 * li32 + luc2;
+    dst(2, 0) = luc5 * li31;
+    dst(2, 1) = luc5 * li32;
+    dst(2, 2) = luc5;
+  }
+
+  template <typename M1, typename M2>
+  inline constexpr void invert44(M1 const& src, M2& dst) {
+    using F = decltype(src(0, 0));
+    auto luc0 = F(1.0) / src(0, 0);
+    auto luc1 = src(1, 0);
+    auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
+    luc2 = F(1.0) / luc2;
+    auto luc3 = src(2, 0);
+    auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
+    auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4);
+    luc5 = F(1.0) / luc5;
+    auto luc6 = src(3, 0);
+    auto luc7 = (src(3, 1) - luc0 * luc1 * luc6);
+    auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7);
+    auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5));
+    luc9 = F(1.0) / luc9;
+
+    auto li21 = -luc1 * luc0;
+    auto li32 = -luc2 * luc4;
+    auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
+    auto li43 = -(luc8 * luc5);
+    auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2;
+    auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0;
+
+    dst(0, 0) = luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0;
+    dst(1, 0) = luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21;
+    dst(1, 1) = luc9 * li42 * li42 + luc5 * li32 * li32 + luc2;
+    dst(2, 0) = luc9 * li41 * li43 + luc5 * li31;
+    dst(2, 1) = luc9 * li42 * li43 + luc5 * li32;
+    dst(2, 2) = luc9 * li43 * li43 + luc5;
+    dst(3, 0) = luc9 * li41;
+    dst(3, 1) = luc9 * li42;
+    dst(3, 2) = luc9 * li43;
+    dst(3, 3) = luc9;
+  }
+
+  template <typename M1, typename M2>
+  inline constexpr void invert55(M1 const& src, M2& dst) {
+    using F = decltype(src(0, 0));
+    auto luc0 = F(1.0) / src(0, 0);
+    auto luc1 = src(1, 0);
+    auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
+    luc2 = F(1.0) / luc2;
+    auto luc3 = src(2, 0);
+    auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
+    auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4);
+    luc5 = F(1.0) / luc5;
+    auto luc6 = src(3, 0);
+    auto luc7 = (src(3, 1) - luc0 * luc1 * luc6);
+    auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7);
+    auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5));
+    luc9 = F(1.0) / luc9;
+    auto luc10 = src(4, 0);
+    auto luc11 = (src(4, 1) - luc0 * luc1 * luc10);
+    auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11);
+    auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12);
+    auto luc14 =
+        src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13);
+    luc14 = F(1.0) / luc14;
+
+    auto li21 = -luc1 * luc0;
+    auto li32 = -luc2 * luc4;
+    auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
+    auto li43 = -(luc8 * luc5);
+    auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2;
+    auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0;
+    auto li54 = -luc13 * luc9;
+    auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5;
+    auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2;
+    auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 -
+                 luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 +
+                 luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) *
+                luc0;
+
+    dst(0, 0) = luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0;
+    dst(1, 0) = luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21;
+    dst(1, 1) = luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2;
+    dst(2, 0) = luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31;
+    dst(2, 1) = luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32;
+    dst(2, 2) = luc14 * li53 * li53 + luc9 * li43 * li43 + luc5;
+    dst(3, 0) = luc14 * li51 * li54 + luc9 * li41;
+    dst(3, 1) = luc14 * li52 * li54 + luc9 * li42;
+    dst(3, 2) = luc14 * li53 * li54 + luc9 * li43;
+    dst(3, 3) = luc14 * li54 * li54 + luc9;
+    dst(4, 0) = luc14 * li51;
+    dst(4, 1) = luc14 * li52;
+    dst(4, 2) = luc14 * li53;
+    dst(4, 3) = luc14 * li54;
+    dst(4, 4) = luc14;
+  }
+
+  template <typename M1, typename M2>
+  inline __attribute__((always_inline)) constexpr void invert66(M1 const& src, M2& dst) {
+    using F = decltype(src(0, 0));
+    auto luc0 = F(1.0) / src(0, 0);
+    auto luc1 = src(1, 0);
+    auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
+    luc2 = F(1.0) / luc2;
+    auto luc3 = src(2, 0);
+    auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
+    auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4);
+    luc5 = F(1.0) / luc5;
+    auto luc6 = src(3, 0);
+    auto luc7 = (src(3, 1) - luc0 * luc1 * luc6);
+    auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7);
+    auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5));
+    luc9 = F(1.0) / luc9;
+    auto luc10 = src(4, 0);
+    auto luc11 = (src(4, 1) - luc0 * luc1 * luc10);
+    auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11);
+    auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12);
+    auto luc14 =
+        src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13);
+    luc14 = F(1.0) / luc14;
+    auto luc15 = src(5, 0);
+    auto luc16 = (src(5, 1) - luc0 * luc1 * luc15);
+    auto luc17 = (src(5, 2) - luc0 * luc3 * luc15 - luc2 * luc4 * luc16);
+    auto luc18 = (src(5, 3) - luc0 * luc6 * luc15 - luc2 * luc7 * luc16 - luc5 * luc8 * luc17);
+    auto luc19 =
+        (src(5, 4) - luc0 * luc10 * luc15 - luc2 * luc11 * luc16 - luc5 * luc12 * luc17 - luc9 * luc13 * luc18);
+    auto luc20 = src(5, 5) - (luc0 * luc15 * luc15 + luc2 * luc16 * luc16 + luc5 * luc17 * luc17 +
+                              luc9 * luc18 * luc18 + luc14 * luc19 * luc19);
+    luc20 = F(1.0) / luc20;
+
+    auto li21 = -luc1 * luc0;
+    auto li32 = -luc2 * luc4;
+    auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
+    auto li43 = -(luc8 * luc5);
+    auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2;
+    auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0;
+    auto li54 = -luc13 * luc9;
+    auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5;
+    auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2;
+    auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 -
+                 luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 +
+                 luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) *
+                luc0;
+
+    auto li65 = -luc19 * luc14;
+    auto li64 = (luc19 * luc14 * luc13 - luc18) * luc9;
+    auto li63 = (-luc8 * luc13 * (luc19 * luc14) * luc9 + luc8 * luc9 * luc18 + luc12 * (luc19 * luc14) - luc17) * luc5;
+    auto li62 = (luc4 * (luc8 * luc9) * luc13 * luc5 * (luc19 * luc14) - luc18 * luc4 * (luc8 * luc9) * luc5 -
+                 luc19 * luc12 * luc4 * luc14 * luc5 - luc19 * luc13 * luc7 * luc14 * luc9 + luc17 * luc4 * luc5 +
+                 luc18 * luc7 * luc9 + luc19 * luc11 * luc14 - luc16) *
+                luc2;
+    auto li61 =
+        (-luc19 * luc13 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 * luc14 +
+         luc18 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 + luc19 * luc12 * luc4 * luc1 * luc2 * luc5 * luc14 +
+         luc19 * luc13 * luc7 * luc1 * luc2 * luc9 * luc14 + luc19 * luc13 * luc8 * luc3 * luc5 * luc9 * luc14 -
+         luc17 * luc4 * luc1 * luc2 * luc5 - luc18 * luc7 * luc1 * luc2 * luc9 - luc19 * luc11 * luc1 * luc2 * luc14 -
+         luc18 * luc8 * luc3 * luc5 * luc9 - luc19 * luc12 * luc3 * luc5 * luc14 - luc19 * luc13 * luc6 * luc9 * luc14 +
+         luc16 * luc1 * luc2 + luc17 * luc3 * luc5 + luc18 * luc6 * luc9 + luc19 * luc10 * luc14 - luc15) *
+        luc0;
+
+    dst(0, 0) =
+        luc20 * li61 * li61 + luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0;
+    dst(1, 0) = luc20 * li61 * li62 + luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21;
+    dst(1, 1) = luc20 * li62 * li62 + luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2;
+    dst(2, 0) = luc20 * li61 * li63 + luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31;
+    dst(2, 1) = luc20 * li62 * li63 + luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32;
+    dst(2, 2) = luc20 * li63 * li63 + luc14 * li53 * li53 + luc9 * li43 * li43 + luc5;
+    dst(3, 0) = luc20 * li61 * li64 + luc14 * li51 * li54 + luc9 * li41;
+    dst(3, 1) = luc20 * li62 * li64 + luc14 * li52 * li54 + luc9 * li42;
+    dst(3, 2) = luc20 * li63 * li64 + luc14 * li53 * li54 + luc9 * li43;
+    dst(3, 3) = luc20 * li64 * li64 + luc14 * li54 * li54 + luc9;
+    dst(4, 0) = luc20 * li61 * li65 + luc14 * li51;
+    dst(4, 1) = luc20 * li62 * li65 + luc14 * li52;
+    dst(4, 2) = luc20 * li63 * li65 + luc14 * li53;
+    dst(4, 3) = luc20 * li64 * li65 + luc14 * li54;
+    dst(4, 4) = luc20 * li65 * li65 + luc14;
+    dst(5, 0) = luc20 * li61;
+    dst(5, 1) = luc20 * li62;
+    dst(5, 2) = luc20 * li63;
+    dst(5, 3) = luc20 * li64;
+    dst(5, 4) = luc20 * li65;
+    dst(5, 5) = luc20;
+  }
+
+  template <typename M>
+  inline constexpr void symmetrize11(M& dst) {}
+  template <typename M>
+  inline constexpr void symmetrize22(M& dst) {
+    dst(0, 1) = dst(1, 0);
+  }
+  template <typename M>
+  inline constexpr void symmetrize33(M& dst) {
+    symmetrize22(dst);
+    dst(0, 2) = dst(2, 0);
+    dst(1, 2) = dst(2, 1);
+  }
+  template <typename M>
+  inline constexpr void symmetrize44(M& dst) {
+    symmetrize33(dst);
+    dst(0, 3) = dst(3, 0);
+    dst(1, 3) = dst(3, 1);
+    dst(2, 3) = dst(3, 2);
+  }
+  template <typename M>
+  inline constexpr void symmetrize55(M& dst) {
+    symmetrize44(dst);
+    dst(0, 4) = dst(4, 0);
+    dst(1, 4) = dst(4, 1);
+    dst(2, 4) = dst(4, 2);
+    dst(3, 4) = dst(4, 3);
+  }
+  template <typename M>
+  inline constexpr void symmetrize66(M& dst) {
+    symmetrize55(dst);
+    dst(0, 5) = dst(5, 0);
+    dst(1, 5) = dst(5, 1);
+    dst(2, 5) = dst(5, 2);
+    dst(3, 5) = dst(5, 3);
+    dst(4, 5) = dst(5, 4);
+  }
+
+  template <typename M1, typename M2, int N>
+  struct Inverter {
+    static constexpr void eval(M1 const& src, M2& dst) { dst = src.inverse(); }
+  };
+  template <typename M1, typename M2>
+  struct Inverter<M1, M2, 1> {
+    static constexpr void eval(M1 const& src, M2& dst) { invert11(src, dst); }
+  };
+  template <typename M1, typename M2>
+  struct Inverter<M1, M2, 2> {
+    static constexpr void eval(M1 const& src, M2& dst) {
+      invert22(src, dst);
+      symmetrize22(dst);
+    }
+  };
+  template <typename M1, typename M2>
+  struct Inverter<M1, M2, 3> {
+    static constexpr void eval(M1 const& src, M2& dst) {
+      invert33(src, dst);
+      symmetrize33(dst);
+    }
+  };
+  template <typename M1, typename M2>
+  struct Inverter<M1, M2, 4> {
+    static constexpr void eval(M1 const& src, M2& dst) {
+      invert44(src, dst);
+      symmetrize44(dst);
+    }
+  };
+  template <typename M1, typename M2>
+  struct Inverter<M1, M2, 5> {
+    static constexpr void eval(M1 const& src, M2& dst) {
+      invert55(src, dst);
+      symmetrize55(dst);
+    }
+  };
+  template <typename M1, typename M2>
+  struct Inverter<M1, M2, 6> {
+    static constexpr void eval(M1 const& src, M2& dst) {
+      invert66(src, dst);
+      symmetrize66(dst);
+    }
+  };
+
+  // Eigen interface
+  template <typename D1, typename D2>
+  inline constexpr void invert(Eigen::DenseBase<D1> const& src, Eigen::DenseBase<D2>& dst) {
+    using M1 = Eigen::DenseBase<D1>;
+    using M2 = Eigen::DenseBase<D2>;
+    Inverter<M1, M2, M2::ColsAtCompileTime>::eval(src, dst);
+  }
+
+}  // namespace choleskyInversion
+
+#endif  // DataFormat_Math_choleskyInversion_h
diff --git a/DataFormats/Math/test/BuildFile.xml b/DataFormats/Math/test/BuildFile.xml
index 6b1112e30472c..5f2dd3854f6d1 100644
--- a/DataFormats/Math/test/BuildFile.xml
+++ b/DataFormats/Math/test/BuildFile.xml
@@ -1,27 +1,31 @@
-<library   file="ReadMath.cc,WriteMath.cc,SealModule.cc" name="DataFormatsMath_test">
-  <use   name="FWCore/Framework"/>
-  <use   name="DataFormats/Common"/>
-  <flags   EDM_PLUGIN="1"/>
+<library file="ReadMath.cc,WriteMath.cc,SealModule.cc" name="DataFormatsMath_test">
+  <use name="FWCore/Framework"/>
+  <use name="DataFormats/Common"/>
+  <flags EDM_PLUGIN="1"/>
 </library>
-<use   name="rootmath"/>
-<use   name="rootgraphics"/>
-<use   name="DataFormats/Math"/>
-<bin   file="Matrices_t.cpp" name="DataFormatsMatrices_t">
+<use name="rootmath"/>
+<use name="rootgraphics"/>
+<use name="DataFormats/Math"/>
+<bin file="Matrices_t.cpp" name="DataFormatsMatrices_t">
 </bin>
-<bin   file="ProjectMatrix_t.cpp" name="DataFormatsProjectMatrix_t">
+
+<bin file="ProjectMatrix_t.cpp" name="DataFormatsProjectMatrix_t">
 </bin>
-<bin   file="FastMath_t.cpp" name="DataFormatsFastMath_t">
+
+<bin file="FastMath_t.cpp" name="DataFormatsFastMath_t">
   <flags CXXFLAGS="-Wno-error=format -Wno-format -Wno-format-contains-nul"/>
   <flags REM_CXXFLAGS="-Wformat -ansi"/>
   <flags REM_CXXFLAGS="-Wformat-contains-nul"/>
 </bin>
-<bin   file="testApproximations.cpp" name="testMathApproximations">
+
+<bin file="testApproximations.cpp" name="testMathApproximations">
   <flags CXXFLAGS="-Wno-error=format -Wno-format -Wno-format-contains-nul"/>
   <flags REM_CXXFLAGS="-Wformat"/>
   <flags REM_CXXFLAGS="-Wformat-contains-nul"/>
   <flags REM_CXXFLAGS="-ansi"/>
 </bin>
-<bin   file="testApproxMath.cpp" name="testMathApproxMath">
+
+<bin file="testApproxMath.cpp" name="testMathApproxMath">
   <flags CXXFLAGS="-Wno-error=format -Wno-format -Wno-format-contains-nul"/>
   <flags REM_CXXFLAGS="-Wformat"/>
   <flags REM_CXXFLAGS="-Wformat-contains-nul"/>
@@ -29,75 +33,97 @@
 </bin>
 
 <release name="!.*ICC_X.*">
-  <bin   file="ExtVec_t.cpp" name="DataFormatsExtVec_t">
+  <bin file="ExtVec_t.cpp" name="DataFormatsExtVec_t">
   </bin>
 </release>
-<bin   file="SSEVec_t.cpp" name="DataFormatsSSEVec_t">
+
+<bin file="SSEVec_t.cpp" name="DataFormatsSSEVec_t">
 </bin>
-<bin   file="crossV4_t.cpp" name="DataFormatscrossV4_t">
+
+<bin file="crossV4_t.cpp" name="DataFormatscrossV4_t">
   <use name="ofast-flag"/>
 </bin>
-<bin   file="sign_t.cpp" name="DataFormatsSign_t">
+
+<bin file="sign_t.cpp" name="DataFormatsSign_t">
 </bin>
-<bin   file="eta_t.cpp" name="DataFormatsEta_t">
+
+<bin file="eta_t.cpp" name="DataFormatsEta_t">
 </bin>
-<bin   file="arc.cpp" name="DataFormatsAsin_t">
+
+<bin file="arc.cpp" name="DataFormatsAsin_t">
 </bin>
+
 <bin file="testAtan2.cpp">
   <flags CXXFLAGS="-fwrapv"/>
 </bin>
+
 <bin file="testAtan2.cpp" name="testAtan2FastMath">
   <use name="ofast-flag"/>
   <flags CXXFLAGS="-fwrapv"/>
 </bin>
+
 <bin file="testAtan2.cpp" name="testAtan2VDT">
-  <use   name="vdt"/>
+  <use name="vdt"/>
   <flags CXXFLAGS="-fwrapv"/>
 </bin>
 
-<bin   file="deltaR_t.cpp" name="DataFormatsDeltaR_t">
+<bin file="deltaR_t.cpp" name="DataFormatsDeltaR_t">
   <flags CXXFLAGS="-Wno-error=format -Wno-format -Wno-format-contains-nul"/>
   <flags REM_CXXFLAGS="-Wformat"/>
   <flags REM_CXXFLAGS="-Wformat-contains-nul"/>
   <flags REM_CXXFLAGS="-ansi"/>
 </bin>
-<bin   file="Similarity_t.cpp" name="DataFormatsSimilarity_t">
+
+<bin file="Similarity_t.cpp" name="DataFormatsSimilarity_t">
 </bin>
-<bin   file="Similarity_t.cpp" name="DataFormatsSimilarityFast_t">
+
+<bin file="Similarity_t.cpp" name="DataFormatsSimilarityFast_t">
   <use name="ofast-flag"/>
 </bin>
 
-<bin   file="MulSymMatrix_t.cpp" name="DataFormatsMulSymMatrix_t">
+<bin file="MulSymMatrix_t.cpp" name="DataFormatsMulSymMatrix_t">
+</bin>
+
+<bin file="CholeskyInvert_t.cpp" name="CholeskyInvert_t">
 </bin>
 
 <bin file="testGraph.cpp">
 </bin>
 
 <bin name="testDataFormatsMathPacking" file="testMiniFloat.cpp,testlogintpack.cpp,testRunner.cpp">
-  <use   name="cppunit"/>
+  <use name="cppunit"/>
 </bin>
+
 <iftool name="cuda-gcc-support">
+
 <bin file="cudaAtan2Test.cu" name="DFM_Atan2">
   <use name="cuda"/>
-  <use name="cuda-api-wrappers"/>
-  <use   name="vdt_headers"/>
+  <use name="vdt_headers"/>
+  <use name="HeterogeneousCore/CUDAUtilities"/>
   <flags CUDA_FLAGS="--expt-relaxed-constexpr"/>
   <flags CXXFLAGS="-O3 -fwrapv"/>
 </bin>
 
 <bin file="cudaMathTest.cu" name="DFM_Math">
   <use name="cuda"/>
-  <use name="cuda-api-wrappers"/>
-  <use   name="vdt_headers"/>
+  <use name="vdt_headers"/>
+  <use name="HeterogeneousCore/CUDAUtilities"/>
   <flags CUDA_FLAGS="--expt-relaxed-constexpr"/>
   <flags CXXFLAGS="-O3 -fwrapv"/>
 </bin>
 
 <bin file="cudaMathTest.cu" name="DFM_MathRepr">
   <use name="cuda"/>
-  <use name="cuda-api-wrappers"/>
-  <use   name="vdt_headers"/>
+  <use name="vdt_headers"/>
+  <use name="HeterogeneousCore/CUDAUtilities"/>
   <flags CUDA_FLAGS="--expt-relaxed-constexpr -fmad=false -ftz=false -prec-div=true -prec-sqrt=true"/>
   <flags CXXFLAGS="-ffp-contract=off"/>
 </bin>
+
+<bin file="CholeskyInvert_t.cu" name="CholeskyInvertOnGPU_t">
+  <use name="cuda"/>
+  <use name="HeterogeneousCore/CUDAUtilities"/>
+  <flags CUDA_FLAGS="-w"/>
+</bin>
+
 </iftool>
diff --git a/DataFormats/Math/test/CholeskyInvert_t.cpp b/DataFormats/Math/test/CholeskyInvert_t.cpp
new file mode 100644
index 0000000000000..f1bf92353dfc3
--- /dev/null
+++ b/DataFormats/Math/test/CholeskyInvert_t.cpp
@@ -0,0 +1,136 @@
+// nvcc -O3 CholeskyDecomp_t.cu --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native"
+// add -DDOPROF to run  nvprof --metrics all
+
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <random>
+
+#include <Eigen/Core>
+#include <Eigen/Eigenvalues>
+
+#include "DataFormats/Math/interface/choleskyInversion.h"
+
+constexpr int stride() { return 5 * 1024; }
+template <int DIM>
+using MXN = Eigen::Matrix<double, DIM, DIM>;
+template <int DIM>
+using MapMX = Eigen::Map<MXN<DIM>, 0, Eigen::Stride<DIM * stride(), stride()> >;
+
+// generate matrices
+template <class M>
+void genMatrix(M& m) {
+  using T = typename std::remove_reference<decltype(m(0, 0))>::type;
+  int n = M::ColsAtCompileTime;
+  std::mt19937 eng;
+  // std::mt19937 eng2;
+  std::uniform_real_distribution<T> rgen(0., 1.);
+
+  // generate first diagonal elemets
+  for (int i = 0; i < n; ++i) {
+    double maxVal = i * 10000 / (n - 1) + 1;  // max condition is 10^4
+    m(i, i) = maxVal * rgen(eng);
+  }
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < i; ++j) {
+      double v = 0.3 * std::sqrt(m(i, i) * m(j, j));  // this makes the matrix pos defined
+      m(i, j) = v * rgen(eng);
+      m(j, i) = m(i, j);
+    }
+  }
+}
+
+template <int N>
+void go(bool soa) {
+  constexpr unsigned int DIM = N;
+  using MX = MXN<DIM>;
+  std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << " in " << (soa ? "SOA" : "AOS")
+            << " mode" << std::endl;
+
+  auto start = std::chrono::high_resolution_clock::now();
+  auto delta = start - start;
+
+  constexpr unsigned int SIZE = 4 * 1024;
+
+  alignas(128) MX mm[stride()];  // just storage in case of SOA
+  double* __restrict__ p = (double*)__builtin_assume_aligned(mm, 128);
+
+  if (soa) {
+    for (unsigned int i = 0; i < SIZE; ++i) {
+      MapMX<N> m(p + i);
+      genMatrix(m);
+    }
+  } else {
+    for (auto& m : mm)
+      genMatrix(m);
+  }
+
+  std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+  if (soa)
+    for (unsigned int i = 0; i < SIZE; ++i) {
+      MapMX<N> m(p + i);
+      choleskyInversion::invert(m, m);
+      choleskyInversion::invert(m, m);
+    }
+  else
+    for (auto& m : mm) {
+      choleskyInversion::invert(m, m);
+      choleskyInversion::invert(m, m);
+    }
+
+  std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+  constexpr int NKK =
+#ifdef DOPROF
+      2;
+#else
+      1000;
+#endif
+  for (int kk = 0; kk < NKK; ++kk) {
+    delta -= (std::chrono::high_resolution_clock::now() - start);
+    if (soa)
+#pragma GCC ivdep
+#ifdef __clang__
+#pragma clang loop vectorize(enable) interleave(enable)
+#endif
+      for (unsigned int i = 0; i < SIZE; ++i) {
+        MapMX<N> m(p + i);
+        choleskyInversion::invert(m, m);
+      }
+    else
+#pragma GCC ivdep
+      for (auto& m : mm) {
+        choleskyInversion::invert(m, m);
+      }
+
+    delta += (std::chrono::high_resolution_clock::now() - start);
+  }
+
+  std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+  double DNNK = NKK;
+  std::cout << "x86 computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() / DNNK
+            << ' ' << " ms" << std::endl;
+}
+
+int main() {
+  go<2>(false);
+  go<3>(false);
+  go<4>(false);
+  go<5>(false);
+  go<6>(false);
+
+  go<2>(true);
+  go<3>(true);
+  go<4>(true);
+  go<5>(true);
+  go<6>(true);
+
+  // go<10>();
+  return 0;
+}
diff --git a/DataFormats/Math/test/CholeskyInvert_t.cu b/DataFormats/Math/test/CholeskyInvert_t.cu
new file mode 100644
index 0000000000000..55df4ed23f20d
--- /dev/null
+++ b/DataFormats/Math/test/CholeskyInvert_t.cu
@@ -0,0 +1,207 @@
+// nvcc -O3 CholeskyDecomp_t.cu --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native"
+// add -DDOPROF to run  nvprof --metrics all
+
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <random>
+
+#include <Eigen/Core>
+#include <Eigen/Eigenvalues>
+
+#include "DataFormats/Math/interface/choleskyInversion.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+
+constexpr int stride() { return 5 * 1024; }
+template <int DIM>
+using MXN = Eigen::Matrix<double, DIM, DIM>;
+template <int DIM>
+using MapMX = Eigen::Map<MXN<DIM>, 0, Eigen::Stride<DIM * stride(), stride()>>;
+
+template <int N>
+__global__ void invertSOA(double *__restrict__ p, unsigned int n) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= n)
+    return;
+
+  MapMX<N> m(p + i);
+  choleskyInversion::invert(m, m);
+}
+
+template <typename M, int N>
+__global__ void invert(M *mm, unsigned int n) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= n)
+    return;
+
+  auto &m = mm[i];
+  choleskyInversion::invert(m, m);
+}
+
+template <typename M, int N>
+__global__ void invertSeq(M *mm, unsigned int n) {
+  if (threadIdx.x != 0)
+    return;
+  auto first = blockIdx.x * blockDim.x;
+  auto last = std::min(first + blockDim.x, n);
+
+  for (auto i = first; i < last; ++i) {
+    auto &m = mm[i];
+    choleskyInversion::invert(m, m);
+  }
+}
+
+// generate matrices
+template <class M>
+void genMatrix(M &m) {
+  using T = typename std::remove_reference<decltype(m(0, 0))>::type;
+  int n = M::ColsAtCompileTime;
+  std::mt19937 eng;
+  // std::mt19937 eng2;
+  std::uniform_real_distribution<T> rgen(0., 1.);
+
+  // generate first diagonal elemets
+  for (int i = 0; i < n; ++i) {
+    double maxVal = i * 10000 / (n - 1) + 1;  // max condition is 10^4
+    m(i, i) = maxVal * rgen(eng);
+  }
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < i; ++j) {
+      double v = 0.3 * std::sqrt(m(i, i) * m(j, j));  // this makes the matrix pos defined
+      m(i, j) = v * rgen(eng);
+      m(j, i) = m(i, j);
+    }
+  }
+}
+
+template <int N>
+void go(bool soa) {
+  constexpr unsigned int DIM = N;
+  using MX = MXN<DIM>;
+  std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << std::endl;
+
+  auto start = std::chrono::high_resolution_clock::now();
+  auto delta = start - start;
+  auto delta1 = delta;
+  auto delta2 = delta;
+
+  constexpr unsigned int SIZE = 4 * 1024;
+
+  MX mm[stride()];  // just storage in case of SOA
+  double *__restrict__ p = (double *)(mm);
+
+  if (soa) {
+    for (unsigned int i = 0; i < SIZE; ++i) {
+      MapMX<N> m(p + i);
+      genMatrix(m);
+    }
+  } else {
+    for (auto &m : mm)
+      genMatrix(m);
+  }
+
+  std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+  if (soa)
+    for (unsigned int i = 0; i < SIZE; ++i) {
+      MapMX<N> m(p + i);
+      choleskyInversion::invert(m, m);
+      choleskyInversion::invert(m, m);
+    }
+  else
+    for (auto &m : mm) {
+      choleskyInversion::invert(m, m);
+      choleskyInversion::invert(m, m);
+    }
+
+  std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+  auto m_d = cms::cuda::make_device_unique<double[]>(DIM * DIM * stride(), nullptr);
+  cudaCheck(cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice));
+
+  constexpr int NKK =
+#ifdef DOPROF
+      2;
+#else
+      1000;
+#endif
+  for (int kk = 0; kk < NKK; ++kk) {
+    int threadsPerBlock = 128;
+    int blocksPerGrid = SIZE / threadsPerBlock;
+
+    delta -= (std::chrono::high_resolution_clock::now() - start);
+
+    if (soa)
+      cms::cuda::launch(invertSOA<DIM>, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE);
+    else
+      cms::cuda::launch(invert<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
+
+    cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost));
+
+    delta += (std::chrono::high_resolution_clock::now() - start);
+
+    if (0 == kk)
+      std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+    if (!soa) {
+      delta1 -= (std::chrono::high_resolution_clock::now() - start);
+
+#ifndef DOPROF
+      cms::cuda::launch(invertSeq<MX, DIM>, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE);
+      cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost));
+#endif
+      delta1 += (std::chrono::high_resolution_clock::now() - start);
+
+      if (0 == kk)
+        std::cout << mm[SIZE / 2](1, 1) << std::endl;
+    }
+
+    delta2 -= (std::chrono::high_resolution_clock::now() - start);
+    if (soa)
+#pragma GCC ivdep
+      for (unsigned int i = 0; i < SIZE; ++i) {
+        MapMX<N> m(p + i);
+        choleskyInversion::invert(m, m);
+      }
+    else
+#pragma GCC ivdep
+      for (auto &m : mm) {
+        choleskyInversion::invert(m, m);
+      }
+
+    delta2 += (std::chrono::high_resolution_clock::now() - start);
+  }
+
+  std::cout << mm[SIZE / 2](1, 1) << std::endl;
+
+  double DNNK = NKK;
+  std::cout << "cuda/cudaSeq/x86 computation took "
+            << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() / DNNK << ' '
+            << std::chrono::duration_cast<std::chrono::milliseconds>(delta1).count() / DNNK << ' '
+            << std::chrono::duration_cast<std::chrono::milliseconds>(delta2).count() / DNNK << ' ' << " ms"
+            << std::endl;
+}
+
+int main() {
+  cms::cudatest::requireDevices();
+
+  go<2>(false);
+  go<4>(false);
+  go<5>(false);
+  go<6>(false);
+
+  go<2>(true);
+  go<4>(true);
+  go<5>(true);
+  go<6>(true);
+
+  // go<10>();
+  return 0;
+}
diff --git a/DataFormats/Math/test/cudaAtan2Test.cu b/DataFormats/Math/test/cudaAtan2Test.cu
index ecc0be911c777..731447fe826e4 100644
--- a/DataFormats/Math/test/cudaAtan2Test.cu
+++ b/DataFormats/Math/test/cudaAtan2Test.cu
@@ -25,10 +25,13 @@ end
 #include <iostream>
 #include <memory>
 #include <random>
-
-#include "cuda/api_wrappers.h"
+#include <stdexcept>
 
 #include "DataFormats/Math/interface/approx_atan2.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 
 constexpr float xmin = -100.001;  // avoid 0
 constexpr float incr = 0.04;
@@ -62,15 +65,13 @@ void go() {
   auto start = std::chrono::high_resolution_clock::now();
   auto delta = start - start;
 
-  auto current_device = cuda::device::current::get();
-
   // atan2
   delta -= (std::chrono::high_resolution_clock::now() - start);
 
-  auto diff_d = cuda::memory::device::make_unique<int[]>(current_device, 3);
+  auto diff_d = cms::cuda::make_device_unique<int[]>(3, nullptr);
 
   int diffs[3];
-  cuda::memory::device::zero(diff_d.get(), 3 * 4);
+  cudaCheck(cudaMemset(diff_d.get(), 0, 3 * 4));
 
   // Launch the diff CUDA Kernel
   dim3 threadsPerBlock(32, 32, 1);
@@ -79,9 +80,9 @@ void go() {
   std::cout << "CUDA kernel 'diff' launch with " << blocksPerGrid.x << " blocks of " << threadsPerBlock.y
             << " threads\n";
 
-  cuda::launch(diffAtan<DEGREE>, {blocksPerGrid, threadsPerBlock}, diff_d.get());
+  cms::cuda::launch(diffAtan<DEGREE>, {blocksPerGrid, threadsPerBlock}, diff_d.get());
 
-  cuda::memory::copy(diffs, diff_d.get(), 3 * 4);
+  cudaCheck(cudaMemcpy(diffs, diff_d.get(), 3 * 4, cudaMemcpyDeviceToHost));
   delta += (std::chrono::high_resolution_clock::now() - start);
 
   float mdiff = diffs[0] * 1.e-7;
@@ -95,26 +96,15 @@ void go() {
 }
 
 int main() {
-  int count = 0;
-  auto status = cudaGetDeviceCount(&count);
-  if (status != cudaSuccess) {
-    std::cerr << "Failed to initialise the CUDA runtime, the test will be skipped."
-              << "\n";
-    exit(EXIT_SUCCESS);
-  }
-  if (count == 0) {
-    std::cerr << "No CUDA devices on this system, the test will be skipped."
-              << "\n";
-    exit(EXIT_SUCCESS);
-  }
+  cms::cudatest::requireDevices();
 
   try {
     go<3>();
     go<5>();
     go<7>();
     go<9>();
-  } catch (cuda::runtime_error &ex) {
-    std::cerr << "CUDA error: " << ex.what() << std::endl;
+  } catch (std::runtime_error &ex) {
+    std::cerr << "CUDA or std runtime error: " << ex.what() << std::endl;
     exit(EXIT_FAILURE);
   } catch (...) {
     std::cerr << "A non-CUDA error occurred" << std::endl;
diff --git a/DataFormats/Math/test/cudaMathTest.cu b/DataFormats/Math/test/cudaMathTest.cu
index 6aeaa0f2ededb..dd6576de46c1c 100644
--- a/DataFormats/Math/test/cudaMathTest.cu
+++ b/DataFormats/Math/test/cudaMathTest.cu
@@ -25,12 +25,7 @@ end
 #include <iostream>
 #include <memory>
 #include <random>
-
-#include "cuda/api_wrappers.h"
-
-#include <DataFormats/Math/interface/approx_log.h>
-#include <DataFormats/Math/interface/approx_exp.h>
-#include <DataFormats/Math/interface/approx_atan2.h>
+#include <stdexcept>
 
 #ifdef __CUDACC__
 #define inline __host__ __device__ inline
@@ -40,6 +35,14 @@ end
 #include <vdt/sin.h>
 #endif
 
+#include "DataFormats/Math/interface/approx_log.h"
+#include "DataFormats/Math/interface/approx_exp.h"
+#include "DataFormats/Math/interface/approx_atan2.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+
 std::mt19937 eng;
 std::mt19937 eng2;
 std::uniform_real_distribution<float> rgen(0., 1.);
@@ -85,8 +88,6 @@ void go() {
   auto start = std::chrono::high_resolution_clock::now();
   auto delta = start - start;
 
-  auto current_device = cuda::device::current::get();
-
   int numElements = 200000;
   size_t size = numElements * sizeof(float);
   std::cout << "[Vector of " << numElements << " elements]\n";
@@ -100,12 +101,12 @@ void go() {
   std::generate(h_B.get(), h_B.get() + numElements, [&]() { return rgen(eng); });
 
   delta -= (std::chrono::high_resolution_clock::now() - start);
-  auto d_A = cuda::memory::device::make_unique<float[]>(current_device, numElements);
-  auto d_B = cuda::memory::device::make_unique<float[]>(current_device, numElements);
-  auto d_C = cuda::memory::device::make_unique<float[]>(current_device, numElements);
+  auto d_A = cms::cuda::make_device_unique<float[]>(numElements, nullptr);
+  auto d_B = cms::cuda::make_device_unique<float[]>(numElements, nullptr);
+  auto d_C = cms::cuda::make_device_unique<float[]>(numElements, nullptr);
 
-  cuda::memory::copy(d_A.get(), h_A.get(), size);
-  cuda::memory::copy(d_B.get(), h_B.get(), size);
+  cudaCheck(cudaMemcpy(d_A.get(), h_A.get(), size, cudaMemcpyHostToDevice));
+  cudaCheck(cudaMemcpy(d_B.get(), h_B.get(), size, cudaMemcpyHostToDevice));
   delta += (std::chrono::high_resolution_clock::now() - start);
   std::cout << "cuda alloc+copy took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
             << std::endl;
@@ -116,19 +117,21 @@ void go() {
   std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads\n";
 
   delta -= (std::chrono::high_resolution_clock::now() - start);
-  cuda::launch(vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
+  cms::cuda::launch(
+      vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
   delta += (std::chrono::high_resolution_clock::now() - start);
   std::cout << "cuda computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
             << std::endl;
 
   delta -= (std::chrono::high_resolution_clock::now() - start);
-  cuda::launch(vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
+  cms::cuda::launch(
+      vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
   delta += (std::chrono::high_resolution_clock::now() - start);
   std::cout << "cuda computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
             << std::endl;
 
   delta -= (std::chrono::high_resolution_clock::now() - start);
-  cuda::memory::copy(h_C.get(), d_C.get(), size);
+  cudaCheck(cudaMemcpy(h_C.get(), d_C.get(), size, cudaMemcpyDeviceToHost));
   delta += (std::chrono::high_resolution_clock::now() - start);
   std::cout << "cuda copy back took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
             << std::endl;
@@ -178,27 +181,15 @@ void go() {
 }
 
 int main() {
-  int count = 0;
-  auto status = cudaGetDeviceCount(&count);
-  if (status != cudaSuccess) {
-    std::cerr << "Failed to initialise the CUDA runtime, the test will be skipped."
-              << "\n";
-    exit(EXIT_SUCCESS);
-  }
-  if (count == 0) {
-    std::cerr << "No CUDA devices on this system, the test will be skipped."
-              << "\n";
-    exit(EXIT_SUCCESS);
-  }
+  cms::cudatest::requireDevices();
 
   try {
     go<USEEXP>();
     go<USESIN>();
     go<USELOG>();
-
     go<USELOG, true>();
-  } catch (cuda::runtime_error &ex) {
-    std::cerr << "CUDA error: " << ex.what() << std::endl;
+  } catch (std::runtime_error &ex) {
+    std::cerr << "CUDA or std runtime error: " << ex.what() << std::endl;
     exit(EXIT_FAILURE);
   } catch (...) {
     std::cerr << "A non-CUDA error occurred" << std::endl;
diff --git a/DataFormats/Math/test/testAtan2.cpp b/DataFormats/Math/test/testAtan2.cpp
index 207e69c848de9..11c951fc2e06b 100644
--- a/DataFormats/Math/test/testAtan2.cpp
+++ b/DataFormats/Math/test/testAtan2.cpp
@@ -195,6 +195,10 @@ void testIntPhi() {
 }
 
 int main() {
+  constexpr float p2i = ((int)(std::numeric_limits<short>::max()) + 1) / M_PI;
+  constexpr float i2p = M_PI / ((int)(std::numeric_limits<short>::max()) + 1);
+  std::cout << std::hexfloat << "p2i i2p " << p2i << " " << i2p << std::defaultfloat << std::endl;
+
   std::cout << unsafe_atan2f<5>(0.f, 0.f) << " " << std::atan2(0., 0.) << std::endl;
   std::cout << unsafe_atan2f<5>(0.5f, 0.5f) << " " << std::atan2(0.5, 0.5) << std::endl;
   std::cout << unsafe_atan2f<5>(0.5f, -0.5f) << " " << std::atan2(0.5, -0.5) << std::endl;
diff --git a/HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h b/HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h
new file mode 100644
index 0000000000000..8854b02176260
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h
@@ -0,0 +1,52 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_AtomicPairCounter_h
+#define HeterogeneousCore_CUDAUtilities_interface_AtomicPairCounter_h
+
+#include <cstdint>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
+
+class AtomicPairCounter {
+public:
+  using c_type = unsigned long long int;
+
+  AtomicPairCounter() {}
+  AtomicPairCounter(c_type i) { counter.ac = i; }
+
+  __device__ __host__ AtomicPairCounter& operator=(c_type i) {
+    counter.ac = i;
+    return *this;
+  }
+
+  struct Counters {
+    uint32_t n;  // in a "One to Many" association is the number of "One"
+    uint32_t m;  // in a "One to Many" association is the total number of associations
+  };
+
+  union Atomic2 {
+    Counters counters;
+    c_type ac;
+  };
+
+  static constexpr c_type incr = 1UL << 32;
+
+  __device__ __host__ Counters get() const { return counter.counters; }
+
+  // increment n by 1 and m by i.  return previous value
+  __host__ __device__ __forceinline__ Counters add(uint32_t i) {
+    c_type c = i;
+    c += incr;
+    Atomic2 ret;
+#ifdef __CUDA_ARCH__
+    ret.ac = atomicAdd(&counter.ac, c);
+#else
+    ret.ac = counter.ac;
+    counter.ac += c;
+#endif
+    return ret.counters;
+  }
+
+private:
+  Atomic2 counter;
+};
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_AtomicPairCounter_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
new file mode 100644
index 0000000000000..67b2b46f45101
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -0,0 +1,340 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
+#define HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
+
+#include <algorithm>
+#ifndef __CUDA_ARCH__
+#include <atomic>
+#endif  // __CUDA_ARCH__
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#ifdef __CUDACC__
+#include <cub/cub.cuh>
+#endif
+
+#include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
+
+namespace cms {
+  namespace cuda {
+
+    template <typename Histo, typename T>
+    __global__ void countFromVector(Histo *__restrict__ h,
+                                    uint32_t nh,
+                                    T const *__restrict__ v,
+                                    uint32_t const *__restrict__ offsets) {
+      int first = blockDim.x * blockIdx.x + threadIdx.x;
+      for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
+        auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
+        assert((*off) > 0);
+        int32_t ih = off - offsets - 1;
+        assert(ih >= 0);
+        assert(ih < int(nh));
+        (*h).count(v[i], ih);
+      }
+    }
+
+    template <typename Histo, typename T>
+    __global__ void fillFromVector(Histo *__restrict__ h,
+                                   uint32_t nh,
+                                   T const *__restrict__ v,
+                                   uint32_t const *__restrict__ offsets) {
+      int first = blockDim.x * blockIdx.x + threadIdx.x;
+      for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
+        auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
+        assert((*off) > 0);
+        int32_t ih = off - offsets - 1;
+        assert(ih >= 0);
+        assert(ih < int(nh));
+        (*h).fill(v[i], i, ih);
+      }
+    }
+
+    template <typename Histo>
+    inline void launchZero(Histo *__restrict__ h,
+                           cudaStream_t stream
+#ifndef __CUDACC__
+                           = cudaStreamDefault
+#endif
+    ) {
+      uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off));
+#ifdef __CUDACC__
+      cudaCheck(cudaMemsetAsync(off, 0, 4 * Histo::totbins(), stream));
+#else
+      ::memset(off, 0, 4 * Histo::totbins());
+#endif
+    }
+
+    template <typename Histo>
+    inline void launchFinalize(Histo *__restrict__ h,
+                               uint8_t *__restrict__ ws
+#ifndef __CUDACC__
+                               = cudaStreamDefault
+#endif
+                               ,
+                               cudaStream_t stream
+#ifndef __CUDACC__
+                               = cudaStreamDefault
+#endif
+    ) {
+#ifdef __CUDACC__
+      assert(ws);
+      uint32_t *off = (uint32_t *)((char *)(h) + offsetof(Histo, off));
+      size_t wss = Histo::wsSize();
+      assert(wss > 0);
+      CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream));
+#else
+      h->finalize();
+#endif
+    }
+
+    template <typename Histo, typename T>
+    inline void fillManyFromVector(Histo *__restrict__ h,
+                                   uint8_t *__restrict__ ws,
+                                   uint32_t nh,
+                                   T const *__restrict__ v,
+                                   uint32_t const *__restrict__ offsets,
+                                   uint32_t totSize,
+                                   int nthreads,
+                                   cudaStream_t stream
+#ifndef __CUDACC__
+                                   = cudaStreamDefault
+#endif
+    ) {
+      launchZero(h, stream);
+#ifdef __CUDACC__
+      auto nblocks = (totSize + nthreads - 1) / nthreads;
+      countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+      cudaCheck(cudaGetLastError());
+      launchFinalize(h, ws, stream);
+      fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+      cudaCheck(cudaGetLastError());
+#else
+      countFromVector(h, nh, v, offsets);
+      h->finalize();
+      fillFromVector(h, nh, v, offsets);
+#endif
+    }
+
+    template <typename Assoc>
+    __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) {
+      assoc->bulkFinalizeFill(*apc);
+    }
+
+  }  // namespace cuda
+}  // namespace cms
+
+// iteratate over N bins left and right of the one containing "v"
+template <typename Hist, typename V, typename Func>
+__host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) {
+  int bs = Hist::bin(value);
+  int be = std::min(int(Hist::nbins() - 1), bs + n);
+  bs = std::max(0, bs - n);
+  assert(be >= bs);
+  for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
+    func(*pj);
+  }
+}
+
+// iteratate over bins containing all values in window wmin, wmax
+template <typename Hist, typename V, typename Func>
+__host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) {
+  auto bs = Hist::bin(wmin);
+  auto be = Hist::bin(wmax);
+  assert(be >= bs);
+  for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
+    func(*pj);
+  }
+}
+
+template <typename T,                  // the type of the discretized input values
+          uint32_t NBINS,              // number of bins
+          uint32_t SIZE,               // max number of element
+          uint32_t S = sizeof(T) * 8,  // number of significant bits in T
+          typename I = uint32_t,  // type stored in the container (usually an index in a vector of the input values)
+          uint32_t NHISTS = 1     // number of histos stored
+          >
+class HistoContainer {
+public:
+  using Counter = uint32_t;
+
+  using CountersOnly = HistoContainer<T, NBINS, 0, S, I, NHISTS>;
+
+  using index_type = I;
+  using UT = typename std::make_unsigned<T>::type;
+
+  static constexpr uint32_t ilog2(uint32_t v) {
+    constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
+    constexpr uint32_t s[] = {1, 2, 4, 8, 16};
+
+    uint32_t r = 0;  // result of log2(v) will go here
+    for (auto i = 4; i >= 0; i--)
+      if (v & b[i]) {
+        v >>= s[i];
+        r |= s[i];
+      }
+    return r;
+  }
+
+  static constexpr uint32_t sizeT() { return S; }
+  static constexpr uint32_t nbins() { return NBINS; }
+  static constexpr uint32_t nhists() { return NHISTS; }
+  static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; }
+  static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; }
+  static constexpr uint32_t capacity() { return SIZE; }
+
+  static constexpr auto histOff(uint32_t nh) { return NBINS * nh; }
+
+  __host__ static size_t wsSize() {
+#ifdef __CUDACC__
+    uint32_t *v = nullptr;
+    void *d_temp_storage = nullptr;
+    size_t temp_storage_bytes = 0;
+    cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins());
+    return temp_storage_bytes;
+#else
+    return 0;
+#endif
+  }
+
+  static constexpr UT bin(T t) {
+    constexpr uint32_t shift = sizeT() - nbits();
+    constexpr uint32_t mask = (1 << nbits()) - 1;
+    return (t >> shift) & mask;
+  }
+
+  __host__ __device__ void zero() {
+    for (auto &i : off)
+      i = 0;
+  }
+
+  __host__ __device__ void add(CountersOnly const &co) {
+    for (uint32_t i = 0; i < totbins(); ++i) {
+#ifdef __CUDA_ARCH__
+      atomicAdd(off + i, co.off[i]);
+#else
+      auto &a = (std::atomic<Counter> &)(off[i]);
+      a += co.off[i];
+#endif
+    }
+  }
+
+  static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) {
+#ifdef __CUDA_ARCH__
+    return atomicAdd(&x, 1);
+#else
+    auto &a = (std::atomic<Counter> &)(x);
+    return a++;
+#endif
+  }
+
+  static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) {
+#ifdef __CUDA_ARCH__
+    return atomicSub(&x, 1);
+#else
+    auto &a = (std::atomic<Counter> &)(x);
+    return a--;
+#endif
+  }
+
+  __host__ __device__ __forceinline__ void countDirect(T b) {
+    assert(b < nbins());
+    atomicIncrement(off[b]);
+  }
+
+  __host__ __device__ __forceinline__ void fillDirect(T b, index_type j) {
+    assert(b < nbins());
+    auto w = atomicDecrement(off[b]);
+    assert(w > 0);
+    bins[w - 1] = j;
+  }
+
+  __device__ __host__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) {
+    auto c = apc.add(n);
+    if (c.m >= nbins())
+      return -int32_t(c.m);
+    off[c.m] = c.n;
+    for (uint32_t j = 0; j < n; ++j)
+      bins[c.n + j] = v[j];
+    return c.m;
+  }
+
+  __device__ __host__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) {
+    off[apc.get().m] = apc.get().n;
+  }
+
+  __device__ __host__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) {
+    auto m = apc.get().m;
+    auto n = apc.get().n;
+    if (m >= nbins()) {  // overflow!
+      off[nbins()] = uint32_t(off[nbins() - 1]);
+      return;
+    }
+    auto first = m + blockDim.x * blockIdx.x + threadIdx.x;
+    for (auto i = first; i < totbins(); i += gridDim.x * blockDim.x) {
+      off[i] = n;
+    }
+  }
+
+  __host__ __device__ __forceinline__ void count(T t) {
+    uint32_t b = bin(t);
+    assert(b < nbins());
+    atomicIncrement(off[b]);
+  }
+
+  __host__ __device__ __forceinline__ void fill(T t, index_type j) {
+    uint32_t b = bin(t);
+    assert(b < nbins());
+    auto w = atomicDecrement(off[b]);
+    assert(w > 0);
+    bins[w - 1] = j;
+  }
+
+  __host__ __device__ __forceinline__ void count(T t, uint32_t nh) {
+    uint32_t b = bin(t);
+    assert(b < nbins());
+    b += histOff(nh);
+    assert(b < totbins());
+    atomicIncrement(off[b]);
+  }
+
+  __host__ __device__ __forceinline__ void fill(T t, index_type j, uint32_t nh) {
+    uint32_t b = bin(t);
+    assert(b < nbins());
+    b += histOff(nh);
+    assert(b < totbins());
+    auto w = atomicDecrement(off[b]);
+    assert(w > 0);
+    bins[w - 1] = j;
+  }
+
+  __device__ __host__ __forceinline__ void finalize(Counter *ws = nullptr) {
+    assert(off[totbins() - 1] == 0);
+    blockPrefixScan(off, totbins(), ws);
+    assert(off[totbins() - 1] == off[totbins() - 2]);
+  }
+
+  constexpr auto size() const { return uint32_t(off[totbins() - 1]); }
+  constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; }
+
+  constexpr index_type const *begin() const { return bins; }
+  constexpr index_type const *end() const { return begin() + size(); }
+
+  constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; }
+  constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; }
+
+  Counter off[totbins()];
+  index_type bins[capacity()];
+};
+
+template <typename I,        // type stored in the container (usually an index in a vector of the input values)
+          uint32_t MAXONES,  // max number of "ones"
+          uint32_t MAXMANYS  // max number of "manys"
+          >
+using OneToManyAssoc = HistoContainer<uint32_t, MAXONES, MAXMANYS, sizeof(uint32_t) * 8, I, 1>;
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
new file mode 100644
index 0000000000000..32c7d7b481a50
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
@@ -0,0 +1,103 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h
+#define HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h
+
+/*
+ * Everything you need to run cuda code in plain sequential c++ code
+ */
+
+#ifndef __CUDACC__
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+
+#include <cuda_runtime.h>
+
+namespace cudaCompat {
+
+#ifndef __CUDA_RUNTIME_H__
+  struct dim3 {
+    uint32_t x, y, z;
+  };
+#endif
+  const dim3 threadIdx = {0, 0, 0};
+  const dim3 blockDim = {1, 1, 1};
+
+  extern thread_local dim3 blockIdx;
+  extern thread_local dim3 gridDim;
+
+  template <typename T1, typename T2>
+  T1 atomicInc(T1* a, T2 b) {
+    auto ret = *a;
+    if ((*a) < T1(b))
+      (*a)++;
+    return ret;
+  }
+
+  template <typename T1, typename T2>
+  T1 atomicAdd(T1* a, T2 b) {
+    auto ret = *a;
+    (*a) += b;
+    return ret;
+  }
+
+  template <typename T1, typename T2>
+  T1 atomicSub(T1* a, T2 b) {
+    auto ret = *a;
+    (*a) -= b;
+    return ret;
+  }
+
+  template <typename T1, typename T2>
+  T1 atomicMin(T1* a, T2 b) {
+    auto ret = *a;
+    *a = std::min(*a, T1(b));
+    return ret;
+  }
+  template <typename T1, typename T2>
+  T1 atomicMax(T1* a, T2 b) {
+    auto ret = *a;
+    *a = std::max(*a, T1(b));
+    return ret;
+  }
+
+  inline void __syncthreads() {}
+  inline void __threadfence() {}
+  inline bool __syncthreads_or(bool x) { return x; }
+  inline bool __syncthreads_and(bool x) { return x; }
+  template <typename T>
+  inline T __ldg(T const* x) {
+    return *x;
+  }
+
+  inline void resetGrid() {
+    blockIdx = {0, 0, 0};
+    gridDim = {1, 1, 1};
+  }
+
+}  // namespace cudaCompat
+
+// some  not needed as done by cuda runtime...
+#ifndef __CUDA_RUNTIME_H__
+#define __host__
+#define __device__
+#define __global__
+#define __shared__
+#define __forceinline__
+#endif
+
+// make sure function are inlined to avoid multiple definition
+#ifndef __CUDA_ARCH__
+#undef __global__
+#define __global__ inline __attribute__((always_inline))
+#undef __forceinline__
+#define __forceinline__ inline __attribute__((always_inline))
+#endif
+
+#ifndef __CUDA_ARCH__
+using namespace cudaCompat;
+#endif
+
+#endif
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h b/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h
new file mode 100644
index 0000000000000..4ff01d75ad02f
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h
@@ -0,0 +1,70 @@
+#ifndef HeterogeneousCore_CUDAUtilities_cudastdAlgorithm_h
+#define HeterogeneousCore_CUDAUtilities_cudastdAlgorithm_h
+
+#include <utility>
+
+#include <cuda_runtime.h>
+
+// reimplementation of std algorithms able to compile with CUDA and run on GPUs,
+// mostly by declaringthem constexpr
+
+namespace cuda_std {
+
+  template <typename T = void>
+  struct less {
+    __host__ __device__ constexpr bool operator()(const T &lhs, const T &rhs) const { return lhs < rhs; }
+  };
+
+  template <>
+  struct less<void> {
+    template <typename T, typename U>
+    __host__ __device__ constexpr bool operator()(const T &lhs, const U &rhs) const {
+      return lhs < rhs;
+    }
+  };
+
+  template <typename RandomIt, typename T, typename Compare = less<T>>
+  __host__ __device__ constexpr RandomIt lower_bound(RandomIt first, RandomIt last, const T &value, Compare comp = {}) {
+    auto count = last - first;
+
+    while (count > 0) {
+      auto it = first;
+      auto step = count / 2;
+      it += step;
+      if (comp(*it, value)) {
+        first = ++it;
+        count -= step + 1;
+      } else {
+        count = step;
+      }
+    }
+    return first;
+  }
+
+  template <typename RandomIt, typename T, typename Compare = less<T>>
+  __host__ __device__ constexpr RandomIt upper_bound(RandomIt first, RandomIt last, const T &value, Compare comp = {}) {
+    auto count = last - first;
+
+    while (count > 0) {
+      auto it = first;
+      auto step = count / 2;
+      it += step;
+      if (!comp(value, *it)) {
+        first = ++it;
+        count -= step + 1;
+      } else {
+        count = step;
+      }
+    }
+    return first;
+  }
+
+  template <typename RandomIt, typename T, typename Compare = cuda_std::less<T>>
+  __host__ __device__ constexpr RandomIt binary_find(RandomIt first, RandomIt last, const T &value, Compare comp = {}) {
+    first = cuda_std::lower_bound(first, last, value, comp);
+    return first != last && !comp(value, *first) ? first : last;
+  }
+
+}  // namespace cuda_std
+
+#endif  // HeterogeneousCore_CUDAUtilities_cudastdAlgorithm_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h b/HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h
new file mode 100644
index 0000000000000..021402c93c203
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h
@@ -0,0 +1,55 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_eigenSoA_h
+#define HeterogeneousCore_CUDAUtilities_interface_eigenSoA_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include <Eigen/Core>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
+
+namespace eigenSoA {
+
+  constexpr bool isPowerOf2(int32_t v) { return v && !(v & (v - 1)); }
+
+  template <typename T, int S>
+  class alignas(128) ScalarSoA {
+  public:
+    using Scalar = T;
+
+    __host__ __device__ constexpr Scalar& operator()(int32_t i) { return data_[i]; }
+    __device__ constexpr const Scalar operator()(int32_t i) const { return __ldg(data_ + i); }
+    __host__ __device__ constexpr Scalar& operator[](int32_t i) { return data_[i]; }
+    __device__ constexpr const Scalar operator[](int32_t i) const { return __ldg(data_ + i); }
+
+    __host__ __device__ constexpr Scalar* data() { return data_; }
+    __host__ __device__ constexpr Scalar const* data() const { return data_; }
+
+  private:
+    Scalar data_[S];
+    static_assert(isPowerOf2(S), "SoA stride not a power of 2");
+    static_assert(sizeof(data_) % 128 == 0, "SoA size not a multiple of 128");
+  };
+
+  template <typename M, int S>
+  class alignas(128) MatrixSoA {
+  public:
+    using Scalar = typename M::Scalar;
+    using Map = Eigen::Map<M, 0, Eigen::Stride<M::RowsAtCompileTime * S, S> >;
+    using CMap = Eigen::Map<const M, 0, Eigen::Stride<M::RowsAtCompileTime * S, S> >;
+
+    __host__ __device__ constexpr Map operator()(int32_t i) { return Map(data_ + i); }
+    __host__ __device__ constexpr CMap operator()(int32_t i) const { return CMap(data_ + i); }
+    __host__ __device__ constexpr Map operator[](int32_t i) { return Map(data_ + i); }
+    __host__ __device__ constexpr CMap operator[](int32_t i) const { return CMap(data_ + i); }
+
+  private:
+    Scalar data_[S * M::RowsAtCompileTime * M::ColsAtCompileTime];
+    static_assert(isPowerOf2(S), "SoA stride not a power of 2");
+    static_assert(sizeof(data_) % 128 == 0, "SoA size not a multiple of 128");
+  };
+
+}  // namespace eigenSoA
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_eigenSoA_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
new file mode 100644
index 0000000000000..8b784bdd61bfe
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
@@ -0,0 +1,169 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_prefixScan_h
+#define HeterogeneousCore_CUDAUtilities_interface_prefixScan_h
+
+#include <cstdint>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#ifdef __CUDA_ARCH__
+template <typename T>
+__device__ void __forceinline__ warpPrefixScan(T const* __restrict__ ci, T* __restrict__ co, uint32_t i, uint32_t mask) {
+  // ci and co may be the same
+  auto x = ci[i];
+  auto laneId = threadIdx.x & 0x1f;
+#pragma unroll
+  for (int offset = 1; offset < 32; offset <<= 1) {
+    auto y = __shfl_up_sync(mask, x, offset);
+    if (laneId >= offset)
+      x += y;
+  }
+  co[i] = x;
+}
+#endif
+
+//same as above may remove
+#ifdef __CUDA_ARCH__
+template <typename T>
+__device__ void __forceinline__ warpPrefixScan(T* c, uint32_t i, uint32_t mask) {
+  auto x = c[i];
+  auto laneId = threadIdx.x & 0x1f;
+#pragma unroll
+  for (int offset = 1; offset < 32; offset <<= 1) {
+    auto y = __shfl_up_sync(mask, x, offset);
+    if (laneId >= offset)
+      x += y;
+  }
+  c[i] = x;
+}
+#endif
+
+// limited to 32*32 elements....
+template <typename T>
+__device__ __host__ void __forceinline__ blockPrefixScan(T const* __restrict__ ci,
+                                                         T* __restrict__ co,
+                                                         uint32_t size,
+                                                         T* ws
+#ifndef __CUDA_ARCH__
+                                                         = nullptr
+#endif
+) {
+#ifdef __CUDA_ARCH__
+  assert(ws);
+  assert(size <= 1024);
+  assert(0 == blockDim.x % 32);
+  auto first = threadIdx.x;
+  auto mask = __ballot_sync(0xffffffff, first < size);
+
+  for (auto i = first; i < size; i += blockDim.x) {
+    warpPrefixScan(ci, co, i, mask);
+    auto laneId = threadIdx.x & 0x1f;
+    auto warpId = i / 32;
+    assert(warpId < 32);
+    if (31 == laneId)
+      ws[warpId] = co[i];
+    mask = __ballot_sync(mask, i + blockDim.x < size);
+  }
+  __syncthreads();
+  if (size <= 32)
+    return;
+  if (threadIdx.x < 32)
+    warpPrefixScan(ws, threadIdx.x, 0xffffffff);
+  __syncthreads();
+  for (auto i = first + 32; i < size; i += blockDim.x) {
+    auto warpId = i / 32;
+    co[i] += ws[warpId - 1];
+  }
+  __syncthreads();
+#else
+  co[0] = ci[0];
+  for (uint32_t i = 1; i < size; ++i)
+    co[i] = ci[i] + co[i - 1];
+#endif
+}
+
+// same as above, may remove
+// limited to 32*32 elements....
+template <typename T>
+__device__ __host__ void __forceinline__ blockPrefixScan(T* c,
+                                                         uint32_t size,
+                                                         T* ws
+#ifndef __CUDA_ARCH__
+                                                         = nullptr
+#endif
+) {
+#ifdef __CUDA_ARCH__
+  assert(ws);
+  assert(size <= 1024);
+  assert(0 == blockDim.x % 32);
+  auto first = threadIdx.x;
+  auto mask = __ballot_sync(0xffffffff, first < size);
+
+  for (auto i = first; i < size; i += blockDim.x) {
+    warpPrefixScan(c, i, mask);
+    auto laneId = threadIdx.x & 0x1f;
+    auto warpId = i / 32;
+    assert(warpId < 32);
+    if (31 == laneId)
+      ws[warpId] = c[i];
+    mask = __ballot_sync(mask, i + blockDim.x < size);
+  }
+  __syncthreads();
+  if (size <= 32)
+    return;
+  if (threadIdx.x < 32)
+    warpPrefixScan(ws, threadIdx.x, 0xffffffff);
+  __syncthreads();
+  for (auto i = first + 32; i < size; i += blockDim.x) {
+    auto warpId = i / 32;
+    c[i] += ws[warpId - 1];
+  }
+  __syncthreads();
+#else
+  for (uint32_t i = 1; i < size; ++i)
+    c[i] += c[i - 1];
+#endif
+}
+
+// limited to 1024*1024 elements....
+template <typename T>
+__global__ void multiBlockPrefixScan(T const* __restrict__ ci, T* __restrict__ co, int32_t size, int32_t* pc) {
+  __shared__ T ws[32];
+  // first each block does a scan of size 1024; (better be enough blocks....)
+  assert(1024 * gridDim.x >= size);
+  int off = 1024 * blockIdx.x;
+  if (size - off > 0)
+    blockPrefixScan(ci + off, co + off, std::min(1024, size - off), ws);
+
+  // count blocks that finished
+  __shared__ bool isLastBlockDone;
+  if (0 == threadIdx.x) {
+    auto value = atomicAdd(pc, 1);  // block counter
+    isLastBlockDone = (value == (int(gridDim.x) - 1));
+  }
+
+  __syncthreads();
+
+  if (!isLastBlockDone)
+    return;
+
+  // good each block has done its work and now we are left in last block
+
+  // let's get the partial sums from each block
+  __shared__ T psum[1024];
+  for (int i = threadIdx.x, ni = gridDim.x; i < ni; i += blockDim.x) {
+    auto j = 1024 * i + 1023;
+    psum[i] = (j < size) ? co[j] : T(0);
+  }
+  __syncthreads();
+  blockPrefixScan(psum, psum, gridDim.x, ws);
+
+  // now it would have been handy to have the other blocks around...
+  int first = threadIdx.x;                                 // + blockDim.x * blockIdx.x
+  for (int i = first + 1024; i < size; i += blockDim.x) {  //  *gridDim.x) {
+    auto k = i / 1024;                                     // block
+    co[i] += psum[k - 1];
+  }
+}
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_prefixScan_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
new file mode 100644
index 0000000000000..ce2acf2006090
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
@@ -0,0 +1,266 @@
+#ifndef HeterogeneousCoreCUDAUtilities_radixSort_H
+#define HeterogeneousCoreCUDAUtilities_radixSort_H
+
+#include <cstdint>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+template <typename T>
+__device__ inline void dummyReorder(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {}
+
+template <typename T>
+__device__ inline void reorderSigned(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
+  //move negative first...
+
+  int32_t first = threadIdx.x;
+  __shared__ uint32_t firstNeg;
+  firstNeg = a[ind[0]] < 0 ? 0 : size;
+  __syncthreads();
+
+  // find first negative
+  for (auto i = first; i < size - 1; i += blockDim.x) {
+    if ((a[ind[i]] ^ a[ind[i + 1]]) < 0)
+      firstNeg = i + 1;
+  }
+
+  __syncthreads();
+
+  auto ii = first;
+  for (auto i = firstNeg + threadIdx.x; i < size; i += blockDim.x) {
+    ind2[ii] = ind[i];
+    ii += blockDim.x;
+  }
+  __syncthreads();
+  ii = size - firstNeg + threadIdx.x;
+  assert(ii >= 0);
+  for (auto i = first; i < firstNeg; i += blockDim.x) {
+    ind2[ii] = ind[i];
+    ii += blockDim.x;
+  }
+  __syncthreads();
+  for (auto i = first; i < size; i += blockDim.x)
+    ind[i] = ind2[i];
+}
+
+template <typename T>
+__device__ inline void reorderFloat(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
+  //move negative first...
+
+  int32_t first = threadIdx.x;
+  __shared__ uint32_t firstNeg;
+  firstNeg = a[ind[0]] < 0 ? 0 : size;
+  __syncthreads();
+
+  // find first negative
+  for (auto i = first; i < size - 1; i += blockDim.x) {
+    if ((a[ind[i]] ^ a[ind[i + 1]]) < 0)
+      firstNeg = i + 1;
+  }
+
+  __syncthreads();
+
+  int ii = size - firstNeg - threadIdx.x - 1;
+  for (auto i = firstNeg + threadIdx.x; i < size; i += blockDim.x) {
+    ind2[ii] = ind[i];
+    ii -= blockDim.x;
+  }
+  __syncthreads();
+  ii = size - firstNeg + threadIdx.x;
+  assert(ii >= 0);
+  for (auto i = first; i < firstNeg; i += blockDim.x) {
+    ind2[ii] = ind[i];
+    ii += blockDim.x;
+  }
+  __syncthreads();
+  for (auto i = first; i < size; i += blockDim.x)
+    ind[i] = ind2[i];
+}
+
+template <typename T,  // shall be interger
+          int NS,      // number of significant bytes to use in sorting
+          typename RF>
+__device__ void __forceinline__
+radixSortImpl(T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) {
+  constexpr int d = 8, w = 8 * sizeof(T);
+  constexpr int sb = 1 << d;
+  constexpr int ps = int(sizeof(T)) - NS;
+
+  __shared__ int32_t c[sb], ct[sb], cu[sb];
+
+  __shared__ int ibs;
+  __shared__ int p;
+
+  assert(size > 0);
+  assert(blockDim.x >= sb);
+
+  // bool debug = false; // threadIdx.x==0 && blockIdx.x==5;
+
+  p = ps;
+
+  auto j = ind;
+  auto k = ind2;
+
+  int32_t first = threadIdx.x;
+  for (auto i = first; i < size; i += blockDim.x)
+    j[i] = i;
+  __syncthreads();
+
+  while (__syncthreads_and(p < w / d)) {
+    if (threadIdx.x < sb)
+      c[threadIdx.x] = 0;
+    __syncthreads();
+
+    // fill bins
+    for (auto i = first; i < size; i += blockDim.x) {
+      auto bin = (a[j[i]] >> d * p) & (sb - 1);
+      atomicAdd(&c[bin], 1);
+    }
+    __syncthreads();
+
+    // prefix scan "optimized"???...
+    if (threadIdx.x < sb) {
+      auto x = c[threadIdx.x];
+      auto laneId = threadIdx.x & 0x1f;
+#pragma unroll
+      for (int offset = 1; offset < 32; offset <<= 1) {
+        auto y = __shfl_up_sync(0xffffffff, x, offset);
+        if (laneId >= offset)
+          x += y;
+      }
+      ct[threadIdx.x] = x;
+    }
+    __syncthreads();
+    if (threadIdx.x < sb) {
+      auto ss = (threadIdx.x / 32) * 32 - 1;
+      c[threadIdx.x] = ct[threadIdx.x];
+      for (int i = ss; i > 0; i -= 32)
+        c[threadIdx.x] += ct[i];
+    }
+    /* 
+    //prefix scan for the nulls  (for documentation)
+    if (threadIdx.x==0)
+      for (int i = 1; i < sb; ++i) c[i] += c[i-1];
+    */
+
+    // broadcast
+    ibs = size - 1;
+    __syncthreads();
+    while (__syncthreads_and(ibs > 0)) {
+      int i = ibs - threadIdx.x;
+      if (threadIdx.x < sb) {
+        cu[threadIdx.x] = -1;
+        ct[threadIdx.x] = -1;
+      }
+      __syncthreads();
+      int32_t bin = -1;
+      if (threadIdx.x < sb) {
+        if (i >= 0) {
+          bin = (a[j[i]] >> d * p) & (sb - 1);
+          ct[threadIdx.x] = bin;
+          atomicMax(&cu[bin], int(i));
+        }
+      }
+      __syncthreads();
+      if (threadIdx.x < sb) {
+        if (i >= 0 && i == cu[bin])  // ensure to keep them in order
+          for (int ii = threadIdx.x; ii < sb; ++ii)
+            if (ct[ii] == bin) {
+              auto oi = ii - threadIdx.x;
+              // assert(i>=oi);if(i>=oi)
+              k[--c[bin]] = j[i - oi];
+            }
+      }
+      __syncthreads();
+      if (bin >= 0)
+        assert(c[bin] >= 0);
+      if (threadIdx.x == 0)
+        ibs -= sb;
+      __syncthreads();
+    }
+
+    /*
+    // broadcast for the nulls  (for documentation)
+    if (threadIdx.x==0)
+    for (int i=size-first-1; i>=0; i--) { // =blockDim.x) {
+      auto bin = (a[j[i]] >> d*p)&(sb-1);
+      auto ik = atomicSub(&c[bin],1);
+      k[ik-1] = j[i];
+    }
+    */
+
+    __syncthreads();
+    assert(c[0] == 0);
+
+    // swap (local, ok)
+    auto t = j;
+    j = k;
+    k = t;
+
+    if (threadIdx.x == 0)
+      ++p;
+    __syncthreads();
+  }
+
+  if ((w != 8) && (0 == (NS & 1)))
+    assert(j == ind);  // w/d is even so ind is correct
+
+  if (j != ind)  // odd...
+    for (auto i = first; i < size; i += blockDim.x)
+      ind[i] = ind2[i];
+
+  __syncthreads();
+
+  // now move negative first... (if signed)
+  reorder(a, ind, ind2, size);
+}
+
+template <typename T,
+          int NS = sizeof(T),  // number of significant bytes to use in sorting
+          typename std::enable_if<std::is_unsigned<T>::value, T>::type* = nullptr>
+__device__ void __forceinline__ radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
+  radixSortImpl<T, NS>(a, ind, ind2, size, dummyReorder<T>);
+}
+
+template <typename T,
+          int NS = sizeof(T),  // number of significant bytes to use in sorting
+          typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, T>::type* = nullptr>
+__device__ void __forceinline__ radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
+  radixSortImpl<T, NS>(a, ind, ind2, size, reorderSigned<T>);
+}
+
+template <typename T,
+          int NS = sizeof(T),  // number of significant bytes to use in sorting
+          typename std::enable_if<std::is_floating_point<T>::value, T>::type* = nullptr>
+__device__ void __forceinline__ radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
+  using I = int;
+  radixSortImpl<I, NS>((I const*)(a), ind, ind2, size, reorderFloat<I>);
+}
+
+template <typename T, int NS = sizeof(T)>
+__device__ void __forceinline__
+radixSortMulti(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
+  extern __shared__ uint16_t ws[];
+
+  auto a = v + offsets[blockIdx.x];
+  auto ind = index + offsets[blockIdx.x];
+  auto ind2 = nullptr == workspace ? ws : workspace + offsets[blockIdx.x];
+  auto size = offsets[blockIdx.x + 1] - offsets[blockIdx.x];
+  assert(offsets[blockIdx.x + 1] >= offsets[blockIdx.x]);
+  if (size > 0)
+    radixSort<T, NS>(a, ind, ind2, size);
+}
+
+template <typename T, int NS = sizeof(T)>
+__global__ void __launch_bounds__(256, 4)
+    radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
+  radixSortMulti<T, NS>(v, index, offsets, workspace);
+}
+
+template <typename T, int NS = sizeof(T)>
+__global__ void
+// __launch_bounds__(256, 4)
+radixSortMultiWrapper2(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
+  radixSortMulti<T, NS>(v, index, offsets, workspace);
+}
+
+#endif  // HeterogeneousCoreCUDAUtilities_radixSort_H
diff --git a/HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc b/HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc
new file mode 100644
index 0000000000000..1aad065963ff2
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc
@@ -0,0 +1,15 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
+
+namespace cudaCompat {
+  thread_local dim3 blockIdx;
+  thread_local dim3 gridDim;
+}  // namespace cudaCompat
+
+namespace {
+  struct InitGrid {
+    InitGrid() { cudaCompat::resetGrid(); }
+  };
+
+  const InitGrid initGrid;
+
+}  // namespace
diff --git a/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu b/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu
new file mode 100644
index 0000000000000..1c28758fb970e
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu
@@ -0,0 +1,65 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+__global__ void update(AtomicPairCounter *dc, uint32_t *ind, uint32_t *cont, uint32_t n) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= n)
+    return;
+
+  auto m = i % 11;
+  m = m % 6 + 1;  // max 6, no 0
+  auto c = dc->add(m);
+  assert(c.m < n);
+  ind[c.m] = c.n;
+  for (int j = c.n; j < c.n + m; ++j)
+    cont[j] = i;
+};
+
+__global__ void finalize(AtomicPairCounter const *dc, uint32_t *ind, uint32_t *cont, uint32_t n) {
+  assert(dc->get().m == n);
+  ind[n] = dc->get().n;
+}
+
+__global__ void verify(AtomicPairCounter const *dc, uint32_t const *ind, uint32_t const *cont, uint32_t n) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= n)
+    return;
+  assert(0 == ind[0]);
+  assert(dc->get().m == n);
+  assert(ind[n] == dc->get().n);
+  auto ib = ind[i];
+  auto ie = ind[i + 1];
+  auto k = cont[ib++];
+  assert(k < n);
+  for (; ib < ie; ++ib)
+    assert(cont[ib] == k);
+}
+
+#include <iostream>
+int main() {
+  AtomicPairCounter *dc_d;
+  cudaCheck(cudaMalloc(&dc_d, sizeof(AtomicPairCounter)));
+  cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)));
+
+  std::cout << "size " << sizeof(AtomicPairCounter) << std::endl;
+
+  constexpr uint32_t N = 20000;
+  constexpr uint32_t M = N * 6;
+  uint32_t *n_d, *m_d;
+  cudaCheck(cudaMalloc(&n_d, N * sizeof(int)));
+  // cudaMemset(n_d, 0, N*sizeof(int));
+  cudaCheck(cudaMalloc(&m_d, M * sizeof(int)));
+
+  update<<<2000, 512>>>(dc_d, n_d, m_d, 10000);
+  finalize<<<1, 1>>>(dc_d, n_d, m_d, 10000);
+  verify<<<2000, 512>>>(dc_d, n_d, m_d, 10000);
+
+  AtomicPairCounter dc;
+  cudaCheck(cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost));
+
+  std::cout << dc.get().n << ' ' << dc.get().m << std::endl;
+
+  return 0;
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
new file mode 100644
index 0000000000000..cc5541f58ad60
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
@@ -0,0 +1,146 @@
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <limits>
+#include <random>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+
+template <typename T, int NBINS = 128, int S = 8 * sizeof(T), int DELTA = 1000>
+void go() {
+  std::mt19937 eng;
+
+  int rmin = std::numeric_limits<T>::min();
+  int rmax = std::numeric_limits<T>::max();
+  if (NBINS != 128) {
+    rmin = 0;
+    rmax = NBINS * 2 - 1;
+  }
+
+  std::uniform_int_distribution<T> rgen(rmin, rmax);
+
+  constexpr int N = 12000;
+  T v[N];
+
+  using Hist = HistoContainer<T, NBINS, N, S>;
+  using Hist4 = HistoContainer<T, NBINS, N, S, uint16_t, 4>;
+  std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' '
+            << Hist::capacity() << ' ' << (rmax - rmin) / Hist::nbins() << std::endl;
+  std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl;
+  std::cout << "HistoContainer4 " << Hist4::nbits() << ' ' << Hist4::nbins() << ' ' << Hist4::totbins() << ' '
+            << Hist4::capacity() << ' ' << (rmax - rmin) / Hist::nbins() << std::endl;
+  for (auto nh = 0; nh < 4; ++nh)
+    std::cout << "bins " << int(Hist4::bin(0)) + Hist4::histOff(nh) << ' ' << int(Hist::bin(rmin)) + Hist4::histOff(nh)
+              << ' ' << int(Hist::bin(rmax)) + Hist4::histOff(nh) << std::endl;
+
+  Hist h;
+  Hist4 h4;
+  for (int it = 0; it < 5; ++it) {
+    for (long long j = 0; j < N; j++)
+      v[j] = rgen(eng);
+    if (it == 2)
+      for (long long j = N / 2; j < N / 2 + N / 4; j++)
+        v[j] = 4;
+    h.zero();
+    h4.zero();
+    assert(h.size() == 0);
+    assert(h4.size() == 0);
+    for (long long j = 0; j < N; j++) {
+      h.count(v[j]);
+      if (j < 2000)
+        h4.count(v[j], 2);
+      else
+        h4.count(v[j], j % 4);
+    }
+    assert(h.size() == 0);
+    assert(h4.size() == 0);
+    h.finalize();
+    h4.finalize();
+    assert(h.size() == N);
+    assert(h4.size() == N);
+    for (long long j = 0; j < N; j++) {
+      h.fill(v[j], j);
+      if (j < 2000)
+        h4.fill(v[j], j, 2);
+      else
+        h4.fill(v[j], j, j % 4);
+    }
+    assert(h.off[0] == 0);
+    assert(h4.off[0] == 0);
+    assert(h.size() == N);
+    assert(h4.size() == N);
+
+    auto verify = [&](uint32_t i, uint32_t j, uint32_t k, uint32_t t1, uint32_t t2) {
+      assert((int32_t)t1 < N);
+      assert((int32_t)t2 < N);
+      if (i != j && T(v[t1] - v[t2]) <= 0)
+        std::cout << "for " << i << ':' << v[k] << " failed " << v[t1] << ' ' << v[t2] << std::endl;
+    };
+
+    for (uint32_t i = 0; i < Hist::nbins(); ++i) {
+      if (0 == h.size(i))
+        continue;
+      auto k = *h.begin(i);
+      assert(k < N);
+      auto kl = NBINS != 128 ? h.bin(std::max(rmin, v[k] - DELTA)) : h.bin(v[k] - T(DELTA));
+      auto kh = NBINS != 128 ? h.bin(std::min(rmax, v[k] + DELTA)) : h.bin(v[k] + T(DELTA));
+      if (NBINS == 128) {
+        assert(kl != i);
+        assert(kh != i);
+      }
+      if (NBINS != 128) {
+        assert(kl <= i);
+        assert(kh >= i);
+      }
+      // std::cout << kl << ' ' << kh << std::endl;
+      for (auto j = h.begin(kl); j < h.end(kl); ++j)
+        verify(i, kl, k, k, (*j));
+      for (auto j = h.begin(kh); j < h.end(kh); ++j)
+        verify(i, kh, k, (*j), k);
+    }
+  }
+
+  for (long long j = 0; j < N; j++) {
+    auto b0 = h.bin(v[j]);
+    int w = 0;
+    int tot = 0;
+    auto ftest = [&](int k) {
+      assert(k >= 0 && k < N);
+      tot++;
+    };
+    forEachInBins(h, v[j], w, ftest);
+    int rtot = h.end(b0) - h.begin(b0);
+    assert(tot == rtot);
+    w = 1;
+    tot = 0;
+    forEachInBins(h, v[j], w, ftest);
+    int bp = b0 + 1;
+    int bm = b0 - 1;
+    if (bp < int(h.nbins()))
+      rtot += h.end(bp) - h.begin(bp);
+    if (bm >= 0)
+      rtot += h.end(bm) - h.begin(bm);
+    assert(tot == rtot);
+    w = 2;
+    tot = 0;
+    forEachInBins(h, v[j], w, ftest);
+    bp++;
+    bm--;
+    if (bp < int(h.nbins()))
+      rtot += h.end(bp) - h.begin(bp);
+    if (bm >= 0)
+      rtot += h.end(bm) - h.begin(bm);
+    assert(tot == rtot);
+  }
+}
+
+int main() {
+  cms::cudatest::requireDevices();
+
+  go<int16_t>();
+  go<uint8_t, 128, 8, 4>();
+  go<uint16_t, 313 / 2, 9, 4>();
+
+  return 0;
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
new file mode 100644
index 0000000000000..aa7eb245c350d
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
@@ -0,0 +1,154 @@
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <limits>
+#include <random>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+
+template <typename T>
+void go() {
+  std::mt19937 eng;
+  std::uniform_int_distribution<T> rgen(std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
+
+  constexpr int N = 12000;
+  T v[N];
+  auto v_d = cms::cuda::make_device_unique<T[]>(N, nullptr);
+
+  cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
+
+  constexpr uint32_t nParts = 10;
+  constexpr uint32_t partSize = N / nParts;
+  uint32_t offsets[nParts + 1];
+
+  using Hist = HistoContainer<T, 128, N, 8 * sizeof(T), uint32_t, nParts>;
+  std::cout << "HistoContainer " << (int)(offsetof(Hist, off)) << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' '
+            << Hist::capacity() << ' ' << Hist::wsSize() << ' '
+            << (std::numeric_limits<T>::max() - std::numeric_limits<T>::min()) / Hist::nbins() << std::endl;
+
+  Hist h;
+  auto h_d = cms::cuda::make_device_unique<Hist[]>(1, nullptr);
+  auto ws_d = cms::cuda::make_device_unique<uint8_t[]>(Hist::wsSize(), nullptr);
+
+  auto off_d = cms::cuda::make_device_unique<uint32_t[]>(nParts + 1, nullptr);
+
+  for (int it = 0; it < 5; ++it) {
+    offsets[0] = 0;
+    for (uint32_t j = 1; j < nParts + 1; ++j) {
+      offsets[j] = offsets[j - 1] + partSize - 3 * j;
+      assert(offsets[j] <= N);
+    }
+
+    if (it == 1) {  // special cases...
+      offsets[0] = 0;
+      offsets[1] = 0;
+      offsets[2] = 19;
+      offsets[3] = 32 + offsets[2];
+      offsets[4] = 123 + offsets[3];
+      offsets[5] = 256 + offsets[4];
+      offsets[6] = 311 + offsets[5];
+      offsets[7] = 2111 + offsets[6];
+      offsets[8] = 256 * 11 + offsets[7];
+      offsets[9] = 44 + offsets[8];
+      offsets[10] = 3297 + offsets[9];
+    }
+
+    cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (nParts + 1), cudaMemcpyHostToDevice));
+
+    for (long long j = 0; j < N; j++)
+      v[j] = rgen(eng);
+
+    if (it == 2) {  // big bin
+      for (long long j = 1000; j < 2000; j++)
+        v[j] = sizeof(T) == 1 ? 22 : 3456;
+    }
+
+    cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
+
+    cms::cuda::fillManyFromVector(h_d.get(), ws_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, 0);
+    cudaCheck(cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost));
+    assert(0 == h.off[0]);
+    assert(offsets[10] == h.size());
+
+    auto verify = [&](uint32_t i, uint32_t k, uint32_t t1, uint32_t t2) {
+      assert(t1 < N);
+      assert(t2 < N);
+      if (T(v[t1] - v[t2]) <= 0)
+        std::cout << "for " << i << ':' << v[k] << " failed " << v[t1] << ' ' << v[t2] << std::endl;
+    };
+
+    auto incr = [](auto& k) { return k = (k + 1) % Hist::nbins(); };
+
+    // make sure it spans 3 bins...
+    auto window = T(1300);
+
+    for (uint32_t j = 0; j < nParts; ++j) {
+      auto off = Hist::histOff(j);
+      for (uint32_t i = 0; i < Hist::nbins(); ++i) {
+        auto ii = i + off;
+        if (0 == h.size(ii))
+          continue;
+        auto k = *h.begin(ii);
+        if (j % 2)
+          k = *(h.begin(ii) + (h.end(ii) - h.begin(ii)) / 2);
+        auto bk = h.bin(v[k]);
+        assert(bk == i);
+        assert(k < offsets[j + 1]);
+        auto kl = h.bin(v[k] - window);
+        auto kh = h.bin(v[k] + window);
+        assert(kl != i);
+        assert(kh != i);
+        // std::cout << kl << ' ' << kh << std::endl;
+
+        auto me = v[k];
+        auto tot = 0;
+        auto nm = 0;
+        bool l = true;
+        auto khh = kh;
+        incr(khh);
+        for (auto kk = kl; kk != khh; incr(kk)) {
+          if (kk != kl && kk != kh)
+            nm += h.size(kk + off);
+          for (auto p = h.begin(kk + off); p < h.end(kk + off); ++p) {
+            if (std::min(std::abs(T(v[*p] - me)), std::abs(T(me - v[*p]))) > window) {
+            } else {
+              ++tot;
+            }
+          }
+          if (kk == i) {
+            l = false;
+            continue;
+          }
+          if (l)
+            for (auto p = h.begin(kk + off); p < h.end(kk + off); ++p)
+              verify(i, k, k, (*p));
+          else
+            for (auto p = h.begin(kk + off); p < h.end(kk + off); ++p)
+              verify(i, k, (*p), k);
+        }
+        if (!(tot >= nm)) {
+          std::cout << "too bad " << j << ' ' << i << ' ' << int(me) << '/' << (int)T(me - window) << '/'
+                    << (int)T(me + window) << ": " << kl << '/' << kh << ' ' << khh << ' ' << tot << '/' << nm
+                    << std::endl;
+        }
+        if (l)
+          std::cout << "what? " << j << ' ' << i << ' ' << int(me) << '/' << (int)T(me - window) << '/'
+                    << (int)T(me + window) << ": " << kl << '/' << kh << ' ' << khh << ' ' << tot << '/' << nm
+                    << std::endl;
+        assert(!l);
+      }
+    }
+  }
+}
+
+int main() {
+  cms::cudatest::requireDevices();
+
+  go<int16_t>();
+  go<int8_t>();
+
+  return 0;
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
new file mode 100644
index 0000000000000..020d69268a420
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
@@ -0,0 +1,140 @@
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <random>
+#include <limits>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+
+template <typename T, int NBINS, int S, int DELTA>
+__global__ void mykernel(T const* __restrict__ v, uint32_t N) {
+  assert(v);
+  assert(N == 12000);
+
+  if (threadIdx.x == 0)
+    printf("start kernel for %d data\n", N);
+
+  using Hist = HistoContainer<T, NBINS, 12000, S, uint16_t>;
+
+  __shared__ Hist hist;
+  __shared__ typename Hist::Counter ws[32];
+
+  for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
+    hist.off[j] = 0;
+  }
+  __syncthreads();
+
+  for (auto j = threadIdx.x; j < N; j += blockDim.x)
+    hist.count(v[j]);
+  __syncthreads();
+
+  assert(0 == hist.size());
+  __syncthreads();
+
+  hist.finalize(ws);
+  __syncthreads();
+
+  assert(N == hist.size());
+  for (auto j = threadIdx.x; j < Hist::nbins(); j += blockDim.x)
+    assert(hist.off[j] <= hist.off[j + 1]);
+  __syncthreads();
+
+  if (threadIdx.x < 32)
+    ws[threadIdx.x] = 0;  // used by prefix scan...
+  __syncthreads();
+
+  for (auto j = threadIdx.x; j < N; j += blockDim.x)
+    hist.fill(v[j], j);
+  __syncthreads();
+  assert(0 == hist.off[0]);
+  assert(N == hist.size());
+
+  for (auto j = threadIdx.x; j < hist.size() - 1; j += blockDim.x) {
+    auto p = hist.begin() + j;
+    assert((*p) < N);
+    auto k1 = Hist::bin(v[*p]);
+    auto k2 = Hist::bin(v[*(p + 1)]);
+    assert(k2 >= k1);
+  }
+
+  for (auto i = threadIdx.x; i < hist.size(); i += blockDim.x) {
+    auto p = hist.begin() + i;
+    auto j = *p;
+    auto b0 = Hist::bin(v[j]);
+    int tot = 0;
+    auto ftest = [&](int k) {
+      assert(k >= 0 && k < N);
+      ++tot;
+    };
+    forEachInWindow(hist, v[j], v[j], ftest);
+    int rtot = hist.size(b0);
+    assert(tot == rtot);
+    tot = 0;
+    auto vm = int(v[j]) - DELTA;
+    auto vp = int(v[j]) + DELTA;
+    constexpr int vmax = NBINS != 128 ? NBINS * 2 - 1 : std::numeric_limits<T>::max();
+    vm = std::max(vm, 0);
+    vm = std::min(vm, vmax);
+    vp = std::min(vp, vmax);
+    vp = std::max(vp, 0);
+    assert(vp >= vm);
+    forEachInWindow(hist, vm, vp, ftest);
+    int bp = Hist::bin(vp);
+    int bm = Hist::bin(vm);
+    rtot = hist.end(bp) - hist.begin(bm);
+    assert(tot == rtot);
+  }
+}
+
+template <typename T, int NBINS = 128, int S = 8 * sizeof(T), int DELTA = 1000>
+void go() {
+  std::mt19937 eng;
+
+  int rmin = std::numeric_limits<T>::min();
+  int rmax = std::numeric_limits<T>::max();
+  if (NBINS != 128) {
+    rmin = 0;
+    rmax = NBINS * 2 - 1;
+  }
+
+  std::uniform_int_distribution<T> rgen(rmin, rmax);
+
+  constexpr int N = 12000;
+  T v[N];
+
+  auto v_d = cms::cuda::make_device_unique<T[]>(N, nullptr);
+  assert(v_d.get());
+
+  using Hist = HistoContainer<T, NBINS, N, S>;
+  std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::capacity() << ' '
+            << (rmax - rmin) / Hist::nbins() << std::endl;
+  std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl;
+
+  for (int it = 0; it < 5; ++it) {
+    for (long long j = 0; j < N; j++)
+      v[j] = rgen(eng);
+    if (it == 2)
+      for (long long j = N / 2; j < N / 2 + N / 4; j++)
+        v[j] = 4;
+
+    assert(v_d.get());
+    assert(v);
+    cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
+    assert(v_d.get());
+    cms::cuda::launch(mykernel<T, NBINS, S, DELTA>, {1, 256}, v_d.get(), N);
+  }
+}
+
+int main() {
+  cms::cudatest::requireDevices();
+
+  go<int16_t>();
+  go<uint8_t, 128, 8, 4>();
+  go<uint16_t, 313 / 2, 9, 4>();
+
+  return 0;
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cpp b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cpp
new file mode 100644
index 0000000000000..3d452e851ba12
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cpp
@@ -0,0 +1 @@
+#include "OneToManyAssoc_t.h"
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cu
new file mode 100644
index 0000000000000..3d452e851ba12
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.cu
@@ -0,0 +1 @@
+#include "OneToManyAssoc_t.h"
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
new file mode 100644
index 0000000000000..92bb359115a02
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
@@ -0,0 +1,308 @@
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <random>
+#include <limits>
+#include <array>
+#include <memory>
+
+#ifdef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+
+#endif
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+
+constexpr uint32_t MaxElem = 64000;
+constexpr uint32_t MaxTk = 8000;
+constexpr uint32_t MaxAssocs = 4 * MaxTk;
+using Assoc = OneToManyAssoc<uint16_t, MaxElem, MaxAssocs>;
+
+using SmallAssoc = OneToManyAssoc<uint16_t, 128, MaxAssocs>;
+
+using Multiplicity = OneToManyAssoc<uint16_t, 8, MaxTk>;
+
+using TK = std::array<uint16_t, 4>;
+
+__global__ void countMultiLocal(TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, int32_t n) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int i = first; i < n; i += gridDim.x * blockDim.x) {
+    __shared__ Multiplicity::CountersOnly local;
+    if (threadIdx.x == 0)
+      local.zero();
+    __syncthreads();
+    local.countDirect(2 + i % 4);
+    __syncthreads();
+    if (threadIdx.x == 0)
+      assoc->add(local);
+  }
+}
+
+__global__ void countMulti(TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, int32_t n) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int i = first; i < n; i += gridDim.x * blockDim.x)
+    assoc->countDirect(2 + i % 4);
+}
+
+__global__ void verifyMulti(Multiplicity* __restrict__ m1, Multiplicity* __restrict__ m2) {
+  auto first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (auto i = first; i < Multiplicity::totbins(); i += gridDim.x * blockDim.x)
+    assert(m1->off[i] == m2->off[i]);
+}
+
+__global__ void count(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int i = first; i < 4 * n; i += gridDim.x * blockDim.x) {
+    auto k = i / 4;
+    auto j = i - 4 * k;
+    assert(j < 4);
+    if (k >= n)
+      return;
+    if (tk[k][j] < MaxElem)
+      assoc->countDirect(tk[k][j]);
+  }
+}
+
+__global__ void fill(TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int i = first; i < 4 * n; i += gridDim.x * blockDim.x) {
+    auto k = i / 4;
+    auto j = i - 4 * k;
+    assert(j < 4);
+    if (k >= n)
+      return;
+    if (tk[k][j] < MaxElem)
+      assoc->fillDirect(tk[k][j], k);
+  }
+}
+
+__global__ void verify(Assoc* __restrict__ assoc) { assert(assoc->size() < Assoc::capacity()); }
+
+template <typename Assoc>
+__global__ void fillBulk(AtomicPairCounter* apc, TK const* __restrict__ tk, Assoc* __restrict__ assoc, int32_t n) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int k = first; k < n; k += gridDim.x * blockDim.x) {
+    auto m = tk[k][3] < MaxElem ? 4 : 3;
+    assoc->bulkFill(*apc, &tk[k][0], m);
+  }
+}
+
+template <typename Assoc>
+__global__ void verifyBulk(Assoc const* __restrict__ assoc, AtomicPairCounter const* apc) {
+  if (apc->get().m >= Assoc::nbins())
+    printf("Overflow %d %d\n", apc->get().m, Assoc::nbins());
+  assert(assoc->size() < Assoc::capacity());
+}
+
+int main() {
+#ifdef __CUDACC__
+  cms::cudatest::requireDevices();
+  auto current_device = cms::cuda::currentDevice();
+#else
+  // make sure cuda emulation is working
+  std::cout << "cuda x's " << threadIdx.x << ' ' << blockIdx.x << ' ' << blockDim.x << ' ' << gridDim.x << std::endl;
+  std::cout << "cuda y's " << threadIdx.y << ' ' << blockIdx.y << ' ' << blockDim.y << ' ' << gridDim.y << std::endl;
+  std::cout << "cuda z's " << threadIdx.z << ' ' << blockIdx.z << ' ' << blockDim.z << ' ' << gridDim.z << std::endl;
+  assert(threadIdx.x == 0);
+  assert(threadIdx.y == 0);
+  assert(threadIdx.z == 0);
+  assert(blockIdx.x == 0);
+  assert(blockIdx.y == 0);
+  assert(blockIdx.z == 0);
+  assert(blockDim.x == 1);
+  assert(blockDim.y == 1);
+  assert(blockDim.z == 1);
+  assert(gridDim.x == 1);
+  assert(gridDim.y == 1);
+  assert(gridDim.z == 1);
+#endif
+
+  std::cout << "OneToManyAssoc " << Assoc::nbins() << ' ' << Assoc::capacity() << ' ' << Assoc::wsSize() << std::endl;
+  std::cout << "OneToManyAssoc (small) " << SmallAssoc::nbins() << ' ' << SmallAssoc::capacity() << ' '
+            << SmallAssoc::wsSize() << std::endl;
+
+  std::mt19937 eng;
+
+  std::geometric_distribution<int> rdm(0.8);
+
+  constexpr uint32_t N = 4000;
+
+  std::vector<std::array<uint16_t, 4>> tr(N);
+
+  // fill with "index" to element
+  long long ave = 0;
+  int imax = 0;
+  auto n = 0U;
+  auto z = 0U;
+  auto nz = 0U;
+  for (auto i = 0U; i < 4U; ++i) {
+    auto j = 0U;
+    while (j < N && n < MaxElem) {
+      if (z == 11) {
+        ++n;
+        z = 0;
+        ++nz;
+        continue;
+      }  // a bit of not assoc
+      auto x = rdm(eng);
+      auto k = std::min(j + x + 1, N);
+      if (i == 3 && z == 3) {  // some triplets time to time
+        for (; j < k; ++j)
+          tr[j][i] = MaxElem + 1;
+      } else {
+        ave += x + 1;
+        imax = std::max(imax, x);
+        for (; j < k; ++j)
+          tr[j][i] = n;
+        ++n;
+      }
+      ++z;
+    }
+    assert(n <= MaxElem);
+    assert(j <= N);
+  }
+  std::cout << "filled with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << nz << std::endl;
+
+#ifdef __CUDACC__
+  auto v_d = cms::cuda::make_device_unique<std::array<uint16_t, 4>[]>(N, nullptr);
+  assert(v_d.get());
+  auto a_d = cms::cuda::make_device_unique<Assoc[]>(1, nullptr);
+  auto sa_d = cms::cuda::make_device_unique<SmallAssoc[]>(1, nullptr);
+  auto ws_d = cms::cuda::make_device_unique<uint8_t[]>(Assoc::wsSize(), nullptr);
+
+  cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array<uint16_t, 4>), cudaMemcpyHostToDevice));
+#else
+  auto a_d = std::make_unique<Assoc>();
+  auto sa_d = std::make_unique<SmallAssoc>();
+  auto v_d = tr.data();
+#endif
+
+  cms::cuda::launchZero(a_d.get(), 0);
+
+#ifdef __CUDACC__
+  auto nThreads = 256;
+  auto nBlocks = (4 * N + nThreads - 1) / nThreads;
+
+  count<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
+
+  cms::cuda::launchFinalize(a_d.get(), ws_d.get(), 0);
+  verify<<<1, 1>>>(a_d.get());
+  fill<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
+#else
+  count(v_d, a_d.get(), N);
+  cms::cuda::launchFinalize(a_d.get());
+  verify(a_d.get());
+  fill(v_d, a_d.get(), N);
+#endif
+
+  Assoc la;
+
+#ifdef __CUDACC__
+  cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost));
+#else
+  memcpy(&la, a_d.get(), sizeof(Assoc));  // not required, easier
+#endif
+
+  std::cout << la.size() << std::endl;
+  imax = 0;
+  ave = 0;
+  z = 0;
+  for (auto i = 0U; i < n; ++i) {
+    auto x = la.size(i);
+    if (x == 0) {
+      z++;
+      continue;
+    }
+    ave += x;
+    imax = std::max(imax, int(x));
+  }
+  assert(0 == la.size(n));
+  std::cout << "found with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << z << std::endl;
+
+  // now the inverse map (actually this is the direct....)
+  AtomicPairCounter* dc_d;
+  AtomicPairCounter dc(0);
+
+#ifdef __CUDACC__
+  cudaCheck(cudaMalloc(&dc_d, sizeof(AtomicPairCounter)));
+  cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)));
+  nBlocks = (N + nThreads - 1) / nThreads;
+  fillBulk<<<nBlocks, nThreads>>>(dc_d, v_d.get(), a_d.get(), N);
+  cms::cuda::finalizeBulk<<<nBlocks, nThreads>>>(dc_d, a_d.get());
+  verifyBulk<<<1, 1>>>(a_d.get(), dc_d);
+
+  cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost));
+  cudaCheck(cudaMemcpy(&dc, dc_d, sizeof(AtomicPairCounter), cudaMemcpyDeviceToHost));
+
+  cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)));
+  fillBulk<<<nBlocks, nThreads>>>(dc_d, v_d.get(), sa_d.get(), N);
+  cms::cuda::finalizeBulk<<<nBlocks, nThreads>>>(dc_d, sa_d.get());
+  verifyBulk<<<1, 1>>>(sa_d.get(), dc_d);
+
+#else
+  dc_d = &dc;
+  fillBulk(dc_d, v_d, a_d.get(), N);
+  cms::cuda::finalizeBulk(dc_d, a_d.get());
+  verifyBulk(a_d.get(), dc_d);
+  memcpy(&la, a_d.get(), sizeof(Assoc));
+
+  AtomicPairCounter sdc(0);
+  fillBulk(&sdc, v_d, sa_d.get(), N);
+  cms::cuda::finalizeBulk(&sdc, sa_d.get());
+  verifyBulk(sa_d.get(), &sdc);
+
+#endif
+
+  std::cout << "final counter value " << dc.get().n << ' ' << dc.get().m << std::endl;
+
+  std::cout << la.size() << std::endl;
+  imax = 0;
+  ave = 0;
+  for (auto i = 0U; i < N; ++i) {
+    auto x = la.size(i);
+    if (!(x == 4 || x == 3))
+      std::cout << i << ' ' << x << std::endl;
+    assert(x == 4 || x == 3);
+    ave += x;
+    imax = std::max(imax, int(x));
+  }
+  assert(0 == la.size(N));
+  std::cout << "found with ave occupancy " << double(ave) / N << ' ' << imax << std::endl;
+
+  // here verify use of block local counters
+#ifdef __CUDACC__
+  auto m1_d = cms::cuda::make_device_unique<Multiplicity[]>(1, nullptr);
+  auto m2_d = cms::cuda::make_device_unique<Multiplicity[]>(1, nullptr);
+#else
+  auto m1_d = std::make_unique<Multiplicity>();
+  auto m2_d = std::make_unique<Multiplicity>();
+#endif
+  cms::cuda::launchZero(m1_d.get(), 0);
+  cms::cuda::launchZero(m2_d.get(), 0);
+
+#ifdef __CUDACC__
+  nBlocks = (4 * N + nThreads - 1) / nThreads;
+  countMulti<<<nBlocks, nThreads>>>(v_d.get(), m1_d.get(), N);
+  countMultiLocal<<<nBlocks, nThreads>>>(v_d.get(), m2_d.get(), N);
+  verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get());
+
+  cms::cuda::launchFinalize(m1_d.get(), ws_d.get(), 0);
+  cms::cuda::launchFinalize(m2_d.get(), ws_d.get(), 0);
+  verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get());
+
+  cudaCheck(cudaGetLastError());
+  cudaCheck(cudaDeviceSynchronize());
+#else
+  countMulti(v_d, m1_d.get(), N);
+  countMultiLocal(v_d, m2_d.get(), N);
+  verifyMulti(m1_d.get(), m2_d.get());
+
+  cms::cuda::launchFinalize(m1_d.get());
+  cms::cuda::launchFinalize(m2_d.get());
+  verifyMulti(m1_d.get(), m2_d.get());
+#endif
+  return 0;
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cpp b/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cpp
new file mode 100644
index 0000000000000..f453041b85b38
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cpp
@@ -0,0 +1,34 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <vector>
+
+void testBinaryFind() {
+  std::vector<int> data = {1, 1, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 6};
+
+  auto lower = cuda_std::lower_bound(data.begin(), data.end(), 4);
+  auto upper = cuda_std::upper_bound(data.begin(), data.end(), 4);
+
+  std::copy(lower, upper, std::ostream_iterator<int>(std::cout, " "));
+
+  std::cout << '\n';
+
+  // classic binary search, returning a value only if it is present
+
+  data = {1, 2, 4, 6, 9, 10};
+
+  auto test = [&](auto v) {
+    auto it = cuda_std::binary_find(data.cbegin(), data.cend(), v);
+
+    if (it != data.cend())
+      std::cout << *it << " found at index " << std::distance(data.cbegin(), it) << std::endl;
+    else
+      std::cout << v << " non found" << std::endl;
+  };
+
+  test(4);
+  test(5);
+}
+
+int main() { testBinaryFind(); }
diff --git a/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cu b/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cu
new file mode 100644
index 0000000000000..7b8ed6219392e
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/cudastdAlgorithm_t.cu
@@ -0,0 +1,30 @@
+#include <cassert>
+#include <iostream>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+
+__global__ void testBinaryFind() {
+  int data[] = {1, 1, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 6};
+
+  auto lower = cuda_std::lower_bound(data, data + 13, 4);
+  auto upper = cuda_std::upper_bound(data, data + 12, 4);
+
+  assert(3 == upper - lower);
+
+  // classic binary search, returning a value only if it is present
+
+  constexpr int data2[] = {1, 2, 4, 6, 9, 10};
+
+  assert(data2 + 2 == cuda_std::binary_find(data2, data2 + 6, 4));
+  assert(data2 + 6 == cuda_std::binary_find(data2, data2 + 6, 5));
+}
+
+void wrapper() { cms::cuda::launch(testBinaryFind, {32, 64}); }
+
+int main() {
+  cms::cudatest::requireDevices();
+
+  wrapper();
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.cpp b/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.cpp
new file mode 100644
index 0000000000000..c5200e2a87bee
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.cpp
@@ -0,0 +1 @@
+#include "eigenSoA_t.h"
diff --git a/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.cu b/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.cu
new file mode 100644
index 0000000000000..c5200e2a87bee
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.cu
@@ -0,0 +1 @@
+#include "eigenSoA_t.h"
diff --git a/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h b/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h
new file mode 100644
index 0000000000000..e9fef92e68083
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h
@@ -0,0 +1,101 @@
+#include <Eigen/Dense>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h"
+
+template <int32_t S>
+struct MySoA {
+  // we can find a way to avoid this copy/paste???
+  static constexpr int32_t stride() { return S; }
+
+  eigenSoA::ScalarSoA<float, S> a;
+  eigenSoA::ScalarSoA<float, S> b;
+};
+
+using V = MySoA<128>;
+
+__global__ void testBasicSoA(float* p) {
+  using namespace eigenSoA;
+
+  assert(!isPowerOf2(0));
+  assert(isPowerOf2(1));
+  assert(isPowerOf2(1024));
+  assert(!isPowerOf2(1026));
+
+  using M3 = Eigen::Matrix<float, 3, 3>;
+  ;
+  __shared__ eigenSoA::MatrixSoA<M3, 64> m;
+
+  int first = threadIdx.x + blockIdx.x * blockDim.x;
+  if (0 == first)
+    printf("before %f\n", p[0]);
+
+  // a silly game...
+  int n = 64;
+  for (int i = first; i < n; i += blockDim.x * gridDim.x) {
+    m[i].setZero();
+    m[i](0, 0) = p[i];
+    m[i](1, 1) = p[i + 64];
+    m[i](2, 2) = p[i + 64 * 2];
+  }
+  __syncthreads();  // not needed
+
+  for (int i = first; i < n; i += blockDim.x * gridDim.x)
+    m[i] = m[i].inverse().eval();
+  __syncthreads();
+
+  for (int i = first; i < n; i += blockDim.x * gridDim.x) {
+    p[i] = m[63 - i](0, 0);
+    p[i + 64] = m[63 - i](1, 1);
+    p[i + 64 * 2] = m[63 - i](2, 2);
+  }
+
+  if (0 == first)
+    printf("after %f\n", p[0]);
+}
+
+#include <cassert>
+#include <iostream>
+#include <memory>
+#include <random>
+
+#ifdef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#endif
+
+int main() {
+#ifdef __CUDACC__
+  cms::cudatest::requireDevices();
+#endif
+
+  float p[1024];
+
+  std::uniform_real_distribution<float> rgen(0.01, 0.99);
+  std::mt19937 eng;
+
+  for (auto& r : p)
+    r = rgen(eng);
+  for (int i = 0, n = 64 * 3; i < n; ++i)
+    assert(p[i] > 0 && p[i] < 1.);
+
+  std::cout << p[0] << std::endl;
+#ifdef __CUDACC__
+  float* p_d;
+  cudaCheck(cudaMalloc(&p_d, 1024 * 4));
+  cudaCheck(cudaMemcpy(p_d, p, 1024 * 4, cudaMemcpyDefault));
+  testBasicSoA<<<1, 1024>>>(p_d);
+  cudaCheck(cudaGetLastError());
+  cudaCheck(cudaMemcpy(p, p_d, 1024 * 4, cudaMemcpyDefault));
+  cudaCheck(cudaDeviceSynchronize());
+#else
+  testBasicSoA(p);
+#endif
+
+  std::cout << p[0] << std::endl;
+
+  for (int i = 0, n = 64 * 3; i < n; ++i)
+    assert(p[i] > 1.);
+
+  std::cout << "END" << std::endl;
+  return 0;
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
new file mode 100644
index 0000000000000..8bbaf4d066abc
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
@@ -0,0 +1,165 @@
+#include <iostream>
+
+#include <cub/cub.cuh>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+
+template <typename T>
+struct format_traits {
+public:
+  static const constexpr char* failed_msg = "failed %d %d %d: %d %d\n";
+};
+
+template<>
+struct format_traits<float> {
+public:
+  static const constexpr char* failed_msg = "failed %d %d %d: %f %f\n";
+};
+
+template <typename T>
+__global__ void testPrefixScan(uint32_t size) {
+  __shared__ T ws[32];
+  __shared__ T c[1024];
+  __shared__ T co[1024];
+
+  auto first = threadIdx.x;
+  for (auto i = first; i < size; i += blockDim.x)
+    c[i] = 1;
+  __syncthreads();
+
+  blockPrefixScan(c, co, size, ws);
+  blockPrefixScan(c, size, ws);
+
+  assert(1 == c[0]);
+  assert(1 == co[0]);
+  for (auto i = first + 1; i < size; i += blockDim.x) {
+    if (c[i] != c[i - 1] + 1)
+      printf(format_traits<T>::failed_msg, size, i, blockDim.x, c[i], c[i - 1]);
+    assert(c[i] == c[i - 1] + 1);
+    assert(c[i] == i + 1);
+    assert(c[i] = co[i]);
+  }
+}
+
+template <typename T>
+__global__ void testWarpPrefixScan(uint32_t size) {
+  assert(size <= 32);
+  __shared__ T c[1024];
+  __shared__ T co[1024];
+  auto i = threadIdx.x;
+  c[i] = 1;
+  __syncthreads();
+
+  warpPrefixScan(c, co, i, 0xffffffff);
+  warpPrefixScan(c, i, 0xffffffff);
+  __syncthreads();
+
+  assert(1 == c[0]);
+  assert(1 == co[0]);
+  if (i != 0) {
+    if (c[i] != c[i - 1] + 1)
+      printf(format_traits<T>::failed_msg, size, i, blockDim.x, c[i], c[i - 1]);
+    assert(c[i] == c[i - 1] + 1);
+    assert(c[i] == i + 1);
+    assert(c[i] = co[i]);
+  }
+}
+
+__global__ void init(uint32_t *v, uint32_t val, uint32_t n) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < n)
+    v[i] = val;
+  if (i == 0)
+    printf("init\n");
+}
+
+__global__ void verify(uint32_t const *v, uint32_t n) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < n)
+    assert(v[i] == i + 1);
+  if (i == 0)
+    printf("verify\n");
+}
+
+int main() {
+  cms::cudatest::requireDevices();
+
+  std::cout << "warp level" << std::endl;
+  // std::cout << "warp 32" << std::endl;
+  testWarpPrefixScan<int><<<1, 32>>>(32);
+  cudaDeviceSynchronize();
+  // std::cout << "warp 16" << std::endl;
+  testWarpPrefixScan<int><<<1, 32>>>(16);
+  cudaDeviceSynchronize();
+  // std::cout << "warp 5" << std::endl;
+  testWarpPrefixScan<int><<<1, 32>>>(5);
+  cudaDeviceSynchronize();
+
+  std::cout << "block level" << std::endl;
+  for (int bs = 32; bs <= 1024; bs += 32) {
+    // std::cout << "bs " << bs << std::endl;
+    for (int j = 1; j <= 1024; ++j) {
+      // std::cout << j << std::endl;
+      testPrefixScan<uint16_t><<<1, bs>>>(j);
+      cudaDeviceSynchronize();
+      testPrefixScan<float><<<1, bs>>>(j);
+      cudaDeviceSynchronize();
+    }
+  }
+  cudaDeviceSynchronize();
+
+  int num_items = 200;
+  for (int ksize = 1; ksize < 4; ++ksize) {
+    // test multiblock
+    std::cout << "multiblok" << std::endl;
+    // Declare, allocate, and initialize device-accessible pointers for input and output
+    num_items *= 10;
+    uint32_t *d_in;
+    uint32_t *d_out1;
+    uint32_t *d_out2;
+
+    cudaCheck(cudaMalloc(&d_in, num_items * sizeof(uint32_t)));
+    cudaCheck(cudaMalloc(&d_out1, num_items * sizeof(uint32_t)));
+    cudaCheck(cudaMalloc(&d_out2, num_items * sizeof(uint32_t)));
+
+    auto nthreads = 256;
+    auto nblocks = (num_items + nthreads - 1) / nthreads;
+
+    init<<<nblocks, nthreads, 0>>>(d_in, 1, num_items);
+
+    // the block counter
+    int32_t *d_pc;
+    cudaCheck(cudaMalloc(&d_pc, sizeof(int32_t)));
+    cudaCheck(cudaMemset(d_pc, 0, 4));
+
+    nthreads = 1024;
+    nblocks = (num_items + nthreads - 1) / nthreads;
+    multiBlockPrefixScan<<<nblocks, nthreads, 0>>>(d_in, d_out1, num_items, d_pc);
+    verify<<<nblocks, nthreads, 0>>>(d_out1, num_items);
+    cudaDeviceSynchronize();
+
+    // test cub
+    std::cout << "cub" << std::endl;
+    // Determine temporary device storage requirements for inclusive prefix sum
+    void *d_temp_storage = nullptr;
+    size_t temp_storage_bytes = 0;
+    cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out2, num_items);
+
+    std::cout << "temp storage " << temp_storage_bytes << std::endl;
+
+    // Allocate temporary storage for inclusive prefix sum
+    // fake larger ws already available
+    temp_storage_bytes *= 8;
+    cudaCheck(cudaMalloc(&d_temp_storage, temp_storage_bytes));
+    std::cout << "temp storage " << temp_storage_bytes << std::endl;
+    // Run inclusive prefix sum
+    CubDebugExit(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out2, num_items));
+    std::cout << "temp storage " << temp_storage_bytes << std::endl;
+
+    verify<<<nblocks, nthreads, 0>>>(d_out2, num_items);
+    cudaDeviceSynchronize();
+  }  // ksize
+  return 0;
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
new file mode 100644
index 0000000000000..febdb9c92b0a7
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
@@ -0,0 +1,202 @@
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <random>
+#include <set>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h"
+
+template <typename T>
+struct RS {
+  using type = std::uniform_int_distribution<T>;
+  static auto ud() { return type(std::numeric_limits<T>::min(), std::numeric_limits<T>::max()); }
+  static constexpr T imax = std::numeric_limits<T>::max();
+};
+
+template <>
+struct RS<float> {
+  using T = float;
+  using type = std::uniform_real_distribution<float>;
+  static auto ud() { return type(-std::numeric_limits<T>::max() / 2, std::numeric_limits<T>::max() / 2); }
+  //  static auto ud() { return type(0,std::numeric_limits<T>::max()/2);}
+  static constexpr int imax = std::numeric_limits<int>::max();
+};
+
+template <typename T, int NS = sizeof(T), typename U = T, typename LL = long long>
+void go(bool useShared) {
+  std::mt19937 eng;
+  //std::mt19937 eng2;
+  auto rgen = RS<T>::ud();
+
+  auto start = std::chrono::high_resolution_clock::now();
+  auto delta = start - start;
+
+  constexpr int blocks = 10;
+  constexpr int blockSize = 256 * 32;
+  constexpr int N = blockSize * blocks;
+  T v[N];
+  uint16_t ind[N];
+
+  constexpr bool sgn = T(-1) < T(0);
+  std::cout << "Will sort " << N << (sgn ? " signed" : " unsigned")
+            << (std::numeric_limits<T>::is_integer ? " 'ints'" : " 'float'") << " of size " << sizeof(T) << " using "
+            << NS << " significant bytes" << std::endl;
+
+  for (int i = 0; i < 50; ++i) {
+    if (i == 49) {
+      for (long long j = 0; j < N; j++)
+        v[j] = 0;
+    } else if (i > 30) {
+      for (long long j = 0; j < N; j++)
+        v[j] = rgen(eng);
+    } else {
+      uint64_t imax = (i < 15) ? uint64_t(RS<T>::imax) + 1LL : 255;
+      for (uint64_t j = 0; j < N; j++) {
+        v[j] = (j % imax);
+        if (j % 2 && i % 2)
+          v[j] = -v[j];
+      }
+    }
+
+    uint32_t offsets[blocks + 1];
+    offsets[0] = 0;
+    for (int j = 1; j < blocks + 1; ++j) {
+      offsets[j] = offsets[j - 1] + blockSize - 3 * j;
+      assert(offsets[j] <= N);
+    }
+
+    if (i == 1) {  // special cases...
+      offsets[0] = 0;
+      offsets[1] = 0;
+      offsets[2] = 19;
+      offsets[3] = 32 + offsets[2];
+      offsets[4] = 123 + offsets[3];
+      offsets[5] = 256 + offsets[4];
+      offsets[6] = 311 + offsets[5];
+      offsets[7] = 2111 + offsets[6];
+      offsets[8] = 256 * 11 + offsets[7];
+      offsets[9] = 44 + offsets[8];
+      offsets[10] = 3297 + offsets[9];
+    }
+
+    std::random_shuffle(v, v + N);
+
+    auto v_d = cms::cuda::make_device_unique<U[]>(N, nullptr);
+    auto ind_d = cms::cuda::make_device_unique<uint16_t[]>(N, nullptr);
+    auto ws_d = cms::cuda::make_device_unique<uint16_t[]>(N, nullptr);
+    auto off_d = cms::cuda::make_device_unique<uint32_t[]>(blocks + 1, nullptr);
+
+    cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
+    cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (blocks + 1), cudaMemcpyHostToDevice));
+
+    if (i < 2)
+      std::cout << "lauch for " << offsets[blocks] << std::endl;
+
+    auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256;
+
+    delta -= (std::chrono::high_resolution_clock::now() - start);
+    constexpr int MaxSize = 256 * 32;
+    if (useShared)
+      cms::cuda::launch(
+          radixSortMultiWrapper<U, NS>, {blocks, ntXBl, MaxSize * 2}, v_d.get(), ind_d.get(), off_d.get(), nullptr);
+    else
+      cms::cuda::launch(
+          radixSortMultiWrapper2<U, NS>, {blocks, ntXBl}, v_d.get(), ind_d.get(), off_d.get(), ws_d.get());
+
+    if (i == 0)
+      std::cout << "done for " << offsets[blocks] << std::endl;
+
+    cudaCheck(cudaMemcpy(ind, ind_d.get(), 2 * N, cudaMemcpyDeviceToHost));
+
+    delta += (std::chrono::high_resolution_clock::now() - start);
+
+    if (i == 0)
+      std::cout << "done for " << offsets[blocks] << std::endl;
+
+    if (32 == i) {
+      std::cout << LL(v[ind[0]]) << ' ' << LL(v[ind[1]]) << ' ' << LL(v[ind[2]]) << std::endl;
+      std::cout << LL(v[ind[3]]) << ' ' << LL(v[ind[10]]) << ' ' << LL(v[ind[blockSize - 1000]]) << std::endl;
+      std::cout << LL(v[ind[blockSize / 2 - 1]]) << ' ' << LL(v[ind[blockSize / 2]]) << ' '
+                << LL(v[ind[blockSize / 2 + 1]]) << std::endl;
+    }
+    for (int ib = 0; ib < blocks; ++ib) {
+      std::set<uint16_t> inds;
+      if (offsets[ib + 1] > offsets[ib])
+        inds.insert(ind[offsets[ib]]);
+      for (auto j = offsets[ib] + 1; j < offsets[ib + 1]; j++) {
+        inds.insert(ind[j]);
+        auto a = v + offsets[ib];
+        auto k1 = a[ind[j]];
+        auto k2 = a[ind[j - 1]];
+        auto sh = sizeof(uint64_t) - NS;
+        sh *= 8;
+        auto shorten = [sh](T& t) {
+          auto k = (uint64_t*)(&t);
+          *k = (*k >> sh) << sh;
+        };
+        shorten(k1);
+        shorten(k2);
+        if (k1 < k2)
+          std::cout << ib << " not ordered at " << ind[j] << " : " << a[ind[j]] << ' ' << a[ind[j - 1]] << std::endl;
+      }
+      if (!inds.empty()) {
+        assert(0 == *inds.begin());
+        assert(inds.size() - 1 == *inds.rbegin());
+      }
+      if (inds.size() != (offsets[ib + 1] - offsets[ib]))
+        std::cout << "error " << i << ' ' << ib << ' ' << inds.size() << "!=" << (offsets[ib + 1] - offsets[ib])
+                  << std::endl;
+      assert(inds.size() == (offsets[ib + 1] - offsets[ib]));
+    }
+  }  // 50 times
+  std::cout << "cuda computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() / 50.
+            << " ms" << std::endl;
+}
+
+int main() {
+  cms::cudatest::requireDevices();
+
+  bool useShared = false;
+
+  std::cout << "using Global memory" << std::endl;
+
+  go<int8_t>(useShared);
+  go<int16_t>(useShared);
+  go<int32_t>(useShared);
+  go<int32_t, 3>(useShared);
+  go<int64_t>(useShared);
+  go<float, 4, float, double>(useShared);
+  go<float, 2, float, double>(useShared);
+
+  go<uint8_t>(useShared);
+  go<uint16_t>(useShared);
+  go<uint32_t>(useShared);
+  // go<uint64_t>(v);
+
+  useShared = true;
+
+  std::cout << "using Shared memory" << std::endl;
+
+  go<int8_t>(useShared);
+  go<int16_t>(useShared);
+  go<int32_t>(useShared);
+  go<int32_t, 3>(useShared);
+  go<int64_t>(useShared);
+  go<float, 4, float, double>(useShared);
+  go<float, 2, float, double>(useShared);
+
+  go<uint8_t>(useShared);
+  go<uint16_t>(useShared);
+  go<uint32_t>(useShared);
+  // go<uint64_t>(v);
+
+  return 0;
+}

From 9b10f923405831759fcca6110bebbbbc3763f144 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 5 Mar 2020 12:38:25 +0100
Subject: [PATCH 02/15] CUDA does not support C++17 yet, so we define here some
 of the missing library functions

---
 .../CUDAUtilities/interface/cuda_cxx17.h      | 63 +++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 HeterogeneousCore/CUDAUtilities/interface/cuda_cxx17.h

diff --git a/HeterogeneousCore/CUDAUtilities/interface/cuda_cxx17.h b/HeterogeneousCore/CUDAUtilities/interface/cuda_cxx17.h
new file mode 100644
index 0000000000000..89f131edd941e
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/cuda_cxx17.h
@@ -0,0 +1,63 @@
+#ifndef HeterogeneousCore_CUDAUtilities_cuda_cxx17_h
+#define HeterogeneousCore_CUDAUtilities_cuda_cxx17_h
+
+#include <initializer_list>
+
+// CUDA does not support C++17 yet, so we define here some of the missing library functions
+#if __cplusplus <= 201402L
+
+namespace std {
+
+  // from https://en.cppreference.com/w/cpp/iterator/size
+  template <class C>
+  constexpr auto size(const C& c) -> decltype(c.size()) {
+    return c.size();
+  }
+
+  template <class T, std::size_t N>
+  constexpr std::size_t size(const T (&array)[N]) noexcept {
+    return N;
+  }
+
+  // from https://en.cppreference.com/w/cpp/iterator/empty
+  template <class C>
+  constexpr auto empty(const C& c) -> decltype(c.empty()) {
+    return c.empty();
+  }
+
+  template <class T, std::size_t N>
+  constexpr bool empty(const T (&array)[N]) noexcept {
+    return false;
+  }
+
+  template <class E>
+  constexpr bool empty(std::initializer_list<E> il) noexcept {
+    return il.size() == 0;
+  }
+
+  // from https://en.cppreference.com/w/cpp/iterator/data
+  template <class C>
+  constexpr auto data(C& c) -> decltype(c.data()) {
+    return c.data();
+  }
+
+  template <class C>
+  constexpr auto data(const C& c) -> decltype(c.data()) {
+    return c.data();
+  }
+
+  template <class T, std::size_t N>
+  constexpr T* data(T (&array)[N]) noexcept {
+    return array;
+  }
+
+  template <class E>
+  constexpr const E* data(std::initializer_list<E> il) noexcept {
+    return il.begin();
+  }
+
+}  // namespace std
+
+#endif
+
+#endif  // HeterogeneousCore_CUDAUtilities_cuda_cxx17_h

From e4ead5ae0908f5fa3a736677e8223639b07d6e8d Mon Sep 17 00:00:00 2001
From: Felice <felice.pantaleo@cern.ch>
Date: Thu, 5 Mar 2020 12:43:11 +0100
Subject: [PATCH 03/15] Implement vector-like classes for use on GPUs

---
 .../CUDAUtilities/interface/GPUSimpleVector.h | 138 ++++++++++++++++++
 .../CUDAUtilities/interface/GPUVecArray.h     | 104 +++++++++++++
 .../test/test_GPUSimpleVector.cu              |  83 +++++++++++
 3 files changed, 325 insertions(+)
 create mode 100644 HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h
 create mode 100644 HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h
 create mode 100644 HeterogeneousCore/CUDAUtilities/test/test_GPUSimpleVector.cu

diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h b/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h
new file mode 100644
index 0000000000000..0a8df34b5125e
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h
@@ -0,0 +1,138 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_GPUSimpleVector_h
+#define HeterogeneousCore_CUDAUtilities_interface_GPUSimpleVector_h
+
+//  author: Felice Pantaleo, CERN, 2018
+
+#include <type_traits>
+#include <utility>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
+
+namespace GPU {
+  template <class T>
+  struct SimpleVector {
+    constexpr SimpleVector() = default;
+
+    // ownership of m_data stays within the caller
+    constexpr void construct(int capacity, T *data) {
+      m_size = 0;
+      m_capacity = capacity;
+      m_data = data;
+    }
+
+    inline constexpr int push_back_unsafe(const T &element) {
+      auto previousSize = m_size;
+      m_size++;
+      if (previousSize < m_capacity) {
+        m_data[previousSize] = element;
+        return previousSize;
+      } else {
+        --m_size;
+        return -1;
+      }
+    }
+
+    template <class... Ts>
+    constexpr int emplace_back_unsafe(Ts &&... args) {
+      auto previousSize = m_size;
+      m_size++;
+      if (previousSize < m_capacity) {
+        (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
+        return previousSize;
+      } else {
+        --m_size;
+        return -1;
+      }
+    }
+
+    __device__ inline T &back() { return m_data[m_size - 1]; }
+
+    __device__ inline const T &back() const {
+      if (m_size > 0) {
+        return m_data[m_size - 1];
+      } else
+        return T();  //undefined behaviour
+    }
+
+    // thread-safe version of the vector, when used in a CUDA kernel
+    __device__ int push_back(const T &element) {
+      auto previousSize = atomicAdd(&m_size, 1);
+      if (previousSize < m_capacity) {
+        m_data[previousSize] = element;
+        return previousSize;
+      } else {
+        atomicSub(&m_size, 1);
+        return -1;
+      }
+    }
+
+    template <class... Ts>
+    __device__ int emplace_back(Ts &&... args) {
+      auto previousSize = atomicAdd(&m_size, 1);
+      if (previousSize < m_capacity) {
+        (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
+        return previousSize;
+      } else {
+        atomicSub(&m_size, 1);
+        return -1;
+      }
+    }
+
+    // thread safe version of resize
+    __device__ int extend(int size = 1) {
+      auto previousSize = atomicAdd(&m_size, size);
+      if (previousSize < m_capacity) {
+        return previousSize;
+      } else {
+        atomicSub(&m_size, size);
+        return -1;
+      }
+    }
+
+    __device__ int shrink(int size = 1) {
+      auto previousSize = atomicSub(&m_size, size);
+      if (previousSize >= size) {
+        return previousSize - size;
+      } else {
+        atomicAdd(&m_size, size);
+        return -1;
+      }
+    }
+
+    inline constexpr bool empty() const { return m_size <= 0; }
+    inline constexpr bool full() const { return m_size >= m_capacity; }
+    inline constexpr T &operator[](int i) { return m_data[i]; }
+    inline constexpr const T &operator[](int i) const { return m_data[i]; }
+    inline constexpr void reset() { m_size = 0; }
+    inline constexpr int size() const { return m_size; }
+    inline constexpr int capacity() const { return m_capacity; }
+    inline constexpr T const *data() const { return m_data; }
+    inline constexpr void resize(int size) { m_size = size; }
+    inline constexpr void set_data(T *data) { m_data = data; }
+
+  private:
+    int m_size;
+    int m_capacity;
+
+    T *m_data;
+  };
+
+  // ownership of m_data stays within the caller
+  template <class T>
+  SimpleVector<T> make_SimpleVector(int capacity, T *data) {
+    SimpleVector<T> ret;
+    ret.construct(capacity, data);
+    return ret;
+  }
+
+  // ownership of m_data stays within the caller
+  template <class T>
+  SimpleVector<T> *make_SimpleVector(SimpleVector<T> *mem, int capacity, T *data) {
+    auto ret = new (mem) SimpleVector<T>();
+    ret->construct(capacity, data);
+    return ret;
+  }
+
+}  // namespace GPU
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_GPUSimpleVector_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h b/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h
new file mode 100644
index 0000000000000..d9ae73d6d4002
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h
@@ -0,0 +1,104 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_GPUVecArray_h
+#define HeterogeneousCore_CUDAUtilities_interface_GPUVecArray_h
+
+//
+// Author: Felice Pantaleo, CERN
+//
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
+
+namespace GPU {
+
+  template <class T, int maxSize>
+  class VecArray {
+  public:
+    using self = VecArray<T, maxSize>;
+    using value_t = T;
+
+    inline constexpr int push_back_unsafe(const T &element) {
+      auto previousSize = m_size;
+      m_size++;
+      if (previousSize < maxSize) {
+        m_data[previousSize] = element;
+        return previousSize;
+      } else {
+        --m_size;
+        return -1;
+      }
+    }
+
+    template <class... Ts>
+    constexpr int emplace_back_unsafe(Ts &&... args) {
+      auto previousSize = m_size;
+      m_size++;
+      if (previousSize < maxSize) {
+        (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
+        return previousSize;
+      } else {
+        --m_size;
+        return -1;
+      }
+    }
+
+    inline constexpr T &back() const {
+      if (m_size > 0) {
+        return m_data[m_size - 1];
+      } else
+        return T();  //undefined behaviour
+    }
+
+    // thread-safe version of the vector, when used in a CUDA kernel
+    __device__ int push_back(const T &element) {
+      auto previousSize = atomicAdd(&m_size, 1);
+      if (previousSize < maxSize) {
+        m_data[previousSize] = element;
+        return previousSize;
+      } else {
+        atomicSub(&m_size, 1);
+        return -1;
+      }
+    }
+
+    template <class... Ts>
+    __device__ int emplace_back(Ts &&... args) {
+      auto previousSize = atomicAdd(&m_size, 1);
+      if (previousSize < maxSize) {
+        (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
+        return previousSize;
+      } else {
+        atomicSub(&m_size, 1);
+        return -1;
+      }
+    }
+
+    inline constexpr T pop_back() {
+      if (m_size > 0) {
+        auto previousSize = m_size--;
+        return m_data[previousSize - 1];
+      } else
+        return T();
+    }
+
+    inline constexpr T const *begin() const { return m_data; }
+    inline constexpr T const *end() const { return m_data + m_size; }
+    inline constexpr T *begin() { return m_data; }
+    inline constexpr T *end() { return m_data + m_size; }
+    inline constexpr int size() const { return m_size; }
+    inline constexpr T &operator[](int i) { return m_data[i]; }
+    inline constexpr const T &operator[](int i) const { return m_data[i]; }
+    inline constexpr void reset() { m_size = 0; }
+    inline static constexpr int capacity() { return maxSize; }
+    inline constexpr T const *data() const { return m_data; }
+    inline constexpr void resize(int size) { m_size = size; }
+    inline constexpr bool empty() const { return 0 == m_size; }
+    inline constexpr bool full() const { return maxSize == m_size; }
+
+  private:
+    T m_data[maxSize];
+
+    int m_size;
+  };
+
+}  // namespace GPU
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_GPUVecArray_h
diff --git a/HeterogeneousCore/CUDAUtilities/test/test_GPUSimpleVector.cu b/HeterogeneousCore/CUDAUtilities/test/test_GPUSimpleVector.cu
new file mode 100644
index 0000000000000..2811e3b34598e
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/test_GPUSimpleVector.cu
@@ -0,0 +1,83 @@
+//  author: Felice Pantaleo, CERN, 2018
+#include <cassert>
+#include <iostream>
+#include <new>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+
+__global__ void vector_pushback(GPU::SimpleVector<int> *foo) {
+  auto index = threadIdx.x + blockIdx.x * blockDim.x;
+  foo->push_back(index);
+}
+
+__global__ void vector_reset(GPU::SimpleVector<int> *foo) { foo->reset(); }
+
+__global__ void vector_emplace_back(GPU::SimpleVector<int> *foo) {
+  auto index = threadIdx.x + blockIdx.x * blockDim.x;
+  foo->emplace_back(index);
+}
+
+int main() {
+  cms::cudatest::requireDevices();
+
+  auto maxN = 10000;
+  GPU::SimpleVector<int> *obj_ptr = nullptr;
+  GPU::SimpleVector<int> *d_obj_ptr = nullptr;
+  GPU::SimpleVector<int> *tmp_obj_ptr = nullptr;
+  int *data_ptr = nullptr;
+  int *d_data_ptr = nullptr;
+
+  cudaCheck(cudaMallocHost(&obj_ptr, sizeof(GPU::SimpleVector<int>)));
+  cudaCheck(cudaMallocHost(&data_ptr, maxN * sizeof(int)));
+  cudaCheck(cudaMalloc(&d_data_ptr, maxN * sizeof(int)));
+
+  auto v = GPU::make_SimpleVector(obj_ptr, maxN, data_ptr);
+
+  cudaCheck(cudaMallocHost(&tmp_obj_ptr, sizeof(GPU::SimpleVector<int>)));
+  GPU::make_SimpleVector(tmp_obj_ptr, maxN, d_data_ptr);
+  assert(tmp_obj_ptr->size() == 0);
+  assert(tmp_obj_ptr->capacity() == static_cast<int>(maxN));
+
+  cudaCheck(cudaMalloc(&d_obj_ptr, sizeof(GPU::SimpleVector<int>)));
+  // ... and copy the object to the device.
+  cudaCheck(cudaMemcpy(d_obj_ptr, tmp_obj_ptr, sizeof(GPU::SimpleVector<int>), cudaMemcpyDefault));
+
+  int numBlocks = 5;
+  int numThreadsPerBlock = 256;
+  vector_pushback<<<numBlocks, numThreadsPerBlock>>>(d_obj_ptr);
+  cudaCheck(cudaGetLastError());
+  cudaCheck(cudaDeviceSynchronize());
+
+  cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(GPU::SimpleVector<int>), cudaMemcpyDefault));
+
+  assert(obj_ptr->size() == (numBlocks * numThreadsPerBlock < maxN ? numBlocks * numThreadsPerBlock : maxN));
+  vector_reset<<<numBlocks, numThreadsPerBlock>>>(d_obj_ptr);
+  cudaCheck(cudaGetLastError());
+  cudaCheck(cudaDeviceSynchronize());
+
+  cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(GPU::SimpleVector<int>), cudaMemcpyDefault));
+
+  assert(obj_ptr->size() == 0);
+
+  vector_emplace_back<<<numBlocks, numThreadsPerBlock>>>(d_obj_ptr);
+  cudaCheck(cudaGetLastError());
+  cudaCheck(cudaDeviceSynchronize());
+
+  cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(GPU::SimpleVector<int>), cudaMemcpyDefault));
+
+  assert(obj_ptr->size() == (numBlocks * numThreadsPerBlock < maxN ? numBlocks * numThreadsPerBlock : maxN));
+
+  cudaCheck(cudaMemcpy(data_ptr, d_data_ptr, obj_ptr->size() * sizeof(int), cudaMemcpyDefault));
+  cudaCheck(cudaFreeHost(obj_ptr));
+  cudaCheck(cudaFreeHost(data_ptr));
+  cudaCheck(cudaFreeHost(tmp_obj_ptr));
+  cudaCheck(cudaFree(d_data_ptr));
+  cudaCheck(cudaFree(d_obj_ptr));
+  std::cout << "TEST PASSED" << std::endl;
+  return 0;
+}

From 020fdd1db6d6dd7c104be60ff28bc8ec6778fa24 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 5 Mar 2020 12:43:29 +0100
Subject: [PATCH 04/15] Update BuildFile

---
 .../CUDAUtilities/test/BuildFile.xml          | 71 +++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
index ae6835126f9cf..60961d42999d4 100644
--- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -8,7 +8,78 @@
   <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
 </bin>
 
+<bin file="test_GPUSimpleVector.cu" name="test_GPUSimpleVector">
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+</bin>
+
+<bin file="cudastdAlgorithm_t.cpp">
+</bin>
+
+<bin file="cudastdAlgorithm_t.cu" name="gpuCudastdAlgorithm_t">
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+</bin>
+
+<bin file="radixSort_t.cu" name="gpuRadixSort_t">
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+</bin>
+
+<bin file="radixSort_t.cu" name="gpuRadixSort_debug">
+  <flags CUDA_FLAGS="-g -G -DGPU_DEBUG"/>
+  <flags REM_CUDA_FLAGS="--generate-line-info -lineinfo"/>
+</bin>
+
+<bin file="HistoContainer_t.cpp">
+  <flags CXXFLAGS="-g"/>
+</bin>
+
+<bin file="HistoContainer_t.cu" name="gpuHistoContainer_t">
+  <use name="cub"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+</bin>
+
+<bin file="HistoContainer_t.cu" name="gpuHistoContainer_debug">
+  <use name="cub"/>
+  <flags CUDA_FLAGS="-g -G -DGPU_DEBUG"/>
+  <flags REM_CUDA_FLAGS="--generate-line-info -lineinfo"/>
+</bin>
+
+<bin file="OneHistoContainer_t.cu" name="gpuOneHistoContainer_t">
+  <use name="cub"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+</bin>
+
+<bin file="AtomicPairCounter_t.cu" name="gpuAtomicPairCounter_t">
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+</bin>
+
+<bin file="OneToManyAssoc_t.cu" name="gpuOneToManyAssoc_t">
+  <use name="cub"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+</bin>
+
+<bin file="OneToManyAssoc_t.cpp" name="cpuOneToManyAssoc_t">
+  <flags CXXFLAGS="-g -DGPU_DEBUG"/>
+</bin>
+
+<bin file="prefixScan_t.cu" name="gpuPrefixScan_t">
+  <use name="cub"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+</bin>
+
 <bin file="testCatch2Main.cpp,device_unique_ptr_t.cpp,host_unique_ptr_t.cpp,host_noncached_unique_ptr_t.cpp,copyAsync_t.cpp,memsetAsync_t.cpp" name="cudaMemUtils_t">
   <use name="catch2"/>
 </bin>
+
+
+<bin file="eigenSoA_t.cpp" name="cpuEigenSoA_t">
+  <use name="eigen"/>
+  <flags CXXFLAGS="-g -DGPU_DEBUG"/>
+</bin>
+
+<bin file="eigenSoA_t.cu" name="gpuEigenSoA_t">
+  <use name="eigen"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+  <flags CXXFLAGS="-g -DGPU_DEBUG"/>
+</bin>
+
 </iftool>

From 35ffe1f9b33e9948d889e5f10ca000f341891793 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 6 Mar 2020 12:07:31 +0100
Subject: [PATCH 05/15] Fix AtomicPairCounter unit test

---
 HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu b/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu
index 1c28758fb970e..92806c91337d8 100644
--- a/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu
@@ -1,7 +1,7 @@
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h"
-
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
 __global__ void update(AtomicPairCounter *dc, uint32_t *ind, uint32_t *cont, uint32_t n) {
   auto i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -39,6 +39,8 @@ __global__ void verify(AtomicPairCounter const *dc, uint32_t const *ind, uint32_
 
 #include <iostream>
 int main() {
+  cms::cudatest::requireDevices();
+
   AtomicPairCounter *dc_d;
   cudaCheck(cudaMalloc(&dc_d, sizeof(AtomicPairCounter)));
   cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)));

From bfc0df7596871073706cafcb5279154c788983e8 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 6 Mar 2020 12:21:29 +0100
Subject: [PATCH 06/15] Rename the cudaCompat namespace to cms::cudacompat

---
 .../CUDAUtilities/interface/cudaCompat.h      | 124 +++++++++---------
 .../CUDAUtilities/src/cudaCompat.cc           |  12 +-
 2 files changed, 70 insertions(+), 66 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
index 32c7d7b481a50..593821fe805ed 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
@@ -13,69 +13,71 @@
 
 #include <cuda_runtime.h>
 
-namespace cudaCompat {
+namespace cms {
+  namespace cudacompat {
 
 #ifndef __CUDA_RUNTIME_H__
-  struct dim3 {
-    uint32_t x, y, z;
-  };
+    struct dim3 {
+      uint32_t x, y, z;
+    };
 #endif
-  const dim3 threadIdx = {0, 0, 0};
-  const dim3 blockDim = {1, 1, 1};
-
-  extern thread_local dim3 blockIdx;
-  extern thread_local dim3 gridDim;
-
-  template <typename T1, typename T2>
-  T1 atomicInc(T1* a, T2 b) {
-    auto ret = *a;
-    if ((*a) < T1(b))
-      (*a)++;
-    return ret;
-  }
-
-  template <typename T1, typename T2>
-  T1 atomicAdd(T1* a, T2 b) {
-    auto ret = *a;
-    (*a) += b;
-    return ret;
-  }
-
-  template <typename T1, typename T2>
-  T1 atomicSub(T1* a, T2 b) {
-    auto ret = *a;
-    (*a) -= b;
-    return ret;
-  }
-
-  template <typename T1, typename T2>
-  T1 atomicMin(T1* a, T2 b) {
-    auto ret = *a;
-    *a = std::min(*a, T1(b));
-    return ret;
-  }
-  template <typename T1, typename T2>
-  T1 atomicMax(T1* a, T2 b) {
-    auto ret = *a;
-    *a = std::max(*a, T1(b));
-    return ret;
-  }
-
-  inline void __syncthreads() {}
-  inline void __threadfence() {}
-  inline bool __syncthreads_or(bool x) { return x; }
-  inline bool __syncthreads_and(bool x) { return x; }
-  template <typename T>
-  inline T __ldg(T const* x) {
-    return *x;
-  }
-
-  inline void resetGrid() {
-    blockIdx = {0, 0, 0};
-    gridDim = {1, 1, 1};
-  }
-
-}  // namespace cudaCompat
+    const dim3 threadIdx = {0, 0, 0};
+    const dim3 blockDim = {1, 1, 1};
+
+    extern thread_local dim3 blockIdx;
+    extern thread_local dim3 gridDim;
+
+    template <typename T1, typename T2>
+    T1 atomicInc(T1* a, T2 b) {
+      auto ret = *a;
+      if ((*a) < T1(b))
+        (*a)++;
+      return ret;
+    }
+
+    template <typename T1, typename T2>
+    T1 atomicAdd(T1* a, T2 b) {
+      auto ret = *a;
+      (*a) += b;
+      return ret;
+    }
+
+    template <typename T1, typename T2>
+    T1 atomicSub(T1* a, T2 b) {
+      auto ret = *a;
+      (*a) -= b;
+      return ret;
+    }
+
+    template <typename T1, typename T2>
+    T1 atomicMin(T1* a, T2 b) {
+      auto ret = *a;
+      *a = std::min(*a, T1(b));
+      return ret;
+    }
+    template <typename T1, typename T2>
+    T1 atomicMax(T1* a, T2 b) {
+      auto ret = *a;
+      *a = std::max(*a, T1(b));
+      return ret;
+    }
+
+    inline void __syncthreads() {}
+    inline void __threadfence() {}
+    inline bool __syncthreads_or(bool x) { return x; }
+    inline bool __syncthreads_and(bool x) { return x; }
+    template <typename T>
+    inline T __ldg(T const* x) {
+      return *x;
+    }
+
+    inline void resetGrid() {
+      blockIdx = {0, 0, 0};
+      gridDim = {1, 1, 1};
+    }
+
+  }  // namespace cudacompat
+}  // namespace cms
 
 // some  not needed as done by cuda runtime...
 #ifndef __CUDA_RUNTIME_H__
@@ -95,7 +97,7 @@ namespace cudaCompat {
 #endif
 
 #ifndef __CUDA_ARCH__
-using namespace cudaCompat;
+using namespace cms::cudacompat;
 #endif
 
 #endif
diff --git a/HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc b/HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc
index 1aad065963ff2..7b8efda8e3811 100644
--- a/HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc
+++ b/HeterogeneousCore/CUDAUtilities/src/cudaCompat.cc
@@ -1,13 +1,15 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
 
-namespace cudaCompat {
-  thread_local dim3 blockIdx;
-  thread_local dim3 gridDim;
-}  // namespace cudaCompat
+namespace cms {
+  namespace cudacompat {
+    thread_local dim3 blockIdx;
+    thread_local dim3 gridDim;
+  }  // namespace cudacompat
+}  // namespace cms
 
 namespace {
   struct InitGrid {
-    InitGrid() { cudaCompat::resetGrid(); }
+    InitGrid() { cms::cudacompat::resetGrid(); }
   };
 
   const InitGrid initGrid;

From 85a0b47749426138065b147c044e9c0268561b67 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 6 Mar 2020 12:41:53 +0100
Subject: [PATCH 07/15] Remove extra semicolon

---
 HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h b/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h
index e9fef92e68083..b19cfc55f919d 100644
--- a/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h
+++ b/HeterogeneousCore/CUDAUtilities/test/eigenSoA_t.h
@@ -22,7 +22,7 @@ __global__ void testBasicSoA(float* p) {
   assert(!isPowerOf2(1026));
 
   using M3 = Eigen::Matrix<float, 3, 3>;
-  ;
+
   __shared__ eigenSoA::MatrixSoA<M3, 64> m;
 
   int first = threadIdx.x + blockIdx.x * blockDim.x;

From 591a0c305d490968b2ccb2f23c9bfefc02f5e0da Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 6 Mar 2020 12:35:58 +0100
Subject: [PATCH 08/15] Move SimpleVector and VecArray to the cms::cuda
 namespace

---
 .../CUDAUtilities/interface/GPUSimpleVector.h | 138 -----------------
 .../CUDAUtilities/interface/GPUVecArray.h     | 104 -------------
 .../CUDAUtilities/interface/SimpleVector.h    | 141 ++++++++++++++++++
 .../CUDAUtilities/interface/VecArray.h        | 106 +++++++++++++
 .../CUDAUtilities/test/BuildFile.xml          |   2 +-
 ...PUSimpleVector.cu => test_SimpleVector.cu} |  32 ++--
 6 files changed, 264 insertions(+), 259 deletions(-)
 delete mode 100644 HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h
 delete mode 100644 HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h
 create mode 100644 HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h
 create mode 100644 HeterogeneousCore/CUDAUtilities/interface/VecArray.h
 rename HeterogeneousCore/CUDAUtilities/test/{test_GPUSimpleVector.cu => test_SimpleVector.cu} (61%)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h b/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h
deleted file mode 100644
index 0a8df34b5125e..0000000000000
--- a/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h
+++ /dev/null
@@ -1,138 +0,0 @@
-#ifndef HeterogeneousCore_CUDAUtilities_interface_GPUSimpleVector_h
-#define HeterogeneousCore_CUDAUtilities_interface_GPUSimpleVector_h
-
-//  author: Felice Pantaleo, CERN, 2018
-
-#include <type_traits>
-#include <utility>
-
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
-
-namespace GPU {
-  template <class T>
-  struct SimpleVector {
-    constexpr SimpleVector() = default;
-
-    // ownership of m_data stays within the caller
-    constexpr void construct(int capacity, T *data) {
-      m_size = 0;
-      m_capacity = capacity;
-      m_data = data;
-    }
-
-    inline constexpr int push_back_unsafe(const T &element) {
-      auto previousSize = m_size;
-      m_size++;
-      if (previousSize < m_capacity) {
-        m_data[previousSize] = element;
-        return previousSize;
-      } else {
-        --m_size;
-        return -1;
-      }
-    }
-
-    template <class... Ts>
-    constexpr int emplace_back_unsafe(Ts &&... args) {
-      auto previousSize = m_size;
-      m_size++;
-      if (previousSize < m_capacity) {
-        (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
-        return previousSize;
-      } else {
-        --m_size;
-        return -1;
-      }
-    }
-
-    __device__ inline T &back() { return m_data[m_size - 1]; }
-
-    __device__ inline const T &back() const {
-      if (m_size > 0) {
-        return m_data[m_size - 1];
-      } else
-        return T();  //undefined behaviour
-    }
-
-    // thread-safe version of the vector, when used in a CUDA kernel
-    __device__ int push_back(const T &element) {
-      auto previousSize = atomicAdd(&m_size, 1);
-      if (previousSize < m_capacity) {
-        m_data[previousSize] = element;
-        return previousSize;
-      } else {
-        atomicSub(&m_size, 1);
-        return -1;
-      }
-    }
-
-    template <class... Ts>
-    __device__ int emplace_back(Ts &&... args) {
-      auto previousSize = atomicAdd(&m_size, 1);
-      if (previousSize < m_capacity) {
-        (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
-        return previousSize;
-      } else {
-        atomicSub(&m_size, 1);
-        return -1;
-      }
-    }
-
-    // thread safe version of resize
-    __device__ int extend(int size = 1) {
-      auto previousSize = atomicAdd(&m_size, size);
-      if (previousSize < m_capacity) {
-        return previousSize;
-      } else {
-        atomicSub(&m_size, size);
-        return -1;
-      }
-    }
-
-    __device__ int shrink(int size = 1) {
-      auto previousSize = atomicSub(&m_size, size);
-      if (previousSize >= size) {
-        return previousSize - size;
-      } else {
-        atomicAdd(&m_size, size);
-        return -1;
-      }
-    }
-
-    inline constexpr bool empty() const { return m_size <= 0; }
-    inline constexpr bool full() const { return m_size >= m_capacity; }
-    inline constexpr T &operator[](int i) { return m_data[i]; }
-    inline constexpr const T &operator[](int i) const { return m_data[i]; }
-    inline constexpr void reset() { m_size = 0; }
-    inline constexpr int size() const { return m_size; }
-    inline constexpr int capacity() const { return m_capacity; }
-    inline constexpr T const *data() const { return m_data; }
-    inline constexpr void resize(int size) { m_size = size; }
-    inline constexpr void set_data(T *data) { m_data = data; }
-
-  private:
-    int m_size;
-    int m_capacity;
-
-    T *m_data;
-  };
-
-  // ownership of m_data stays within the caller
-  template <class T>
-  SimpleVector<T> make_SimpleVector(int capacity, T *data) {
-    SimpleVector<T> ret;
-    ret.construct(capacity, data);
-    return ret;
-  }
-
-  // ownership of m_data stays within the caller
-  template <class T>
-  SimpleVector<T> *make_SimpleVector(SimpleVector<T> *mem, int capacity, T *data) {
-    auto ret = new (mem) SimpleVector<T>();
-    ret->construct(capacity, data);
-    return ret;
-  }
-
-}  // namespace GPU
-
-#endif  // HeterogeneousCore_CUDAUtilities_interface_GPUSimpleVector_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h b/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h
deleted file mode 100644
index d9ae73d6d4002..0000000000000
--- a/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h
+++ /dev/null
@@ -1,104 +0,0 @@
-#ifndef HeterogeneousCore_CUDAUtilities_interface_GPUVecArray_h
-#define HeterogeneousCore_CUDAUtilities_interface_GPUVecArray_h
-
-//
-// Author: Felice Pantaleo, CERN
-//
-
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
-
-namespace GPU {
-
-  template <class T, int maxSize>
-  class VecArray {
-  public:
-    using self = VecArray<T, maxSize>;
-    using value_t = T;
-
-    inline constexpr int push_back_unsafe(const T &element) {
-      auto previousSize = m_size;
-      m_size++;
-      if (previousSize < maxSize) {
-        m_data[previousSize] = element;
-        return previousSize;
-      } else {
-        --m_size;
-        return -1;
-      }
-    }
-
-    template <class... Ts>
-    constexpr int emplace_back_unsafe(Ts &&... args) {
-      auto previousSize = m_size;
-      m_size++;
-      if (previousSize < maxSize) {
-        (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
-        return previousSize;
-      } else {
-        --m_size;
-        return -1;
-      }
-    }
-
-    inline constexpr T &back() const {
-      if (m_size > 0) {
-        return m_data[m_size - 1];
-      } else
-        return T();  //undefined behaviour
-    }
-
-    // thread-safe version of the vector, when used in a CUDA kernel
-    __device__ int push_back(const T &element) {
-      auto previousSize = atomicAdd(&m_size, 1);
-      if (previousSize < maxSize) {
-        m_data[previousSize] = element;
-        return previousSize;
-      } else {
-        atomicSub(&m_size, 1);
-        return -1;
-      }
-    }
-
-    template <class... Ts>
-    __device__ int emplace_back(Ts &&... args) {
-      auto previousSize = atomicAdd(&m_size, 1);
-      if (previousSize < maxSize) {
-        (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
-        return previousSize;
-      } else {
-        atomicSub(&m_size, 1);
-        return -1;
-      }
-    }
-
-    inline constexpr T pop_back() {
-      if (m_size > 0) {
-        auto previousSize = m_size--;
-        return m_data[previousSize - 1];
-      } else
-        return T();
-    }
-
-    inline constexpr T const *begin() const { return m_data; }
-    inline constexpr T const *end() const { return m_data + m_size; }
-    inline constexpr T *begin() { return m_data; }
-    inline constexpr T *end() { return m_data + m_size; }
-    inline constexpr int size() const { return m_size; }
-    inline constexpr T &operator[](int i) { return m_data[i]; }
-    inline constexpr const T &operator[](int i) const { return m_data[i]; }
-    inline constexpr void reset() { m_size = 0; }
-    inline static constexpr int capacity() { return maxSize; }
-    inline constexpr T const *data() const { return m_data; }
-    inline constexpr void resize(int size) { m_size = size; }
-    inline constexpr bool empty() const { return 0 == m_size; }
-    inline constexpr bool full() const { return maxSize == m_size; }
-
-  private:
-    T m_data[maxSize];
-
-    int m_size;
-  };
-
-}  // namespace GPU
-
-#endif  // HeterogeneousCore_CUDAUtilities_interface_GPUVecArray_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h b/HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h
new file mode 100644
index 0000000000000..0e3caf72d1023
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h
@@ -0,0 +1,141 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_SimpleVector_h
+#define HeterogeneousCore_CUDAUtilities_interface_SimpleVector_h
+
+//  author: Felice Pantaleo, CERN, 2018
+
+#include <type_traits>
+#include <utility>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
+
+namespace cms {
+  namespace cuda {
+
+    template <class T>
+    struct SimpleVector {
+      constexpr SimpleVector() = default;
+
+      // ownership of m_data stays within the caller
+      constexpr void construct(int capacity, T *data) {
+        m_size = 0;
+        m_capacity = capacity;
+        m_data = data;
+      }
+
+      inline constexpr int push_back_unsafe(const T &element) {
+        auto previousSize = m_size;
+        m_size++;
+        if (previousSize < m_capacity) {
+          m_data[previousSize] = element;
+          return previousSize;
+        } else {
+          --m_size;
+          return -1;
+        }
+      }
+
+      template <class... Ts>
+      constexpr int emplace_back_unsafe(Ts &&... args) {
+        auto previousSize = m_size;
+        m_size++;
+        if (previousSize < m_capacity) {
+          (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
+          return previousSize;
+        } else {
+          --m_size;
+          return -1;
+        }
+      }
+
+      __device__ inline T &back() { return m_data[m_size - 1]; }
+
+      __device__ inline const T &back() const {
+        if (m_size > 0) {
+          return m_data[m_size - 1];
+        } else
+          return T();  //undefined behaviour
+      }
+
+      // thread-safe version of the vector, when used in a CUDA kernel
+      __device__ int push_back(const T &element) {
+        auto previousSize = atomicAdd(&m_size, 1);
+        if (previousSize < m_capacity) {
+          m_data[previousSize] = element;
+          return previousSize;
+        } else {
+          atomicSub(&m_size, 1);
+          return -1;
+        }
+      }
+
+      template <class... Ts>
+      __device__ int emplace_back(Ts &&... args) {
+        auto previousSize = atomicAdd(&m_size, 1);
+        if (previousSize < m_capacity) {
+          (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
+          return previousSize;
+        } else {
+          atomicSub(&m_size, 1);
+          return -1;
+        }
+      }
+
+      // thread safe version of resize
+      __device__ int extend(int size = 1) {
+        auto previousSize = atomicAdd(&m_size, size);
+        if (previousSize < m_capacity) {
+          return previousSize;
+        } else {
+          atomicSub(&m_size, size);
+          return -1;
+        }
+      }
+
+      __device__ int shrink(int size = 1) {
+        auto previousSize = atomicSub(&m_size, size);
+        if (previousSize >= size) {
+          return previousSize - size;
+        } else {
+          atomicAdd(&m_size, size);
+          return -1;
+        }
+      }
+
+      inline constexpr bool empty() const { return m_size <= 0; }
+      inline constexpr bool full() const { return m_size >= m_capacity; }
+      inline constexpr T &operator[](int i) { return m_data[i]; }
+      inline constexpr const T &operator[](int i) const { return m_data[i]; }
+      inline constexpr void reset() { m_size = 0; }
+      inline constexpr int size() const { return m_size; }
+      inline constexpr int capacity() const { return m_capacity; }
+      inline constexpr T const *data() const { return m_data; }
+      inline constexpr void resize(int size) { m_size = size; }
+      inline constexpr void set_data(T *data) { m_data = data; }
+
+    private:
+      int m_size;
+      int m_capacity;
+
+      T *m_data;
+    };
+
+    // ownership of m_data stays within the caller
+    template <class T>
+    SimpleVector<T> make_SimpleVector(int capacity, T *data) {
+      SimpleVector<T> ret;
+      ret.construct(capacity, data);
+      return ret;
+    }
+
+    // ownership of m_data stays within the caller
+    template <class T>
+    SimpleVector<T> *make_SimpleVector(SimpleVector<T> *mem, int capacity, T *data) {
+      auto ret = new (mem) SimpleVector<T>();
+      ret->construct(capacity, data);
+      return ret;
+    }
+
+  }  // namespace cuda
+}  // namespace cms
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_SimpleVector_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/VecArray.h b/HeterogeneousCore/CUDAUtilities/interface/VecArray.h
new file mode 100644
index 0000000000000..08388dc8d4b3e
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/VecArray.h
@@ -0,0 +1,106 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_VecArray_h
+#define HeterogeneousCore_CUDAUtilities_interface_VecArray_h
+
+//
+// Author: Felice Pantaleo, CERN
+//
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
+
+namespace cms {
+  namespace cuda {
+
+    template <class T, int maxSize>
+    class VecArray {
+    public:
+      using self = VecArray<T, maxSize>;
+      using value_t = T;
+
+      inline constexpr int push_back_unsafe(const T &element) {
+        auto previousSize = m_size;
+        m_size++;
+        if (previousSize < maxSize) {
+          m_data[previousSize] = element;
+          return previousSize;
+        } else {
+          --m_size;
+          return -1;
+        }
+      }
+
+      template <class... Ts>
+      constexpr int emplace_back_unsafe(Ts &&... args) {
+        auto previousSize = m_size;
+        m_size++;
+        if (previousSize < maxSize) {
+          (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
+          return previousSize;
+        } else {
+          --m_size;
+          return -1;
+        }
+      }
+
+      inline constexpr T &back() const {
+        if (m_size > 0) {
+          return m_data[m_size - 1];
+        } else
+          return T();  //undefined behaviour
+      }
+
+      // thread-safe version of the vector, when used in a CUDA kernel
+      __device__ int push_back(const T &element) {
+        auto previousSize = atomicAdd(&m_size, 1);
+        if (previousSize < maxSize) {
+          m_data[previousSize] = element;
+          return previousSize;
+        } else {
+          atomicSub(&m_size, 1);
+          return -1;
+        }
+      }
+
+      template <class... Ts>
+      __device__ int emplace_back(Ts &&... args) {
+        auto previousSize = atomicAdd(&m_size, 1);
+        if (previousSize < maxSize) {
+          (new (&m_data[previousSize]) T(std::forward<Ts>(args)...));
+          return previousSize;
+        } else {
+          atomicSub(&m_size, 1);
+          return -1;
+        }
+      }
+
+      inline constexpr T pop_back() {
+        if (m_size > 0) {
+          auto previousSize = m_size--;
+          return m_data[previousSize - 1];
+        } else
+          return T();
+      }
+
+      inline constexpr T const *begin() const { return m_data; }
+      inline constexpr T const *end() const { return m_data + m_size; }
+      inline constexpr T *begin() { return m_data; }
+      inline constexpr T *end() { return m_data + m_size; }
+      inline constexpr int size() const { return m_size; }
+      inline constexpr T &operator[](int i) { return m_data[i]; }
+      inline constexpr const T &operator[](int i) const { return m_data[i]; }
+      inline constexpr void reset() { m_size = 0; }
+      inline static constexpr int capacity() { return maxSize; }
+      inline constexpr T const *data() const { return m_data; }
+      inline constexpr void resize(int size) { m_size = size; }
+      inline constexpr bool empty() const { return 0 == m_size; }
+      inline constexpr bool full() const { return maxSize == m_size; }
+
+    private:
+      T m_data[maxSize];
+
+      int m_size;
+    };
+
+  }  // namespace cuda
+}  // namespace cms
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_VecArray_h
diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
index 60961d42999d4..5455860900400 100644
--- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -8,7 +8,7 @@
   <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
 </bin>
 
-<bin file="test_GPUSimpleVector.cu" name="test_GPUSimpleVector">
+<bin file="test_SimpleVector.cu" name="test_SimpleVector">
   <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
 </bin>
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/test_GPUSimpleVector.cu b/HeterogeneousCore/CUDAUtilities/test/test_SimpleVector.cu
similarity index 61%
rename from HeterogeneousCore/CUDAUtilities/test/test_GPUSimpleVector.cu
rename to HeterogeneousCore/CUDAUtilities/test/test_SimpleVector.cu
index 2811e3b34598e..6ecc89f30f158 100644
--- a/HeterogeneousCore/CUDAUtilities/test/test_GPUSimpleVector.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/test_SimpleVector.cu
@@ -6,18 +6,18 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
-__global__ void vector_pushback(GPU::SimpleVector<int> *foo) {
+__global__ void vector_pushback(cms::cuda::SimpleVector<int> *foo) {
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
   foo->push_back(index);
 }
 
-__global__ void vector_reset(GPU::SimpleVector<int> *foo) { foo->reset(); }
+__global__ void vector_reset(cms::cuda::SimpleVector<int> *foo) { foo->reset(); }
 
-__global__ void vector_emplace_back(GPU::SimpleVector<int> *foo) {
+__global__ void vector_emplace_back(cms::cuda::SimpleVector<int> *foo) {
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
   foo->emplace_back(index);
 }
@@ -26,26 +26,26 @@ int main() {
   cms::cudatest::requireDevices();
 
   auto maxN = 10000;
-  GPU::SimpleVector<int> *obj_ptr = nullptr;
-  GPU::SimpleVector<int> *d_obj_ptr = nullptr;
-  GPU::SimpleVector<int> *tmp_obj_ptr = nullptr;
+  cms::cuda::SimpleVector<int> *obj_ptr = nullptr;
+  cms::cuda::SimpleVector<int> *d_obj_ptr = nullptr;
+  cms::cuda::SimpleVector<int> *tmp_obj_ptr = nullptr;
   int *data_ptr = nullptr;
   int *d_data_ptr = nullptr;
 
-  cudaCheck(cudaMallocHost(&obj_ptr, sizeof(GPU::SimpleVector<int>)));
+  cudaCheck(cudaMallocHost(&obj_ptr, sizeof(cms::cuda::SimpleVector<int>)));
   cudaCheck(cudaMallocHost(&data_ptr, maxN * sizeof(int)));
   cudaCheck(cudaMalloc(&d_data_ptr, maxN * sizeof(int)));
 
-  auto v = GPU::make_SimpleVector(obj_ptr, maxN, data_ptr);
+  auto v = cms::cuda::make_SimpleVector(obj_ptr, maxN, data_ptr);
 
-  cudaCheck(cudaMallocHost(&tmp_obj_ptr, sizeof(GPU::SimpleVector<int>)));
-  GPU::make_SimpleVector(tmp_obj_ptr, maxN, d_data_ptr);
+  cudaCheck(cudaMallocHost(&tmp_obj_ptr, sizeof(cms::cuda::SimpleVector<int>)));
+  cms::cuda::make_SimpleVector(tmp_obj_ptr, maxN, d_data_ptr);
   assert(tmp_obj_ptr->size() == 0);
   assert(tmp_obj_ptr->capacity() == static_cast<int>(maxN));
 
-  cudaCheck(cudaMalloc(&d_obj_ptr, sizeof(GPU::SimpleVector<int>)));
+  cudaCheck(cudaMalloc(&d_obj_ptr, sizeof(cms::cuda::SimpleVector<int>)));
   // ... and copy the object to the device.
-  cudaCheck(cudaMemcpy(d_obj_ptr, tmp_obj_ptr, sizeof(GPU::SimpleVector<int>), cudaMemcpyDefault));
+  cudaCheck(cudaMemcpy(d_obj_ptr, tmp_obj_ptr, sizeof(cms::cuda::SimpleVector<int>), cudaMemcpyDefault));
 
   int numBlocks = 5;
   int numThreadsPerBlock = 256;
@@ -53,14 +53,14 @@ int main() {
   cudaCheck(cudaGetLastError());
   cudaCheck(cudaDeviceSynchronize());
 
-  cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(GPU::SimpleVector<int>), cudaMemcpyDefault));
+  cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(cms::cuda::SimpleVector<int>), cudaMemcpyDefault));
 
   assert(obj_ptr->size() == (numBlocks * numThreadsPerBlock < maxN ? numBlocks * numThreadsPerBlock : maxN));
   vector_reset<<<numBlocks, numThreadsPerBlock>>>(d_obj_ptr);
   cudaCheck(cudaGetLastError());
   cudaCheck(cudaDeviceSynchronize());
 
-  cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(GPU::SimpleVector<int>), cudaMemcpyDefault));
+  cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(cms::cuda::SimpleVector<int>), cudaMemcpyDefault));
 
   assert(obj_ptr->size() == 0);
 
@@ -68,7 +68,7 @@ int main() {
   cudaCheck(cudaGetLastError());
   cudaCheck(cudaDeviceSynchronize());
 
-  cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(GPU::SimpleVector<int>), cudaMemcpyDefault));
+  cudaCheck(cudaMemcpy(obj_ptr, d_obj_ptr, sizeof(cms::cuda::SimpleVector<int>), cudaMemcpyDefault));
 
   assert(obj_ptr->size() == (numBlocks * numThreadsPerBlock < maxN ? numBlocks * numThreadsPerBlock : maxN));
 

From f4dc65051b14516993fa023f623f04d31af58cf6 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 6 Mar 2020 12:51:06 +0100
Subject: [PATCH 09/15] Add missing dependency

---
 HeterogeneousCore/CUDAUtilities/BuildFile.xml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/HeterogeneousCore/CUDAUtilities/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/BuildFile.xml
index 4a8e644a231f6..1e2de51dc0053 100644
--- a/HeterogeneousCore/CUDAUtilities/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/BuildFile.xml
@@ -1,6 +1,7 @@
 <iftool name="cuda-gcc-support">
 <use name="cub"/>
 <use name="cuda"/>
+<use name="eigen"/>
 <use name="FWCore/Utilities"/>
 <use name="FWCore/MessageLogger"/>
 

From 68de0b3d1028d85f7a591dff33dd2a7c5c693934 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 6 Mar 2020 13:14:55 +0100
Subject: [PATCH 10/15] Move HistoContainer, AtomicPairCounter, prefixScan and
 radixSort to the cms::cuda namespace

---
 .../interface/AtomicPairCounter.h             |  70 ++--
 .../CUDAUtilities/interface/HistoContainer.h  | 378 +++++++++---------
 .../CUDAUtilities/interface/prefixScan.h      | 245 ++++++------
 .../CUDAUtilities/interface/radixSort.h       |  45 ++-
 .../CUDAUtilities/test/AtomicPairCounter_t.cu |   2 +
 .../CUDAUtilities/test/HistoContainer_t.cpp   |   2 +
 .../CUDAUtilities/test/HistoContainer_t.cu    |  12 +-
 .../CUDAUtilities/test/OneHistoContainer_t.cu |   6 +-
 .../CUDAUtilities/test/OneToManyAssoc_t.h     |  42 +-
 .../CUDAUtilities/test/prefixScan_t.cu        |   8 +-
 .../CUDAUtilities/test/radixSort_t.cu         |   2 +
 11 files changed, 423 insertions(+), 389 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h b/HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h
index 8854b02176260..2e191ce551c42 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h
@@ -5,48 +5,54 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
 
-class AtomicPairCounter {
-public:
-  using c_type = unsigned long long int;
+namespace cms {
+  namespace cuda {
 
-  AtomicPairCounter() {}
-  AtomicPairCounter(c_type i) { counter.ac = i; }
+    class AtomicPairCounter {
+    public:
+      using c_type = unsigned long long int;
 
-  __device__ __host__ AtomicPairCounter& operator=(c_type i) {
-    counter.ac = i;
-    return *this;
-  }
+      AtomicPairCounter() {}
+      AtomicPairCounter(c_type i) { counter.ac = i; }
 
-  struct Counters {
-    uint32_t n;  // in a "One to Many" association is the number of "One"
-    uint32_t m;  // in a "One to Many" association is the total number of associations
-  };
+      __device__ __host__ AtomicPairCounter& operator=(c_type i) {
+        counter.ac = i;
+        return *this;
+      }
 
-  union Atomic2 {
-    Counters counters;
-    c_type ac;
-  };
+      struct Counters {
+        uint32_t n;  // in a "One to Many" association is the number of "One"
+        uint32_t m;  // in a "One to Many" association is the total number of associations
+      };
 
-  static constexpr c_type incr = 1UL << 32;
+      union Atomic2 {
+        Counters counters;
+        c_type ac;
+      };
 
-  __device__ __host__ Counters get() const { return counter.counters; }
+      static constexpr c_type incr = 1UL << 32;
 
-  // increment n by 1 and m by i.  return previous value
-  __host__ __device__ __forceinline__ Counters add(uint32_t i) {
-    c_type c = i;
-    c += incr;
-    Atomic2 ret;
+      __device__ __host__ Counters get() const { return counter.counters; }
+
+      // increment n by 1 and m by i.  return previous value
+      __host__ __device__ __forceinline__ Counters add(uint32_t i) {
+        c_type c = i;
+        c += incr;
+        Atomic2 ret;
 #ifdef __CUDA_ARCH__
-    ret.ac = atomicAdd(&counter.ac, c);
+        ret.ac = atomicAdd(&counter.ac, c);
 #else
-    ret.ac = counter.ac;
-    counter.ac += c;
+        ret.ac = counter.ac;
+        counter.ac += c;
 #endif
-    return ret.counters;
-  }
+        return ret.counters;
+      }
+
+    private:
+      Atomic2 counter;
+    };
 
-private:
-  Atomic2 counter;
-};
+  }  // namespace cuda
+}  // namespace cms
 
 #endif  // HeterogeneousCore_CUDAUtilities_interface_AtomicPairCounter_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
index 67b2b46f45101..88118fcdac277 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -125,216 +125,216 @@ namespace cms {
       assoc->bulkFinalizeFill(*apc);
     }
 
-  }  // namespace cuda
-}  // namespace cms
+    // iteratate over N bins left and right of the one containing "v"
+    template <typename Hist, typename V, typename Func>
+    __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) {
+      int bs = Hist::bin(value);
+      int be = std::min(int(Hist::nbins() - 1), bs + n);
+      bs = std::max(0, bs - n);
+      assert(be >= bs);
+      for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
+        func(*pj);
+      }
+    }
 
-// iteratate over N bins left and right of the one containing "v"
-template <typename Hist, typename V, typename Func>
-__host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) {
-  int bs = Hist::bin(value);
-  int be = std::min(int(Hist::nbins() - 1), bs + n);
-  bs = std::max(0, bs - n);
-  assert(be >= bs);
-  for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
-    func(*pj);
-  }
-}
-
-// iteratate over bins containing all values in window wmin, wmax
-template <typename Hist, typename V, typename Func>
-__host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) {
-  auto bs = Hist::bin(wmin);
-  auto be = Hist::bin(wmax);
-  assert(be >= bs);
-  for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
-    func(*pj);
-  }
-}
-
-template <typename T,                  // the type of the discretized input values
-          uint32_t NBINS,              // number of bins
-          uint32_t SIZE,               // max number of element
-          uint32_t S = sizeof(T) * 8,  // number of significant bits in T
-          typename I = uint32_t,  // type stored in the container (usually an index in a vector of the input values)
-          uint32_t NHISTS = 1     // number of histos stored
-          >
-class HistoContainer {
-public:
-  using Counter = uint32_t;
-
-  using CountersOnly = HistoContainer<T, NBINS, 0, S, I, NHISTS>;
-
-  using index_type = I;
-  using UT = typename std::make_unsigned<T>::type;
-
-  static constexpr uint32_t ilog2(uint32_t v) {
-    constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
-    constexpr uint32_t s[] = {1, 2, 4, 8, 16};
-
-    uint32_t r = 0;  // result of log2(v) will go here
-    for (auto i = 4; i >= 0; i--)
-      if (v & b[i]) {
-        v >>= s[i];
-        r |= s[i];
+    // iteratate over bins containing all values in window wmin, wmax
+    template <typename Hist, typename V, typename Func>
+    __host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) {
+      auto bs = Hist::bin(wmin);
+      auto be = Hist::bin(wmax);
+      assert(be >= bs);
+      for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
+        func(*pj);
       }
-    return r;
-  }
+    }
 
-  static constexpr uint32_t sizeT() { return S; }
-  static constexpr uint32_t nbins() { return NBINS; }
-  static constexpr uint32_t nhists() { return NHISTS; }
-  static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; }
-  static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; }
-  static constexpr uint32_t capacity() { return SIZE; }
+    template <typename T,                  // the type of the discretized input values
+              uint32_t NBINS,              // number of bins
+              uint32_t SIZE,               // max number of element
+              uint32_t S = sizeof(T) * 8,  // number of significant bits in T
+              typename I = uint32_t,  // type stored in the container (usually an index in a vector of the input values)
+              uint32_t NHISTS = 1     // number of histos stored
+              >
+    class HistoContainer {
+    public:
+      using Counter = uint32_t;
+
+      using CountersOnly = HistoContainer<T, NBINS, 0, S, I, NHISTS>;
+
+      using index_type = I;
+      using UT = typename std::make_unsigned<T>::type;
+
+      static constexpr uint32_t ilog2(uint32_t v) {
+        constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
+        constexpr uint32_t s[] = {1, 2, 4, 8, 16};
+
+        uint32_t r = 0;  // result of log2(v) will go here
+        for (auto i = 4; i >= 0; i--)
+          if (v & b[i]) {
+            v >>= s[i];
+            r |= s[i];
+          }
+        return r;
+      }
 
-  static constexpr auto histOff(uint32_t nh) { return NBINS * nh; }
+      static constexpr uint32_t sizeT() { return S; }
+      static constexpr uint32_t nbins() { return NBINS; }
+      static constexpr uint32_t nhists() { return NHISTS; }
+      static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; }
+      static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; }
+      static constexpr uint32_t capacity() { return SIZE; }
 
-  __host__ static size_t wsSize() {
+      static constexpr auto histOff(uint32_t nh) { return NBINS * nh; }
+
+      __host__ static size_t wsSize() {
 #ifdef __CUDACC__
-    uint32_t *v = nullptr;
-    void *d_temp_storage = nullptr;
-    size_t temp_storage_bytes = 0;
-    cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins());
-    return temp_storage_bytes;
+        uint32_t *v = nullptr;
+        void *d_temp_storage = nullptr;
+        size_t temp_storage_bytes = 0;
+        cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins());
+        return temp_storage_bytes;
 #else
-    return 0;
+        return 0;
 #endif
-  }
+      }
 
-  static constexpr UT bin(T t) {
-    constexpr uint32_t shift = sizeT() - nbits();
-    constexpr uint32_t mask = (1 << nbits()) - 1;
-    return (t >> shift) & mask;
-  }
+      static constexpr UT bin(T t) {
+        constexpr uint32_t shift = sizeT() - nbits();
+        constexpr uint32_t mask = (1 << nbits()) - 1;
+        return (t >> shift) & mask;
+      }
 
-  __host__ __device__ void zero() {
-    for (auto &i : off)
-      i = 0;
-  }
+      __host__ __device__ void zero() {
+        for (auto &i : off)
+          i = 0;
+      }
 
-  __host__ __device__ void add(CountersOnly const &co) {
-    for (uint32_t i = 0; i < totbins(); ++i) {
+      __host__ __device__ void add(CountersOnly const &co) {
+        for (uint32_t i = 0; i < totbins(); ++i) {
 #ifdef __CUDA_ARCH__
-      atomicAdd(off + i, co.off[i]);
+          atomicAdd(off + i, co.off[i]);
 #else
-      auto &a = (std::atomic<Counter> &)(off[i]);
-      a += co.off[i];
+          auto &a = (std::atomic<Counter> &)(off[i]);
+          a += co.off[i];
 #endif
-    }
-  }
+        }
+      }
 
-  static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) {
+      static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) {
 #ifdef __CUDA_ARCH__
-    return atomicAdd(&x, 1);
+        return atomicAdd(&x, 1);
 #else
-    auto &a = (std::atomic<Counter> &)(x);
-    return a++;
+        auto &a = (std::atomic<Counter> &)(x);
+        return a++;
 #endif
-  }
+      }
 
-  static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) {
+      static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) {
 #ifdef __CUDA_ARCH__
-    return atomicSub(&x, 1);
+        return atomicSub(&x, 1);
 #else
-    auto &a = (std::atomic<Counter> &)(x);
-    return a--;
+        auto &a = (std::atomic<Counter> &)(x);
+        return a--;
 #endif
-  }
-
-  __host__ __device__ __forceinline__ void countDirect(T b) {
-    assert(b < nbins());
-    atomicIncrement(off[b]);
-  }
-
-  __host__ __device__ __forceinline__ void fillDirect(T b, index_type j) {
-    assert(b < nbins());
-    auto w = atomicDecrement(off[b]);
-    assert(w > 0);
-    bins[w - 1] = j;
-  }
-
-  __device__ __host__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) {
-    auto c = apc.add(n);
-    if (c.m >= nbins())
-      return -int32_t(c.m);
-    off[c.m] = c.n;
-    for (uint32_t j = 0; j < n; ++j)
-      bins[c.n + j] = v[j];
-    return c.m;
-  }
-
-  __device__ __host__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) {
-    off[apc.get().m] = apc.get().n;
-  }
-
-  __device__ __host__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) {
-    auto m = apc.get().m;
-    auto n = apc.get().n;
-    if (m >= nbins()) {  // overflow!
-      off[nbins()] = uint32_t(off[nbins() - 1]);
-      return;
-    }
-    auto first = m + blockDim.x * blockIdx.x + threadIdx.x;
-    for (auto i = first; i < totbins(); i += gridDim.x * blockDim.x) {
-      off[i] = n;
-    }
-  }
-
-  __host__ __device__ __forceinline__ void count(T t) {
-    uint32_t b = bin(t);
-    assert(b < nbins());
-    atomicIncrement(off[b]);
-  }
-
-  __host__ __device__ __forceinline__ void fill(T t, index_type j) {
-    uint32_t b = bin(t);
-    assert(b < nbins());
-    auto w = atomicDecrement(off[b]);
-    assert(w > 0);
-    bins[w - 1] = j;
-  }
-
-  __host__ __device__ __forceinline__ void count(T t, uint32_t nh) {
-    uint32_t b = bin(t);
-    assert(b < nbins());
-    b += histOff(nh);
-    assert(b < totbins());
-    atomicIncrement(off[b]);
-  }
-
-  __host__ __device__ __forceinline__ void fill(T t, index_type j, uint32_t nh) {
-    uint32_t b = bin(t);
-    assert(b < nbins());
-    b += histOff(nh);
-    assert(b < totbins());
-    auto w = atomicDecrement(off[b]);
-    assert(w > 0);
-    bins[w - 1] = j;
-  }
-
-  __device__ __host__ __forceinline__ void finalize(Counter *ws = nullptr) {
-    assert(off[totbins() - 1] == 0);
-    blockPrefixScan(off, totbins(), ws);
-    assert(off[totbins() - 1] == off[totbins() - 2]);
-  }
-
-  constexpr auto size() const { return uint32_t(off[totbins() - 1]); }
-  constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; }
-
-  constexpr index_type const *begin() const { return bins; }
-  constexpr index_type const *end() const { return begin() + size(); }
-
-  constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; }
-  constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; }
-
-  Counter off[totbins()];
-  index_type bins[capacity()];
-};
-
-template <typename I,        // type stored in the container (usually an index in a vector of the input values)
-          uint32_t MAXONES,  // max number of "ones"
-          uint32_t MAXMANYS  // max number of "manys"
-          >
-using OneToManyAssoc = HistoContainer<uint32_t, MAXONES, MAXMANYS, sizeof(uint32_t) * 8, I, 1>;
+      }
+
+      __host__ __device__ __forceinline__ void countDirect(T b) {
+        assert(b < nbins());
+        atomicIncrement(off[b]);
+      }
+
+      __host__ __device__ __forceinline__ void fillDirect(T b, index_type j) {
+        assert(b < nbins());
+        auto w = atomicDecrement(off[b]);
+        assert(w > 0);
+        bins[w - 1] = j;
+      }
+
+      __host__ __device__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) {
+        auto c = apc.add(n);
+        if (c.m >= nbins())
+          return -int32_t(c.m);
+        off[c.m] = c.n;
+        for (uint32_t j = 0; j < n; ++j)
+          bins[c.n + j] = v[j];
+        return c.m;
+      }
+
+      __host__ __device__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) {
+        off[apc.get().m] = apc.get().n;
+      }
+
+      __host__ __device__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) {
+        auto m = apc.get().m;
+        auto n = apc.get().n;
+        if (m >= nbins()) {  // overflow!
+          off[nbins()] = uint32_t(off[nbins() - 1]);
+          return;
+        }
+        auto first = m + blockDim.x * blockIdx.x + threadIdx.x;
+        for (auto i = first; i < totbins(); i += gridDim.x * blockDim.x) {
+          off[i] = n;
+        }
+      }
+
+      __host__ __device__ __forceinline__ void count(T t) {
+        uint32_t b = bin(t);
+        assert(b < nbins());
+        atomicIncrement(off[b]);
+      }
+
+      __host__ __device__ __forceinline__ void fill(T t, index_type j) {
+        uint32_t b = bin(t);
+        assert(b < nbins());
+        auto w = atomicDecrement(off[b]);
+        assert(w > 0);
+        bins[w - 1] = j;
+      }
+
+      __host__ __device__ __forceinline__ void count(T t, uint32_t nh) {
+        uint32_t b = bin(t);
+        assert(b < nbins());
+        b += histOff(nh);
+        assert(b < totbins());
+        atomicIncrement(off[b]);
+      }
+
+      __host__ __device__ __forceinline__ void fill(T t, index_type j, uint32_t nh) {
+        uint32_t b = bin(t);
+        assert(b < nbins());
+        b += histOff(nh);
+        assert(b < totbins());
+        auto w = atomicDecrement(off[b]);
+        assert(w > 0);
+        bins[w - 1] = j;
+      }
+
+      __host__ __device__ __forceinline__ void finalize(Counter *ws = nullptr) {
+        assert(off[totbins() - 1] == 0);
+        blockPrefixScan(off, totbins(), ws);
+        assert(off[totbins() - 1] == off[totbins() - 2]);
+      }
+
+      constexpr auto size() const { return uint32_t(off[totbins() - 1]); }
+      constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; }
+
+      constexpr index_type const *begin() const { return bins; }
+      constexpr index_type const *end() const { return begin() + size(); }
+
+      constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; }
+      constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; }
+
+      Counter off[totbins()];
+      index_type bins[capacity()];
+    };
+
+    template <typename I,        // type stored in the container (usually an index in a vector of the input values)
+              uint32_t MAXONES,  // max number of "ones"
+              uint32_t MAXMANYS  // max number of "manys"
+              >
+    using OneToManyAssoc = HistoContainer<uint32_t, MAXONES, MAXMANYS, sizeof(uint32_t) * 8, I, 1>;
+
+  }  // namespace cuda
+}  // namespace cms
 
 #endif  // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
index 8b784bdd61bfe..1cf554163f409 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
@@ -7,6 +7,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
 #ifdef __CUDA_ARCH__
+
 template <typename T>
 __device__ void __forceinline__ warpPrefixScan(T const* __restrict__ ci, T* __restrict__ co, uint32_t i, uint32_t mask) {
   // ci and co may be the same
@@ -20,10 +21,7 @@ __device__ void __forceinline__ warpPrefixScan(T const* __restrict__ ci, T* __re
   }
   co[i] = x;
 }
-#endif
 
-//same as above may remove
-#ifdef __CUDA_ARCH__
 template <typename T>
 __device__ void __forceinline__ warpPrefixScan(T* c, uint32_t i, uint32_t mask) {
   auto x = c[i];
@@ -36,134 +34,141 @@ __device__ void __forceinline__ warpPrefixScan(T* c, uint32_t i, uint32_t mask)
   }
   c[i] = x;
 }
+
 #endif
 
-// limited to 32*32 elements....
-template <typename T>
-__device__ __host__ void __forceinline__ blockPrefixScan(T const* __restrict__ ci,
-                                                         T* __restrict__ co,
-                                                         uint32_t size,
-                                                         T* ws
+namespace cms {
+  namespace cuda {
+
+    // limited to 32*32 elements....
+    template <typename T>
+    __host__ __device__ __forceinline__ void blockPrefixScan(T const* __restrict__ ci,
+                                                             T* __restrict__ co,
+                                                             uint32_t size,
+                                                             T* ws
 #ifndef __CUDA_ARCH__
-                                                         = nullptr
+                                                             = nullptr
 #endif
-) {
+    ) {
 #ifdef __CUDA_ARCH__
-  assert(ws);
-  assert(size <= 1024);
-  assert(0 == blockDim.x % 32);
-  auto first = threadIdx.x;
-  auto mask = __ballot_sync(0xffffffff, first < size);
-
-  for (auto i = first; i < size; i += blockDim.x) {
-    warpPrefixScan(ci, co, i, mask);
-    auto laneId = threadIdx.x & 0x1f;
-    auto warpId = i / 32;
-    assert(warpId < 32);
-    if (31 == laneId)
-      ws[warpId] = co[i];
-    mask = __ballot_sync(mask, i + blockDim.x < size);
-  }
-  __syncthreads();
-  if (size <= 32)
-    return;
-  if (threadIdx.x < 32)
-    warpPrefixScan(ws, threadIdx.x, 0xffffffff);
-  __syncthreads();
-  for (auto i = first + 32; i < size; i += blockDim.x) {
-    auto warpId = i / 32;
-    co[i] += ws[warpId - 1];
-  }
-  __syncthreads();
+      assert(ws);
+      assert(size <= 1024);
+      assert(0 == blockDim.x % 32);
+      auto first = threadIdx.x;
+      auto mask = __ballot_sync(0xffffffff, first < size);
+
+      for (auto i = first; i < size; i += blockDim.x) {
+        warpPrefixScan(ci, co, i, mask);
+        auto laneId = threadIdx.x & 0x1f;
+        auto warpId = i / 32;
+        assert(warpId < 32);
+        if (31 == laneId)
+          ws[warpId] = co[i];
+        mask = __ballot_sync(mask, i + blockDim.x < size);
+      }
+      __syncthreads();
+      if (size <= 32)
+        return;
+      if (threadIdx.x < 32)
+        warpPrefixScan(ws, threadIdx.x, 0xffffffff);
+      __syncthreads();
+      for (auto i = first + 32; i < size; i += blockDim.x) {
+        auto warpId = i / 32;
+        co[i] += ws[warpId - 1];
+      }
+      __syncthreads();
 #else
-  co[0] = ci[0];
-  for (uint32_t i = 1; i < size; ++i)
-    co[i] = ci[i] + co[i - 1];
+      co[0] = ci[0];
+      for (uint32_t i = 1; i < size; ++i)
+        co[i] = ci[i] + co[i - 1];
 #endif
-}
-
-// same as above, may remove
-// limited to 32*32 elements....
-template <typename T>
-__device__ __host__ void __forceinline__ blockPrefixScan(T* c,
-                                                         uint32_t size,
-                                                         T* ws
+    }
+
+    // same as above, may remove
+    // limited to 32*32 elements....
+    template <typename T>
+    __host__ __device__ __forceinline__ void blockPrefixScan(T* c,
+                                                             uint32_t size,
+                                                             T* ws
 #ifndef __CUDA_ARCH__
-                                                         = nullptr
+                                                             = nullptr
 #endif
-) {
+    ) {
 #ifdef __CUDA_ARCH__
-  assert(ws);
-  assert(size <= 1024);
-  assert(0 == blockDim.x % 32);
-  auto first = threadIdx.x;
-  auto mask = __ballot_sync(0xffffffff, first < size);
-
-  for (auto i = first; i < size; i += blockDim.x) {
-    warpPrefixScan(c, i, mask);
-    auto laneId = threadIdx.x & 0x1f;
-    auto warpId = i / 32;
-    assert(warpId < 32);
-    if (31 == laneId)
-      ws[warpId] = c[i];
-    mask = __ballot_sync(mask, i + blockDim.x < size);
-  }
-  __syncthreads();
-  if (size <= 32)
-    return;
-  if (threadIdx.x < 32)
-    warpPrefixScan(ws, threadIdx.x, 0xffffffff);
-  __syncthreads();
-  for (auto i = first + 32; i < size; i += blockDim.x) {
-    auto warpId = i / 32;
-    c[i] += ws[warpId - 1];
-  }
-  __syncthreads();
+      assert(ws);
+      assert(size <= 1024);
+      assert(0 == blockDim.x % 32);
+      auto first = threadIdx.x;
+      auto mask = __ballot_sync(0xffffffff, first < size);
+
+      for (auto i = first; i < size; i += blockDim.x) {
+        warpPrefixScan(c, i, mask);
+        auto laneId = threadIdx.x & 0x1f;
+        auto warpId = i / 32;
+        assert(warpId < 32);
+        if (31 == laneId)
+          ws[warpId] = c[i];
+        mask = __ballot_sync(mask, i + blockDim.x < size);
+      }
+      __syncthreads();
+      if (size <= 32)
+        return;
+      if (threadIdx.x < 32)
+        warpPrefixScan(ws, threadIdx.x, 0xffffffff);
+      __syncthreads();
+      for (auto i = first + 32; i < size; i += blockDim.x) {
+        auto warpId = i / 32;
+        c[i] += ws[warpId - 1];
+      }
+      __syncthreads();
 #else
-  for (uint32_t i = 1; i < size; ++i)
-    c[i] += c[i - 1];
+      for (uint32_t i = 1; i < size; ++i)
+        c[i] += c[i - 1];
 #endif
-}
-
-// limited to 1024*1024 elements....
-template <typename T>
-__global__ void multiBlockPrefixScan(T const* __restrict__ ci, T* __restrict__ co, int32_t size, int32_t* pc) {
-  __shared__ T ws[32];
-  // first each block does a scan of size 1024; (better be enough blocks....)
-  assert(1024 * gridDim.x >= size);
-  int off = 1024 * blockIdx.x;
-  if (size - off > 0)
-    blockPrefixScan(ci + off, co + off, std::min(1024, size - off), ws);
-
-  // count blocks that finished
-  __shared__ bool isLastBlockDone;
-  if (0 == threadIdx.x) {
-    auto value = atomicAdd(pc, 1);  // block counter
-    isLastBlockDone = (value == (int(gridDim.x) - 1));
-  }
-
-  __syncthreads();
-
-  if (!isLastBlockDone)
-    return;
-
-  // good each block has done its work and now we are left in last block
-
-  // let's get the partial sums from each block
-  __shared__ T psum[1024];
-  for (int i = threadIdx.x, ni = gridDim.x; i < ni; i += blockDim.x) {
-    auto j = 1024 * i + 1023;
-    psum[i] = (j < size) ? co[j] : T(0);
-  }
-  __syncthreads();
-  blockPrefixScan(psum, psum, gridDim.x, ws);
-
-  // now it would have been handy to have the other blocks around...
-  int first = threadIdx.x;                                 // + blockDim.x * blockIdx.x
-  for (int i = first + 1024; i < size; i += blockDim.x) {  //  *gridDim.x) {
-    auto k = i / 1024;                                     // block
-    co[i] += psum[k - 1];
-  }
-}
+    }
+
+    // limited to 1024*1024 elements....
+    template <typename T>
+    __global__ void multiBlockPrefixScan(T const* __restrict__ ci, T* __restrict__ co, int32_t size, int32_t* pc) {
+      __shared__ T ws[32];
+      // first each block does a scan of size 1024; (better be enough blocks....)
+      assert(1024 * gridDim.x >= size);
+      int off = 1024 * blockIdx.x;
+      if (size - off > 0)
+        blockPrefixScan(ci + off, co + off, std::min(1024, size - off), ws);
+
+      // count blocks that finished
+      __shared__ bool isLastBlockDone;
+      if (0 == threadIdx.x) {
+        auto value = atomicAdd(pc, 1);  // block counter
+        isLastBlockDone = (value == (int(gridDim.x) - 1));
+      }
+
+      __syncthreads();
+
+      if (!isLastBlockDone)
+        return;
+
+      // good each block has done its work and now we are left in last block
+
+      // let's get the partial sums from each block
+      __shared__ T psum[1024];
+      for (int i = threadIdx.x, ni = gridDim.x; i < ni; i += blockDim.x) {
+        auto j = 1024 * i + 1023;
+        psum[i] = (j < size) ? co[j] : T(0);
+      }
+      __syncthreads();
+      blockPrefixScan(psum, psum, gridDim.x, ws);
+
+      // now it would have been handy to have the other blocks around...
+      int first = threadIdx.x;                                 // + blockDim.x * blockIdx.x
+      for (int i = first + 1024; i < size; i += blockDim.x) {  //  *gridDim.x) {
+        auto k = i / 1024;                                     // block
+        co[i] += psum[k - 1];
+      }
+    }
+
+  }  // namespace cuda
+}  // namespace cms
 
 #endif  // HeterogeneousCore_CUDAUtilities_interface_prefixScan_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
index ce2acf2006090..09fcdc8cd5f69 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
@@ -2,6 +2,7 @@
 #define HeterogeneousCoreCUDAUtilities_radixSort_H
 
 #include <cstdint>
+#include <type_traits>
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
@@ -79,8 +80,8 @@ __device__ inline void reorderFloat(T const* a, uint16_t* ind, uint16_t* ind2, u
 template <typename T,  // shall be interger
           int NS,      // number of significant bytes to use in sorting
           typename RF>
-__device__ void __forceinline__
-radixSortImpl(T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) {
+__device__ __forceinline__ void radixSortImpl(
+    T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) {
   constexpr int d = 8, w = 8 * sizeof(T);
   constexpr int sb = 1 << d;
   constexpr int ps = int(sizeof(T)) - NS;
@@ -217,28 +218,30 @@ radixSortImpl(T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t s
 template <typename T,
           int NS = sizeof(T),  // number of significant bytes to use in sorting
           typename std::enable_if<std::is_unsigned<T>::value, T>::type* = nullptr>
-__device__ void __forceinline__ radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
+__device__ __forceinline__ void radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
   radixSortImpl<T, NS>(a, ind, ind2, size, dummyReorder<T>);
 }
 
 template <typename T,
           int NS = sizeof(T),  // number of significant bytes to use in sorting
           typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, T>::type* = nullptr>
-__device__ void __forceinline__ radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
+__device__ __forceinline__ void radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
   radixSortImpl<T, NS>(a, ind, ind2, size, reorderSigned<T>);
 }
 
 template <typename T,
           int NS = sizeof(T),  // number of significant bytes to use in sorting
           typename std::enable_if<std::is_floating_point<T>::value, T>::type* = nullptr>
-__device__ void __forceinline__ radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
+__device__ __forceinline__ void radixSort(T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {
   using I = int;
   radixSortImpl<I, NS>((I const*)(a), ind, ind2, size, reorderFloat<I>);
 }
 
 template <typename T, int NS = sizeof(T)>
-__device__ void __forceinline__
-radixSortMulti(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
+__device__ __forceinline__ void radixSortMulti(T const* v,
+                                               uint16_t* index,
+                                               uint32_t const* offsets,
+                                               uint16_t* workspace) {
   extern __shared__ uint16_t ws[];
 
   auto a = v + offsets[blockIdx.x];
@@ -250,17 +253,23 @@ radixSortMulti(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* w
     radixSort<T, NS>(a, ind, ind2, size);
 }
 
-template <typename T, int NS = sizeof(T)>
-__global__ void __launch_bounds__(256, 4)
-    radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
-  radixSortMulti<T, NS>(v, index, offsets, workspace);
-}
+namespace cms {
+  namespace cuda {
 
-template <typename T, int NS = sizeof(T)>
-__global__ void
-// __launch_bounds__(256, 4)
-radixSortMultiWrapper2(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
-  radixSortMulti<T, NS>(v, index, offsets, workspace);
-}
+    template <typename T, int NS = sizeof(T)>
+    __global__ void __launch_bounds__(256, 4)
+        radixSortMultiWrapper(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
+      radixSortMulti<T, NS>(v, index, offsets, workspace);
+    }
+
+    template <typename T, int NS = sizeof(T)>
+    __global__ void
+    // __launch_bounds__(256, 4)
+    radixSortMultiWrapper2(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
+      radixSortMulti<T, NS>(v, index, offsets, workspace);
+    }
+
+  }  // namespace cuda
+}  // namespace cms
 
 #endif  // HeterogeneousCoreCUDAUtilities_radixSort_H
diff --git a/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu b/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu
index 92806c91337d8..e671c5c1c2ae8 100644
--- a/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/AtomicPairCounter_t.cu
@@ -3,6 +3,8 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
+using namespace cms::cuda;
+
 __global__ void update(AtomicPairCounter *dc, uint32_t *ind, uint32_t *cont, uint32_t n) {
   auto i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i >= n)
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
index cc5541f58ad60..577e9bbb69b57 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
@@ -7,6 +7,8 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
+using namespace cms::cuda;
+
 template <typename T, int NBINS = 128, int S = 8 * sizeof(T), int DELTA = 1000>
 void go() {
   std::mt19937 eng;
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
index aa7eb245c350d..cf512521e1f4d 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
@@ -9,6 +9,8 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
+using namespace cms::cuda;
+
 template <typename T>
 void go() {
   std::mt19937 eng;
@@ -16,7 +18,7 @@ void go() {
 
   constexpr int N = 12000;
   T v[N];
-  auto v_d = cms::cuda::make_device_unique<T[]>(N, nullptr);
+  auto v_d = make_device_unique<T[]>(N, nullptr);
 
   cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
 
@@ -30,10 +32,10 @@ void go() {
             << (std::numeric_limits<T>::max() - std::numeric_limits<T>::min()) / Hist::nbins() << std::endl;
 
   Hist h;
-  auto h_d = cms::cuda::make_device_unique<Hist[]>(1, nullptr);
-  auto ws_d = cms::cuda::make_device_unique<uint8_t[]>(Hist::wsSize(), nullptr);
+  auto h_d = make_device_unique<Hist[]>(1, nullptr);
+  auto ws_d = make_device_unique<uint8_t[]>(Hist::wsSize(), nullptr);
 
-  auto off_d = cms::cuda::make_device_unique<uint32_t[]>(nParts + 1, nullptr);
+  auto off_d = make_device_unique<uint32_t[]>(nParts + 1, nullptr);
 
   for (int it = 0; it < 5; ++it) {
     offsets[0] = 0;
@@ -68,7 +70,7 @@ void go() {
 
     cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
 
-    cms::cuda::fillManyFromVector(h_d.get(), ws_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, 0);
+    fillManyFromVector(h_d.get(), ws_d.get(), nParts, v_d.get(), off_d.get(), offsets[10], 256, 0);
     cudaCheck(cudaMemcpy(&h, h_d.get(), sizeof(Hist), cudaMemcpyDeviceToHost));
     assert(0 == h.off[0]);
     assert(offsets[10] == h.size());
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
index 020d69268a420..f08c768b43fa5 100644
--- a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
@@ -10,6 +10,8 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 
+using namespace cms::cuda;
+
 template <typename T, int NBINS, int S, int DELTA>
 __global__ void mykernel(T const* __restrict__ v, uint32_t N) {
   assert(v);
@@ -106,7 +108,7 @@ void go() {
   constexpr int N = 12000;
   T v[N];
 
-  auto v_d = cms::cuda::make_device_unique<T[]>(N, nullptr);
+  auto v_d = make_device_unique<T[]>(N, nullptr);
   assert(v_d.get());
 
   using Hist = HistoContainer<T, NBINS, N, S>;
@@ -125,7 +127,7 @@ void go() {
     assert(v);
     cudaCheck(cudaMemcpy(v_d.get(), v, N * sizeof(T), cudaMemcpyHostToDevice));
     assert(v_d.get());
-    cms::cuda::launch(mykernel<T, NBINS, S, DELTA>, {1, 256}, v_d.get(), N);
+    launch(mykernel<T, NBINS, S, DELTA>, {1, 256}, v_d.get(), N);
   }
 }
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
index 92bb359115a02..765ec89e295f2 100644
--- a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
+++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
@@ -16,6 +16,8 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 
+using namespace cms::cuda;
+
 constexpr uint32_t MaxElem = 64000;
 constexpr uint32_t MaxTk = 8000;
 constexpr uint32_t MaxAssocs = 4 * MaxTk;
@@ -100,7 +102,7 @@ __global__ void verifyBulk(Assoc const* __restrict__ assoc, AtomicPairCounter co
 int main() {
 #ifdef __CUDACC__
   cms::cudatest::requireDevices();
-  auto current_device = cms::cuda::currentDevice();
+  auto current_device = currentDevice();
 #else
   // make sure cuda emulation is working
   std::cout << "cuda x's " << threadIdx.x << ' ' << blockIdx.x << ' ' << blockDim.x << ' ' << gridDim.x << std::endl;
@@ -167,11 +169,11 @@ int main() {
   std::cout << "filled with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << nz << std::endl;
 
 #ifdef __CUDACC__
-  auto v_d = cms::cuda::make_device_unique<std::array<uint16_t, 4>[]>(N, nullptr);
+  auto v_d = make_device_unique<std::array<uint16_t, 4>[]>(N, nullptr);
   assert(v_d.get());
-  auto a_d = cms::cuda::make_device_unique<Assoc[]>(1, nullptr);
-  auto sa_d = cms::cuda::make_device_unique<SmallAssoc[]>(1, nullptr);
-  auto ws_d = cms::cuda::make_device_unique<uint8_t[]>(Assoc::wsSize(), nullptr);
+  auto a_d = make_device_unique<Assoc[]>(1, nullptr);
+  auto sa_d = make_device_unique<SmallAssoc[]>(1, nullptr);
+  auto ws_d = make_device_unique<uint8_t[]>(Assoc::wsSize(), nullptr);
 
   cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array<uint16_t, 4>), cudaMemcpyHostToDevice));
 #else
@@ -180,7 +182,7 @@ int main() {
   auto v_d = tr.data();
 #endif
 
-  cms::cuda::launchZero(a_d.get(), 0);
+  launchZero(a_d.get(), 0);
 
 #ifdef __CUDACC__
   auto nThreads = 256;
@@ -188,12 +190,12 @@ int main() {
 
   count<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
 
-  cms::cuda::launchFinalize(a_d.get(), ws_d.get(), 0);
+  launchFinalize(a_d.get(), ws_d.get(), 0);
   verify<<<1, 1>>>(a_d.get());
   fill<<<nBlocks, nThreads>>>(v_d.get(), a_d.get(), N);
 #else
   count(v_d, a_d.get(), N);
-  cms::cuda::launchFinalize(a_d.get());
+  launchFinalize(a_d.get());
   verify(a_d.get());
   fill(v_d, a_d.get(), N);
 #endif
@@ -231,7 +233,7 @@ int main() {
   cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)));
   nBlocks = (N + nThreads - 1) / nThreads;
   fillBulk<<<nBlocks, nThreads>>>(dc_d, v_d.get(), a_d.get(), N);
-  cms::cuda::finalizeBulk<<<nBlocks, nThreads>>>(dc_d, a_d.get());
+  finalizeBulk<<<nBlocks, nThreads>>>(dc_d, a_d.get());
   verifyBulk<<<1, 1>>>(a_d.get(), dc_d);
 
   cudaCheck(cudaMemcpy(&la, a_d.get(), sizeof(Assoc), cudaMemcpyDeviceToHost));
@@ -239,19 +241,19 @@ int main() {
 
   cudaCheck(cudaMemset(dc_d, 0, sizeof(AtomicPairCounter)));
   fillBulk<<<nBlocks, nThreads>>>(dc_d, v_d.get(), sa_d.get(), N);
-  cms::cuda::finalizeBulk<<<nBlocks, nThreads>>>(dc_d, sa_d.get());
+  finalizeBulk<<<nBlocks, nThreads>>>(dc_d, sa_d.get());
   verifyBulk<<<1, 1>>>(sa_d.get(), dc_d);
 
 #else
   dc_d = &dc;
   fillBulk(dc_d, v_d, a_d.get(), N);
-  cms::cuda::finalizeBulk(dc_d, a_d.get());
+  finalizeBulk(dc_d, a_d.get());
   verifyBulk(a_d.get(), dc_d);
   memcpy(&la, a_d.get(), sizeof(Assoc));
 
   AtomicPairCounter sdc(0);
   fillBulk(&sdc, v_d, sa_d.get(), N);
-  cms::cuda::finalizeBulk(&sdc, sa_d.get());
+  finalizeBulk(&sdc, sa_d.get());
   verifyBulk(sa_d.get(), &sdc);
 
 #endif
@@ -274,14 +276,14 @@ int main() {
 
   // here verify use of block local counters
 #ifdef __CUDACC__
-  auto m1_d = cms::cuda::make_device_unique<Multiplicity[]>(1, nullptr);
-  auto m2_d = cms::cuda::make_device_unique<Multiplicity[]>(1, nullptr);
+  auto m1_d = make_device_unique<Multiplicity[]>(1, nullptr);
+  auto m2_d = make_device_unique<Multiplicity[]>(1, nullptr);
 #else
   auto m1_d = std::make_unique<Multiplicity>();
   auto m2_d = std::make_unique<Multiplicity>();
 #endif
-  cms::cuda::launchZero(m1_d.get(), 0);
-  cms::cuda::launchZero(m2_d.get(), 0);
+  launchZero(m1_d.get(), 0);
+  launchZero(m2_d.get(), 0);
 
 #ifdef __CUDACC__
   nBlocks = (4 * N + nThreads - 1) / nThreads;
@@ -289,8 +291,8 @@ int main() {
   countMultiLocal<<<nBlocks, nThreads>>>(v_d.get(), m2_d.get(), N);
   verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get());
 
-  cms::cuda::launchFinalize(m1_d.get(), ws_d.get(), 0);
-  cms::cuda::launchFinalize(m2_d.get(), ws_d.get(), 0);
+  launchFinalize(m1_d.get(), ws_d.get(), 0);
+  launchFinalize(m2_d.get(), ws_d.get(), 0);
   verifyMulti<<<1, Multiplicity::totbins()>>>(m1_d.get(), m2_d.get());
 
   cudaCheck(cudaGetLastError());
@@ -300,8 +302,8 @@ int main() {
   countMultiLocal(v_d, m2_d.get(), N);
   verifyMulti(m1_d.get(), m2_d.get());
 
-  cms::cuda::launchFinalize(m1_d.get());
-  cms::cuda::launchFinalize(m2_d.get());
+  launchFinalize(m1_d.get());
+  launchFinalize(m2_d.get());
   verifyMulti(m1_d.get(), m2_d.get());
 #endif
   return 0;
diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
index 8bbaf4d066abc..01376c668ef6a 100644
--- a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
@@ -6,16 +6,18 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 
+using namespace cms::cuda;
+
 template <typename T>
 struct format_traits {
 public:
-  static const constexpr char* failed_msg = "failed %d %d %d: %d %d\n";
+  static const constexpr char *failed_msg = "failed %d %d %d: %d %d\n";
 };
 
-template<>
+template <>
 struct format_traits<float> {
 public:
-  static const constexpr char* failed_msg = "failed %d %d %d: %f %f\n";
+  static const constexpr char *failed_msg = "failed %d %d %d: %f %f\n";
 };
 
 template <typename T>
diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
index febdb9c92b0a7..e96a9d8eae83b 100644
--- a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
@@ -14,6 +14,8 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h"
 
+using namespace cms::cuda;
+
 template <typename T>
 struct RS {
   using type = std::uniform_int_distribution<T>;

From d76ad8e63a90171ace695aaa67713323811e3194 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sun, 8 Mar 2020 21:12:29 +0100
Subject: [PATCH 11/15] Fix code rule violations

Replace using namespace cms::cuda in test/OneToManyAssoc_t.h .
---
 .../CUDAUtilities/test/OneToManyAssoc_t.h     | 26 ++++++++-----------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
index 765ec89e295f2..9f6631a70b214 100644
--- a/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
+++ b/HeterogeneousCore/CUDAUtilities/test/OneToManyAssoc_t.h
@@ -11,22 +11,18 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
-
 #endif
 
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
-
-using namespace cms::cuda;
+using cms::cuda::AtomicPairCounter;
 
 constexpr uint32_t MaxElem = 64000;
 constexpr uint32_t MaxTk = 8000;
 constexpr uint32_t MaxAssocs = 4 * MaxTk;
-using Assoc = OneToManyAssoc<uint16_t, MaxElem, MaxAssocs>;
-
-using SmallAssoc = OneToManyAssoc<uint16_t, 128, MaxAssocs>;
-
-using Multiplicity = OneToManyAssoc<uint16_t, 8, MaxTk>;
 
+using Assoc = cms::cuda::OneToManyAssoc<uint16_t, MaxElem, MaxAssocs>;
+using SmallAssoc = cms::cuda::OneToManyAssoc<uint16_t, 128, MaxAssocs>;
+using Multiplicity = cms::cuda::OneToManyAssoc<uint16_t, 8, MaxTk>;
 using TK = std::array<uint16_t, 4>;
 
 __global__ void countMultiLocal(TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, int32_t n) {
@@ -102,7 +98,7 @@ __global__ void verifyBulk(Assoc const* __restrict__ assoc, AtomicPairCounter co
 int main() {
 #ifdef __CUDACC__
   cms::cudatest::requireDevices();
-  auto current_device = currentDevice();
+  auto current_device = cms::cuda::currentDevice();
 #else
   // make sure cuda emulation is working
   std::cout << "cuda x's " << threadIdx.x << ' ' << blockIdx.x << ' ' << blockDim.x << ' ' << gridDim.x << std::endl;
@@ -169,11 +165,11 @@ int main() {
   std::cout << "filled with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << nz << std::endl;
 
 #ifdef __CUDACC__
-  auto v_d = make_device_unique<std::array<uint16_t, 4>[]>(N, nullptr);
+  auto v_d = cms::cuda::make_device_unique<std::array<uint16_t, 4>[]>(N, nullptr);
   assert(v_d.get());
-  auto a_d = make_device_unique<Assoc[]>(1, nullptr);
-  auto sa_d = make_device_unique<SmallAssoc[]>(1, nullptr);
-  auto ws_d = make_device_unique<uint8_t[]>(Assoc::wsSize(), nullptr);
+  auto a_d = cms::cuda::make_device_unique<Assoc[]>(1, nullptr);
+  auto sa_d = cms::cuda::make_device_unique<SmallAssoc[]>(1, nullptr);
+  auto ws_d = cms::cuda::make_device_unique<uint8_t[]>(Assoc::wsSize(), nullptr);
 
   cudaCheck(cudaMemcpy(v_d.get(), tr.data(), N * sizeof(std::array<uint16_t, 4>), cudaMemcpyHostToDevice));
 #else
@@ -276,8 +272,8 @@ int main() {
 
   // here verify use of block local counters
 #ifdef __CUDACC__
-  auto m1_d = make_device_unique<Multiplicity[]>(1, nullptr);
-  auto m2_d = make_device_unique<Multiplicity[]>(1, nullptr);
+  auto m1_d = cms::cuda::make_device_unique<Multiplicity[]>(1, nullptr);
+  auto m2_d = cms::cuda::make_device_unique<Multiplicity[]>(1, nullptr);
 #else
   auto m1_d = std::make_unique<Multiplicity>();
   auto m2_d = std::make_unique<Multiplicity>();

From 1f91c73d82f431ece465fbf164c4db444465d880 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sun, 8 Mar 2020 21:16:37 +0100
Subject: [PATCH 12/15] Add an exception for cudaCompat.h

cudaCompat.ch relies on defining equivalent symbols to the CUDA
intrinsics in the cms::cudacompat namespace, and pulling them in the
global namespace when compiling device code without CUDA.
---
 Utilities/ReleaseScripts/python/cmsCodeRules/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Utilities/ReleaseScripts/python/cmsCodeRules/config.py b/Utilities/ReleaseScripts/python/cmsCodeRules/config.py
index 264cf76e5d888..a7b9a11299441 100644
--- a/Utilities/ReleaseScripts/python/cmsCodeRules/config.py
+++ b/Utilities/ReleaseScripts/python/cmsCodeRules/config.py
@@ -33,7 +33,7 @@
 
 Configuration[ruleName]['description'] = 'Search for "using namespace" or "using std::" in header files'
 Configuration[ruleName]['filesToMatch'] = ['*.h']
-Configuration[ruleName]['exceptPaths'] = []
+Configuration[ruleName]['exceptPaths'] = ['HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h']
 Configuration[ruleName]['skip']  = [comment, function]
 Configuration[ruleName]['filter'] = '(\susing|\Ausing)\s+(namespace|std::)' #should be regular expression
 Configuration[ruleName]['exceptFilter'] = []

From c52b855961d4a7ee0fa985c683765a72153b0815 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 9 Mar 2020 17:09:47 +0100
Subject: [PATCH 13/15] Protect the header to compile only with a CUDA compiler

---
 HeterogeneousCore/CUDAUtilities/interface/radixSort.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
index 09fcdc8cd5f69..e4b2905782825 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
@@ -1,6 +1,8 @@
 #ifndef HeterogeneousCoreCUDAUtilities_radixSort_H
 #define HeterogeneousCoreCUDAUtilities_radixSort_H
 
+#ifdef __CUDACC__
+
 #include <cstdint>
 #include <type_traits>
 
@@ -263,13 +265,13 @@ namespace cms {
     }
 
     template <typename T, int NS = sizeof(T)>
-    __global__ void
-    // __launch_bounds__(256, 4)
-    radixSortMultiWrapper2(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
+    __global__ void radixSortMultiWrapper2(T const* v, uint16_t* index, uint32_t const* offsets, uint16_t* workspace) {
       radixSortMulti<T, NS>(v, index, offsets, workspace);
     }
 
   }  // namespace cuda
 }  // namespace cms
 
+#endif  // __CUDACC__
+
 #endif  // HeterogeneousCoreCUDAUtilities_radixSort_H

From 62480002ebec6cc6354e2efb14d45e964773f85d Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 26 Mar 2020 11:05:18 +0100
Subject: [PATCH 14/15] choleskyInversion.h: expand source notice

---
 DataFormats/Math/interface/choleskyInversion.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/DataFormats/Math/interface/choleskyInversion.h b/DataFormats/Math/interface/choleskyInversion.h
index eba2a17648008..a04ad456f96f5 100644
--- a/DataFormats/Math/interface/choleskyInversion.h
+++ b/DataFormats/Math/interface/choleskyInversion.h
@@ -9,6 +9,7 @@
  * fully inlined specialized code to perform the inversion of a
  * positive defined matrix of rank up to 6.
  *
+ * adapted from ROOT::Math::CholeskyDecomp
  * originally by
  * @author Manuel Schiller
  * @date Aug 29 2008

From 8ce96318e31ea00aad63e7e840755cc0492766b4 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 26 Mar 2020 11:37:28 +0100
Subject: [PATCH 15/15] Rename namespace choleskyInversion to math::cholesky

---
 .../Math/interface/choleskyInversion.h        | 623 +++++++++---------
 DataFormats/Math/test/CholeskyInvert_t.cpp    |  12 +-
 DataFormats/Math/test/CholeskyInvert_t.cu     |  20 +-
 3 files changed, 336 insertions(+), 319 deletions(-)

diff --git a/DataFormats/Math/interface/choleskyInversion.h b/DataFormats/Math/interface/choleskyInversion.h
index a04ad456f96f5..2cb4105f86bae 100644
--- a/DataFormats/Math/interface/choleskyInversion.h
+++ b/DataFormats/Math/interface/choleskyInversion.h
@@ -16,319 +16,334 @@
  *
  *
  */
-namespace choleskyInversion {
-
-  template <typename M1, typename M2>
-  inline constexpr void invert11(M1 const& src, M2& dst) {
-    using F = decltype(src(0, 0));
-    dst(0, 0) = F(1.0) / src(0, 0);
-  }
-
-  template <typename M1, typename M2>
-  inline constexpr void invert22(M1 const& src, M2& dst) {
-    using F = decltype(src(0, 0));
-    auto luc0 = F(1.0) / src(0, 0);
-    auto luc1 = src(1, 0) * src(1, 0) * luc0;
-    auto luc2 = F(1.0) / (src(1, 1) - luc1);
-
-    auto li21 = luc1 * luc0 * luc2;
-
-    dst(0, 0) = li21 + luc0;
-    dst(1, 0) = -src(1, 0) * luc0 * luc2;
-    dst(1, 1) = luc2;
-  }
-
-  template <typename M1, typename M2>
-  inline constexpr void invert33(M1 const& src, M2& dst) {
-    using F = decltype(src(0, 0));
-    auto luc0 = F(1.0) / src(0, 0);
-    auto luc1 = src(1, 0);
-    auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
-    luc2 = F(1.0) / luc2;
-    auto luc3 = src(2, 0);
-    auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
-    auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + (luc2 * luc4) * luc4);
-    luc5 = F(1.0) / luc5;
-
-    auto li21 = -luc0 * luc1;
-    auto li32 = -(luc2 * luc4);
-    auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
-
-    dst(0, 0) = luc5 * li31 * li31 + li21 * li21 * luc2 + luc0;
-    dst(1, 0) = luc5 * li31 * li32 + li21 * luc2;
-    dst(1, 1) = luc5 * li32 * li32 + luc2;
-    dst(2, 0) = luc5 * li31;
-    dst(2, 1) = luc5 * li32;
-    dst(2, 2) = luc5;
-  }
-
-  template <typename M1, typename M2>
-  inline constexpr void invert44(M1 const& src, M2& dst) {
-    using F = decltype(src(0, 0));
-    auto luc0 = F(1.0) / src(0, 0);
-    auto luc1 = src(1, 0);
-    auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
-    luc2 = F(1.0) / luc2;
-    auto luc3 = src(2, 0);
-    auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
-    auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4);
-    luc5 = F(1.0) / luc5;
-    auto luc6 = src(3, 0);
-    auto luc7 = (src(3, 1) - luc0 * luc1 * luc6);
-    auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7);
-    auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5));
-    luc9 = F(1.0) / luc9;
-
-    auto li21 = -luc1 * luc0;
-    auto li32 = -luc2 * luc4;
-    auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
-    auto li43 = -(luc8 * luc5);
-    auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2;
-    auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0;
-
-    dst(0, 0) = luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0;
-    dst(1, 0) = luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21;
-    dst(1, 1) = luc9 * li42 * li42 + luc5 * li32 * li32 + luc2;
-    dst(2, 0) = luc9 * li41 * li43 + luc5 * li31;
-    dst(2, 1) = luc9 * li42 * li43 + luc5 * li32;
-    dst(2, 2) = luc9 * li43 * li43 + luc5;
-    dst(3, 0) = luc9 * li41;
-    dst(3, 1) = luc9 * li42;
-    dst(3, 2) = luc9 * li43;
-    dst(3, 3) = luc9;
-  }
-
-  template <typename M1, typename M2>
-  inline constexpr void invert55(M1 const& src, M2& dst) {
-    using F = decltype(src(0, 0));
-    auto luc0 = F(1.0) / src(0, 0);
-    auto luc1 = src(1, 0);
-    auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
-    luc2 = F(1.0) / luc2;
-    auto luc3 = src(2, 0);
-    auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
-    auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4);
-    luc5 = F(1.0) / luc5;
-    auto luc6 = src(3, 0);
-    auto luc7 = (src(3, 1) - luc0 * luc1 * luc6);
-    auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7);
-    auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5));
-    luc9 = F(1.0) / luc9;
-    auto luc10 = src(4, 0);
-    auto luc11 = (src(4, 1) - luc0 * luc1 * luc10);
-    auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11);
-    auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12);
-    auto luc14 =
-        src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13);
-    luc14 = F(1.0) / luc14;
-
-    auto li21 = -luc1 * luc0;
-    auto li32 = -luc2 * luc4;
-    auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
-    auto li43 = -(luc8 * luc5);
-    auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2;
-    auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0;
-    auto li54 = -luc13 * luc9;
-    auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5;
-    auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2;
-    auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 -
-                 luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 +
-                 luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) *
-                luc0;
-
-    dst(0, 0) = luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0;
-    dst(1, 0) = luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21;
-    dst(1, 1) = luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2;
-    dst(2, 0) = luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31;
-    dst(2, 1) = luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32;
-    dst(2, 2) = luc14 * li53 * li53 + luc9 * li43 * li43 + luc5;
-    dst(3, 0) = luc14 * li51 * li54 + luc9 * li41;
-    dst(3, 1) = luc14 * li52 * li54 + luc9 * li42;
-    dst(3, 2) = luc14 * li53 * li54 + luc9 * li43;
-    dst(3, 3) = luc14 * li54 * li54 + luc9;
-    dst(4, 0) = luc14 * li51;
-    dst(4, 1) = luc14 * li52;
-    dst(4, 2) = luc14 * li53;
-    dst(4, 3) = luc14 * li54;
-    dst(4, 4) = luc14;
-  }
-
-  template <typename M1, typename M2>
-  inline __attribute__((always_inline)) constexpr void invert66(M1 const& src, M2& dst) {
-    using F = decltype(src(0, 0));
-    auto luc0 = F(1.0) / src(0, 0);
-    auto luc1 = src(1, 0);
-    auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
-    luc2 = F(1.0) / luc2;
-    auto luc3 = src(2, 0);
-    auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
-    auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4);
-    luc5 = F(1.0) / luc5;
-    auto luc6 = src(3, 0);
-    auto luc7 = (src(3, 1) - luc0 * luc1 * luc6);
-    auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7);
-    auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5));
-    luc9 = F(1.0) / luc9;
-    auto luc10 = src(4, 0);
-    auto luc11 = (src(4, 1) - luc0 * luc1 * luc10);
-    auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11);
-    auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12);
-    auto luc14 =
-        src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13);
-    luc14 = F(1.0) / luc14;
-    auto luc15 = src(5, 0);
-    auto luc16 = (src(5, 1) - luc0 * luc1 * luc15);
-    auto luc17 = (src(5, 2) - luc0 * luc3 * luc15 - luc2 * luc4 * luc16);
-    auto luc18 = (src(5, 3) - luc0 * luc6 * luc15 - luc2 * luc7 * luc16 - luc5 * luc8 * luc17);
-    auto luc19 =
-        (src(5, 4) - luc0 * luc10 * luc15 - luc2 * luc11 * luc16 - luc5 * luc12 * luc17 - luc9 * luc13 * luc18);
-    auto luc20 = src(5, 5) - (luc0 * luc15 * luc15 + luc2 * luc16 * luc16 + luc5 * luc17 * luc17 +
-                              luc9 * luc18 * luc18 + luc14 * luc19 * luc19);
-    luc20 = F(1.0) / luc20;
-
-    auto li21 = -luc1 * luc0;
-    auto li32 = -luc2 * luc4;
-    auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
-    auto li43 = -(luc8 * luc5);
-    auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2;
-    auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0;
-    auto li54 = -luc13 * luc9;
-    auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5;
-    auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2;
-    auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 -
-                 luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 +
-                 luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) *
-                luc0;
-
-    auto li65 = -luc19 * luc14;
-    auto li64 = (luc19 * luc14 * luc13 - luc18) * luc9;
-    auto li63 = (-luc8 * luc13 * (luc19 * luc14) * luc9 + luc8 * luc9 * luc18 + luc12 * (luc19 * luc14) - luc17) * luc5;
-    auto li62 = (luc4 * (luc8 * luc9) * luc13 * luc5 * (luc19 * luc14) - luc18 * luc4 * (luc8 * luc9) * luc5 -
-                 luc19 * luc12 * luc4 * luc14 * luc5 - luc19 * luc13 * luc7 * luc14 * luc9 + luc17 * luc4 * luc5 +
-                 luc18 * luc7 * luc9 + luc19 * luc11 * luc14 - luc16) *
-                luc2;
-    auto li61 =
-        (-luc19 * luc13 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 * luc14 +
-         luc18 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 + luc19 * luc12 * luc4 * luc1 * luc2 * luc5 * luc14 +
-         luc19 * luc13 * luc7 * luc1 * luc2 * luc9 * luc14 + luc19 * luc13 * luc8 * luc3 * luc5 * luc9 * luc14 -
-         luc17 * luc4 * luc1 * luc2 * luc5 - luc18 * luc7 * luc1 * luc2 * luc9 - luc19 * luc11 * luc1 * luc2 * luc14 -
-         luc18 * luc8 * luc3 * luc5 * luc9 - luc19 * luc12 * luc3 * luc5 * luc14 - luc19 * luc13 * luc6 * luc9 * luc14 +
-         luc16 * luc1 * luc2 + luc17 * luc3 * luc5 + luc18 * luc6 * luc9 + luc19 * luc10 * luc14 - luc15) *
-        luc0;
-
-    dst(0, 0) =
-        luc20 * li61 * li61 + luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0;
-    dst(1, 0) = luc20 * li61 * li62 + luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21;
-    dst(1, 1) = luc20 * li62 * li62 + luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2;
-    dst(2, 0) = luc20 * li61 * li63 + luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31;
-    dst(2, 1) = luc20 * li62 * li63 + luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32;
-    dst(2, 2) = luc20 * li63 * li63 + luc14 * li53 * li53 + luc9 * li43 * li43 + luc5;
-    dst(3, 0) = luc20 * li61 * li64 + luc14 * li51 * li54 + luc9 * li41;
-    dst(3, 1) = luc20 * li62 * li64 + luc14 * li52 * li54 + luc9 * li42;
-    dst(3, 2) = luc20 * li63 * li64 + luc14 * li53 * li54 + luc9 * li43;
-    dst(3, 3) = luc20 * li64 * li64 + luc14 * li54 * li54 + luc9;
-    dst(4, 0) = luc20 * li61 * li65 + luc14 * li51;
-    dst(4, 1) = luc20 * li62 * li65 + luc14 * li52;
-    dst(4, 2) = luc20 * li63 * li65 + luc14 * li53;
-    dst(4, 3) = luc20 * li64 * li65 + luc14 * li54;
-    dst(4, 4) = luc20 * li65 * li65 + luc14;
-    dst(5, 0) = luc20 * li61;
-    dst(5, 1) = luc20 * li62;
-    dst(5, 2) = luc20 * li63;
-    dst(5, 3) = luc20 * li64;
-    dst(5, 4) = luc20 * li65;
-    dst(5, 5) = luc20;
-  }
-
-  template <typename M>
-  inline constexpr void symmetrize11(M& dst) {}
-  template <typename M>
-  inline constexpr void symmetrize22(M& dst) {
-    dst(0, 1) = dst(1, 0);
-  }
-  template <typename M>
-  inline constexpr void symmetrize33(M& dst) {
-    symmetrize22(dst);
-    dst(0, 2) = dst(2, 0);
-    dst(1, 2) = dst(2, 1);
-  }
-  template <typename M>
-  inline constexpr void symmetrize44(M& dst) {
-    symmetrize33(dst);
-    dst(0, 3) = dst(3, 0);
-    dst(1, 3) = dst(3, 1);
-    dst(2, 3) = dst(3, 2);
-  }
-  template <typename M>
-  inline constexpr void symmetrize55(M& dst) {
-    symmetrize44(dst);
-    dst(0, 4) = dst(4, 0);
-    dst(1, 4) = dst(4, 1);
-    dst(2, 4) = dst(4, 2);
-    dst(3, 4) = dst(4, 3);
-  }
-  template <typename M>
-  inline constexpr void symmetrize66(M& dst) {
-    symmetrize55(dst);
-    dst(0, 5) = dst(5, 0);
-    dst(1, 5) = dst(5, 1);
-    dst(2, 5) = dst(5, 2);
-    dst(3, 5) = dst(5, 3);
-    dst(4, 5) = dst(5, 4);
-  }
-
-  template <typename M1, typename M2, int N>
-  struct Inverter {
-    static constexpr void eval(M1 const& src, M2& dst) { dst = src.inverse(); }
-  };
-  template <typename M1, typename M2>
-  struct Inverter<M1, M2, 1> {
-    static constexpr void eval(M1 const& src, M2& dst) { invert11(src, dst); }
-  };
-  template <typename M1, typename M2>
-  struct Inverter<M1, M2, 2> {
-    static constexpr void eval(M1 const& src, M2& dst) {
-      invert22(src, dst);
+namespace math {
+  namespace cholesky {
+
+    template <typename M1, typename M2>
+    inline constexpr void invert11(M1 const& src, M2& dst) {
+      using F = decltype(src(0, 0));
+      dst(0, 0) = F(1.0) / src(0, 0);
+    }
+
+    template <typename M1, typename M2>
+    inline constexpr void invert22(M1 const& src, M2& dst) {
+      using F = decltype(src(0, 0));
+      auto luc0 = F(1.0) / src(0, 0);
+      auto luc1 = src(1, 0) * src(1, 0) * luc0;
+      auto luc2 = F(1.0) / (src(1, 1) - luc1);
+
+      auto li21 = luc1 * luc0 * luc2;
+
+      dst(0, 0) = li21 + luc0;
+      dst(1, 0) = -src(1, 0) * luc0 * luc2;
+      dst(1, 1) = luc2;
+    }
+
+    template <typename M1, typename M2>
+    inline constexpr void invert33(M1 const& src, M2& dst) {
+      using F = decltype(src(0, 0));
+      auto luc0 = F(1.0) / src(0, 0);
+      auto luc1 = src(1, 0);
+      auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
+      luc2 = F(1.0) / luc2;
+      auto luc3 = src(2, 0);
+      auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
+      auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + (luc2 * luc4) * luc4);
+      luc5 = F(1.0) / luc5;
+
+      auto li21 = -luc0 * luc1;
+      auto li32 = -(luc2 * luc4);
+      auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
+
+      dst(0, 0) = luc5 * li31 * li31 + li21 * li21 * luc2 + luc0;
+      dst(1, 0) = luc5 * li31 * li32 + li21 * luc2;
+      dst(1, 1) = luc5 * li32 * li32 + luc2;
+      dst(2, 0) = luc5 * li31;
+      dst(2, 1) = luc5 * li32;
+      dst(2, 2) = luc5;
+    }
+
+    template <typename M1, typename M2>
+    inline constexpr void invert44(M1 const& src, M2& dst) {
+      using F = decltype(src(0, 0));
+      auto luc0 = F(1.0) / src(0, 0);
+      auto luc1 = src(1, 0);
+      auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
+      luc2 = F(1.0) / luc2;
+      auto luc3 = src(2, 0);
+      auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
+      auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4);
+      luc5 = F(1.0) / luc5;
+      auto luc6 = src(3, 0);
+      auto luc7 = (src(3, 1) - luc0 * luc1 * luc6);
+      auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7);
+      auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5));
+      luc9 = F(1.0) / luc9;
+
+      auto li21 = -luc1 * luc0;
+      auto li32 = -luc2 * luc4;
+      auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
+      auto li43 = -(luc8 * luc5);
+      auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2;
+      auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0;
+
+      dst(0, 0) = luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0;
+      dst(1, 0) = luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21;
+      dst(1, 1) = luc9 * li42 * li42 + luc5 * li32 * li32 + luc2;
+      dst(2, 0) = luc9 * li41 * li43 + luc5 * li31;
+      dst(2, 1) = luc9 * li42 * li43 + luc5 * li32;
+      dst(2, 2) = luc9 * li43 * li43 + luc5;
+      dst(3, 0) = luc9 * li41;
+      dst(3, 1) = luc9 * li42;
+      dst(3, 2) = luc9 * li43;
+      dst(3, 3) = luc9;
+    }
+
+    template <typename M1, typename M2>
+    inline constexpr void invert55(M1 const& src, M2& dst) {
+      using F = decltype(src(0, 0));
+      auto luc0 = F(1.0) / src(0, 0);
+      auto luc1 = src(1, 0);
+      auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
+      luc2 = F(1.0) / luc2;
+      auto luc3 = src(2, 0);
+      auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
+      auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4);
+      luc5 = F(1.0) / luc5;
+      auto luc6 = src(3, 0);
+      auto luc7 = (src(3, 1) - luc0 * luc1 * luc6);
+      auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7);
+      auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5));
+      luc9 = F(1.0) / luc9;
+      auto luc10 = src(4, 0);
+      auto luc11 = (src(4, 1) - luc0 * luc1 * luc10);
+      auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11);
+      auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12);
+      auto luc14 =
+          src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13);
+      luc14 = F(1.0) / luc14;
+
+      auto li21 = -luc1 * luc0;
+      auto li32 = -luc2 * luc4;
+      auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
+      auto li43 = -(luc8 * luc5);
+      auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2;
+      auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0;
+      auto li54 = -luc13 * luc9;
+      auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5;
+      auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2;
+      auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 -
+                   luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 +
+                   luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) *
+                  luc0;
+
+      dst(0, 0) = luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0;
+      dst(1, 0) = luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21;
+      dst(1, 1) = luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2;
+      dst(2, 0) = luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31;
+      dst(2, 1) = luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32;
+      dst(2, 2) = luc14 * li53 * li53 + luc9 * li43 * li43 + luc5;
+      dst(3, 0) = luc14 * li51 * li54 + luc9 * li41;
+      dst(3, 1) = luc14 * li52 * li54 + luc9 * li42;
+      dst(3, 2) = luc14 * li53 * li54 + luc9 * li43;
+      dst(3, 3) = luc14 * li54 * li54 + luc9;
+      dst(4, 0) = luc14 * li51;
+      dst(4, 1) = luc14 * li52;
+      dst(4, 2) = luc14 * li53;
+      dst(4, 3) = luc14 * li54;
+      dst(4, 4) = luc14;
+    }
+
+    template <typename M1, typename M2>
+    inline __attribute__((always_inline)) constexpr void invert66(M1 const& src, M2& dst) {
+      using F = decltype(src(0, 0));
+      auto luc0 = F(1.0) / src(0, 0);
+      auto luc1 = src(1, 0);
+      auto luc2 = src(1, 1) - luc0 * luc1 * luc1;
+      luc2 = F(1.0) / luc2;
+      auto luc3 = src(2, 0);
+      auto luc4 = (src(2, 1) - luc0 * luc1 * luc3);
+      auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4);
+      luc5 = F(1.0) / luc5;
+      auto luc6 = src(3, 0);
+      auto luc7 = (src(3, 1) - luc0 * luc1 * luc6);
+      auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7);
+      auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5));
+      luc9 = F(1.0) / luc9;
+      auto luc10 = src(4, 0);
+      auto luc11 = (src(4, 1) - luc0 * luc1 * luc10);
+      auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11);
+      auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12);
+      auto luc14 =
+          src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13);
+      luc14 = F(1.0) / luc14;
+      auto luc15 = src(5, 0);
+      auto luc16 = (src(5, 1) - luc0 * luc1 * luc15);
+      auto luc17 = (src(5, 2) - luc0 * luc3 * luc15 - luc2 * luc4 * luc16);
+      auto luc18 = (src(5, 3) - luc0 * luc6 * luc15 - luc2 * luc7 * luc16 - luc5 * luc8 * luc17);
+      auto luc19 =
+          (src(5, 4) - luc0 * luc10 * luc15 - luc2 * luc11 * luc16 - luc5 * luc12 * luc17 - luc9 * luc13 * luc18);
+      auto luc20 = src(5, 5) - (luc0 * luc15 * luc15 + luc2 * luc16 * luc16 + luc5 * luc17 * luc17 +
+                                luc9 * luc18 * luc18 + luc14 * luc19 * luc19);
+      luc20 = F(1.0) / luc20;
+
+      auto li21 = -luc1 * luc0;
+      auto li32 = -luc2 * luc4;
+      auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0;
+      auto li43 = -(luc8 * luc5);
+      auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2;
+      auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0;
+      auto li54 = -luc13 * luc9;
+      auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5;
+      auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2;
+      auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 -
+                   luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 +
+                   luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) *
+                  luc0;
+
+      auto li65 = -luc19 * luc14;
+      auto li64 = (luc19 * luc14 * luc13 - luc18) * luc9;
+      auto li63 =
+          (-luc8 * luc13 * (luc19 * luc14) * luc9 + luc8 * luc9 * luc18 + luc12 * (luc19 * luc14) - luc17) * luc5;
+      auto li62 = (luc4 * (luc8 * luc9) * luc13 * luc5 * (luc19 * luc14) - luc18 * luc4 * (luc8 * luc9) * luc5 -
+                   luc19 * luc12 * luc4 * luc14 * luc5 - luc19 * luc13 * luc7 * luc14 * luc9 + luc17 * luc4 * luc5 +
+                   luc18 * luc7 * luc9 + luc19 * luc11 * luc14 - luc16) *
+                  luc2;
+      auto li61 =
+          (-luc19 * luc13 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 * luc14 +
+           luc18 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 + luc19 * luc12 * luc4 * luc1 * luc2 * luc5 * luc14 +
+           luc19 * luc13 * luc7 * luc1 * luc2 * luc9 * luc14 + luc19 * luc13 * luc8 * luc3 * luc5 * luc9 * luc14 -
+           luc17 * luc4 * luc1 * luc2 * luc5 - luc18 * luc7 * luc1 * luc2 * luc9 - luc19 * luc11 * luc1 * luc2 * luc14 -
+           luc18 * luc8 * luc3 * luc5 * luc9 - luc19 * luc12 * luc3 * luc5 * luc14 -
+           luc19 * luc13 * luc6 * luc9 * luc14 + luc16 * luc1 * luc2 + luc17 * luc3 * luc5 + luc18 * luc6 * luc9 +
+           luc19 * luc10 * luc14 - luc15) *
+          luc0;
+
+      dst(0, 0) = luc20 * li61 * li61 + luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 +
+                  luc2 * li21 * li21 + luc0;
+      dst(1, 0) = luc20 * li61 * li62 + luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21;
+      dst(1, 1) = luc20 * li62 * li62 + luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2;
+      dst(2, 0) = luc20 * li61 * li63 + luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31;
+      dst(2, 1) = luc20 * li62 * li63 + luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32;
+      dst(2, 2) = luc20 * li63 * li63 + luc14 * li53 * li53 + luc9 * li43 * li43 + luc5;
+      dst(3, 0) = luc20 * li61 * li64 + luc14 * li51 * li54 + luc9 * li41;
+      dst(3, 1) = luc20 * li62 * li64 + luc14 * li52 * li54 + luc9 * li42;
+      dst(3, 2) = luc20 * li63 * li64 + luc14 * li53 * li54 + luc9 * li43;
+      dst(3, 3) = luc20 * li64 * li64 + luc14 * li54 * li54 + luc9;
+      dst(4, 0) = luc20 * li61 * li65 + luc14 * li51;
+      dst(4, 1) = luc20 * li62 * li65 + luc14 * li52;
+      dst(4, 2) = luc20 * li63 * li65 + luc14 * li53;
+      dst(4, 3) = luc20 * li64 * li65 + luc14 * li54;
+      dst(4, 4) = luc20 * li65 * li65 + luc14;
+      dst(5, 0) = luc20 * li61;
+      dst(5, 1) = luc20 * li62;
+      dst(5, 2) = luc20 * li63;
+      dst(5, 3) = luc20 * li64;
+      dst(5, 4) = luc20 * li65;
+      dst(5, 5) = luc20;
+    }
+
+    template <typename M>
+    inline constexpr void symmetrize11(M& dst) {}
+
+    template <typename M>
+    inline constexpr void symmetrize22(M& dst) {
+      dst(0, 1) = dst(1, 0);
+    }
+
+    template <typename M>
+    inline constexpr void symmetrize33(M& dst) {
       symmetrize22(dst);
+      dst(0, 2) = dst(2, 0);
+      dst(1, 2) = dst(2, 1);
     }
-  };
-  template <typename M1, typename M2>
-  struct Inverter<M1, M2, 3> {
-    static constexpr void eval(M1 const& src, M2& dst) {
-      invert33(src, dst);
+
+    template <typename M>
+    inline constexpr void symmetrize44(M& dst) {
       symmetrize33(dst);
+      dst(0, 3) = dst(3, 0);
+      dst(1, 3) = dst(3, 1);
+      dst(2, 3) = dst(3, 2);
     }
-  };
-  template <typename M1, typename M2>
-  struct Inverter<M1, M2, 4> {
-    static constexpr void eval(M1 const& src, M2& dst) {
-      invert44(src, dst);
+
+    template <typename M>
+    inline constexpr void symmetrize55(M& dst) {
       symmetrize44(dst);
+      dst(0, 4) = dst(4, 0);
+      dst(1, 4) = dst(4, 1);
+      dst(2, 4) = dst(4, 2);
+      dst(3, 4) = dst(4, 3);
     }
-  };
-  template <typename M1, typename M2>
-  struct Inverter<M1, M2, 5> {
-    static constexpr void eval(M1 const& src, M2& dst) {
-      invert55(src, dst);
+
+    template <typename M>
+    inline constexpr void symmetrize66(M& dst) {
       symmetrize55(dst);
+      dst(0, 5) = dst(5, 0);
+      dst(1, 5) = dst(5, 1);
+      dst(2, 5) = dst(5, 2);
+      dst(3, 5) = dst(5, 3);
+      dst(4, 5) = dst(5, 4);
     }
-  };
-  template <typename M1, typename M2>
-  struct Inverter<M1, M2, 6> {
-    static constexpr void eval(M1 const& src, M2& dst) {
-      invert66(src, dst);
-      symmetrize66(dst);
-    }
-  };
 
-  // Eigen interface
-  template <typename D1, typename D2>
-  inline constexpr void invert(Eigen::DenseBase<D1> const& src, Eigen::DenseBase<D2>& dst) {
-    using M1 = Eigen::DenseBase<D1>;
-    using M2 = Eigen::DenseBase<D2>;
-    Inverter<M1, M2, M2::ColsAtCompileTime>::eval(src, dst);
-  }
+    template <typename M1, typename M2, int N>
+    struct Inverter {
+      static constexpr void eval(M1 const& src, M2& dst) { dst = src.inverse(); }
+    };
+
+    template <typename M1, typename M2>
+    struct Inverter<M1, M2, 1> {
+      static constexpr void eval(M1 const& src, M2& dst) { invert11(src, dst); }
+    };
+
+    template <typename M1, typename M2>
+    struct Inverter<M1, M2, 2> {
+      static constexpr void eval(M1 const& src, M2& dst) {
+        invert22(src, dst);
+        symmetrize22(dst);
+      }
+    };
+
+    template <typename M1, typename M2>
+    struct Inverter<M1, M2, 3> {
+      static constexpr void eval(M1 const& src, M2& dst) {
+        invert33(src, dst);
+        symmetrize33(dst);
+      }
+    };
+
+    template <typename M1, typename M2>
+    struct Inverter<M1, M2, 4> {
+      static constexpr void eval(M1 const& src, M2& dst) {
+        invert44(src, dst);
+        symmetrize44(dst);
+      }
+    };
+
+    template <typename M1, typename M2>
+    struct Inverter<M1, M2, 5> {
+      static constexpr void eval(M1 const& src, M2& dst) {
+        invert55(src, dst);
+        symmetrize55(dst);
+      }
+    };
+
+    template <typename M1, typename M2>
+    struct Inverter<M1, M2, 6> {
+      static constexpr void eval(M1 const& src, M2& dst) {
+        invert66(src, dst);
+        symmetrize66(dst);
+      }
+    };
+
+    // Eigen interface
+    template <typename D1, typename D2>
+    inline constexpr void invert(Eigen::DenseBase<D1> const& src, Eigen::DenseBase<D2>& dst) {
+      using M1 = Eigen::DenseBase<D1>;
+      using M2 = Eigen::DenseBase<D2>;
+      Inverter<M1, M2, M2::ColsAtCompileTime>::eval(src, dst);
+    }
 
-}  // namespace choleskyInversion
+  }  // namespace cholesky
+}  // namespace math
 
 #endif  // DataFormat_Math_choleskyInversion_h
diff --git a/DataFormats/Math/test/CholeskyInvert_t.cpp b/DataFormats/Math/test/CholeskyInvert_t.cpp
index f1bf92353dfc3..c5dea25231988 100644
--- a/DataFormats/Math/test/CholeskyInvert_t.cpp
+++ b/DataFormats/Math/test/CholeskyInvert_t.cpp
@@ -74,13 +74,13 @@ void go(bool soa) {
   if (soa)
     for (unsigned int i = 0; i < SIZE; ++i) {
       MapMX<N> m(p + i);
-      choleskyInversion::invert(m, m);
-      choleskyInversion::invert(m, m);
+      math::cholesky::invert(m, m);
+      math::cholesky::invert(m, m);
     }
   else
     for (auto& m : mm) {
-      choleskyInversion::invert(m, m);
-      choleskyInversion::invert(m, m);
+      math::cholesky::invert(m, m);
+      math::cholesky::invert(m, m);
     }
 
   std::cout << mm[SIZE / 2](1, 1) << std::endl;
@@ -100,12 +100,12 @@ void go(bool soa) {
 #endif
       for (unsigned int i = 0; i < SIZE; ++i) {
         MapMX<N> m(p + i);
-        choleskyInversion::invert(m, m);
+        math::cholesky::invert(m, m);
       }
     else
 #pragma GCC ivdep
       for (auto& m : mm) {
-        choleskyInversion::invert(m, m);
+        math::cholesky::invert(m, m);
       }
 
     delta += (std::chrono::high_resolution_clock::now() - start);
diff --git a/DataFormats/Math/test/CholeskyInvert_t.cu b/DataFormats/Math/test/CholeskyInvert_t.cu
index 55df4ed23f20d..558f9296150c7 100644
--- a/DataFormats/Math/test/CholeskyInvert_t.cu
+++ b/DataFormats/Math/test/CholeskyInvert_t.cu
@@ -32,7 +32,7 @@ __global__ void invertSOA(double *__restrict__ p, unsigned int n) {
     return;
 
   MapMX<N> m(p + i);
-  choleskyInversion::invert(m, m);
+  math::cholesky::invert(m, m);
 }
 
 template <typename M, int N>
@@ -42,7 +42,7 @@ __global__ void invert(M *mm, unsigned int n) {
     return;
 
   auto &m = mm[i];
-  choleskyInversion::invert(m, m);
+  math::cholesky::invert(m, m);
 }
 
 template <typename M, int N>
@@ -54,7 +54,7 @@ __global__ void invertSeq(M *mm, unsigned int n) {
 
   for (auto i = first; i < last; ++i) {
     auto &m = mm[i];
-    choleskyInversion::invert(m, m);
+    math::cholesky::invert(m, m);
   }
 }
 
@@ -112,13 +112,13 @@ void go(bool soa) {
   if (soa)
     for (unsigned int i = 0; i < SIZE; ++i) {
       MapMX<N> m(p + i);
-      choleskyInversion::invert(m, m);
-      choleskyInversion::invert(m, m);
+      math::cholesky::invert(m, m);
+      math::cholesky::invert(m, m);
     }
   else
     for (auto &m : mm) {
-      choleskyInversion::invert(m, m);
-      choleskyInversion::invert(m, m);
+      math::cholesky::invert(m, m);
+      math::cholesky::invert(m, m);
     }
 
   std::cout << mm[SIZE / 2](1, 1) << std::endl;
@@ -168,12 +168,12 @@ void go(bool soa) {
 #pragma GCC ivdep
       for (unsigned int i = 0; i < SIZE; ++i) {
         MapMX<N> m(p + i);
-        choleskyInversion::invert(m, m);
+        math::cholesky::invert(m, m);
       }
     else
 #pragma GCC ivdep
       for (auto &m : mm) {
-        choleskyInversion::invert(m, m);
+        math::cholesky::invert(m, m);
       }
 
     delta2 += (std::chrono::high_resolution_clock::now() - start);
@@ -193,11 +193,13 @@ int main() {
   cms::cudatest::requireDevices();
 
   go<2>(false);
+  go<3>(false);
   go<4>(false);
   go<5>(false);
   go<6>(false);
 
   go<2>(true);
+  go<3>(true);
   go<4>(true);
   go<5>(true);
   go<6>(true);