From 1e1cf79604251cbd7467f284348427f8c5a9e372 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 5 Jul 2019 11:59:12 +0200
Subject: [PATCH 01/10] Port the whole pixel workflow to new heterogeneous
 framework (#384)

  - port the whole pixel workflow to new heterogeneous framework
  - implement a legacy cluster to SoA converter for the pixel RecHits
  - update the vertex producer to run on CPU as well as GPU
---
 .../Common/interface/HeterogeneousSoA.h       | 247 ++++++++++++++++++
 .../Common/interface/HostProduct.h            |  40 +++
 2 files changed, 287 insertions(+)
 create mode 100644 CUDADataFormats/Common/interface/HeterogeneousSoA.h
 create mode 100644 CUDADataFormats/Common/interface/HostProduct.h
diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
new file mode 100644
index 0000000000000..3ada1f2e1d83a
--- /dev/null
+++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
@@ -0,0 +1,247 @@
+#ifndef CUDADataFormatsCommonHeterogeneousSoA_H
+#define CUDADataFormatsCommonHeterogeneousSoA_H
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+// a heterogeneous unique pointer...
+template<typename T>
+class HeterogeneousSoA {
+public:
+
+  using Product = T;
+
+  HeterogeneousSoA() = default; // make root happy
+  ~HeterogeneousSoA() = default;
+  HeterogeneousSoA(HeterogeneousSoA&&) = default;
+  HeterogeneousSoA& operator=(HeterogeneousSoA&&) = default;
+
+  explicit HeterogeneousSoA(cudautils::device::unique_ptr<T> && p) : dm_ptr(std::move(p)) {}
+  explicit HeterogeneousSoA(cudautils::host::unique_ptr<T> && p) : hm_ptr(std::move(p)) {}
+  explicit HeterogeneousSoA(std::unique_ptr<T> && p) : std_ptr(std::move(p)) {}
+
+
+  auto const * get() const {
+    return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get());
+  }
+
+  auto const & operator*() const {
+    return *get();
+  }
+
+  auto const * operator->() const {
+    return get();
+  }
+
+
+  auto * get() {
+    return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get());
+  }
+
+  auto &  operator*() {
+    return *get();
+  }
+
+  auto * operator->() {
+    return get();
+  }
+
+
+
+  // in reality valid only for GPU version...
+  cudautils::host::unique_ptr<T>
+  toHostAsync(cuda::stream_t<>& stream) const {
+    assert(dm_ptr);
+    edm::Service<CUDAService> cs;
+    auto ret = cs->make_host_unique<T>(stream);
+    cudaCheck(cudaMemcpyAsync(ret.get(), dm_ptr.get(), sizeof(T), cudaMemcpyDefault, stream.id()));
+    return ret;
+  }
+
+
+
+private:
+  // a union wan't do it, a variant will not be more efficienct
+  cudautils::device::unique_ptr<T> dm_ptr; //!
+  cudautils::host::unique_ptr<T> hm_ptr; //!
+  std::unique_ptr<T> std_ptr; //!
+
+};
+
+
+/*
+namespace cudaCompat {
+
+  struct GPUTraits {
+
+    template<typename T>
+    using unique_ptr =  cudautils::device::unique_ptr<T>;
+
+    template<typename T>
+    static auto make_unique(edm::Service<CUDAService> & cs, cuda::stream_t<> &stream)    {
+      return cs->make_device_unique<T>(stream);
+    }
+
+    template<typename T>
+    static auto make_unique(edm::Service<CUDAService> & cs, size_t size, cuda::stream_t<> &stream)    {
+      return cs->make_device_unique<T>(size, stream);
+    }
+
+    template<typename T>
+    static auto make_host_unique(edm::Service<CUDAService> & cs, cuda::stream_t<> &stream)    {
+      return cs->make_host_unique<T>(stream);
+    }
+
+
+    template<typename T>
+    static auto make_device_unique(edm::Service<CUDAService> & cs, cuda::stream_t<> &stream)    {
+      return cs->make_device_unique<T>(stream);
+    }
+
+    template<typename T>
+    static auto make_device_unique(edm::Service<CUDAService> & cs, size_t size, cuda::stream_t<> &stream)    {
+      return cs->make_device_unique<T>(size, stream);
+    }
+
+
+  };
+
+
+  struct HostTraits {
+
+    template<typename T>
+    using unique_ptr =  cudautils::host::unique_ptr<T>;
+
+    template<typename T>
+    static auto make_unique(edm::Service<CUDAService> & cs, cuda::stream_t<> &stream)    {
+      return cs->make_host_unique<T>(stream);
+    }
+
+
+    template<typename T>
+    static auto make_host_unique(edm::Service<CUDAService> & cs, cuda::stream_t<> &stream)    {
+      return cs->make_host_unique<T>(stream);
+    }
+
+
+    template<typename T>
+    static auto make_device_unique(edm::Service<CUDAService> & cs, cuda::stream_t<> &stream)    {
+      return cs->make_device_unique<T>(stream);
+    }
+
+    template<typename T>
+    static auto make_device_unique(edm::Service<CUDAService> & cs, size_t size, cuda::stream_t<> &stream)    {
+      return cs->make_device_unique<T>(size, stream);
+    }
+
+
+  };
+
+
+  struct CPUTraits {
+
+    template<typename T>
+    using unique_ptr =  std::unique_ptr<T>;
+
+    template<typename T>
+    static auto make_unique(edm::Service<CUDAService>&, cuda::stream_t<> &)    {
+      return std::make_unique<T>();
+    }
+
+
+    template<typename T>
+    static auto make_unique(edm::Service<CUDAService>&, size_t size, cuda::stream_t<> &)    {
+      return std::make_unique<T>(size);
+    }
+
+
+    template<typename T>
+    static auto make_host_unique(edm::Service<CUDAService>&, cuda::stream_t<> &)    {
+      return std::make_unique<T>();
+    }
+
+
+    template<typename T>
+    static auto make_device_unique(edm::Service<CUDAService>&, cuda::stream_t<> &)    {
+      return std::make_unique<T>();
+    }
+
+    template<typename T>
+    static auto make_device_unique(edm::Service<CUDAService>&, size_t size, cuda::stream_t<> &)    {
+      return std::make_unique<T>(size);
+    }
+
+
+  };
+
+}
+
+
+
+// a heterogeneous unique pointer (of a different sort) ...
+template<typename T, typename Traits>
+class HeterogeneousSoAImpl {
+public:
+
+  template<typename V>
+  using unique_ptr = typename Traits:: template unique_ptr<V>;
+
+
+  HeterogeneousSoAImpl() = default; // make root happy
+  ~HeterogeneousSoAImpl() = default;
+  HeterogeneousSoAImpl(HeterogeneousSoAImpl&&) = default;
+  HeterogeneousSoAImpl& operator=(HeterogeneousSoAImpl&&) = default;
+
+  explicit HeterogeneousSoAImpl(unique_ptr<T> && p) : m_ptr(std::move(p)) {}
+  explicit HeterogeneousSoAImpl(cuda::stream_t<> &stream);
+
+  T const * get() const {
+    return m_ptr.get();
+  }
+  
+  T * get() {
+    return m_ptr.get();
+  }
+
+
+  cudautils::host::unique_ptr<T> toHostAsync(cuda::stream_t<>& stream) const;
+
+private:
+  unique_ptr<T> m_ptr; //!
+
+};
+
+
+
+template<typename T, typename Traits>
+HeterogeneousSoAImpl<T,Traits>::HeterogeneousSoAImpl(cuda::stream_t<> &stream) {
+  edm::Service<CUDAService> cs;
+  m_ptr = Traits:: template make_unique<T>(cs,stream);
+}
+
+
+// in reality valid only for GPU version...
+template<typename T, typename Traits>
+cudautils::host::unique_ptr<T>
+HeterogeneousSoAImpl<T,Traits>::toHostAsync(cuda::stream_t<>& stream) const {
+  edm::Service<CUDAService> cs;
+  auto ret = cs->make_host_unique<T>(stream);
+  cudaCheck(cudaMemcpyAsync(ret.get(), get(), sizeof(T), cudaMemcpyDefault, stream.id()));
+  return ret;
+}
+
+
+template<typename T>
+using HeterogeneousSoAGPU = HeterogeneousSoAImpl<T,cudaCompat::GPUTraits>;
+template<typename T>
+using HeterogeneousSoACPU =  HeterogeneousSoAImpl<T,cudaCompat::CPUTraits>;
+template<typename T>
+using HeterogeneousSoAHost = HeterogeneousSoAImpl<T,cudaCompat::HostTraits>;
+*/
+
+#endif
diff --git a/CUDADataFormats/Common/interface/HostProduct.h b/CUDADataFormats/Common/interface/HostProduct.h
new file mode 100644
index 0000000000000..dfc5cfc866779
--- /dev/null
+++ b/CUDADataFormats/Common/interface/HostProduct.h
@@ -0,0 +1,40 @@
+#ifndef CUDADataFormatsCommonHostProduct_H
+#define CUDADataFormatsCommonHostProduct_H
+
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+
+// a heterogeneous unique pointer...
+template<typename T>
+class HostProduct {
+public:
+
+
+  HostProduct() = default; // make root happy
+  ~HostProduct() = default;
+  HostProduct(HostProduct&&) = default;
+  HostProduct& operator=(HostProduct&&) = default;
+
+  explicit HostProduct(cudautils::host::unique_ptr<T> && p) : hm_ptr(std::move(p)) {}
+  explicit HostProduct(std::unique_ptr<T> && p) : std_ptr(std::move(p)) {}
+
+
+  auto const * get() const {
+    return hm_ptr ? hm_ptr.get() : std_ptr.get();
+  }
+  
+  auto const & operator*() const {
+    return *get();
+  }
+
+  auto const * operator->() const {
+    return get();
+  }
+  
+
+private:
+  cudautils::host::unique_ptr<T> hm_ptr;
+  std::unique_ptr<T> std_ptr;
+
+};
+
+#endif

From bdebe5ec25b3c077aa357983039618dcde45de25 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sun, 25 Aug 2019 14:04:02 +0200
Subject: [PATCH 02/10] Implement full Pixel SoA workflow on CPU (#385)

---
 CUDADataFormats/Common/interface/HeterogeneousSoA.h | 3 +--
 CUDADataFormats/Common/interface/HostProduct.h      | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
index 3ada1f2e1d83a..44b36c3add2cc 100644
--- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h
+++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
@@ -74,7 +74,6 @@ class HeterogeneousSoA {
 };
 
 
-/*
 namespace cudaCompat {
 
   struct GPUTraits {
@@ -242,6 +241,6 @@ template<typename T>
 using HeterogeneousSoACPU =  HeterogeneousSoAImpl<T,cudaCompat::CPUTraits>;
 template<typename T>
 using HeterogeneousSoAHost = HeterogeneousSoAImpl<T,cudaCompat::HostTraits>;
-*/
+
 
 #endif
diff --git a/CUDADataFormats/Common/interface/HostProduct.h b/CUDADataFormats/Common/interface/HostProduct.h
index dfc5cfc866779..cab50d402cdb5 100644
--- a/CUDADataFormats/Common/interface/HostProduct.h
+++ b/CUDADataFormats/Common/interface/HostProduct.h
@@ -32,8 +32,8 @@ class HostProduct {
   
 
 private:
-  cudautils::host::unique_ptr<T> hm_ptr;
-  std::unique_ptr<T> std_ptr;
+  cudautils::host::unique_ptr<T> hm_ptr; //!
+  std::unique_ptr<T> std_ptr;  //!
 
 };
 

From d1f001530f0025bf802f7392108c2632cf9c734c Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 10 Sep 2019 16:03:58 -0500
Subject: [PATCH 03/10] Move event and stream caches, and caching allocators
 out from CUDAService (#364)

To reduce dependencies on edm::Service, and to make CUDAService less
of a collection of everything, split off from it:
  - the CUDAEventCache
  - the CUDAStreamCache
  - the caching allocators

Other changes:
  - clean up unnecessary use of CUDAService
  - fix maxCachedFraction, add debug printouts
  - add make_*_unique_uninitialized that avoid the static_assert
---
 .../Common/interface/HeterogeneousSoA.h       | 57 +++++++++----------
 1 file changed, 26 insertions(+), 31 deletions(-)

diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
index 44b36c3add2cc..3a161f404f44d 100644
--- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h
+++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
@@ -4,8 +4,6 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
@@ -57,8 +55,7 @@ class HeterogeneousSoA {
   cudautils::host::unique_ptr<T>
   toHostAsync(cuda::stream_t<>& stream) const {
     assert(dm_ptr);
-    edm::Service<CUDAService> cs;
-    auto ret = cs->make_host_unique<T>(stream);
+    auto ret = cudautils::make_host_unique<T>(stream);
     cudaCheck(cudaMemcpyAsync(ret.get(), dm_ptr.get(), sizeof(T), cudaMemcpyDefault, stream.id()));
     return ret;
   }
@@ -82,29 +79,29 @@ namespace cudaCompat {
     using unique_ptr =  cudautils::device::unique_ptr<T>;
 
     template<typename T>
-    static auto make_unique(edm::Service<CUDAService> & cs, cuda::stream_t<> &stream)    {
-      return cs->make_device_unique<T>(stream);
+    static auto make_unique(cuda::stream_t<> &stream)    {
+      return cudautils::make_device_unique<T>(stream);
     }
 
     template<typename T>
-    static auto make_unique(edm::Service<CUDAService> & cs, size_t size, cuda::stream_t<> &stream)    {
-      return cs->make_device_unique<T>(size, stream);
+    static auto make_unique(size_t size, cuda::stream_t<> &stream)    {
+      return cudautils::make_device_unique<T>(size, stream);
     }
 
     template<typename T>
-    static auto make_host_unique(edm::Service<CUDAService> & cs, cuda::stream_t<> &stream)    {
-      return cs->make_host_unique<T>(stream);
+    static auto make_host_unique(cuda::stream_t<> &stream)    {
+      return cudautils::make_host_unique<T>(stream);
     }
 
 
     template<typename T>
-    static auto make_device_unique(edm::Service<CUDAService> & cs, cuda::stream_t<> &stream)    {
-      return cs->make_device_unique<T>(stream);
+    static auto make_device_unique(cuda::stream_t<> &stream)    {
+      return cudautils::make_device_unique<T>(stream);
     }
 
     template<typename T>
-    static auto make_device_unique(edm::Service<CUDAService> & cs, size_t size, cuda::stream_t<> &stream)    {
-      return cs->make_device_unique<T>(size, stream);
+    static auto make_device_unique(size_t size, cuda::stream_t<> &stream)    {
+      return cudautils::make_device_unique<T>(size, stream);
     }
 
 
@@ -117,25 +114,25 @@ namespace cudaCompat {
     using unique_ptr =  cudautils::host::unique_ptr<T>;
 
     template<typename T>
-    static auto make_unique(edm::Service<CUDAService> & cs, cuda::stream_t<> &stream)    {
-      return cs->make_host_unique<T>(stream);
+    static auto make_unique(cuda::stream_t<> &stream)    {
+      return cudautils::make_host_unique<T>(stream);
     }
 
 
     template<typename T>
-    static auto make_host_unique(edm::Service<CUDAService> & cs, cuda::stream_t<> &stream)    {
-      return cs->make_host_unique<T>(stream);
+    static auto make_host_unique(cuda::stream_t<> &stream)    {
+      return cudautils::make_host_unique<T>(stream);
     }
 
 
     template<typename T>
-    static auto make_device_unique(edm::Service<CUDAService> & cs, cuda::stream_t<> &stream)    {
-      return cs->make_device_unique<T>(stream);
+    static auto make_device_unique(cuda::stream_t<> &stream)    {
+      return cudautils::make_device_unique<T>(stream);
     }
 
     template<typename T>
-    static auto make_device_unique(edm::Service<CUDAService> & cs, size_t size, cuda::stream_t<> &stream)    {
-      return cs->make_device_unique<T>(size, stream);
+    static auto make_device_unique(size_t size, cuda::stream_t<> &stream)    {
+      return cudautils::make_device_unique<T>(size, stream);
     }
 
 
@@ -148,30 +145,30 @@ namespace cudaCompat {
     using unique_ptr =  std::unique_ptr<T>;
 
     template<typename T>
-    static auto make_unique(edm::Service<CUDAService>&, cuda::stream_t<> &)    {
+    static auto make_unique(cuda::stream_t<> &)    {
       return std::make_unique<T>();
     }
 
 
     template<typename T>
-    static auto make_unique(edm::Service<CUDAService>&, size_t size, cuda::stream_t<> &)    {
+    static auto make_unique(size_t size, cuda::stream_t<> &)    {
       return std::make_unique<T>(size);
     }
 
 
     template<typename T>
-    static auto make_host_unique(edm::Service<CUDAService>&, cuda::stream_t<> &)    {
+    static auto make_host_unique(cuda::stream_t<> &)    {
       return std::make_unique<T>();
     }
 
 
     template<typename T>
-    static auto make_device_unique(edm::Service<CUDAService>&, cuda::stream_t<> &)    {
+    static auto make_device_unique(cuda::stream_t<> &)    {
       return std::make_unique<T>();
     }
 
     template<typename T>
-    static auto make_device_unique(edm::Service<CUDAService>&, size_t size, cuda::stream_t<> &)    {
+    static auto make_device_unique(size_t size, cuda::stream_t<> &)    {
       return std::make_unique<T>(size);
     }
 
@@ -219,8 +216,7 @@ class HeterogeneousSoAImpl {
 
 template<typename T, typename Traits>
 HeterogeneousSoAImpl<T,Traits>::HeterogeneousSoAImpl(cuda::stream_t<> &stream) {
-  edm::Service<CUDAService> cs;
-  m_ptr = Traits:: template make_unique<T>(cs,stream);
+  m_ptr = Traits:: template make_unique<T>(stream);
 }
 
 
@@ -228,8 +224,7 @@ HeterogeneousSoAImpl<T,Traits>::HeterogeneousSoAImpl(cuda::stream_t<> &stream) {
 template<typename T, typename Traits>
 cudautils::host::unique_ptr<T>
 HeterogeneousSoAImpl<T,Traits>::toHostAsync(cuda::stream_t<>& stream) const {
-  edm::Service<CUDAService> cs;
-  auto ret = cs->make_host_unique<T>(stream);
+  auto ret = cudautils::make_host_unique<T>(stream);
   cudaCheck(cudaMemcpyAsync(ret.get(), get(), sizeof(T), cudaMemcpyDefault, stream.id()));
   return ret;
 }

From 2e939359f81cfeacb45c7e59e7f51b57c40e0302 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 12 Sep 2019 00:22:05 +0200
Subject: [PATCH 04/10] Apply clang-format style formatting

---
 .../Common/interface/HeterogeneousSoA.h       | 203 +++++++-----------
 .../Common/interface/HostProduct.h            |  29 +--
 2 files changed, 83 insertions(+), 149 deletions(-)

diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
index 3a161f404f44d..e0e537b9b5c06 100644
--- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h
+++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
@@ -8,234 +8,179 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 // a heterogeneous unique pointer...
-template<typename T>
+template <typename T>
 class HeterogeneousSoA {
 public:
-
   using Product = T;
 
-  HeterogeneousSoA() = default; // make root happy
+  HeterogeneousSoA() = default;  // make root happy
   ~HeterogeneousSoA() = default;
-  HeterogeneousSoA(HeterogeneousSoA&&) = default;
-  HeterogeneousSoA& operator=(HeterogeneousSoA&&) = default;
-
-  explicit HeterogeneousSoA(cudautils::device::unique_ptr<T> && p) : dm_ptr(std::move(p)) {}
-  explicit HeterogeneousSoA(cudautils::host::unique_ptr<T> && p) : hm_ptr(std::move(p)) {}
-  explicit HeterogeneousSoA(std::unique_ptr<T> && p) : std_ptr(std::move(p)) {}
-
-
-  auto const * get() const {
-    return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get());
-  }
-
-  auto const & operator*() const {
-    return *get();
-  }
+  HeterogeneousSoA(HeterogeneousSoA &&) = default;
+  HeterogeneousSoA &operator=(HeterogeneousSoA &&) = default;
 
-  auto const * operator->() const {
-    return get();
-  }
+  explicit HeterogeneousSoA(cudautils::device::unique_ptr<T> &&p) : dm_ptr(std::move(p)) {}
+  explicit HeterogeneousSoA(cudautils::host::unique_ptr<T> &&p) : hm_ptr(std::move(p)) {}
+  explicit HeterogeneousSoA(std::unique_ptr<T> &&p) : std_ptr(std::move(p)) {}
 
+  auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
 
-  auto * get() {
-    return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get());
-  }
+  auto const &operator*() const { return *get(); }
 
-  auto &  operator*() {
-    return *get();
-  }
+  auto const *operator-> () const { return get(); }
 
-  auto * operator->() {
-    return get();
-  }
+  auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
 
+  auto &operator*() { return *get(); }
 
+  auto *operator-> () { return get(); }
 
   // in reality valid only for GPU version...
-  cudautils::host::unique_ptr<T>
-  toHostAsync(cuda::stream_t<>& stream) const {
+  cudautils::host::unique_ptr<T> toHostAsync(cuda::stream_t<> &stream) const {
     assert(dm_ptr);
     auto ret = cudautils::make_host_unique<T>(stream);
     cudaCheck(cudaMemcpyAsync(ret.get(), dm_ptr.get(), sizeof(T), cudaMemcpyDefault, stream.id()));
     return ret;
   }
 
-
-
 private:
   // a union wan't do it, a variant will not be more efficienct
-  cudautils::device::unique_ptr<T> dm_ptr; //!
-  cudautils::host::unique_ptr<T> hm_ptr; //!
-  std::unique_ptr<T> std_ptr; //!
-
+  cudautils::device::unique_ptr<T> dm_ptr;  //!
+  cudautils::host::unique_ptr<T> hm_ptr;    //!
+  std::unique_ptr<T> std_ptr;               //!
 };
 
-
 namespace cudaCompat {
 
   struct GPUTraits {
+    template <typename T>
+    using unique_ptr = cudautils::device::unique_ptr<T>;
 
-    template<typename T>
-    using unique_ptr =  cudautils::device::unique_ptr<T>;
-
-    template<typename T>
-    static auto make_unique(cuda::stream_t<> &stream)    {
+    template <typename T>
+    static auto make_unique(cuda::stream_t<> &stream) {
       return cudautils::make_device_unique<T>(stream);
     }
 
-    template<typename T>
-    static auto make_unique(size_t size, cuda::stream_t<> &stream)    {
+    template <typename T>
+    static auto make_unique(size_t size, cuda::stream_t<> &stream) {
       return cudautils::make_device_unique<T>(size, stream);
     }
 
-    template<typename T>
-    static auto make_host_unique(cuda::stream_t<> &stream)    {
+    template <typename T>
+    static auto make_host_unique(cuda::stream_t<> &stream) {
       return cudautils::make_host_unique<T>(stream);
     }
 
-
-    template<typename T>
-    static auto make_device_unique(cuda::stream_t<> &stream)    {
+    template <typename T>
+    static auto make_device_unique(cuda::stream_t<> &stream) {
       return cudautils::make_device_unique<T>(stream);
     }
 
-    template<typename T>
-    static auto make_device_unique(size_t size, cuda::stream_t<> &stream)    {
+    template <typename T>
+    static auto make_device_unique(size_t size, cuda::stream_t<> &stream) {
       return cudautils::make_device_unique<T>(size, stream);
     }
-
-
   };
 
-
   struct HostTraits {
+    template <typename T>
+    using unique_ptr = cudautils::host::unique_ptr<T>;
 
-    template<typename T>
-    using unique_ptr =  cudautils::host::unique_ptr<T>;
-
-    template<typename T>
-    static auto make_unique(cuda::stream_t<> &stream)    {
+    template <typename T>
+    static auto make_unique(cuda::stream_t<> &stream) {
       return cudautils::make_host_unique<T>(stream);
     }
 
-
-    template<typename T>
-    static auto make_host_unique(cuda::stream_t<> &stream)    {
+    template <typename T>
+    static auto make_host_unique(cuda::stream_t<> &stream) {
       return cudautils::make_host_unique<T>(stream);
     }
 
-
-    template<typename T>
-    static auto make_device_unique(cuda::stream_t<> &stream)    {
+    template <typename T>
+    static auto make_device_unique(cuda::stream_t<> &stream) {
       return cudautils::make_device_unique<T>(stream);
     }
 
-    template<typename T>
-    static auto make_device_unique(size_t size, cuda::stream_t<> &stream)    {
+    template <typename T>
+    static auto make_device_unique(size_t size, cuda::stream_t<> &stream) {
       return cudautils::make_device_unique<T>(size, stream);
     }
-
-
   };
 
-
   struct CPUTraits {
+    template <typename T>
+    using unique_ptr = std::unique_ptr<T>;
 
-    template<typename T>
-    using unique_ptr =  std::unique_ptr<T>;
-
-    template<typename T>
-    static auto make_unique(cuda::stream_t<> &)    {
+    template <typename T>
+    static auto make_unique(cuda::stream_t<> &) {
       return std::make_unique<T>();
     }
 
-
-    template<typename T>
-    static auto make_unique(size_t size, cuda::stream_t<> &)    {
+    template <typename T>
+    static auto make_unique(size_t size, cuda::stream_t<> &) {
       return std::make_unique<T>(size);
     }
 
-
-    template<typename T>
-    static auto make_host_unique(cuda::stream_t<> &)    {
+    template <typename T>
+    static auto make_host_unique(cuda::stream_t<> &) {
       return std::make_unique<T>();
     }
 
-
-    template<typename T>
-    static auto make_device_unique(cuda::stream_t<> &)    {
+    template <typename T>
+    static auto make_device_unique(cuda::stream_t<> &) {
       return std::make_unique<T>();
     }
 
-    template<typename T>
-    static auto make_device_unique(size_t size, cuda::stream_t<> &)    {
+    template <typename T>
+    static auto make_device_unique(size_t size, cuda::stream_t<> &) {
       return std::make_unique<T>(size);
     }
-
-
   };
 
-}
-
-
+}  // namespace cudaCompat
 
 // a heterogeneous unique pointer (of a different sort) ...
-template<typename T, typename Traits>
+template <typename T, typename Traits>
 class HeterogeneousSoAImpl {
 public:
+  template <typename V>
+  using unique_ptr = typename Traits::template unique_ptr<V>;
 
-  template<typename V>
-  using unique_ptr = typename Traits:: template unique_ptr<V>;
-
-
-  HeterogeneousSoAImpl() = default; // make root happy
+  HeterogeneousSoAImpl() = default;  // make root happy
   ~HeterogeneousSoAImpl() = default;
-  HeterogeneousSoAImpl(HeterogeneousSoAImpl&&) = default;
-  HeterogeneousSoAImpl& operator=(HeterogeneousSoAImpl&&) = default;
+  HeterogeneousSoAImpl(HeterogeneousSoAImpl &&) = default;
+  HeterogeneousSoAImpl &operator=(HeterogeneousSoAImpl &&) = default;
 
-  explicit HeterogeneousSoAImpl(unique_ptr<T> && p) : m_ptr(std::move(p)) {}
+  explicit HeterogeneousSoAImpl(unique_ptr<T> &&p) : m_ptr(std::move(p)) {}
   explicit HeterogeneousSoAImpl(cuda::stream_t<> &stream);
 
-  T const * get() const {
-    return m_ptr.get();
-  }
-  
-  T * get() {
-    return m_ptr.get();
-  }
+  T const *get() const { return m_ptr.get(); }
 
+  T *get() { return m_ptr.get(); }
 
-  cudautils::host::unique_ptr<T> toHostAsync(cuda::stream_t<>& stream) const;
+  cudautils::host::unique_ptr<T> toHostAsync(cuda::stream_t<> &stream) const;
 
 private:
-  unique_ptr<T> m_ptr; //!
-
+  unique_ptr<T> m_ptr;  //!
 };
 
-
-
-template<typename T, typename Traits>
-HeterogeneousSoAImpl<T,Traits>::HeterogeneousSoAImpl(cuda::stream_t<> &stream) {
-  m_ptr = Traits:: template make_unique<T>(stream);
+template <typename T, typename Traits>
+HeterogeneousSoAImpl<T, Traits>::HeterogeneousSoAImpl(cuda::stream_t<> &stream) {
+  m_ptr = Traits::template make_unique<T>(stream);
 }
 
-
 // in reality valid only for GPU version...
-template<typename T, typename Traits>
-cudautils::host::unique_ptr<T>
-HeterogeneousSoAImpl<T,Traits>::toHostAsync(cuda::stream_t<>& stream) const {
+template <typename T, typename Traits>
+cudautils::host::unique_ptr<T> HeterogeneousSoAImpl<T, Traits>::toHostAsync(cuda::stream_t<> &stream) const {
   auto ret = cudautils::make_host_unique<T>(stream);
   cudaCheck(cudaMemcpyAsync(ret.get(), get(), sizeof(T), cudaMemcpyDefault, stream.id()));
   return ret;
 }
 
-
-template<typename T>
-using HeterogeneousSoAGPU = HeterogeneousSoAImpl<T,cudaCompat::GPUTraits>;
-template<typename T>
-using HeterogeneousSoACPU =  HeterogeneousSoAImpl<T,cudaCompat::CPUTraits>;
-template<typename T>
-using HeterogeneousSoAHost = HeterogeneousSoAImpl<T,cudaCompat::HostTraits>;
-
+template <typename T>
+using HeterogeneousSoAGPU = HeterogeneousSoAImpl<T, cudaCompat::GPUTraits>;
+template <typename T>
+using HeterogeneousSoACPU = HeterogeneousSoAImpl<T, cudaCompat::CPUTraits>;
+template <typename T>
+using HeterogeneousSoAHost = HeterogeneousSoAImpl<T, cudaCompat::HostTraits>;
 
 #endif
diff --git a/CUDADataFormats/Common/interface/HostProduct.h b/CUDADataFormats/Common/interface/HostProduct.h
index cab50d402cdb5..17ad98ba403a4 100644
--- a/CUDADataFormats/Common/interface/HostProduct.h
+++ b/CUDADataFormats/Common/interface/HostProduct.h
@@ -4,37 +4,26 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
 // a heterogeneous unique pointer...
-template<typename T>
+template <typename T>
 class HostProduct {
 public:
-
-
-  HostProduct() = default; // make root happy
+  HostProduct() = default;  // make root happy
   ~HostProduct() = default;
   HostProduct(HostProduct&&) = default;
   HostProduct& operator=(HostProduct&&) = default;
 
-  explicit HostProduct(cudautils::host::unique_ptr<T> && p) : hm_ptr(std::move(p)) {}
-  explicit HostProduct(std::unique_ptr<T> && p) : std_ptr(std::move(p)) {}
+  explicit HostProduct(cudautils::host::unique_ptr<T>&& p) : hm_ptr(std::move(p)) {}
+  explicit HostProduct(std::unique_ptr<T>&& p) : std_ptr(std::move(p)) {}
 
+  auto const* get() const { return hm_ptr ? hm_ptr.get() : std_ptr.get(); }
 
-  auto const * get() const {
-    return hm_ptr ? hm_ptr.get() : std_ptr.get();
-  }
-  
-  auto const & operator*() const {
-    return *get();
-  }
+  auto const& operator*() const { return *get(); }
 
-  auto const * operator->() const {
-    return get();
-  }
-  
+  auto const* operator-> () const { return get(); }
 
 private:
-  cudautils::host::unique_ptr<T> hm_ptr; //!
-  std::unique_ptr<T> std_ptr;  //!
-
+  cudautils::host::unique_ptr<T> hm_ptr;  //!
+  std::unique_ptr<T> std_ptr;             //!
 };
 
 #endif

From 5d3f02666c1437b4a33bbe54573d46c1c5c21908 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Sat, 26 Oct 2019 13:57:43 -0500
Subject: [PATCH 05/10] Replace use of API wrapper stream and event with plain
 CUDA, part 1 (#389)

Replace cuda::stream_t<> with cudaStream_t in client code
Replace cuda::event_t with cudaEvent_t in the client code
Clean up BuildFiles
---
 .../Common/interface/HeterogeneousSoA.h       | 42 +++++++++----------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
index e0e537b9b5c06..9e44bfdf35969 100644
--- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h
+++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
@@ -35,10 +35,10 @@ class HeterogeneousSoA {
   auto *operator-> () { return get(); }
 
   // in reality valid only for GPU version...
-  cudautils::host::unique_ptr<T> toHostAsync(cuda::stream_t<> &stream) const {
+  cudautils::host::unique_ptr<T> toHostAsync(cudaStream_t stream) const {
     assert(dm_ptr);
     auto ret = cudautils::make_host_unique<T>(stream);
-    cudaCheck(cudaMemcpyAsync(ret.get(), dm_ptr.get(), sizeof(T), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(ret.get(), dm_ptr.get(), sizeof(T), cudaMemcpyDefault, stream));
     return ret;
   }
 
@@ -56,27 +56,27 @@ namespace cudaCompat {
     using unique_ptr = cudautils::device::unique_ptr<T>;
 
     template <typename T>
-    static auto make_unique(cuda::stream_t<> &stream) {
+    static auto make_unique(cudaStream_t stream) {
       return cudautils::make_device_unique<T>(stream);
     }
 
     template <typename T>
-    static auto make_unique(size_t size, cuda::stream_t<> &stream) {
+    static auto make_unique(size_t size, cudaStream_t stream) {
       return cudautils::make_device_unique<T>(size, stream);
     }
 
     template <typename T>
-    static auto make_host_unique(cuda::stream_t<> &stream) {
+    static auto make_host_unique(cudaStream_t stream) {
       return cudautils::make_host_unique<T>(stream);
     }
 
     template <typename T>
-    static auto make_device_unique(cuda::stream_t<> &stream) {
+    static auto make_device_unique(cudaStream_t stream) {
       return cudautils::make_device_unique<T>(stream);
     }
 
     template <typename T>
-    static auto make_device_unique(size_t size, cuda::stream_t<> &stream) {
+    static auto make_device_unique(size_t size, cudaStream_t stream) {
       return cudautils::make_device_unique<T>(size, stream);
     }
   };
@@ -86,22 +86,22 @@ namespace cudaCompat {
     using unique_ptr = cudautils::host::unique_ptr<T>;
 
     template <typename T>
-    static auto make_unique(cuda::stream_t<> &stream) {
+    static auto make_unique(cudaStream_t stream) {
       return cudautils::make_host_unique<T>(stream);
     }
 
     template <typename T>
-    static auto make_host_unique(cuda::stream_t<> &stream) {
+    static auto make_host_unique(cudaStream_t stream) {
       return cudautils::make_host_unique<T>(stream);
     }
 
     template <typename T>
-    static auto make_device_unique(cuda::stream_t<> &stream) {
+    static auto make_device_unique(cudaStream_t stream) {
       return cudautils::make_device_unique<T>(stream);
     }
 
     template <typename T>
-    static auto make_device_unique(size_t size, cuda::stream_t<> &stream) {
+    static auto make_device_unique(size_t size, cudaStream_t stream) {
       return cudautils::make_device_unique<T>(size, stream);
     }
   };
@@ -111,27 +111,27 @@ namespace cudaCompat {
     using unique_ptr = std::unique_ptr<T>;
 
     template <typename T>
-    static auto make_unique(cuda::stream_t<> &) {
+    static auto make_unique(cudaStream_t) {
       return std::make_unique<T>();
     }
 
     template <typename T>
-    static auto make_unique(size_t size, cuda::stream_t<> &) {
+    static auto make_unique(size_t size, cudaStream_t) {
       return std::make_unique<T>(size);
     }
 
     template <typename T>
-    static auto make_host_unique(cuda::stream_t<> &) {
+    static auto make_host_unique(cudaStream_t) {
       return std::make_unique<T>();
     }
 
     template <typename T>
-    static auto make_device_unique(cuda::stream_t<> &) {
+    static auto make_device_unique(cudaStream_t) {
       return std::make_unique<T>();
     }
 
     template <typename T>
-    static auto make_device_unique(size_t size, cuda::stream_t<> &) {
+    static auto make_device_unique(size_t size, cudaStream_t) {
       return std::make_unique<T>(size);
     }
   };
@@ -151,28 +151,28 @@ class HeterogeneousSoAImpl {
   HeterogeneousSoAImpl &operator=(HeterogeneousSoAImpl &&) = default;
 
   explicit HeterogeneousSoAImpl(unique_ptr<T> &&p) : m_ptr(std::move(p)) {}
-  explicit HeterogeneousSoAImpl(cuda::stream_t<> &stream);
+  explicit HeterogeneousSoAImpl(cudaStream_t stream);
 
   T const *get() const { return m_ptr.get(); }
 
   T *get() { return m_ptr.get(); }
 
-  cudautils::host::unique_ptr<T> toHostAsync(cuda::stream_t<> &stream) const;
+  cudautils::host::unique_ptr<T> toHostAsync(cudaStream_t stream) const;
 
 private:
   unique_ptr<T> m_ptr;  //!
 };
 
 template <typename T, typename Traits>
-HeterogeneousSoAImpl<T, Traits>::HeterogeneousSoAImpl(cuda::stream_t<> &stream) {
+HeterogeneousSoAImpl<T, Traits>::HeterogeneousSoAImpl(cudaStream_t stream) {
   m_ptr = Traits::template make_unique<T>(stream);
 }
 
 // in reality valid only for GPU version...
 template <typename T, typename Traits>
-cudautils::host::unique_ptr<T> HeterogeneousSoAImpl<T, Traits>::toHostAsync(cuda::stream_t<> &stream) const {
+cudautils::host::unique_ptr<T> HeterogeneousSoAImpl<T, Traits>::toHostAsync(cudaStream_t stream) const {
   auto ret = cudautils::make_host_unique<T>(stream);
-  cudaCheck(cudaMemcpyAsync(ret.get(), get(), sizeof(T), cudaMemcpyDefault, stream.id()));
+  cudaCheck(cudaMemcpyAsync(ret.get(), get(), sizeof(T), cudaMemcpyDefault, stream));
   return ret;
 }
 

From 41ef278585cfb8b4fa9b4d7c12d8844d2c85021e Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 27 Nov 2019 15:17:05 +0100
Subject: [PATCH 06/10] Drop obsolete heterogenous framework (#416)

---
 CUDADataFormats/Common/interface/HeterogeneousSoA.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
index 9e44bfdf35969..907b7647a3452 100644
--- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h
+++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
@@ -1,11 +1,12 @@
 #ifndef CUDADataFormatsCommonHeterogeneousSoA_H
 #define CUDADataFormatsCommonHeterogeneousSoA_H
 
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include <cassert>
 
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
 // a heterogeneous unique pointer...
 template <typename T>

From 62b09c0611da3328e2c0799b8393cf2651da8f07 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 17 Jan 2020 09:10:53 -0600
Subject: [PATCH 07/10] Implement changes from the CUDA framework review (#429)

Rename the cudautils namespace to cms::cuda or cms::cudatest, and drop the CUDA prefix from the symbols defined there.

Always record and query the CUDA event, to minimize need for error checking in CUDAScopedContextProduce destructor.

Add comments to highlight the pieces in CachingDeviceAllocator that have been changed wrt. cub.

Various other updates and clean up:
  - enable CUDA for compute capability 3.5.
  - clean up CUDAService, CUDA tests and plugins.
  - add CUDA existence protections to BuildFiles.
  - mark thread-safe static variables with CMS_THREAD_SAFE.
---
 .../Common/interface/HeterogeneousSoA.h       | 40 +++++++++----------
 .../Common/interface/HostProduct.h            |  4 +-
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
index 907b7647a3452..6fec0026dfaa1 100644
--- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h
+++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
@@ -19,8 +19,8 @@ class HeterogeneousSoA {
   HeterogeneousSoA(HeterogeneousSoA &&) = default;
   HeterogeneousSoA &operator=(HeterogeneousSoA &&) = default;
 
-  explicit HeterogeneousSoA(cudautils::device::unique_ptr<T> &&p) : dm_ptr(std::move(p)) {}
-  explicit HeterogeneousSoA(cudautils::host::unique_ptr<T> &&p) : hm_ptr(std::move(p)) {}
+  explicit HeterogeneousSoA(cms::cuda::device::unique_ptr<T> &&p) : dm_ptr(std::move(p)) {}
+  explicit HeterogeneousSoA(cms::cuda::host::unique_ptr<T> &&p) : hm_ptr(std::move(p)) {}
   explicit HeterogeneousSoA(std::unique_ptr<T> &&p) : std_ptr(std::move(p)) {}
 
   auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
@@ -36,17 +36,17 @@ class HeterogeneousSoA {
   auto *operator-> () { return get(); }
 
   // in reality valid only for GPU version...
-  cudautils::host::unique_ptr<T> toHostAsync(cudaStream_t stream) const {
+  cms::cuda::host::unique_ptr<T> toHostAsync(cudaStream_t stream) const {
     assert(dm_ptr);
-    auto ret = cudautils::make_host_unique<T>(stream);
+    auto ret = cms::cuda::make_host_unique<T>(stream);
     cudaCheck(cudaMemcpyAsync(ret.get(), dm_ptr.get(), sizeof(T), cudaMemcpyDefault, stream));
     return ret;
   }
 
 private:
   // a union wan't do it, a variant will not be more efficienct
-  cudautils::device::unique_ptr<T> dm_ptr;  //!
-  cudautils::host::unique_ptr<T> hm_ptr;    //!
+  cms::cuda::device::unique_ptr<T> dm_ptr;  //!
+  cms::cuda::host::unique_ptr<T> hm_ptr;    //!
   std::unique_ptr<T> std_ptr;               //!
 };
 
@@ -54,56 +54,56 @@ namespace cudaCompat {
 
   struct GPUTraits {
     template <typename T>
-    using unique_ptr = cudautils::device::unique_ptr<T>;
+    using unique_ptr = cms::cuda::device::unique_ptr<T>;
 
     template <typename T>
     static auto make_unique(cudaStream_t stream) {
-      return cudautils::make_device_unique<T>(stream);
+      return cms::cuda::make_device_unique<T>(stream);
     }
 
     template <typename T>
     static auto make_unique(size_t size, cudaStream_t stream) {
-      return cudautils::make_device_unique<T>(size, stream);
+      return cms::cuda::make_device_unique<T>(size, stream);
     }
 
     template <typename T>
     static auto make_host_unique(cudaStream_t stream) {
-      return cudautils::make_host_unique<T>(stream);
+      return cms::cuda::make_host_unique<T>(stream);
     }
 
     template <typename T>
     static auto make_device_unique(cudaStream_t stream) {
-      return cudautils::make_device_unique<T>(stream);
+      return cms::cuda::make_device_unique<T>(stream);
     }
 
     template <typename T>
     static auto make_device_unique(size_t size, cudaStream_t stream) {
-      return cudautils::make_device_unique<T>(size, stream);
+      return cms::cuda::make_device_unique<T>(size, stream);
     }
   };
 
   struct HostTraits {
     template <typename T>
-    using unique_ptr = cudautils::host::unique_ptr<T>;
+    using unique_ptr = cms::cuda::host::unique_ptr<T>;
 
     template <typename T>
     static auto make_unique(cudaStream_t stream) {
-      return cudautils::make_host_unique<T>(stream);
+      return cms::cuda::make_host_unique<T>(stream);
     }
 
     template <typename T>
     static auto make_host_unique(cudaStream_t stream) {
-      return cudautils::make_host_unique<T>(stream);
+      return cms::cuda::make_host_unique<T>(stream);
     }
 
     template <typename T>
     static auto make_device_unique(cudaStream_t stream) {
-      return cudautils::make_device_unique<T>(stream);
+      return cms::cuda::make_device_unique<T>(stream);
     }
 
     template <typename T>
     static auto make_device_unique(size_t size, cudaStream_t stream) {
-      return cudautils::make_device_unique<T>(size, stream);
+      return cms::cuda::make_device_unique<T>(size, stream);
     }
   };
 
@@ -158,7 +158,7 @@ class HeterogeneousSoAImpl {
 
   T *get() { return m_ptr.get(); }
 
-  cudautils::host::unique_ptr<T> toHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<T> toHostAsync(cudaStream_t stream) const;
 
 private:
   unique_ptr<T> m_ptr;  //!
@@ -171,8 +171,8 @@ HeterogeneousSoAImpl<T, Traits>::HeterogeneousSoAImpl(cudaStream_t stream) {
 
 // in reality valid only for GPU version...
 template <typename T, typename Traits>
-cudautils::host::unique_ptr<T> HeterogeneousSoAImpl<T, Traits>::toHostAsync(cudaStream_t stream) const {
-  auto ret = cudautils::make_host_unique<T>(stream);
+cms::cuda::host::unique_ptr<T> HeterogeneousSoAImpl<T, Traits>::toHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<T>(stream);
   cudaCheck(cudaMemcpyAsync(ret.get(), get(), sizeof(T), cudaMemcpyDefault, stream));
   return ret;
 }
diff --git a/CUDADataFormats/Common/interface/HostProduct.h b/CUDADataFormats/Common/interface/HostProduct.h
index 17ad98ba403a4..aefd7b61f9781 100644
--- a/CUDADataFormats/Common/interface/HostProduct.h
+++ b/CUDADataFormats/Common/interface/HostProduct.h
@@ -12,7 +12,7 @@ class HostProduct {
   HostProduct(HostProduct&&) = default;
   HostProduct& operator=(HostProduct&&) = default;
 
-  explicit HostProduct(cudautils::host::unique_ptr<T>&& p) : hm_ptr(std::move(p)) {}
+  explicit HostProduct(cms::cuda::host::unique_ptr<T>&& p) : hm_ptr(std::move(p)) {}
   explicit HostProduct(std::unique_ptr<T>&& p) : std_ptr(std::move(p)) {}
 
   auto const* get() const { return hm_ptr ? hm_ptr.get() : std_ptr.get(); }
@@ -22,7 +22,7 @@ class HostProduct {
   auto const* operator-> () const { return get(); }
 
 private:
-  cudautils::host::unique_ptr<T> hm_ptr;  //!
+  cms::cuda::host::unique_ptr<T> hm_ptr;  //!
   std::unique_ptr<T> std_ptr;             //!
 };
 

From f9e4e0fbd908338103bd4b04f982613ca3ebdeea Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 25 Mar 2020 00:28:04 +0100
Subject: [PATCH 08/10] Integrate the comments from the upstream PRs (#442)

Clean up the Patatrack code base following the comments received during the integration into the upstream release.

Currently tracks the changes introduced due to
   - cms-sw#29109: Patatrack integration - trivial changes (1/N)
   - cms-sw#29110: Patatrack integration - common tools (2/N)

List of changes:
 * Remove unused files
 * Fix compilation warnings
 * Fix AtomicPairCounter unit test
 * Rename the cudaCompat namespace to cms::cudacompat
 * Remove extra semicolon
 * Move SimpleVector and VecArray to the cms::cuda namespace
 * Add missing dependency
 * Move HistoContainer, AtomicPairCounter, prefixScan and radixSort to the cms::cuda namespace
 * Remove rule exception for HeterogeneousCore
 * Fix code rule violations:
    - replace using namespace cms::cuda in test/OneToManyAssoc_t.h .
    - add an exception for cudaCompat.h:
      cudaCompat relies on defining equivalent symbols to the CUDA
      intrinsics in the cms::cudacompat namespace, and pulling them in the
      global namespace when compiling device code without CUDA.
* Protect the headers to compile only with a CUDA compiler
---
 .../Common/interface/HeterogeneousSoA.h       | 184 +++++++++---------
 1 file changed, 93 insertions(+), 91 deletions(-)

diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
index 6fec0026dfaa1..46a21510d0520 100644
--- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h
+++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
@@ -50,94 +50,96 @@ class HeterogeneousSoA {
   std::unique_ptr<T> std_ptr;               //!
 };
 
-namespace cudaCompat {
-
-  struct GPUTraits {
-    template <typename T>
-    using unique_ptr = cms::cuda::device::unique_ptr<T>;
-
-    template <typename T>
-    static auto make_unique(cudaStream_t stream) {
-      return cms::cuda::make_device_unique<T>(stream);
-    }
-
-    template <typename T>
-    static auto make_unique(size_t size, cudaStream_t stream) {
-      return cms::cuda::make_device_unique<T>(size, stream);
-    }
-
-    template <typename T>
-    static auto make_host_unique(cudaStream_t stream) {
-      return cms::cuda::make_host_unique<T>(stream);
-    }
-
-    template <typename T>
-    static auto make_device_unique(cudaStream_t stream) {
-      return cms::cuda::make_device_unique<T>(stream);
-    }
-
-    template <typename T>
-    static auto make_device_unique(size_t size, cudaStream_t stream) {
-      return cms::cuda::make_device_unique<T>(size, stream);
-    }
-  };
-
-  struct HostTraits {
-    template <typename T>
-    using unique_ptr = cms::cuda::host::unique_ptr<T>;
-
-    template <typename T>
-    static auto make_unique(cudaStream_t stream) {
-      return cms::cuda::make_host_unique<T>(stream);
-    }
-
-    template <typename T>
-    static auto make_host_unique(cudaStream_t stream) {
-      return cms::cuda::make_host_unique<T>(stream);
-    }
-
-    template <typename T>
-    static auto make_device_unique(cudaStream_t stream) {
-      return cms::cuda::make_device_unique<T>(stream);
-    }
-
-    template <typename T>
-    static auto make_device_unique(size_t size, cudaStream_t stream) {
-      return cms::cuda::make_device_unique<T>(size, stream);
-    }
-  };
-
-  struct CPUTraits {
-    template <typename T>
-    using unique_ptr = std::unique_ptr<T>;
-
-    template <typename T>
-    static auto make_unique(cudaStream_t) {
-      return std::make_unique<T>();
-    }
-
-    template <typename T>
-    static auto make_unique(size_t size, cudaStream_t) {
-      return std::make_unique<T>(size);
-    }
-
-    template <typename T>
-    static auto make_host_unique(cudaStream_t) {
-      return std::make_unique<T>();
-    }
-
-    template <typename T>
-    static auto make_device_unique(cudaStream_t) {
-      return std::make_unique<T>();
-    }
-
-    template <typename T>
-    static auto make_device_unique(size_t size, cudaStream_t) {
-      return std::make_unique<T>(size);
-    }
-  };
-
-}  // namespace cudaCompat
+namespace cms {
+  namespace cudacompat {
+
+    struct GPUTraits {
+      template <typename T>
+      using unique_ptr = cms::cuda::device::unique_ptr<T>;
+
+      template <typename T>
+      static auto make_unique(cudaStream_t stream) {
+        return cms::cuda::make_device_unique<T>(stream);
+      }
+
+      template <typename T>
+      static auto make_unique(size_t size, cudaStream_t stream) {
+        return cms::cuda::make_device_unique<T>(size, stream);
+      }
+
+      template <typename T>
+      static auto make_host_unique(cudaStream_t stream) {
+        return cms::cuda::make_host_unique<T>(stream);
+      }
+
+      template <typename T>
+      static auto make_device_unique(cudaStream_t stream) {
+        return cms::cuda::make_device_unique<T>(stream);
+      }
+
+      template <typename T>
+      static auto make_device_unique(size_t size, cudaStream_t stream) {
+        return cms::cuda::make_device_unique<T>(size, stream);
+      }
+    };
+
+    struct HostTraits {
+      template <typename T>
+      using unique_ptr = cms::cuda::host::unique_ptr<T>;
+
+      template <typename T>
+      static auto make_unique(cudaStream_t stream) {
+        return cms::cuda::make_host_unique<T>(stream);
+      }
+
+      template <typename T>
+      static auto make_host_unique(cudaStream_t stream) {
+        return cms::cuda::make_host_unique<T>(stream);
+      }
+
+      template <typename T>
+      static auto make_device_unique(cudaStream_t stream) {
+        return cms::cuda::make_device_unique<T>(stream);
+      }
+
+      template <typename T>
+      static auto make_device_unique(size_t size, cudaStream_t stream) {
+        return cms::cuda::make_device_unique<T>(size, stream);
+      }
+    };
+
+    struct CPUTraits {
+      template <typename T>
+      using unique_ptr = std::unique_ptr<T>;
+
+      template <typename T>
+      static auto make_unique(cudaStream_t) {
+        return std::make_unique<T>();
+      }
+
+      template <typename T>
+      static auto make_unique(size_t size, cudaStream_t) {
+        return std::make_unique<T>(size);
+      }
+
+      template <typename T>
+      static auto make_host_unique(cudaStream_t) {
+        return std::make_unique<T>();
+      }
+
+      template <typename T>
+      static auto make_device_unique(cudaStream_t) {
+        return std::make_unique<T>();
+      }
+
+      template <typename T>
+      static auto make_device_unique(size_t size, cudaStream_t) {
+        return std::make_unique<T>(size);
+      }
+    };
+
+  }  // namespace cudacompat
+}  // namespace cms
 
 // a heterogeneous unique pointer (of a different sort) ...
 template <typename T, typename Traits>
@@ -178,10 +180,10 @@ cms::cuda::host::unique_ptr<T> HeterogeneousSoAImpl<T, Traits>::toHostAsync(cuda
 }
 
 template <typename T>
-using HeterogeneousSoAGPU = HeterogeneousSoAImpl<T, cudaCompat::GPUTraits>;
+using HeterogeneousSoAGPU = HeterogeneousSoAImpl<T, cms::cudacompat::GPUTraits>;
 template <typename T>
-using HeterogeneousSoACPU = HeterogeneousSoAImpl<T, cudaCompat::CPUTraits>;
+using HeterogeneousSoACPU = HeterogeneousSoAImpl<T, cms::cudacompat::CPUTraits>;
 template <typename T>
-using HeterogeneousSoAHost = HeterogeneousSoAImpl<T, cudaCompat::HostTraits>;
+using HeterogeneousSoAHost = HeterogeneousSoAImpl<T, cms::cudacompat::HostTraits>;
 
 #endif

From 88a1751855bca00bf93e9f44fd34d4d49e38be76 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 5 Jun 2020 08:54:56 +0200
Subject: [PATCH 09/10] Update HCAL local reconstruction on GPUs (#470)

Move common ROOT dictionaries to a dedicated new package,
CUDADataFormats/StdDictionaries .

Remove unnecessary dictionary declarations.

Determine the default module label automatically for templated and
non-templated EDProducers and ESProducer, and remove the "name()"
static method previously used to distinguish their template arguments.

Use Event::emplace instead of Event:put where relevant.

Protect the use of CUDA API calls in module constructors and
destructors, checking that the CUDAService is available and enabled.

Move the definition of EventSetup records to a package/library that does
not define plugins.
---
 CUDADataFormats/StdDictionaries/BuildFile.xml      |  5 +++++
 CUDADataFormats/StdDictionaries/src/classes.h      |  4 ++++
 .../StdDictionaries/src/classes_def.xml            | 14 ++++++++++++++
 3 files changed, 23 insertions(+)
 create mode 100644 CUDADataFormats/StdDictionaries/BuildFile.xml
 create mode 100644 CUDADataFormats/StdDictionaries/src/classes.h
 create mode 100644 CUDADataFormats/StdDictionaries/src/classes_def.xml

diff --git a/CUDADataFormats/StdDictionaries/BuildFile.xml b/CUDADataFormats/StdDictionaries/BuildFile.xml
new file mode 100644
index 0000000000000..0a1542b3b05c6
--- /dev/null
+++ b/CUDADataFormats/StdDictionaries/BuildFile.xml
@@ -0,0 +1,5 @@
+<use name="rootcore"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<export>
+  <lib name="1"/>
+</export>
diff --git a/CUDADataFormats/StdDictionaries/src/classes.h b/CUDADataFormats/StdDictionaries/src/classes.h
new file mode 100644
index 0000000000000..9c4e1d810c3c2
--- /dev/null
+++ b/CUDADataFormats/StdDictionaries/src/classes.h
@@ -0,0 +1,4 @@
+#include <cstddef>
+#include <vector>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
diff --git a/CUDADataFormats/StdDictionaries/src/classes_def.xml b/CUDADataFormats/StdDictionaries/src/classes_def.xml
new file mode 100644
index 0000000000000..7060ec91efa1a
--- /dev/null
+++ b/CUDADataFormats/StdDictionaries/src/classes_def.xml
@@ -0,0 +1,14 @@
+<lcgdict>
+    <class name="std::vector<std::byte, cms::cuda::HostAllocator<std::byte, 0>>" />
+
+    <class name="std::vector<uint8_t, cms::cuda::HostAllocator<uint8_t, 0>>" />
+    <class name="std::vector<uint16_t, cms::cuda::HostAllocator<uint16_t, 0>>" />
+    <class name="std::vector<uint32_t, cms::cuda::HostAllocator<uint32_t, 0>>" />
+
+    <class name="std::vector<int8_t, cms::cuda::HostAllocator<int8_t, 0>>" />
+    <class name="std::vector<int16_t, cms::cuda::HostAllocator<int16_t, 0>>" />
+    <class name="std::vector<int32_t, cms::cuda::HostAllocator<int32_t, 0>>" />
+
+    <class name="std::vector<float, cms::cuda::HostAllocator<float, 0>>" />
+    <class name="std::vector<double, cms::cuda::HostAllocator<double, 0>>" />
+</lcgdict>

From 1fa9b32046741778e071db517e071a08cf750c2c Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 31 Jul 2020 11:29:39 +0200
Subject: [PATCH 10/10] Apply code formatting (#526)

---
 CUDADataFormats/Common/interface/HeterogeneousSoA.h | 4 ++--
 CUDADataFormats/Common/interface/HostProduct.h      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
index 46a21510d0520..3f2a551bc320f 100644
--- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h
+++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
@@ -27,13 +27,13 @@ class HeterogeneousSoA {
 
   auto const &operator*() const { return *get(); }
 
-  auto const *operator-> () const { return get(); }
+  auto const *operator->() const { return get(); }
 
   auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
 
   auto &operator*() { return *get(); }
 
-  auto *operator-> () { return get(); }
+  auto *operator->() { return get(); }
 
   // in reality valid only for GPU version...
   cms::cuda::host::unique_ptr<T> toHostAsync(cudaStream_t stream) const {
diff --git a/CUDADataFormats/Common/interface/HostProduct.h b/CUDADataFormats/Common/interface/HostProduct.h
index aefd7b61f9781..63a152298e42b 100644
--- a/CUDADataFormats/Common/interface/HostProduct.h
+++ b/CUDADataFormats/Common/interface/HostProduct.h
@@ -19,7 +19,7 @@ class HostProduct {
 
   auto const& operator*() const { return *get(); }
 
-  auto const* operator-> () const { return get(); }
+  auto const* operator->() const { return get(); }
 
 private:
   cms::cuda::host::unique_ptr<T> hm_ptr;  //!