cms-patatrack · VinInn · Dec 4, 2019 · Dec 4, 2019 · Dec 4, 2019 · Dec 4, 2019
diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
@@ -5,6 +5,7 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
@@ -21,15 +22,15 @@ class HeterogeneousSoA {
 
   explicit HeterogeneousSoA(cudautils::device::unique_ptr<T> &&p) : dm_ptr(std::move(p)) {}
   explicit HeterogeneousSoA(cudautils::host::unique_ptr<T> &&p) : hm_ptr(std::move(p)) {}
-  explicit HeterogeneousSoA(std::unique_ptr<T> &&p) : std_ptr(std::move(p)) {}
+  explicit HeterogeneousSoA(cudautils::cpu::unique_ptr<T> &&p) : cm_ptr(std::move(p)) {}
 
-  auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
+  auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : cm_ptr.get()); }
 
   auto const &operator*() const { return *get(); }
 
   auto const *operator-> () const { return get(); }
 
-  auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
+  auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : cm_ptr.get()); }
 
   auto &operator*() { return *get(); }
 
@@ -47,12 +48,14 @@ class HeterogeneousSoA {
   // a union wan't do it, a variant will not be more efficienct
   cudautils::device::unique_ptr<T> dm_ptr;  //!
   cudautils::host::unique_ptr<T> hm_ptr;    //!
-  std::unique_ptr<T> std_ptr;               //!
+  cudautils::cpu::unique_ptr<T> cm_ptr;    //!
 };
 
 namespace cudaCompat {
 
   struct GPUTraits {
+    static constexpr const char * name = "GPU"; 
+
     template <typename T>
     using unique_ptr = cudautils::device::unique_ptr<T>;
 
@@ -83,6 +86,8 @@ namespace cudaCompat {
   };
 
   struct HostTraits {
+    static constexpr const char * name = "HOST";
+
     template <typename T>
     using unique_ptr = cudautils::host::unique_ptr<T>;
 
@@ -108,32 +113,45 @@ namespace cudaCompat {
   };
 
   struct CPUTraits {
+    static constexpr const char * name = "CPU";
+
     template <typename T>
-    using unique_ptr = std::unique_ptr<T>;
+    using unique_ptr = cudautils::cpu::unique_ptr<T>;;
+
 
     template <typename T>
-    static auto make_unique(cudaStream_t) {
-      return std::make_unique<T>();
+    static auto make_unique() {
+      return cudautils::make_cpu_unique<T>(0);
     }
 
     template <typename T>
-    static auto make_unique(size_t size, cudaStream_t) {
-      return std::make_unique<T>(size);
+    static auto make_unique(size_t size) {
+      return cudautils::make_cpu_unique<T>(size,0);
     }
 
     template <typename T>
-    static auto make_host_unique(cudaStream_t) {
-      return std::make_unique<T>();
+    static auto make_unique(cudaStream_t stream) {
+      return cudautils::make_cpu_unique<T>(stream);
     }
 
     template <typename T>
-    static auto make_device_unique(cudaStream_t) {
-      return std::make_unique<T>();
+    static auto make_unique(size_t size, cudaStream_t stream) {
+      return cudautils::make_cpu_unique<T>(size, stream);
     }
 
     template <typename T>
-    static auto make_device_unique(size_t size, cudaStream_t) {
-      return std::make_unique<T>(size);
+    static auto make_host_unique(cudaStream_t stream) {
+      return cudautils::make_cpu_unique<T>(stream);
+    }
+
+    template <typename T>
+    static auto make_device_unique(cudaStream_t stream) {
+      return cudautils::make_cpu_unique<T>(stream);
+    }
+
+    template <typename T>
+    static auto make_device_unique(size_t size, cudaStream_t stream) {
+      return cudautils::make_cpu_unique<T>(size, stream);
     }
   };
 

diff --git a/HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h
@@ -0,0 +1,86 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_cpu_unique_ptr_h
+#define HeterogeneousCore_CUDAUtilities_interface_cpu_unique_ptr_h
+
+#include <memory>
+#include <functional>
+
+#include <cstdlib>
+#include <cuda_runtime.h>
+
+namespace cudautils {
+  namespace cpu {
+    namespace impl {
+      // Additional layer of types to distinguish from device:: and host::unique_ptr
+      class CPUDeleter {
+      public:
+        CPUDeleter() = default;
+
+        void operator()(void *ptr) {
+            ::free(ptr);
+        }
+      };
+    }  // namespace impl
+
+    template <typename T>
+    using unique_ptr = std::unique_ptr<T, impl::CPUDeleter>;
+
+    namespace impl {
+      template <typename T>
+      struct make_cpu_unique_selector {
+        using non_array = cudautils::cpu::unique_ptr<T>;
+      };
+      template <typename T>
+      struct make_cpu_unique_selector<T[]> {
+        using unbounded_array = cudautils::cpu::unique_ptr<T[]>;
+      };
+      template <typename T, size_t N>
+      struct make_cpu_unique_selector<T[N]> {
+        struct bounded_array {};
+      };
+    }  // namespace impl
+  }    // namespace cpu
+
+  template <typename T>
+  typename cpu::impl::make_cpu_unique_selector<T>::non_array make_cpu_unique(cudaStream_t) {
+    static_assert(std::is_trivially_constructible<T>::value,
+                  "Allocating with non-trivial constructor on the cpu memory is not supported");
+    void *mem = ::malloc(sizeof(T));
+    return typename cpu::impl::make_cpu_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
+                                                                            cpu::impl::CPUDeleter()};
+  }
+
+  template <typename T>
+  typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array make_cpu_unique(size_t n, cudaStream_t) {
+    using element_type = typename std::remove_extent<T>::type;
+    static_assert(std::is_trivially_constructible<element_type>::value,
+                  "Allocating with non-trivial constructor on the cpu memory is not supported");
+    void *mem = ::malloc(n * sizeof(element_type));
+    return typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem),
+                                                                                  cpu::impl::CPUDeleter()};
+  }
+
+  template <typename T, typename... Args>
+  typename cpu::impl::make_cpu_unique_selector<T>::bounded_array make_cpu_unique(Args &&...) = delete;
+
+  // No check for the trivial constructor, make it clear in the interface
+  template <typename T>
+  typename cpu::impl::make_cpu_unique_selector<T>::non_array make_cpu_unique_uninitialized(cudaStream_t) {
+    void *mem = ::malloc(sizeof(T));
+    return typename cpu::impl::make_cpu_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
+                                                                            cpu::impl::CPUDeleter()};
+  }
+
+  template <typename T>
+  typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array make_cpu_unique_uninitialized(size_t n, cudaStream_t) {
+    using element_type = typename std::remove_extent<T>::type;
+    void *mem = ::malloc(n * sizeof(element_type));
+    return typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem),
+                                                                                  cpu::impl::CPUDeleter()};
+  }
+
+  template <typename T, typename... Args>
+  typename cpu::impl::make_cpu_unique_selector<T>::bounded_array make_cpu_unique_uninitialized(Args &&...) =
+      delete;
+}  // namespace cudautils
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
@@ -6,6 +6,10 @@
  */
 
 #ifndef __CUDACC__
+  #define CUDA_KERNELS_ON_CPU
+#endif
+
+#ifdef CUDA_KERNELS_ON_CPU
 
 #include <algorithm>
 #include <cstdint>
@@ -98,6 +102,6 @@ namespace cudaCompat {
 using namespace cudaCompat;
 #endif
 
-#endif
+#endif  // CUDA_KERNELS_ON_CPU
 
 #endif  // HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/launch.h b/HeterogeneousCore/CUDAUtilities/interface/launch.h
@@ -94,10 +94,14 @@ namespace cudautils {
   }  // namespace detail
 
   // wrappers for cudaLaunchKernel
-
+  inline
   void launch(void (*kernel)(), LaunchParameters config) {
+#ifdef CUDA_KERNELS_ON_CPU
+    kernel();
+#else
     cudaCheck(cudaLaunchKernel(
         (const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream));
+#endif
   }
 
   template <typename F, typename... Args>
@@ -107,6 +111,9 @@ namespace cudautils {
   std::enable_if_t<std::is_void<std::result_of_t<F && (Args && ...)> >::value>
 #endif
   launch(F* kernel, LaunchParameters config, Args&&... args) {
+#ifdef CUDA_KERNELS_ON_CPU
+    kernel(args...);
+#else
     using function_type = detail::kernel_traits<F>;
     typename function_type::argument_type_tuple args_copy(args...);
 
@@ -116,10 +123,11 @@ namespace cudautils {
     detail::pointer_setter<size>()(pointers, args_copy);
     cudaCheck(cudaLaunchKernel(
         (const void*)kernel, config.gridDim, config.blockDim, (void**)pointers, config.sharedMem, config.stream));
+#endif
   }
 
   // wrappers for cudaLaunchCooperativeKernel
-
+  inline
   void launch_cooperative(void (*kernel)(), LaunchParameters config) {
     cudaCheck(cudaLaunchCooperativeKernel(
         (const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream));

diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -7,6 +7,13 @@
   <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
 </bin>
 
+<bin file="Launch_t.cpp Launch_t.cu" name="gpuLaunch_t">
+</bin>
+
+<bin file="Launch_t.cpp" name="cpuLaunch_t">
+  <flags CXXFLAGS="-DCUDA_KERNELS_ON_CPU"/>
+</bin>
+
 <bin file="test_GPUSimpleVector.cu" name="test_GPUSimpleVector">
   <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
 </bin>

diff --git a/HeterogeneousCore/CUDAUtilities/test/Launch_t.cpp b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cpp
@@ -0,0 +1,31 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+
+
+
+#include <cuda_runtime.h>
+#include<cstdio>
+
+#undef __global__
+#define __global__ inline __attribute__((always_inline))
+
+
+__global__
+void hello(float k) {
+
+  printf("hello %f\n",k);
+
+}
+
+
+
+
+int main() {
+
+  requireCUDADevices();
+
+  cudautils::launch(hello,{1, 1},3.14);
+  cudaDeviceSynchronize();
+
+  return 0;
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/Launch_t.cu b/HeterogeneousCore/CUDAUtilities/test/Launch_t.cu
@@ -0,0 +1,10 @@
+#include<cstdio>
+
+__global__
+void hello(float k) {
+
+  printf("hello %f\n",k);
+
+}
+
+
diff --git a/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp
@@ -0,0 +1,31 @@
+#include "catch.hpp"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h"
+
+TEST_CASE("cpu_unique_ptr", "[cudaMemTools]") {
+
+  SECTION("Single element") {
+    auto ptr = cudautils::make_cpu_unique<int>(0);
+    REQUIRE(ptr != nullptr);
+  }
+
+  SECTION("Reset") {
+    auto ptr = cudautils::make_cpu_unique<int>(0);
+    REQUIRE(ptr != nullptr);
+
+    ptr.reset();
+    REQUIRE(ptr.get() == nullptr);
+  }
+
+  SECTION("Multiple elements") {
+    auto ptr = cudautils::make_host_unique<int[]>(10,0);
+    REQUIRE(ptr != nullptr);
+  }
+
+  SECTION("Allocating too much") {
+    constexpr size_t maxSize = 1 << 30;  // 8**10
+    auto ptr = cudautils::make_cpu_unique<char[]>(maxSize+1,0);
+    ptr.reset();
+    REQUIRE(ptr != nullptr);
+  }
+}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -19,10 +19,7 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
 #endif
 
   // in principle we can use "nhits" to heuristically dimension the workspace...
-  // overkill to use template here (std::make_unique would suffice)
-  // device_isOuterHitOfCell_ = Traits:: template make_unique<GPUCACell::OuterHitOfCell[]>(cs, std::max(1U,nhits), stream);
-  device_isOuterHitOfCell_.reset(
-      (GPUCACell::OuterHitOfCell *)malloc(std::max(1U, nhits) * sizeof(GPUCACell::OuterHitOfCell)));
+  device_isOuterHitOfCell_ = Traits:: template make_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U,nhits),stream);
   assert(device_isOuterHitOfCell_.get());
   gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(),
                                  nhits,
@@ -31,8 +28,7 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
                                  device_theCellTracks_,
                                  device_theCellTracksContainer_.get());
 
-  // device_theCells_ = Traits:: template make_unique<GPUCACell[]>(cs, m_params.maxNumberOfDoublets_, stream);
-  device_theCells_.reset((GPUCACell *)malloc(sizeof(GPUCACell) * m_params.maxNumberOfDoublets_));
+  device_theCells_ = Traits:: template make_unique<GPUCACell[]>(m_params.maxNumberOfDoublets_,stream);
   if (0 == nhits)
     return;  // protect against empty events
 

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -187,7 +187,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH
 }
 
 PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
-  PixelTrackHeterogeneous tracks(std::make_unique<pixelTrack::TrackSoA>());
+  PixelTrackHeterogeneous tracks(cudautils::make_cpu_unique<pixelTrack::TrackSoA>(0));
 
   auto* soa = tracks.get();
   assert(soa);

diff --git a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexProducerCUDA.cc
@@ -96,7 +96,7 @@ void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent
 
     assert(tracks);
 
-    ctx.emplace(iEvent, tokenGPUVertex_, m_gpuAlgo.makeAsync(ctx.stream(), tracks, m_ptMin));
+    ctx.emplace(iEvent, tokenGPUVertex_, m_gpuAlgo.make<cudaCompat::GPUTraits>(ctx.stream(), tracks, m_ptMin,m_OnGPU));
 
   } else {
     auto const* tracks = iEvent.get(tokenCPUTrack_).get();
@@ -117,7 +117,8 @@ void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent
     std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl;
     */
 
-    iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make(tracks, m_ptMin));
+    cudaStream_t stream=0;
+    iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make<cudaCompat::CPUTraits>(stream,tracks, m_ptMin,m_OnGPU));
   }
 }