cms-sw · cmsbuild · Jan 20, 2020 · Dec 2, 2019 · Dec 2, 2019 · makortel
diff --git a/CUDADataFormats/Common/BuildFile.xml b/CUDADataFormats/Common/BuildFile.xml
@@ -0,0 +1,7 @@
+<iftool name="cuda-gcc-support">
+<use name="HeterogeneousCore/CUDAUtilities"/>
+
+<export>
+    <lib name="1"/>
+</export>
+</iftool>
diff --git a/CUDADataFormats/Common/interface/Product.h b/CUDADataFormats/Common/interface/Product.h
@@ -0,0 +1,60 @@
+#ifndef CUDADataFormats_Common_Product_h
+#define CUDADataFormats_Common_Product_h
+
+#include <memory>
+
+#include "CUDADataFormats/Common/interface/ProductBase.h"
+
+namespace edm {
+  template <typename T>
+  class Wrapper;
+}
+
+namespace cms {
+  namespace cuda {
+    namespace impl {
+      class ScopedContextGetterBase;
+    }
+
+    /**
+     * The purpose of this class is to wrap CUDA data to edm::Event in a
+     * way which forces correct use of various utilities.
+     *
+     * The non-default construction has to be done with cms::cuda::ScopedContext
+     * (in order to properly register the CUDA event).
+     *
+     * The default constructor is needed only for the ROOT dictionary generation.
+     *
+     * The CUDA event is in practice needed only for stream-stream
+     * synchronization, but someone with long-enough lifetime has to own
+     * it. Here is a somewhat natural place. If overhead is too much, we
+     * can use them only where synchronization between streams is needed.
+     */
+    template <typename T>
+    class Product : public ProductBase {
+    public:
+      Product() = default;  // Needed only for ROOT dictionary generation
+
+      Product(const Product&) = delete;
+      Product& operator=(const Product&) = delete;
+      Product(Product&&) = default;
+      Product& operator=(Product&&) = default;
+
+    private:
+      friend class impl::ScopedContextGetterBase;
+      friend class ScopedContextProduce;
+      friend class edm::Wrapper<Product<T>>;
+
+      explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, T data)
+          : ProductBase(device, std::move(stream), std::move(event)), data_(std::move(data)) {}
+
+      template <typename... Args>
+      explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, Args&&... args)
+          : ProductBase(device, std::move(stream), std::move(event)), data_(std::forward<Args>(args)...) {}
+
+      T data_;  //!
+    };
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/CUDADataFormats/Common/interface/ProductBase.h b/CUDADataFormats/Common/interface/ProductBase.h
@@ -0,0 +1,93 @@
+#ifndef CUDADataFormats_Common_ProductBase_h
+#define CUDADataFormats_Common_ProductBase_h
+
+#include <atomic>
+#include <memory>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h"
+
+namespace cms {
+  namespace cuda {
+    namespace impl {
+      class ScopedContextBase;
+    }
+
+    /**
+     * Base class for all instantiations of CUDA<T> to hold the
+     * non-T-dependent members.
+     */
+    class ProductBase {
+    public:
+      ProductBase() = default;  // Needed only for ROOT dictionary generation
+      ~ProductBase();
+
+      ProductBase(const ProductBase&) = delete;
+      ProductBase& operator=(const ProductBase&) = delete;
+      ProductBase(ProductBase&& other)
+          : stream_{std::move(other.stream_)},
+            event_{std::move(other.event_)},
+            mayReuseStream_{other.mayReuseStream_.load()},
+            device_{other.device_} {}
+      ProductBase& operator=(ProductBase&& other) {
+        stream_ = std::move(other.stream_);
+        event_ = std::move(other.event_);
+        mayReuseStream_ = other.mayReuseStream_.load();
+        device_ = other.device_;
+        return *this;
+      }
+
+      bool isValid() const { return stream_.get() != nullptr; }
+      bool isAvailable() const;
+
+      int device() const { return device_; }
+
+      // cudaStream_t is a pointer to a thread-safe object, for which a
+      // mutable access is needed even if the cms::cuda::ScopedContext itself
+      // would be const. Therefore it is ok to return a non-const
+      // pointer from a const method here.
+      cudaStream_t stream() const { return stream_.get(); }
+
+      // cudaEvent_t is a pointer to a thread-safe object, for which a
+      // mutable access is needed even if the cms::cuda::ScopedContext itself
+      // would be const. Therefore it is ok to return a non-const
+      // pointer from a const method here.
+      cudaEvent_t event() const { return event_.get(); }
+
+    protected:
+      explicit ProductBase(int device, SharedStreamPtr stream, SharedEventPtr event)
+          : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {}
+
+    private:
+      friend class impl::ScopedContextBase;
+      friend class ScopedContextProduce;
+
+      // The following function is intended to be used only from ScopedContext
+      const SharedStreamPtr& streamPtr() const { return stream_; }
+
+      bool mayReuseStream() const {
+        bool expected = true;
+        bool changed = mayReuseStream_.compare_exchange_strong(expected, false);
+        // If the current thread is the one flipping the flag, it may
+        // reuse the stream.
+        return changed;
+      }
+
+      // The cudaStream_t is really shared among edm::Event products, so
+      // using shared_ptr also here
+      SharedStreamPtr stream_;  //!
+      // shared_ptr because of caching in cms::cuda::EventCache
+      SharedEventPtr event_;  //!
+
+      // This flag tells whether the CUDA stream may be reused by a
+      // consumer or not. The goal is to have a "chain" of modules to
+      // queue their work to the same stream.
+      mutable std::atomic<bool> mayReuseStream_ = true;  //!
+
+      // The CUDA device associated with this product
+      int device_ = -1;  //!
+    };
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/CUDADataFormats/Common/src/ProductBase.cc b/CUDADataFormats/Common/src/ProductBase.cc
@@ -0,0 +1,29 @@
+#include "CUDADataFormats/Common/interface/ProductBase.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
+
+namespace cms::cuda {
+  bool ProductBase::isAvailable() const {
+    // if default-constructed, the product is not available
+    if (not event_) {
+      return false;
+    }
+    return eventWorkHasCompleted(event_.get());
+  }
+
+  ProductBase::~ProductBase() {
+    // Make sure that the production of the product in the GPU is
+    // complete before destructing the product. This is to make sure
+    // that the EDM stream does not move to the next event before all
+    // asynchronous processing of the current is complete.
+
+    // TODO: a callback notifying a WaitingTaskHolder (or similar)
+    // would avoid blocking the CPU, but would also require more work.
+    //
+    // Intentionally not checking the return value to avoid throwing
+    // exceptions. If this call would fail, we should get failures
+    // elsewhere as well.
+    if (event_) {
+      cudaEventSynchronize(event_.get());
+    }
+  }
+}  // namespace cms::cuda
diff --git a/CUDADataFormats/Common/test/BuildFile.xml b/CUDADataFormats/Common/test/BuildFile.xml
@@ -0,0 +1,7 @@
+<iftool name="cuda-gcc-support">
+<bin file="test*.cc" name="testCUDADataFormatsCommon">
+  <use name="HeterogeneousCore/CUDACore"/>
+  <use name="catch2"/>
+  <use name="cuda"/>
+</bin>
+</iftool>
diff --git a/CUDADataFormats/Common/test/test_Product.cc b/CUDADataFormats/Common/test/test_Product.cc
@@ -0,0 +1,68 @@
+#include "catch.hpp"
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
+
+#include <cuda_runtime_api.h>
+
+namespace cms::cudatest {
+  class TestScopedContext {
+  public:
+    static cuda::ScopedContextProduce make(int dev, bool createEvent) {
+      cms::cuda::SharedEventPtr event;
+      if (createEvent) {
+        event = cms::cuda::getEventCache().get();
+      }
+      return cuda::ScopedContextProduce(dev, cms::cuda::getStreamCache().get(), std::move(event));
+    }
+  };
+}  // namespace cms::cudatest
+
+TEST_CASE("Use of cms::cuda::Product template", "[CUDACore]") {
+  SECTION("Default constructed") {
+    auto foo = cms::cuda::Product<int>();
+    REQUIRE(!foo.isValid());
+
+    auto bar = std::move(foo);
+  }
+
+  if (not cms::cudatest::testDevices()) {
+    return;
+  }
+
+  constexpr int defaultDevice = 0;
+  cudaCheck(cudaSetDevice(defaultDevice));
+  {
+    auto ctx = cms::cudatest::TestScopedContext::make(defaultDevice, true);
+    std::unique_ptr<cms::cuda::Product<int>> dataPtr = ctx.wrap(10);
+    auto& data = *dataPtr;
+
+    SECTION("Construct from cms::cuda::ScopedContext") {
+      REQUIRE(data.isValid());
+      REQUIRE(data.device() == defaultDevice);
+      REQUIRE(data.stream() == ctx.stream());
+      REQUIRE(data.event() != nullptr);
+    }
+
+    SECTION("Move constructor") {
+      auto data2 = cms::cuda::Product<int>(std::move(data));
+      REQUIRE(data2.isValid());
+      REQUIRE(!data.isValid());
+    }
+
+    SECTION("Move assignment") {
+      cms::cuda::Product<int> data2;
+      data2 = std::move(data);
+      REQUIRE(data2.isValid());
+      REQUIRE(!data.isValid());
+    }
+  }
+
+  cudaCheck(cudaSetDevice(defaultDevice));
+  cudaCheck(cudaDeviceSynchronize());
+  // Note: CUDA resources are cleaned up by the destructors of the global cache objects
+}
diff --git a/CUDADataFormats/Common/test/test_main.cc b/CUDADataFormats/Common/test/test_main.cc
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
diff --git a/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h b/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h
@@ -25,7 +25,6 @@
 #include "tbb/task_arena.h"
 
 namespace edm {
-
   class WaitingTask;
   class WaitingTaskHolder;
 
@@ -72,5 +71,29 @@ namespace edm {
     WaitingTask* m_task;
     std::shared_ptr<tbb::task_arena> m_arena;
   };
+
+  template <typename F>
+  auto make_lambda_with_holder(WaitingTaskWithArenaHolder h, F&& f) {
+    return [holder = std::move(h), func = std::forward<F>(f)]() mutable {
+      try {
+        func(holder);
+      } catch (...) {
+        holder.doneWaiting(std::current_exception());
+      }
+    };
+  }
+
+  template <typename ALLOC, typename F>
+  auto make_waiting_task_with_holder(ALLOC&& iAlloc, WaitingTaskWithArenaHolder h, F&& f) {
+    return make_waiting_task(
+        std::forward<ALLOC>(iAlloc),
+        [holder = h, func = make_lambda_with_holder(h, std::forward<F>(f))](std::exception_ptr const* excptr) mutable {
+          if (excptr) {
+            holder.doneWaiting(*excptr);
+            return;
+          }
+          func();
+        });
+  }
 }  // namespace edm
 #endif
diff --git a/HeterogeneousCore/CUDACore/BuildFile.xml b/HeterogeneousCore/CUDACore/BuildFile.xml
@@ -0,0 +1,14 @@
+<iftool name="cuda-gcc-support">
+<use name="FWCore/Concurrency"/>
+<use name="FWCore/Framework"/>
+<use name="FWCore/ServiceRegistry"/>
+<use name="FWCore/ParameterSet"/>
+<use name="CUDADataFormats/Common"/>
+<use name="DataFormats/Provenance"/>
+<use name="HeterogeneousCore/CUDAServices"/>
+<use name="cuda"/>
+
+<export>
+    <lib name="1"/>
+</export>
+</iftool>
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		#define CATCH_CONFIG_MAIN
		#include "catch.hpp"