cms-patatrack · makortel · Aug 23, 2021 · Aug 23, 2021 · Aug 23, 2021 · Aug 31, 2021
diff --git a/src/cudadev/CUDACore/AcquireContext.cc b/src/cudadev/CUDACore/AcquireContext.cc
@@ -0,0 +1,5 @@
+#include "CUDACore/AcquireContext.h"
+
+namespace cms::cuda {
+  void AcquireContext::commit() { holderHelper_.enqueueCallback(stream()); }
+}
diff --git a/src/cudadev/CUDACore/AcquireContext.h b/src/cudadev/CUDACore/AcquireContext.h
@@ -0,0 +1,45 @@
+#ifndef HeterogeneousCore_CUDACore_AcquireContext_h
+#define HeterogeneousCore_CUDACore_AcquireContext_h
+
+#include "CUDACore/EDGetterContextBase.h"
+#include "CUDACore/TaskContext.h"
+
+namespace cms::cuda {
+  /**
+   * The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire():
+   * - setting the current device
+   * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
+   * - synchronizing between CUDA streams if necessary
+   * Users should not, however, construct it explicitly.
+   */
+  class AcquireContext : public impl::EDGetterContextBase {
+  public:
+    explicit AcquireContext(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+        : EDGetterContextBase(streamID), holderHelper_{std::move(waitingTaskHolder), device()} {}
+    ~AcquireContext() = default;
+
+    template <typename F>
+    void pushNextTask(F&& f) {
+      holderHelper_.pushNextTask(std::forward<F>(f));
+    }
+
+    void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+      holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
+    }
+
+    // internal API
+    void commit();
+
+  private:
+    impl::FwkContextHolderHelper holderHelper_;
+  };
+
+  template <typename F>
+  void runAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder holder, F func) {
+    AcquireContext context(streamID, std::move(holder));
+    func(context);
+    context.commit();
+  }
+}
+
+#endif
diff --git a/src/cudadev/CUDACore/AnalyzeContext.h b/src/cudadev/CUDACore/AnalyzeContext.h
@@ -0,0 +1,20 @@
+#ifndef HeterogeneousCore_CUDACore_AnalyzeContext_h
+#define HeterogeneousCore_CUDACore_AnalyzeContext_h
+
+#include "CUDACore/EDGetterContextBase.h"
+
+namespace cms::cuda {
+  /**
+   * The aim of this class is to do necessary per-event "initialization" in analyze()
+   * - setting the current device
+   * - synchronizing between CUDA streams if necessary
+   * and enforce that those get done in a proper way in RAII fashion.
+   */
+  class AnalyzeContext : public impl::EDGetterContextBase {
+  public:
+    /// Constructor to (possibly) re-use a CUDA stream
+    explicit AnalyzeContext(edm::StreamID streamID) : EDGetterContextBase(streamID) {}
+  };
+}
+
+#endif
diff --git a/src/cudadev/CUDACore/Context.h b/src/cudadev/CUDACore/Context.h
@@ -0,0 +1,46 @@
+#ifndef HeterogeneousCore_CUDAUtilities_Context_h
+#define HeterogeneousCore_CUDAUtilities_Context_h
+
+#include "CUDACore/allocate_device.h"
+#include "CUDACore/allocate_host.h"
+
+namespace cms::cuda {
+  class HostAllocatorContext {
+  public:
+    explicit HostAllocatorContext(cudaStream_t stream) : stream_(stream) {}
+
+    void *allocate_host(size_t nbytes) const { return cms::cuda::allocate_host(nbytes, stream_); }
+
+    void free_host(void *ptr) const { cms::cuda::free_host(ptr); }
+
+  private:
+    cudaStream_t stream_;
+  };
+
+  class DeviceAllocatorContext {
+  public:
+    explicit DeviceAllocatorContext(cudaStream_t stream) : stream_(stream) {}
+
+    void *allocate_device(size_t nbytes) const { return cms::cuda::allocate_device(nbytes, stream_); }
+
+    void free_device(void *ptr) const { cms::cuda::free_device(ptr, stream_); }
+
+  private:
+    cudaStream_t stream_;
+  };
+
+  class Context {
+  public:
+    explicit Context(cudaStream_t stream) : stream_(stream) {}
+
+    cudaStream_t stream() const { return stream_; }
+
+    operator HostAllocatorContext() const { return HostAllocatorContext(stream()); }
+    operator DeviceAllocatorContext() const { return DeviceAllocatorContext(stream()); }
+
+  private:
+    cudaStream_t stream_;
+  };
+}  // namespace cms::cuda
+
+#endif
diff --git a/src/cudadev/CUDACore/ContextState.h b/src/cudadev/CUDACore/ContextState.h
@@ -26,6 +26,9 @@ namespace cms {
       friend class ScopedContextAcquire;
       friend class ScopedContextProduce;
       friend class ScopedContextTask;
+      friend class AcquireContext;
+      friend class ProduceContext;
+      friend class TaskContext;
 
       void set(int device, SharedStreamPtr stream) {
         throwIfStream();

diff --git a/src/cudadev/CUDACore/EDGetterContextBase.cc b/src/cudadev/CUDACore/EDGetterContextBase.cc
@@ -0,0 +1,27 @@
+#include "CUDACore/EDGetterContextBase.h"
+#include "CUDACore/cudaCheck.h"
+
+namespace cms::cuda::impl {
+  void EDGetterContextBase::synchronizeStreams(int dataDevice,
+                                               cudaStream_t dataStream,
+                                               bool available,
+                                               cudaEvent_t dataEvent) {
+    if (dataDevice != device()) {
+      // Eventually replace with prefetch to current device (assuming unified memory works)
+      // If we won't go to unified memory, need to figure out something else...
+      throw std::runtime_error("Handling data from multiple devices is not yet supported");
+    }
+
+    if (dataStream != stream()) {
+      // Different streams, need to synchronize
+      if (not available) {
+        // Event not yet occurred, so need to add synchronization
+        // here. Sychronization is done by making the CUDA stream to
+        // wait for an event, so all subsequent work in the stream
+        // will run only after the event has "occurred" (i.e. data
+        // product became available).
+        cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event");
+      }
+    }
+  }
+}
diff --git a/src/cudadev/CUDACore/EDGetterContextBase.h b/src/cudadev/CUDACore/EDGetterContextBase.h
@@ -0,0 +1,45 @@
+#ifndef HeterogeneousCore_CUDACore_EDGetterContextBase_h
+#define HeterogeneousCore_CUDACore_EDGetterContextBase_h
+
+#include "CUDACore/ESProductNew.h"
+#include "CUDACore/FwkContextBase.h"
+#include "CUDACore/Product.h"
+#include "Framework/EDGetToken.h"
+#include "Framework/EventSetup.h"
+
+namespace cms::cuda::impl {
+  /**
+   * This class is a base class for Context classes that should be
+   * able to read Event Data products
+   */
+  class EDGetterContextBase : public FwkContextBase {
+  public:
+    template <typename T>
+    const T& get(const Product<T>& data) {
+      if (not isInitialized()) {
+        initialize(data);
+      }
+      synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event());
+      return data.data_;
+    }
+
+    template <typename T>
+    const T& get(const edm::Event& iEvent, edm::EDGetTokenT<Product<T>> token) {
+      return get(iEvent.get(token));
+    }
+
+    template <typename T>
+    const T& getData(const edm::EventSetup& iSetup) {
+      return iSetup.get<cms::cudaNew::ESProduct<T>>().get(device(), stream());
+    }
+
+  protected:
+    template <typename... Args>
+    EDGetterContextBase(Args&&... args) : FwkContextBase(std::forward<Args>(args)...) {}
+
+  private:
+    void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent);
+  };
+}
+
+#endif
diff --git a/src/cudadev/CUDACore/EDProducer.cc b/src/cudadev/CUDACore/EDProducer.cc
@@ -0,0 +1,17 @@
+#include "CUDACore/EDProducer.h"
+
+namespace cms::cuda {
+  void EDProducer::produce(edm::Event& event, edm::EventSetup const& eventSetup) {
+    runProduce(event.streamID(), [&](auto& ctx) { produce(event, eventSetup, ctx); });
+  }
+
+  void SynchronizingEDProducer::acquire(edm::Event const& event,
+                                        edm::EventSetup const& eventSetup,
+                                        edm::WaitingTaskWithArenaHolder holder) {
+    runAcquire(event.streamID(), std::move(holder), [&](auto& ctx) { acquire(event, eventSetup, ctx); });
+  }
+
+  void SynchronizingEDProducer::produce(edm::Event& event, edm::EventSetup const& eventSetup) {
+    runProduce(event.streamID(), [&](auto& ctx) { produce(event, eventSetup, ctx); });
+  }
+}  // namespace cms::cuda
diff --git a/src/cudadev/CUDACore/EDProducer.h b/src/cudadev/CUDACore/EDProducer.h
@@ -0,0 +1,27 @@
+#ifndef HeterogeneousCore_CUDACore_stream_EDProducer_h
+#define HeterogeneousCore_CUDACore_stream_EDProducer_h
+
+#include "Framework/EDProducer.h"
+#include "CUDACore/AcquireContext.h"
+#include "CUDACore/ProduceContext.h"
+
+namespace cms::cuda {
+  class EDProducer : public edm::EDProducer {
+  public:
+    void produce(edm::Event& event, edm::EventSetup const& eventSetup) override;
+    virtual void produce(edm::Event& event, edm::EventSetup const& eventSetup, ProduceContext& context) = 0;
+  };
+
+  class SynchronizingEDProducer : public edm::EDProducerExternalWork {
+  public:
+    void acquire(edm::Event const& event,
+                 edm::EventSetup const& eventSetup,
+                 edm::WaitingTaskWithArenaHolder holder) override;
+    void produce(edm::Event& event, edm::EventSetup const& eventSetup) override;
+
+    virtual void acquire(edm::Event const& event, edm::EventSetup const& eventSetup, AcquireContext& context) = 0;
+    virtual void produce(edm::Event& event, edm::EventSetup const& eventSetup, ProduceContext& context) = 0;
+  };
+}  // namespace cms::cuda
+
+#endif
diff --git a/src/cudadev/CUDACore/ESContext.cc b/src/cudadev/CUDACore/ESContext.cc
@@ -0,0 +1,11 @@
+#include "CUDACore/ESContext.h"
+#include "CUDACore/StreamCache.h"
+#include "CUDACore/cudaCheck.h"
+
+namespace cms::cuda {
+  ESContext::ESContext(int device) : currentDevice_(device) {
+    cudaCheck(cudaSetDevice(currentDevice_));
+    stream_ = getStreamCache().get();
+  }
+
+}  // namespace cms::cuda
diff --git a/src/cudadev/CUDACore/ESContext.h b/src/cudadev/CUDACore/ESContext.h
@@ -0,0 +1,85 @@
+#ifndef HeterogeneousCore_CUDACore_ESContext_h
+#define HeterogeneousCore_CUDACore_ESContext_h
+
+#include "CUDACore/Context.h"
+#include "CUDACore/ESProductNew.h"
+#include "CUDACore/SharedStreamPtr.h"
+#include "CUDACore/StreamCache.h"
+
+namespace cms::cuda {
+  class ESContext {
+  public:
+    explicit ESContext(int device);
+
+    int device() const { return currentDevice_; }
+
+    cudaStream_t stream() const { return stream_.get(); }
+
+    operator HostAllocatorContext() { return HostAllocatorContext(stream()); }
+    operator DeviceAllocatorContext() { return DeviceAllocatorContext(stream()); }
+    operator Context() { return Context(stream()); }
+
+  private:
+    int currentDevice_;
+    SharedStreamPtr stream_;
+  };
+
+  namespace impl {
+    template <typename T>
+    class RunForEachDevice {
+    public:
+      RunForEachDevice(T&& data) : data_(std::forward<T>(data)) {}
+
+      template <typename F>
+      [[nodiscard]] auto forEachDevice(F const& func) {
+        using RetType = decltype(func(std::declval<const T&>(), std::declval<ESContext&>()));
+        static_assert(not std::is_same_v<RetType, void>,
+                      "Function must return a value corresponding the ESProduct for one device");
+        auto product = std::make_unique<cms::cudaNew::ESProduct<RetType>>();
+        auto const& cref = data_;
+        for (std::size_t i = 0; i < product->size(); ++i) {
+          ESContext ctx(i);
+          product->emplace(i, func(cref, ctx), ctx.stream());
+        }
+        product->setHostData(std::move(data_));
+        return product;
+      }
+
+    private:
+      T data_;
+    };
+  }  // namespace impl
+
+  template <typename F>
+  [[nodiscard]] auto runForHost(F&& func) {
+    using RetType = decltype(func(std::declval<HostAllocatorContext&>()));
+    static_assert(not std::is_same_v<RetType, void>,
+                  "Function must return an intermediate value passed to the functor argument of runOnEachDevice()");
+    // TODO: temporarily use a "random stream" for the pinned host
+    // memory allocation until we figure out a way to use the caching
+    // allocator without requiring a stream. The
+    // cms::cudaNew::ESProduct will take care of releasing any
+    // allocated host memory only after all asynchronous work issued
+    // in RunForEachDevice::forEachDevice() has completed.
+    cudaCheck(cudaSetDevice(0));
+    auto stream = getStreamCache().get();
+    HostAllocatorContext ctx(stream.get());
+    return impl::RunForEachDevice<RetType>(func(ctx));
+  }
+
+  template <typename F>
+  [[nodiscard]] auto runForEachDevice(F&& func) {
+    using RetType = decltype(func(std::declval<ESContext&>()));
+    static_assert(not std::is_same_v<RetType, void>,
+                  "Function must return a value corresponding the ESProduct for noe device");
+    auto product = std::make_unique<cms::cudaNew::ESProduct<RetType>>();
+    for (int i = 0; i < product.size(); ++i) {
+      ESContext ctx(i);
+      product->emplace(i, func(ctx), ctx.stream());
+    }
+    return product;
+  }
+
+}  // namespace cms::cuda
+
+#endif