diff --git a/CUDADataFormats/Common/BuildFile.xml b/CUDADataFormats/Common/BuildFile.xml
new file mode 100644
index 0000000000000..e7a5ba74d80be
--- /dev/null
+++ b/CUDADataFormats/Common/BuildFile.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
diff --git a/CUDADataFormats/Common/interface/Product.h b/CUDADataFormats/Common/interface/Product.h
new file mode 100644
index 0000000000000..41bb8356e67cf
--- /dev/null
+++ b/CUDADataFormats/Common/interface/Product.h
@@ -0,0 +1,60 @@
+#ifndef CUDADataFormats_Common_Product_h
+#define CUDADataFormats_Common_Product_h
+
+#include
+
+#include "CUDADataFormats/Common/interface/ProductBase.h"
+
+namespace edm {
+ template
+ class Wrapper;
+}
+
+namespace cms {
+ namespace cuda {
+ namespace impl {
+ class ScopedContextGetterBase;
+ }
+
+ /**
+ * The purpose of this class is to wrap CUDA data to edm::Event in a
+ * way which forces correct use of various utilities.
+ *
+ * The non-default construction has to be done with cms::cuda::ScopedContext
+ * (in order to properly register the CUDA event).
+ *
+ * The default constructor is needed only for the ROOT dictionary generation.
+ *
+ * The CUDA event is in practice needed only for stream-stream
+ * synchronization, but someone with long-enough lifetime has to own
+ * it. Here is a somewhat natural place. If overhead is too much, we
+ * can use them only where synchronization between streams is needed.
+ */
+ template
+ class Product : public ProductBase {
+ public:
+ Product() = default; // Needed only for ROOT dictionary generation
+
+ Product(const Product&) = delete;
+ Product& operator=(const Product&) = delete;
+ Product(Product&&) = default;
+ Product& operator=(Product&&) = default;
+
+ private:
+ friend class impl::ScopedContextGetterBase;
+ friend class ScopedContextProduce;
+ friend class edm::Wrapper>;
+
+ explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, T data)
+ : ProductBase(device, std::move(stream), std::move(event)), data_(std::move(data)) {}
+
+ template
+ explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, Args&&... args)
+ : ProductBase(device, std::move(stream), std::move(event)), data_(std::forward(args)...) {}
+
+ T data_; //!
+ };
+ } // namespace cuda
+} // namespace cms
+
+#endif
diff --git a/CUDADataFormats/Common/interface/ProductBase.h b/CUDADataFormats/Common/interface/ProductBase.h
new file mode 100644
index 0000000000000..efe2242903bd0
--- /dev/null
+++ b/CUDADataFormats/Common/interface/ProductBase.h
@@ -0,0 +1,93 @@
+#ifndef CUDADataFormats_Common_ProductBase_h
+#define CUDADataFormats_Common_ProductBase_h
+
+#include
+#include
+
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h"
+
+namespace cms {
+ namespace cuda {
+ namespace impl {
+ class ScopedContextBase;
+ }
+
+ /**
+ * Base class for all instantiations of CUDA to hold the
+ * non-T-dependent members.
+ */
+ class ProductBase {
+ public:
+ ProductBase() = default; // Needed only for ROOT dictionary generation
+ ~ProductBase();
+
+ ProductBase(const ProductBase&) = delete;
+ ProductBase& operator=(const ProductBase&) = delete;
+ ProductBase(ProductBase&& other)
+ : stream_{std::move(other.stream_)},
+ event_{std::move(other.event_)},
+ mayReuseStream_{other.mayReuseStream_.load()},
+ device_{other.device_} {}
+ ProductBase& operator=(ProductBase&& other) {
+ stream_ = std::move(other.stream_);
+ event_ = std::move(other.event_);
+ mayReuseStream_ = other.mayReuseStream_.load();
+ device_ = other.device_;
+ return *this;
+ }
+
+ bool isValid() const { return stream_.get() != nullptr; }
+ bool isAvailable() const;
+
+ int device() const { return device_; }
+
+ // cudaStream_t is a pointer to a thread-safe object, for which a
+ // mutable access is needed even if the cms::cuda::ScopedContext itself
+ // would be const. Therefore it is ok to return a non-const
+ // pointer from a const method here.
+ cudaStream_t stream() const { return stream_.get(); }
+
+ // cudaEvent_t is a pointer to a thread-safe object, for which a
+ // mutable access is needed even if the cms::cuda::ScopedContext itself
+ // would be const. Therefore it is ok to return a non-const
+ // pointer from a const method here.
+ cudaEvent_t event() const { return event_.get(); }
+
+ protected:
+ explicit ProductBase(int device, SharedStreamPtr stream, SharedEventPtr event)
+ : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {}
+
+ private:
+ friend class impl::ScopedContextBase;
+ friend class ScopedContextProduce;
+
+ // The following function is intended to be used only from ScopedContext
+ const SharedStreamPtr& streamPtr() const { return stream_; }
+
+ bool mayReuseStream() const {
+ bool expected = true;
+ bool changed = mayReuseStream_.compare_exchange_strong(expected, false);
+ // If the current thread is the one flipping the flag, it may
+ // reuse the stream.
+ return changed;
+ }
+
+ // The cudaStream_t is really shared among edm::Event products, so
+ // using shared_ptr also here
+ SharedStreamPtr stream_; //!
+ // shared_ptr because of caching in cms::cuda::EventCache
+ SharedEventPtr event_; //!
+
+ // This flag tells whether the CUDA stream may be reused by a
+ // consumer or not. The goal is to have a "chain" of modules to
+ // queue their work to the same stream.
+ mutable std::atomic mayReuseStream_ = true; //!
+
+ // The CUDA device associated with this product
+ int device_ = -1; //!
+ };
+ } // namespace cuda
+} // namespace cms
+
+#endif
diff --git a/CUDADataFormats/Common/src/ProductBase.cc b/CUDADataFormats/Common/src/ProductBase.cc
new file mode 100644
index 0000000000000..8e1cf64b17122
--- /dev/null
+++ b/CUDADataFormats/Common/src/ProductBase.cc
@@ -0,0 +1,29 @@
+#include "CUDADataFormats/Common/interface/ProductBase.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
+
+namespace cms::cuda {
+ bool ProductBase::isAvailable() const {
+ // if default-constructed, the product is not available
+ if (not event_) {
+ return false;
+ }
+ return eventWorkHasCompleted(event_.get());
+ }
+
+ ProductBase::~ProductBase() {
+ // Make sure that the production of the product in the GPU is
+ // complete before destructing the product. This is to make sure
+ // that the EDM stream does not move to the next event before all
+ // asynchronous processing of the current is complete.
+
+ // TODO: a callback notifying a WaitingTaskHolder (or similar)
+ // would avoid blocking the CPU, but would also require more work.
+ //
+ // Intentionally not checking the return value to avoid throwing
+ // exceptions. If this call would fail, we should get failures
+ // elsewhere as well.
+ if (event_) {
+ cudaEventSynchronize(event_.get());
+ }
+ }
+} // namespace cms::cuda
diff --git a/CUDADataFormats/Common/test/BuildFile.xml b/CUDADataFormats/Common/test/BuildFile.xml
new file mode 100644
index 0000000000000..30b84c9ce2f22
--- /dev/null
+++ b/CUDADataFormats/Common/test/BuildFile.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
diff --git a/CUDADataFormats/Common/test/test_Product.cc b/CUDADataFormats/Common/test/test_Product.cc
new file mode 100644
index 0000000000000..d70ff3e63e903
--- /dev/null
+++ b/CUDADataFormats/Common/test/test_Product.cc
@@ -0,0 +1,68 @@
+#include "catch.hpp"
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
+
+#include
+
+namespace cms::cudatest {
+ class TestScopedContext {
+ public:
+ static cuda::ScopedContextProduce make(int dev, bool createEvent) {
+ cms::cuda::SharedEventPtr event;
+ if (createEvent) {
+ event = cms::cuda::getEventCache().get();
+ }
+ return cuda::ScopedContextProduce(dev, cms::cuda::getStreamCache().get(), std::move(event));
+ }
+ };
+} // namespace cms::cudatest
+
+TEST_CASE("Use of cms::cuda::Product template", "[CUDACore]") {
+ SECTION("Default constructed") {
+ auto foo = cms::cuda::Product();
+ REQUIRE(!foo.isValid());
+
+ auto bar = std::move(foo);
+ }
+
+ if (not cms::cudatest::testDevices()) {
+ return;
+ }
+
+ constexpr int defaultDevice = 0;
+ cudaCheck(cudaSetDevice(defaultDevice));
+ {
+ auto ctx = cms::cudatest::TestScopedContext::make(defaultDevice, true);
+ std::unique_ptr> dataPtr = ctx.wrap(10);
+ auto& data = *dataPtr;
+
+ SECTION("Construct from cms::cuda::ScopedContext") {
+ REQUIRE(data.isValid());
+ REQUIRE(data.device() == defaultDevice);
+ REQUIRE(data.stream() == ctx.stream());
+ REQUIRE(data.event() != nullptr);
+ }
+
+ SECTION("Move constructor") {
+ auto data2 = cms::cuda::Product(std::move(data));
+ REQUIRE(data2.isValid());
+ REQUIRE(!data.isValid());
+ }
+
+ SECTION("Move assignment") {
+ cms::cuda::Product data2;
+ data2 = std::move(data);
+ REQUIRE(data2.isValid());
+ REQUIRE(!data.isValid());
+ }
+ }
+
+ cudaCheck(cudaSetDevice(defaultDevice));
+ cudaCheck(cudaDeviceSynchronize());
+ // Note: CUDA resources are cleaned up by the destructors of the global cache objects
+}
diff --git a/CUDADataFormats/Common/test/test_main.cc b/CUDADataFormats/Common/test/test_main.cc
new file mode 100644
index 0000000000000..0c7c351f437f5
--- /dev/null
+++ b/CUDADataFormats/Common/test/test_main.cc
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
diff --git a/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h b/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h
index efc7d9e6cde0b..ccd99247e501e 100644
--- a/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h
+++ b/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h
@@ -25,7 +25,6 @@
#include "tbb/task_arena.h"
namespace edm {
-
class WaitingTask;
class WaitingTaskHolder;
@@ -72,5 +71,29 @@ namespace edm {
WaitingTask* m_task;
std::shared_ptr m_arena;
};
+
+ template
+ auto make_lambda_with_holder(WaitingTaskWithArenaHolder h, F&& f) {
+ return [holder = std::move(h), func = std::forward(f)]() mutable {
+ try {
+ func(holder);
+ } catch (...) {
+ holder.doneWaiting(std::current_exception());
+ }
+ };
+ }
+
+ template
+ auto make_waiting_task_with_holder(ALLOC&& iAlloc, WaitingTaskWithArenaHolder h, F&& f) {
+ return make_waiting_task(
+ std::forward(iAlloc),
+ [holder = h, func = make_lambda_with_holder(h, std::forward(f))](std::exception_ptr const* excptr) mutable {
+ if (excptr) {
+ holder.doneWaiting(*excptr);
+ return;
+ }
+ func();
+ });
+ }
} // namespace edm
#endif
diff --git a/HeterogeneousCore/CUDACore/BuildFile.xml b/HeterogeneousCore/CUDACore/BuildFile.xml
new file mode 100644
index 0000000000000..a10567db16edb
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/BuildFile.xml
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/HeterogeneousCore/CUDACore/README.md b/HeterogeneousCore/CUDACore/README.md
new file mode 100644
index 0000000000000..75d9cf4c227a7
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/README.md
@@ -0,0 +1,1003 @@
+# CUDA algorithms in CMSSW
+
+## Outline
+
+* [Introduction](#introduction)
+ * [Design goals](#design-goals)
+ * [Overall guidelines](#overall-guidelines)
+* [Sub-packages](#sub-packages)
+* [Examples](#examples)
+ * [Isolated producer (no CUDA input nor output)](#isolated-producer-no-cuda-input-nor-output)
+ * [Producer with CUDA output](#producer-with-cuda-output)
+ * [Producer with CUDA input](#producer-with-cuda-input)
+ * [Producer with CUDA input and output (with ExternalWork)](#producer-with-cuda-input-and-output-with-externalwork)
+ * [Producer with CUDA input and output, and internal chain of CPU and GPU tasks (with ExternalWork)](producer-with-cuda-input-and-output-and-internal-chain-of-cpu-and-gpu-tasks-with-externalwork)
+ * [Producer with CUDA input and output (without ExternalWork)](#producer-with-cuda-input-and-output-without-externalwork)
+ * [Analyzer with CUDA input](#analyzer-with-cuda-input)
+ * [Configuration](#configuration)
+ * [GPU-only configuration](#gpu-only-configuration)
+ * [Automatic switching between CPU and GPU modules](#automatic-switching-between-cpu-and-gpu-modules)
+* [More details](#more-details)
+ * [Device choice](#device-choice)
+ * [Data model](#data-model)
+ * [CUDA EDProducer](#cuda-edproducer)
+ * [Class declaration](#class-declaration)
+ * [Memory allocation](#memory-allocation)
+ * [Caching allocator](#caching-allocator)
+ * [Non-cached pinned host `unique_ptr`](#non-cached-pinned-host-unique_ptr)
+ * [CUDA API](#cuda-api)
+ * [Setting the current device](#setting-the-current-device)
+ * [Getting input](#getting-input)
+ * [Calling the CUDA kernels](#calling-the-cuda-kernels)
+ * [Putting output](#putting-output)
+ * [`ExternalWork` extension](#externalwork-extension)
+ * [Module-internal chain of CPU and GPU tasks](#module-internal-chain-of-cpu-and-gpu-tasks)
+ * [Transferring GPU data to CPU](#transferring-gpu-data-to-cpu)
+ * [Synchronizing between CUDA streams](#synchronizing-between-cuda-streams)
+ * [CUDA ESProduct](#cuda-esproduct)
+
+## Introduction
+
+This page documents the CUDA integration within CMSSW
+
+### Design goals
+
+1. Provide a mechanism for a chain of modules to share a resource
+ * Resource can be e.g. CUDA device memory or a CUDA stream
+2. Minimize data movements between the CPU and the device
+3. Support multiple devices
+4. Allow the same job configuration to be used on all hardware combinations
+
+### Overall guidelines
+
+1. Within the `acquire()`/`produce()` functions all CUDA operations should be asynchronous, i.e.
+ * Use `cudaMemcpyAsync()`, `cudaMemsetAsync()`, `cudaMemPrefetchAsync()` etc.
+ * Avoid `cudaMalloc*()`, `cudaHostAlloc()`, `cudaFree*()`, `cudaHostRegister()`, `cudaHostUnregister()` on every event
+ * Occasional calls are permitted through a caching mechanism that amortizes the cost (see also [Caching allocator](#caching-allocator))
+ * Avoid `assert()` in device functions, or use `#include HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h`
+ * With the latter the `assert()` calls in CUDA code are disabled by
+ default, but can be enabled by defining a `GPU_DEBUG` macro
+ (before the aforementioned include)
+2. Synchronization needs should be fulfilled with
+ [`ExternalWork`](https://twiki.cern.ch/twiki/bin/view/CMSPublic/FWMultithreadedFrameworkStreamModuleInterface#edm_ExternalWork)
+ extension to EDProducers
+ * `ExternalWork` can be used to replace one synchronization point
+ (e.g. between device kernels and copying a known amount of data
+ back to CPU).
+ * For further synchronization points (e.g. copying data whose
+ amount is known only at the device side), split the work to
+ multiple `ExternalWork` producers. This approach has the added
+ benefit that e.g. data transfers to CPU become on-demand automatically
+ * A general breakdown of the possible steps:
+ * Convert input legacy CPU data format to CPU SoA
+ * Transfer input CPU SoA to GPU
+ * Launch kernels
+ * Transfer the number of output elements to CPU
+ * Transfer the output data from GPU to CPU SoA
+ * Convert the output SoA to legacy CPU data formats
+3. Within `acquire()`/`produce()`, the current CUDA device is set
+ implicitly and the CUDA stream is provided by the system (with
+ `cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`)
+ * It is strongly recommended to use the provided CUDA stream for all operations
+ * If that is not feasible for some reason, the provided CUDA
+ stream must synchronize with the work queued on other CUDA
+ streams (with CUDA events and `cudaStreamWaitEvent()`)
+4. Outside of `acquire()`/`produce()`, CUDA API functions may be
+ called only if `CUDAService::enabled()` returns `true`.
+ * With point 3 it follows that in these cases multiple devices have
+ to be dealt with explicitly, as well as CUDA streams
+
+## Sub-packages
+* [`HeterogeneousCore/CUDACore`](#cuda-integration) CUDA-specific core components
+* [`HeterogeneousCore/CUDAServices`](../CUDAServices) Various edm::Services related to CUDA
+* [`HeterogeneousCore/CUDAUtilities`](../CUDAUtilities) Various utilities for CUDA kernel code
+* [`HeterogeneousCore/CUDATest`](../CUDATest) Test modules and configurations
+* [`CUDADataFormats/Common`](../../CUDADataFormats/Common) Utilities for event products with CUDA data
+
+## Examples
+
+### Isolated producer (no CUDA input nor output)
+
+```cpp
+class IsolatedProducerCUDA: public edm::stream::EDProducer {
+public:
+ ...
+ void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+ void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+ ...
+private:
+ ...
+ IsolatedProducerGPUAlgo gpuAlgo_;
+ edm::EDGetTokenT inputToken_;
+ edm::EDPutTokenT outputToken_;
+};
+...
+void IsolatedProducerCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+ // Sets the current device and creates a CUDA stream
+ cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
+
+ auto const& inputData = iEvent.get(inputToken_);
+
+ // Queues asynchronous data transfers and kernels to the CUDA stream
+ // returned by cms::cuda::ScopedContextAcquire::stream()
+ gpuAlgo_.makeAsync(inputData, ctx.stream());
+
+ // Destructor of ctx queues a callback to the CUDA stream notifying
+ // waitingTaskHolder when the queued asynchronous work has finished
+}
+
+// Called after the asynchronous work has finished
+void IsolatedProducerCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+ // Real life is likely more complex than this simple example. Here
+ // getResult() returns some data in CPU memory that is passed
+ // directly to the OutputData constructor.
+ iEvent.emplace(outputToken_, gpuAlgo_.getResult());
+}
+```
+
+### Producer with CUDA output
+
+```cpp
+class ProducerOutputCUDA: public edm::stream::EDProducer {
+public:
+ ...
+ void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+ void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+ ...
+private:
+ ...
+ ProducerOutputGPUAlgo gpuAlgo_;
+ edm::EDGetTokenT inputToken_;
+ edm::EDPutTokenT> outputToken_;
+ cms::cuda::ContextState ctxState_;
+};
+...
+void ProducerOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+ // Sets the current device and creates a CUDA stream
+ cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_};
+
+ auto const& inputData = iEvent.get(inputToken_);
+
+ // Queues asynchronous data transfers and kernels to the CUDA stream
+ // returned by cms::cuda::ScopedContextAcquire::stream()
+ gpuAlgo.makeAsync(inputData, ctx.stream());
+
+ // Destructor of ctx queues a callback to the CUDA stream notifying
+ // waitingTaskHolder when the queued asynchronous work has finished,
+ // and saves the device and CUDA stream to ctxState_
+}
+
+// Called after the asynchronous work has finished
+void ProducerOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+ // Sets again the current device, uses the CUDA stream created in the acquire()
+ cms::cuda::ScopedContextProduce ctx{ctxState_};
+
+ // Now getResult() returns data in GPU memory that is passed to the
+ // constructor of OutputData. cms::cuda::ScopedContextProduce::emplace() wraps the
+ // OutputData to cms::cuda::Product. cms::cuda::Product stores also
+ // the current device and the CUDA stream since those will be needed
+ // in the consumer side.
+ ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult());
+}
+```
+
+### Producer with CUDA input
+
+```cpp
+class ProducerInputCUDA: public edm::stream::EDProducer {
+public:
+ ...
+ void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+ void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+ ...
+private:
+ ...
+ ProducerInputGPUAlgo gpuAlgo_;
+ edm::EDGetTokenT> inputToken_;
+ edm::EDGetTokenT> otherInputToken_;
+ edm::EDPutTokenT outputToken_;
+};
+...
+void ProducerInputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+ cms::cuda::Product const& inputDataWrapped = iEvent.get(inputToken_);
+
+ // Set the current device to the same that was used to produce
+ // InputData, and possibly use the same CUDA stream
+ cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+
+ // Grab the real input data. Checks that the input data is on the
+ // current device. If the input data was produced in a different CUDA
+ // stream than the cms::cuda::ScopedContextAcquire holds, create an inter-stream
+ // synchronization point with CUDA event and cudaStreamWaitEvent()
+ auto const& inputData = ctx.get(inputDataWrapped);
+
+ // Input data from another producer
+ auto const& otherInputData = ctx.get(iEvent.get(otherInputToken_));
+ // or
+ auto const& otherInputData = ctx.get(iEvent, otherInputToken_);
+
+
+ // Queues asynchronous data transfers and kernels to the CUDA stream
+ // returned by cms::cuda::ScopedContextAcquire::stream()
+ gpuAlgo.makeAsync(inputData, otherInputData, ctx.stream());
+
+ // Destructor of ctx queues a callback to the CUDA stream notifying
+ // waitingTaskHolder when the queued asynchronous work has finished
+}
+
+// Called after the asynchronous work has finished
+void ProducerInputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
+ // Real life is likely more complex than this simple example. Here
+ // getResult() returns some data in CPU memory that is passed
+ // directly to the OutputData constructor.
+ iEvent.emplace(outputToken_, gpuAlgo_.getResult());
+}
+```
+
+See [further below](#setting-the-current-device) for the conditions
+when the `cms::cuda::ScopedContextAcquire` constructor reuses the CUDA stream. Note
+that the `cms::cuda::ScopedContextAcquire` constructor taking `edm::StreamID` is
+allowed, it will just always create a new CUDA stream.
+
+
+### Producer with CUDA input and output (with ExternalWork)
+
+```cpp
+class ProducerInputOutputCUDA: public edm::stream::EDProducer {
+public:
+ ...
+ void acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+ void produce(edm::Event& iEvent, edm::EventSetup& iSetup) override;
+ ...
+private:
+ ...
+ ProducerInputGPUAlgo gpuAlgo_;
+ edm::EDGetTokenT> inputToken_;
+ edm::EDPutTokenT> outputToken_;
+};
+...
+void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+ cms::cuda::Product const& inputDataWrapped = iEvent.get(inputToken_);
+
+ // Set the current device to the same that was used to produce
+ // InputData, and also use the same CUDA stream
+ cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder), ctxState_};
+
+ // Grab the real input data. Checks that the input data is on the
+ // current device. If the input data was produced in a different CUDA
+ // stream than the cms::cuda::ScopedContextAcquire holds, create an inter-stream
+ // synchronization point with CUDA event and cudaStreamWaitEvent()
+ auto const& inputData = ctx.get(inputDataWrapped);
+
+ // Queues asynchronous data transfers and kernels to the CUDA stream
+ // returned by cms::cuda::ScopedContextAcquire::stream()
+ gpuAlgo.makeAsync(inputData, ctx.stream());
+
+ // Destructor of ctx queues a callback to the CUDA stream notifying
+ // waitingTaskHolder when the queued asynchronous work has finished,
+ // and saves the device and CUDA stream to ctxState_
+}
+
+// Called after the asynchronous work has finished
+void ProducerInputOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
+ // Sets again the current device, uses the CUDA stream created in the acquire()
+ cms::cuda::ScopedContextProduce ctx{ctxState_};
+
+ // Now getResult() returns data in GPU memory that is passed to the
+ // constructor of OutputData. cms::cuda::ScopedContextProduce::emplace() wraps the
+ // OutputData to cms::cuda::Product. cms::cuda::Product stores also
+ // the current device and the CUDA stream since those will be needed
+ // in the consumer side.
+ ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult());
+}
+```
+
+[Complete example](../CUDATest/plugins/TestCUDAProducerGPUEW.cc)
+
+
+### Producer with CUDA input and output, and internal chain of CPU and GPU tasks (with ExternalWork)
+
+```cpp
+class ProducerInputOutputCUDA: public edm::stream::EDProducer {
+public:
+ ...
+ void acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+ void produce(edm::Event& iEvent, edm::EventSetup& iSetup) override;
+ ...
+private:
+ void addMoreWork(edm::WaitingTaskWithArenaHolder waitingTashHolder);
+
+ ...
+ ProducerInputGPUAlgo gpuAlgo_;
+ edm::EDGetTokenT> inputToken_;
+ edm::EDPutTokenT> outputToken_;
+};
+...
+void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+ cms::cuda::Product const& inputDataWrapped = iEvent.get(inputToken_);
+
+ // Set the current device to the same that was used to produce
+ // InputData, and also use the same CUDA stream
+ cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder), ctxState_};
+
+ // Grab the real input data. Checks that the input data is on the
+ // current device. If the input data was produced in a different CUDA
+ // stream than the cms::cuda::ScopedContextAcquire holds, create an inter-stream
+ // synchronization point with CUDA event and cudaStreamWaitEvent()
+ auto const& inputData = ctx.get(inputDataWrapped);
+
+ // Queues asynchronous data transfers and kernels to the CUDA stream
+ // returned by cms::cuda::ScopedContextAcquire::stream()
+ gpuAlgo.makeAsync(inputData, ctx.stream());
+
+ // Push a functor on top of "a stack of tasks" to be run as a next
+ // task after the work queued above before produce(). In this case ctx
+ // is a context constructed by the calling TBB task, and therefore the
+ // current device and CUDA stream have been already set up. The ctx
+ // internally holds the WaitingTaskWithArenaHolder for the next task.
+
+ ctx.pushNextTask([this](cms::cuda::ScopedContextTask ctx) {
+ addMoreWork(ctx);
+ });
+
+ // Destructor of ctx queues a callback to the CUDA stream notifying
+ // waitingTaskHolder when the queued asynchronous work has finished,
+ // and saves the device and CUDA stream to ctxState_
+}
+
+// Called after the asynchronous work queued in acquire() has finished
+void ProducerInputOutputCUDA::addMoreWork(cms::cuda::ScopedContextTask& ctx) {
+ // Current device and CUDA stream have already been set
+
+ // Queues more asynchronous data transfer and kernels to the CUDA
+ // stream returned by cms::cuda::ScopedContextTask::stream()
+ gpuAlgo.makeMoreAsync(ctx.stream());
+
+ // Destructor of ctx queues a callback to the CUDA stream notifying
+ // waitingTaskHolder when the queued asynchronous work has finished
+}
+
+// Called after the asynchronous work queued in addMoreWork() has finished
+void ProducerInputOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
+ // Sets again the current device, uses the CUDA stream created in the acquire()
+ cms::cuda::ScopedContextProduce ctx{ctxState_};
+
+ // Now getResult() returns data in GPU memory that is passed to the
+ // constructor of OutputData. cms::cuda::ScopedContextProduce::emplace() wraps the
+ // OutputData to cms::cuda::Product. cms::cuda::Product stores also
+ // the current device and the CUDA stream since those will be needed
+ // in the consumer side.
+ ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult());
+}
+```
+
+[Complete example](../CUDATest/plugins/TestCUDAProducerGPUEWTask.cc)
+
+
+### Producer with CUDA input and output (without ExternalWork)
+
+If the producer does not need to transfer anything back to CPU (like
+the number of output elements), the `ExternalWork` extension is not
+needed as there is no need to synchronize.
+
+```cpp
+class ProducerInputOutputCUDA: public edm::global::EDProducer<> {
+public:
+ ...
+ void produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup& iSetup) const override;
+ ...
+private:
+ ...
+ ProducerInputGPUAlgo gpuAlgo_;
+ edm::EDGetTokenT> inputToken_;
+ edm::EDPutTokenT> outputToken_;
+};
+...
+void ProducerInputOutputCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup& iSetup) const {
+ cms::cuda::Product const& inputDataWrapped = iEvent.get(inputToken_);
+
+ // Set the current device to the same that was used to produce
+ // InputData, and possibly use the same CUDA stream
+ cms::cuda::ScopedContextProduce ctx{inputDataWrapped};
+
+ // Grab the real input data. Checks that the input data is on the
+ // current device. If the input data was produced in a different CUDA
+ // stream than the cms::cuda::ScopedContextProduce holds, create an inter-stream
+ // synchronization point with CUDA event and cudaStreamWaitEvent()
+ auto const& inputData = ctx.get(inputDataWrapped);
+
+ // Queues asynchronous data transfers and kernels to the CUDA stream
+ // returned by cms::cuda::ScopedContextProduce::stream(). Here makeAsync() also
+ // returns data in GPU memory that is passed to the constructor of
+ // OutputData. cms::cuda::ScopedContextProduce::emplace() wraps the OutputData to
+ // cms::cuda::Product. cms::cuda::Product stores also the current
+ // device and the CUDA stream since those will be needed in the
+ // consumer side.
+ ctx.emplace(iEvent, outputToken, gpuAlgo.makeAsync(inputData, ctx.stream());
+
+ // Destructor of ctx queues a callback to the CUDA stream notifying
+ // waitingTaskHolder when the queued asynchronous work has finished
+}
+```
+
+[Complete example](../CUDATest/plugins/TestCUDAProducerGPU.cc)
+
+
+### Analyzer with CUDA input
+
+Analyzer with CUDA input is similar to [producer with CUDA
+input](#producer-with-cuda-input). Note that currently we do not have
+a mechanism for portable configurations with analyzers (like
+[`SwitchProducer`](#automatic-switching-between-cpu-and-gpu-modules)
+for producers). This means that a configuration with a CUDA analyzer
+can only run on a machine with CUDA device(s).
+
+```cpp
+class AnalyzerInputCUDA: public edm::global::EDAnalyzer<> {
+public:
+ ...
+ void analyzer(edm::Event const& iEvent, edm::EventSetup const& iSetup) override;
+ ...
+private:
+ ...
+ AnalyzerInputGPUAlgo gpuAlgo_;
+ edm::EDGetTokenT> inputToken_;
+ edm::EDGetTokenT> otherInputToken_;
+};
+...
+void AnalyzerInputCUDA::analyze(edm::Event const& iEvent, edm::EventSetup& iSetup) {
+ cms::cuda::Product const& inputDataWrapped = iEvent.get(inputToken_);
+
+ // Set the current device to the same that was used to produce
+ // InputData, and possibly use the same CUDA stream
+ cms::cuda::ScopedContextAnalyze ctx{inputDataWrapped};
+
+ // Grab the real input data. Checks that the input data is on the
+ // current device. If the input data was produced in a different CUDA
+ // stream than the cms::cuda::ScopedContextAnalyze holds, create an inter-stream
+ // synchronization point with CUDA event and cudaStreamWaitEvent()
+ auto const& inputData = ctx.get(inputDataWrapped);
+
+ // Input data from another producer
+ auto const& otherInputData = ctx.get(iEvent.get(otherInputToken_));
+ // or
+ auto const& otherInputData = ctx.get(iEvent, otherInputToken_);
+
+
+ // Queues asynchronous data transfers and kernels to the CUDA stream
+ // returned by cms::cuda::ScopedContextAnalyze::stream()
+ gpuAlgo.analyzeAsync(inputData, otherInputData, ctx.stream());
+}
+```
+
+[Complete example](../CUDATest/plugins/TestCUDAAnalyzerGPU.cc)
+
+
+### Configuration
+
+#### GPU-only configuration
+
+For a GPU-only configuration there is nothing special to be done, just
+construct the Paths/Sequences/Tasks from the GPU modules.
+
+#### Automatic switching between CPU and GPU modules
+
+The `SwitchProducer` mechanism can be used to switch automatically
+between CPU and GPU modules based on the availability of GPUs on the
+machine where the configuration is done. Framework decides at the
+beginning of the job which of the modules to run for a given module
+label.
+
+Framework requires that the modules in the switch must produce the
+same types of output products (the closer the actual results are the
+better, but the framework can not enforce that). This means that for a
+chain of GPU modules, it is the module that transforms the SoA data
+format back to the legacy data formats (possibly, but not necessarily,
+transferring the SoA data from GPU to CPU) that should be switched
+between the legacy CPU module. The rest of the GPU modules should be
+placed to a `Task`, in which case framework runs them only if their
+output is needed by another module.
+
+```python
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+process.foo = SwitchProducerCUDA(
+ cpu = cms.EDProducer("FooProducer"), # legacy CPU
+ cuda = cms.EDProducer("FooProducerFromCUDA",
+ src="fooCUDA"
+ )
+)
+process.fooCUDA = cms.EDProducer("FooProducerCUDA")
+
+process.fooTaskCUDA = cms.Task(process.fooCUDA)
+process.fooTask = cms.Task(
+ process.foo,
+ process.fooTaskCUDA
+)
+```
+
+For a more complete example, see [here](../CUDATest/test/testCUDASwitch_cfg.py).
+
+
+
+
+
+## More details
+
+### Device choice
+
+As discussed above, with `SwitchProducer` the choice between CPU and
+GPU modules is done at the beginning of the job.
+
+For multi-GPU setup the device is chosen in the first CUDA module in a
+chain of modules by one of the constructors of
+`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`
+```cpp
+// In ExternalWork acquire()
+cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), ...};
+
+// In normal produce() (or filter())
+cms::cuda::ScopedContextProduce ctx{iEvent.streamID()};
+```
+As the choice is still the static EDM stream to device assignment, the
+EDM stream ID is needed. The logic will likely evolve in the future to
+be more dynamic, and likely the device choice has to be made for the
+full event.
+
+### Data model
+
+The "GPU data product" should be a class/struct containing smart
+pointer(s) to device data (see [Memory allocation](#memory-allocation)).
+When putting the data to event, the data is wrapped to
+`cms::cuda::Product` template, which holds
+* the GPU data product
+ * must be moveable, but no other restrictions
+* the current device where the data was produced, and the CUDA stream the data was produced with
+* [CUDA event for synchronization between multiple CUDA streams](#synchronizing-between-cuda-streams)
+
+Note that the `cms::cuda::Product` wrapper can be constructed only with
+`cms::cuda::ScopedContextProduce::wrap()`, and the data `T` can be obtained
+from it only with
+`cms::cuda::ScopedContextAcquire::get()`/`cms::cuda::ScopedContextProduce::get()`/`cms::cuda::ScopedContextAnalyze::get()`,
+as described further below. When putting the data product directly to
+`edm::Event`, also `cms::cuda::SCopedContextProduce::emplace()` can be used.
+
+The GPU data products that depend on the CUDA runtime should be placed
+under `CUDADataFormats` package, using the same name for sub-package
+that would be used in `DataFormats`. Everything else, e.g. SoA for
+CPU, should go under `DataFormats` as usual.
+
+
+### CUDA EDProducer
+
+#### Class declaration
+
+The CUDA producers are normal EDProducers. The `ExternalWork`
+extension should be used if a synchronization between the GPU and CPU
+is needed, e.g. when transferring data from GPU to CPU.
+
+#### Memory allocation
+
+##### Caching allocator
+
+The memory allocations should be done dynamically with the following functions
+```cpp
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+
+cms::cuda::device::unique_ptr device_buffer = cms::cuda::make_device_unique(50, cudaStream);
+cms::cuda::host::unique_ptr host_buffer = cms::cuda::make_host_unique(50, cudaStream);
+```
+
+in the `acquire()` and `produce()` functions. The same
+`cudaStream_t` object that is used for transfers and kernels
+should be passed to the allocator.
+
+The allocator is based on `cub::CachingDeviceAllocator`. The memory is
+guaranteed to be reserved
+* for the host: up to the destructor of the `unique_ptr`
+* for the device: until all work queued in the `cudaStream` up to the point when the `unique_ptr` destructor is called has finished
+
+##### Non-cached pinned host `unique_ptr`
+
+In producers transferring data to GPU one may want to pinned host
+memory allocated with `cudaHostAllocWriteCombined`. As of now we don't
+want to include the flag dimension to the caching allocator. The CUDA
+API wrapper library does not support allocation flags, so we add our
+own `unique_ptr` for that.
+
+```cpp
+#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
+
+cms::cuda::host::noncached_unique_ptr host_buffer = cms::cuda::make_host_noncached_unique(50, flags);
+```
+The `flags` is passed directly to `cudaHostAlloc()`.
+
+##### CUDA API
+
+The `cudaMalloc()` etc may be used outside of the event loop, but that
+should be limited to only relatively small allocations in order to
+allow as much re-use of device memory as possible.
+
+If really needed, the `cudaMalloc()` etc may be used also within the
+event loop, but then the cost of allocation and implicit
+synchronization should be explicitly amortized e.g. by caching.
+
+#### Setting the current device
+
+A CUDA producer should construct `cms::cuda::ScopedContextAcquire` in
+`acquire()` (`cms::cuda::ScopedContextProduce` `produce()` if not using
+`ExternalWork`) either with `edm::StreamID`, or with a
+`cms::cuda::Product` read as an input.
+
+```cpp
+// From edm::StreamID
+cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), ...};
+// or
+cms::cuda::ScopedContextProduce ctx{iEvent.streamID()};
+
+
+// From cms::cuda::Product
+cms::cuda::Product const& cclus = iEvent.get(srcToken_);
+cms::cuda::ScopedContextAcquire ctx{cclus, ...};
+// or
+cms::cuda::ScopedContextProduce ctx{cclus};
+```
+
+A CUDA analyzer should construct `cms::cuda::ScopedContextAnalyze` with a
+`cms::cuda::Product` read as an input.
+
+```cpp
+cms::cuda::Product const& cclus = iEvent.get(srcToken_);
+cms::cuda::ScopedContextAnalyze ctx{cclus};
+```
+
+`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`/`cms::cuda::ScopedContextAnalyze` work in the RAII way and does the following
+* Sets the current device for the current scope
+ - If constructed from the `edm::StreamID`, chooses the device and creates a new CUDA stream
+ - If constructed from the `cms::cuda::Product`, uses the same device and possibly the same CUDA stream as was used to produce the `cms::cuda::Product`
+ * The CUDA stream is reused if this producer is the first consumer
+ of the `cms::cuda::Product`, otherwise a new CUDA stream is created.
+ This approach is simple compromise to automatically express the work of
+ parallel producers in different CUDA streams, and at the same
+ time allow a chain of producers to queue their work to the same
+ CUDA stream.
+* Gives access to the CUDA stream the algorithm should use to queue asynchronous work
+* `cms::cuda::ScopedContextAcquire` calls `edm::WaitingTaskWithArenaHolder::doneWaiting()` when necessary (in its destructor)
+* [Synchronizes between CUDA streams if necessary](#synchronizing-between-cuda-streams)
+* Needed to get `cms::cuda::Product` from the event
+ * `cms::cuda::ScopedContextProduce` is needed to put `cms::cuda::Product` to the event
+
+In case of multiple input products, from possibly different CUDA
+streams and/or CUDA devices, this approach gives the developer full
+control in which of them the kernels of the algorithm should be run.
+
+#### Getting input
+
+The real product (`T`) can be obtained from `cms::cuda::Product` only with
+the help of
+`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`/`cms::cuda::ScopedContextAnalyze`.
+
+```cpp
+// From cms::cuda::Product
+cms::cuda::Product cclus = iEvent.get(srcToken_);
+GPUClusters const& clus = ctx.get(cclus);
+
+// Directly from Event
+GPUClusters const& clus = ctx.get(iEvent, srcToken_);
+```
+
+This step is needed to
+* check that the data are on the same CUDA device
+ * if not, throw an exception (with unified memory could prefetch instead)
+* if the CUDA streams are different, synchronize between them
+
+#### Calling the CUDA kernels
+
+It is usually best to wrap the CUDA kernel calls to a separate class,
+and then call methods of that class from the EDProducer. The only
+requirement is that the CUDA stream where to queue the operations
+should be the one from the
+`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`/`cms::cuda::ScopedContextAnalyze`.
+
+```cpp
+gpuAlgo.makeClustersAsync(..., ctx.stream());
+```
+
+If necessary, different CUDA streams may be used internally, but they
+should to be made to synchronize with the provided CUDA stream with
+CUDA events and `cudaStreamWaitEvent()`.
+
+
+#### Putting output
+
+The GPU data needs to be wrapped to `cms::cuda::Product` template with
+`cms::cuda::ScopedContextProduce::wrap()` or `cms::cuda::ScopedContextProduce::emplace()`
+
+```cpp
+GPUClusters clusters = gpuAlgo.makeClustersAsync(..., ctx.stream());
+std::unique_ptr> ret = ctx.wrap(clusters);
+iEvent.put(std::move(ret));
+
+// or with one line
+iEvent.put(ctx.wrap(gpuAlgo.makeClustersAsync(ctx.stream())));
+
+// or avoid one unique_ptr with emplace
+edm::PutTokenT> putToken_ = produces>(); // in constructor
+...
+ctx.emplace(iEvent, putToken_, gpuAlgo.makeClustersAsync(ctx.stream()));
+```
+
+This step is needed to
+* store the current device and CUDA stream into `cms::cuda::Product`
+* record the CUDA event needed for CUDA stream synchronization
+
+#### `ExternalWork` extension
+
+Everything above works both with and without `ExternalWork`.
+
+Without `ExternalWork` the `EDProducer`s act similar to TBB
+flowgraph's "streaming node". In other words, they just queue more
+asynchronous work to the CUDA stream in their `produce()`.
+
+The `ExternalWork` is needed when one would otherwise call
+`cudeStreamSynchronize()`. For example transferring something to CPU
+needed for downstream DQM, or queueing more asynchronous work. With
+`ExternalWork` an `acquire()` method needs to be implemented that gets
+an `edm::WaitingTaskWithArenaHolder` parameter. The
+`edm::WaitingTaskWithArenaHolder` should then be passed to the
+constructor of `cms::cuda::ScopedContextAcquire` along
+
+```cpp
+void acquire(..., edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+ cms::cuda::Product const& cclus = iEvent.get(token_);
+ cms::cuda::ScopedContextAcquire ctx{cclus, std::move(waitingTaskHolder)}; // can also copy instead of move if waitingTaskHolder is needed for something else as well
+ ...
+```
+
+When constructed this way, `cms::cuda::ScopedContextAcquire` registers a
+callback function to the CUDA stream in its destructor to call
+`waitingTaskHolder.doneWaiting()`.
+
+A GPU->GPU producer needs a `cms::cuda::ScopedContext` also in its
+`produce()`. The device and CUDA stream are transferred via
+`cms::cuda::ContextState` member variable:
+
+```cpp
+class FooProducerCUDA ... {
+ ...
+ cms::cuda::ContextState ctxState_;
+};
+
+void FooProducerCUDA::acquire(...) {
+ ...
+ cms::cuda::ScopedContextAcquire ctx{..., std::move(waitingTaskHolder), ctxState_};
+ ...
+}
+
+void FooProducerCUDA::produce(...( {
+ ...
+ cms::cuda::ScopedContextProduce ctx{ctxState_};
+}
+```
+
+The `cms::cuda::ScopedContextAcquire` saves its state to the `ctxState_` in
+the destructor, and `cms::cuda::ScopedContextProduce` then restores the
+context.
+
+#### Module-internal chain of CPU and GPU tasks
+
+Technically `ExternalWork` works such that the framework calls
+`acquire()` with a `edm::WaitingTaskWithArenaHolder` that holds an
+`edm::WaitingTask` (that inherits from `tbb::task`) for calling
+`produce()` in a `std::shared_ptr` semantics: spawn the task when
+reference count hits `0`. It is also possible to create a longer chain
+of such tasks, alternating between CPU and GPU work. This mechanism
+can also be used to re-run (part of) the GPU work.
+
+The "next tasks" to run are essentially structured as a stack, such
+that
+- `cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextTask::pushNextTask()`
+ pushes a new functor on top of the stack
+- Completion of both the asynchronous work and the queueing function
+ pops the top task of the stack and enqueues it (so that TBB
+ eventually runs the task)
+ * Technically the task is made eligible to run when all copies of
+ `edm::WaitingTaskWithArenaHolder` of the acquire() (or "previous"
+ function) have either been destructed or their `doneWaiting()` has
+ been called
+ * The code calling `acquire()` or the functor holds one copy of
+ `edm::WaitingTaskWithArenaHolder` so it is guaranteed that the
+ next function will not run before the earlier one has finished
+
+
+Below is an example how to push a functor on top of the stack of tasks
+to run next (following the example of the previous section)
+```cpp
+void FooProducerCUDA::acquire(...) {
+ ...
+ ctx.pushNextTask([this](cms::cuda::ScopedContextTask ctx) {
+ ...
+ });
+ ...
+}
+```
+
+In this case the `ctx`argument to the function is a
+`cms::cuda::ScopedContexTask` object constructed by the TBB task calling the
+user-given function. It follows that the current device and CUDA
+stream have been set up already. The `pushNextTask()` can be called
+many times. On each invocation the `pushNextTask()` pushes a new task
+on top of the stack (i.e. in front of the chain). It follows that in
+```cpp
+void FooProducerCUDA::acquire(...) {
+ ...
+ ctx.pushNextTask([this](cms::cuda::ScopedContextTask ctx) {
+ ... // function 1
+ });
+ ctx.pushNextTask([this](cms::cuda::ScopedContextTask ctx) {
+ ... // function 2
+ });
+ ctx.pushNextTask([this](cms::cuda::ScopedContextTask ctx) {
+ ... // function 3
+ });
+ ...
+}
+```
+the functions will be run in the order 3, 2, 1.
+
+**Note** that the `CUDAService` is **not** available (nor is any other
+service) in these intermediate tasks. In the near future memory
+allocations etc. will be made possible by taking them out from the
+`CUDAService`.
+
+The `cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextTask` have also a
+more generic member function, `replaceWaitingTaskHolder()`, that can
+be used to just replace the currently-hold
+`edm::WaitingTaskWithArenaHolder` (that will get notified by the
+callback function) with anything. In this case the caller is
+responsible of creating the task(s) and setting up the chain of them.
+
+
+#### Transferring GPU data to CPU
+
+The GPU->CPU data transfer needs synchronization to ensure the CPU
+memory to have all data before putting that to the event. This means
+the `ExternalWork` needs to be used along
+* In `acquire()`
+ * (allocate CPU memory buffers)
+ * Queue all GPU->CPU transfers asynchronously
+* In `produce()`
+ * If needed, read additional CPU products (e.g. from `edm::Ref`s)
+ * Reformat data back to legacy data formats
+ * Note: `cms::cuda::ScopedContextProduce` is **not** needed in `produce()`
+
+#### Synchronizing between CUDA streams
+
+In case the producer needs input data that were produced in two (or
+more) CUDA streams, these streams have to be synchronized. Here this
+synchronization is achieved with CUDA events.
+
+Each `cms::cuda::Product` constains also a CUDA event object. The call to
+`cms::cuda::ScopedContextProduce::wrap()` will *record* the event in the CUDA
+stream. This means that when all work queued to the CUDA stream up to
+that point has been finished, the CUDA event becomes *occurred*. Then,
+in
+`cms::cuda::ScopedContextAcquire::get()`/`cms::cuda::ScopedContextProduce::get()`/`cms::cuda::ScopedContextAnalyze::get()`,
+if the `cms::cuda::Product` to get from has a different CUDA stream than
+the
+`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`/`cms::cuda::ScopedContextAnalyze`,
+`cudaStreamWaitEvent(stream, event)` is called. This means that all
+subsequent work queued to the CUDA stream will wait for the CUDA event
+to become occurred. Therefore this subsequent work can assume that the
+to-be-getted CUDA product exists.
+
+
+### CUDA ESProduct
+
+Conditions data can be transferred to the device with the following
+pattern.
+
+1. Define a `class`/`struct` for the data to be transferred in the format accessed in the device (hereafter referred to as "payload")
+2. Define a wrapper ESProduct that holds the aforementioned data in the pinned host memory
+3. The wrapper should have a function returning the payload on the
+ device memory. The function should transfer the data to the device
+ asynchronously with the help of `cms::cuda::ESProduct`.
+
+#### Example
+
+```cpp
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+
+// Declare the struct for the payload to be transferred. Here the
+// example is an array with (potentially) dynamic size. Note that all of
+// below becomes simpler if the array has compile-time size.
+struct ESProductExampleCUDA {
+ float *someData;
+ unsigned int size;
+};
+
+// Declare the wrapper ESProduct. The corresponding ESProducer should
+// produce objects of this type.
+class ESProductExampleCUDAWrapper {
+public:
+ // Constructor takes the standard CPU ESProduct, and transforms the
+ // necessary data to array(s) in pinned host memory
+ ESProductExampleCUDAWrapper(ESProductExample const&);
+
+ // Deallocates all pinned host memory
+ ~ESProductExampleCUDAWrapper();
+
+ // Function to return the actual payload on the memory of the current device
+ ESProductExampleCUDA const *getGPUProductAsync(cudaStream_t stream) const;
+
+private:
+ // Holds the data in pinned CPU memory
+ float *someData_;
+ unsigned int size_;
+
+ // Helper struct to hold all information that has to be allocated and
+ // deallocated per device
+ struct GPUData {
+ // Destructor should free all member pointers
+ ~GPUData();
+ // internal pointers are on device, struct itself is on CPU
+ ESProductExampleCUDA *esproductHost = nullptr;
+ // internal pounters and struct are on device
+ ESProductExampleCUDA *esproductDevice = nullptr;
+ };
+
+ // Helper that takes care of complexity of transferring the data to
+ // multiple devices
+ cms::cuda::ESProduct gpuData_;
+};
+
+ESProductExampleCUDAWrapper::ESProductExampleCUDAWrapper(ESProductExample const& cpuProduct) {
+ cudaCheck(cudaMallocHost(&someData_, sizeof(float)*NUM_ELEMENTS));
+ // fill someData_ and size_ from cpuProduct
+}
+
+ESProductExampleCUDA const *ESProductExampleCUDAWrapper::getGPUProductAsync(cudaStream_t stream) const {
+ // cms::cuda::ESProduct essentially holds an array of GPUData objects,
+ // one per device. If the data have already been transferred to the
+ // current device (or the transfer has been queued), the helper just
+ // returns a reference to that GPUData object. Otherwise, i.e. data are
+ // not yet on the current device, the helper calls the lambda to do the
+ // necessary memory allocations and to queue the transfers.
+ auto const& data = gpuData_.dataForCurrentDeviceAsync(stream, [this](GPUData& data, cudaStream_t stream) {
+ // Allocate memory. Currently this can be with the CUDA API,
+ // sometime we'll migrate to the caching allocator. Assumption is
+ // that IOV changes are rare enough that adding global synchronization
+ // points is not that bad (for now).
+
+ // Allocate the payload object on pinned host memory.
+ cudaCheck(cudaMallocHost(&data.esproductHost, sizeof(ESProductExampleCUDA)));
+ // Allocate the payload array(s) on device memory.
+ cudaCheck(cudaMalloc(&data.esproductHost->someData, sizeof(float)*NUM_ELEMENTS));
+
+ // Allocate the payload object on the device memory.
+ cudaCheck(cudaMalloc(&data.esproductDevice, sizeof(ESProductDevice)));
+
+ // Complete the host-side information on the payload
+ data.cablingMapHost->size = this->size_;
+
+
+ // Transfer the payload, first the array(s) ...
+ cudaCheck(cudaMemcpyAsync(data.esproductHost->someData, this->someData, sizeof(float)*NUM_ELEMENTS, cudaMemcpyDefault, stream));
+ // ... and then the payload object
+ cudaCheck(cudaMemcpyAsync(data.esproductDevice, data.esproduceHost, sizeof(ESProductExampleCUDA), cudaMemcpyDefault, stream));
+});
+
+ // Returns the payload object on the memory of the current device
+ return data.esproductDevice;
+}
+
+// Destructor frees all member pointers
+ESProductExampleCUDA::GPUData::~GPUData() {
+ if(esproductHost != nullptr) {
+ cudaCheck(cudaFree(esproductHost->someData));
+ cudaCheck(cudaFreeHost(esproductHost));
+ }
+ cudaCheck(cudaFree(esProductDevice));
+}
+
+```
diff --git a/HeterogeneousCore/CUDACore/interface/ContextState.h b/HeterogeneousCore/CUDACore/interface/ContextState.h
new file mode 100644
index 0000000000000..4de13c643a79f
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/interface/ContextState.h
@@ -0,0 +1,61 @@
+#ifndef HeterogeneousCore_CUDACore_ContextState_h
+#define HeterogeneousCore_CUDACore_ContextState_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
+
+#include
+
+namespace cms {
+ namespace cuda {
+ /**
+ * The purpose of this class is to deliver the device and CUDA stream
+ * information from ExternalWork's acquire() to producer() via a
+ * member/StreamCache variable.
+ */
+ class ContextState {
+ public:
+ ContextState() = default;
+ ~ContextState() = default;
+
+ ContextState(const ContextState&) = delete;
+ ContextState& operator=(const ContextState&) = delete;
+ ContextState(ContextState&&) = delete;
+ ContextState& operator=(ContextState&& other) = delete;
+
+ private:
+ friend class ScopedContextAcquire;
+ friend class ScopedContextProduce;
+ friend class ScopedContextTask;
+
+ void set(int device, SharedStreamPtr stream) {
+ throwIfStream();
+ device_ = device;
+ stream_ = std::move(stream);
+ }
+
+ int device() const { return device_; }
+
+ const SharedStreamPtr& streamPtr() const {
+ throwIfNoStream();
+ return stream_;
+ }
+
+ SharedStreamPtr releaseStreamPtr() {
+ throwIfNoStream();
+ // This function needs to effectively reset stream_ (i.e. stream_
+ // must be empty after this function). This behavior ensures that
+ // the SharedStreamPtr is not hold for inadvertedly long (i.e. to
+ // the next event), and is checked at run time.
+ return std::move(stream_);
+ }
+
+ void throwIfStream() const;
+ void throwIfNoStream() const;
+
+ SharedStreamPtr stream_;
+ int device_;
+ };
+ } // namespace cuda
+} // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/interface/ESProduct.h b/HeterogeneousCore/CUDACore/interface/ESProduct.h
new file mode 100644
index 0000000000000..485d694e5b522
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/interface/ESProduct.h
@@ -0,0 +1,103 @@
+#ifndef HeterogeneousCore_CUDACore_CUDAESProduct_h
+#define HeterogeneousCore_CUDACore_CUDAESProduct_h
+
+#include
+#include
+#include
+#include
+
+#include "FWCore/Utilities/interface/thread_safety_macros.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
+
+namespace cms {
+ namespace cuda {
+ template
+ class ESProduct {
+ public:
+ ESProduct() : gpuDataPerDevice_(deviceCount()) {
+ for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) {
+ gpuDataPerDevice_[i].m_event = getEventCache().get();
+ }
+ }
+ ~ESProduct() = default;
+
+ // transferAsync should be a function of (T&, cudaStream_t)
+ // which enqueues asynchronous transfers (possibly kernels as well)
+ // to the CUDA stream
+ template
+ const T& dataForCurrentDeviceAsync(cudaStream_t cudaStream, F transferAsync) const {
+ auto device = currentDevice();
+
+ auto& data = gpuDataPerDevice_[device];
+
+ // If GPU data has already been filled, we can return it
+ // immediately
+ if (not data.m_filled.load()) {
+ // It wasn't, so need to fill it
+ std::scoped_lock lk{data.m_mutex};
+
+ if (data.m_filled.load()) {
+ // Other thread marked it filled while we were locking the mutex, so we're free to return it
+ return data.m_data;
+ }
+
+ if (data.m_fillingStream != nullptr) {
+ // Someone else is filling
+
+ // Check first if the recorded event has occurred
+ if (eventWorkHasCompleted(data.m_event.get())) {
+ // It was, so data is accessible from all CUDA streams on
+ // the device. Set the 'filled' for all subsequent calls and
+ // return the value
+ auto should_be_false = data.m_filled.exchange(true);
+ assert(not should_be_false);
+ data.m_fillingStream = nullptr;
+ } else if (data.m_fillingStream != cudaStream) {
+ // Filling is still going on. For other CUDA stream, add
+ // wait on the CUDA stream and return the value. Subsequent
+ // work queued on the stream will wait for the event to
+ // occur (i.e. transfer to finish).
+ cudaCheck(cudaStreamWaitEvent(cudaStream, data.m_event.get(), 0),
+ "Failed to make a stream to wait for an event");
+ }
+ // else: filling is still going on. But for the same CUDA
+ // stream (which would be a bit strange but fine), we can just
+ // return as all subsequent work should be enqueued to the
+ // same CUDA stream (or stream to be explicitly synchronized
+ // by the caller)
+ } else {
+ // Now we can be sure that the data is not yet on the GPU, and
+ // this thread is the first to try that.
+ transferAsync(data.m_data, cudaStream);
+ assert(data.m_fillingStream == nullptr);
+ data.m_fillingStream = cudaStream;
+ // Now the filling has been enqueued to the cudaStream, so we
+ // can return the GPU data immediately, since all subsequent
+ // work must be either enqueued to the cudaStream, or the cudaStream
+ // must be synchronized by the caller
+ }
+ }
+
+ return data.m_data;
+ }
+
+ private:
+ struct Item {
+ mutable std::mutex m_mutex;
+ CMS_THREAD_GUARD(m_mutex) mutable SharedEventPtr m_event;
+ // non-null if some thread is already filling (cudaStream_t is just a pointer)
+ CMS_THREAD_GUARD(m_mutex) mutable cudaStream_t m_fillingStream = nullptr;
+ mutable std::atomic m_filled = false; // easy check if data has been filled already or not
+ CMS_THREAD_GUARD(m_mutex) mutable T m_data;
+ };
+
+ std::vector- gpuDataPerDevice_;
+ };
+ } // namespace cuda
+} // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/interface/ScopedContext.h b/HeterogeneousCore/CUDACore/interface/ScopedContext.h
new file mode 100644
index 0000000000000..cdc3e2dd2c620
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/interface/ScopedContext.h
@@ -0,0 +1,242 @@
+#ifndef HeterogeneousCore_CUDACore_ScopedContext_h
+#define HeterogeneousCore_CUDACore_ScopedContext_h
+
+#include
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/EDPutToken.h"
+#include "FWCore/Utilities/interface/StreamID.h"
+#include "HeterogeneousCore/CUDACore/interface/ContextState.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
+
+namespace cms {
+ namespace cudatest {
+ class TestScopedContext;
+ }
+
+ namespace cuda {
+
+ namespace impl {
+ // This class is intended to be derived by other ScopedContext*, not for general use
+ class ScopedContextBase {
+ public:
+ int device() const { return currentDevice_; }
+
+ // cudaStream_t is a pointer to a thread-safe object, for which a
+ // mutable access is needed even if the ScopedContext itself
+ // would be const. Therefore it is ok to return a non-const
+ // pointer from a const method here.
+ cudaStream_t stream() const { return stream_.get(); }
+ const SharedStreamPtr& streamPtr() const { return stream_; }
+
+ protected:
+ // The constructors set the current device, but the device
+ // is not set back to the previous value at the destructor. This
+ // should be sufficient (and tiny bit faster) as all CUDA API
+ // functions relying on the current device should be called from
+ // the scope where this context is. The current device doesn't
+ // really matter between modules (or across TBB tasks).
+ explicit ScopedContextBase(edm::StreamID streamID);
+
+ explicit ScopedContextBase(const ProductBase& data);
+
+ explicit ScopedContextBase(int device, SharedStreamPtr stream);
+
+ private:
+ int currentDevice_;
+ SharedStreamPtr stream_;
+ };
+
+ class ScopedContextGetterBase : public ScopedContextBase {
+ public:
+ template
+ const T& get(const Product& data) {
+ synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event());
+ return data.data_;
+ }
+
+ template
+ const T& get(const edm::Event& iEvent, edm::EDGetTokenT> token) {
+ return get(iEvent.get(token));
+ }
+
+ protected:
+ template
+ ScopedContextGetterBase(Args&&... args) : ScopedContextBase(std::forward(args)...) {}
+
+ void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent);
+ };
+
+ class ScopedContextHolderHelper {
+ public:
+ ScopedContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+ : waitingTaskHolder_{std::move(waitingTaskHolder)} {}
+
+ template
+ void pushNextTask(F&& f, ContextState const* state);
+
+ void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+ waitingTaskHolder_ = std::move(waitingTaskHolder);
+ }
+
+ void enqueueCallback(int device, cudaStream_t stream);
+
+ private:
+ edm::WaitingTaskWithArenaHolder waitingTaskHolder_;
+ };
+ } // namespace impl
+
+ /**
+ * The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire():
+ * - setting the current device
+ * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
+ * - synchronizing between CUDA streams if necessary
+ * and enforce that those get done in a proper way in RAII fashion.
+ */
+ class ScopedContextAcquire : public impl::ScopedContextGetterBase {
+ public:
+ /// Constructor to create a new CUDA stream (no need for context beyond acquire())
+ explicit ScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+ : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)} {}
+
+ /// Constructor to create a new CUDA stream, and the context is needed after acquire()
+ explicit ScopedContextAcquire(edm::StreamID streamID,
+ edm::WaitingTaskWithArenaHolder waitingTaskHolder,
+ ContextState& state)
+ : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {}
+
+ /// Constructor to (possibly) re-use a CUDA stream (no need for context beyond acquire())
+ explicit ScopedContextAcquire(const ProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+ : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)} {}
+
+ /// Constructor to (possibly) re-use a CUDA stream, and the context is needed after acquire()
+ explicit ScopedContextAcquire(const ProductBase& data,
+ edm::WaitingTaskWithArenaHolder waitingTaskHolder,
+ ContextState& state)
+ : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {}
+
+ ~ScopedContextAcquire();
+
+ template
+ void pushNextTask(F&& f) {
+ if (contextState_ == nullptr)
+ throwNoState();
+ holderHelper_.pushNextTask(std::forward(f), contextState_);
+ }
+
+ void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+ holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
+ }
+
+ private:
+ void throwNoState();
+
+ impl::ScopedContextHolderHelper holderHelper_;
+ ContextState* contextState_ = nullptr;
+ };
+
+ /**
+ * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce():
+ * - setting the current device
+ * - synchronizing between CUDA streams if necessary
+ * and enforce that those get done in a proper way in RAII fashion.
+ */
+ class ScopedContextProduce : public impl::ScopedContextGetterBase {
+ public:
+ /// Constructor to create a new CUDA stream (non-ExternalWork module)
+ explicit ScopedContextProduce(edm::StreamID streamID) : ScopedContextGetterBase(streamID) {}
+
+ /// Constructor to (possibly) re-use a CUDA stream (non-ExternalWork module)
+ explicit ScopedContextProduce(const ProductBase& data) : ScopedContextGetterBase(data) {}
+
+ /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
+ explicit ScopedContextProduce(ContextState& state)
+ : ScopedContextGetterBase(state.device(), state.releaseStreamPtr()) {}
+
+ /// Record the CUDA event, all asynchronous work must have been queued before the destructor
+ ~ScopedContextProduce();
+
+ template
+ std::unique_ptr> wrap(T data) {
+ // make_unique doesn't work because of private constructor
+ return std::unique_ptr>(new Product(device(), streamPtr(), event_, std::move(data)));
+ }
+
+ template
+ auto emplace(edm::Event& iEvent, edm::EDPutTokenT token, Args&&... args) {
+ return iEvent.emplace(token, device(), streamPtr(), event_, std::forward(args)...);
+ }
+
+ private:
+ friend class cudatest::TestScopedContext;
+
+ // This construcor is only meant for testing
+ explicit ScopedContextProduce(int device, SharedStreamPtr stream, SharedEventPtr event)
+ : ScopedContextGetterBase(device, std::move(stream)), event_{std::move(event)} {}
+
+ // create the CUDA Event upfront to catch possible errors from its creation
+ SharedEventPtr event_ = getEventCache().get();
+ };
+
+ /**
+ * The aim of this class is to do necessary per-task "initialization" tasks created in ExternalWork acquire():
+ * - setting the current device
+ * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
+ * and enforce that those get done in a proper way in RAII fashion.
+ */
+ class ScopedContextTask : public impl::ScopedContextBase {
+ public:
+ /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
+ explicit ScopedContextTask(ContextState const* state, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+ : ScopedContextBase(state->device(), state->streamPtr()), // don't move, state is re-used afterwards
+ holderHelper_{std::move(waitingTaskHolder)},
+ contextState_{state} {}
+
+ ~ScopedContextTask();
+
+ template
+ void pushNextTask(F&& f) {
+ holderHelper_.pushNextTask(std::forward(f), contextState_);
+ }
+
+ void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+ holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
+ }
+
+ private:
+ impl::ScopedContextHolderHelper holderHelper_;
+ ContextState const* contextState_;
+ };
+
+ /**
+ * The aim of this class is to do necessary per-event "initialization" in analyze()
+ * - setting the current device
+ * - synchronizing between CUDA streams if necessary
+ * and enforce that those get done in a proper way in RAII fashion.
+ */
+ class ScopedContextAnalyze : public impl::ScopedContextGetterBase {
+ public:
+ /// Constructor to (possibly) re-use a CUDA stream
+ explicit ScopedContextAnalyze(const ProductBase& data) : ScopedContextGetterBase(data) {}
+ };
+
+ namespace impl {
+ template
+ void ScopedContextHolderHelper::pushNextTask(F&& f, ContextState const* state) {
+ replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{
+ edm::make_waiting_task_with_holder(tbb::task::allocate_root(),
+ std::move(waitingTaskHolder_),
+ [state, func = std::forward(f)](edm::WaitingTaskWithArenaHolder h) {
+ func(ScopedContextTask{state, std::move(h)});
+ })});
+ }
+ } // namespace impl
+ } // namespace cuda
+} // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/python/SwitchProducerCUDA.py b/HeterogeneousCore/CUDACore/python/SwitchProducerCUDA.py
new file mode 100644
index 0000000000000..ded114e2fddfe
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/python/SwitchProducerCUDA.py
@@ -0,0 +1,34 @@
+import FWCore.ParameterSet.Config as cms
+
+_cuda_enabled_cached = None
+
+def _switch_cuda():
+ global _cuda_enabled_cached
+ if _cuda_enabled_cached is None:
+ import os
+ _cuda_enabled_cached = (os.system("cudaIsEnabled") == 0)
+ return (_cuda_enabled_cached, 2)
+
+class SwitchProducerCUDA(cms.SwitchProducer):
+ def __init__(self, **kargs):
+ super(SwitchProducerCUDA,self).__init__(
+ dict(cpu = cms.SwitchProducer.getCpu(),
+ cuda = _switch_cuda),
+ **kargs
+ )
+cms.specialImportRegistry.registerSpecialImportForType(SwitchProducerCUDA, "from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA")
+
+if __name__ == "__main__":
+ import unittest
+
+ class TestSwitchProducerCUDA(unittest.TestCase):
+ def testPickle(self):
+ import pickle
+ sp = SwitchProducerCUDA(cpu = cms.EDProducer("Foo"), cuda = cms.EDProducer("Bar"))
+ pkl = pickle.dumps(sp)
+ unpkl = pickle.loads(pkl)
+ self.assertEqual(unpkl.cpu.type_(), "Foo")
+ self.assertEqual(unpkl.cuda.type_(), "Bar")
+
+ unittest.main()
+
diff --git a/HeterogeneousCore/CUDACore/src/ContextState.cc b/HeterogeneousCore/CUDACore/src/ContextState.cc
new file mode 100644
index 0000000000000..0670f01d472f3
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/src/ContextState.cc
@@ -0,0 +1,16 @@
+#include "HeterogeneousCore/CUDACore/interface/ContextState.h"
+#include "FWCore/Utilities/interface/Exception.h"
+
+namespace cms::cuda {
+ void ContextState::throwIfStream() const {
+ if (stream_) {
+ throw cms::Exception("LogicError") << "Trying to set ContextState, but it already had a valid state";
+ }
+ }
+
+ void ContextState::throwIfNoStream() const {
+ if (not stream_) {
+ throw cms::Exception("LogicError") << "Trying to get ContextState, but it did not have a valid state";
+ }
+ }
+} // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDACore/src/ScopedContext.cc b/HeterogeneousCore/CUDACore/src/ScopedContext.cc
new file mode 100644
index 0000000000000..7461ebbee9f0d
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/src/ScopedContext.cc
@@ -0,0 +1,118 @@
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/Utilities/interface/Exception.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "chooseDevice.h"
+
+namespace {
+ struct CallbackData {
+ edm::WaitingTaskWithArenaHolder holder;
+ int device;
+ };
+
+ void CUDART_CB cudaScopedContextCallback(cudaStream_t streamId, cudaError_t status, void* data) {
+ std::unique_ptr guard{reinterpret_cast(data)};
+ edm::WaitingTaskWithArenaHolder& waitingTaskHolder = guard->holder;
+ int device = guard->device;
+ if (status == cudaSuccess) {
+ LogTrace("ScopedContext") << " GPU kernel finished (in callback) device " << device << " CUDA stream "
+ << streamId;
+ waitingTaskHolder.doneWaiting(nullptr);
+ } else {
+ // wrap the exception in a try-catch block to let GDB "catch throw" break on it
+ try {
+ auto error = cudaGetErrorName(status);
+ auto message = cudaGetErrorString(status);
+ throw cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << device
+ << " error " << error << ": " << message;
+ } catch (cms::Exception&) {
+ waitingTaskHolder.doneWaiting(std::current_exception());
+ }
+ }
+ }
+} // namespace
+
+namespace cms::cuda {
+ namespace impl {
+ ScopedContextBase::ScopedContextBase(edm::StreamID streamID) : currentDevice_(chooseDevice(streamID)) {
+ cudaCheck(cudaSetDevice(currentDevice_));
+ stream_ = getStreamCache().get();
+ }
+
+ ScopedContextBase::ScopedContextBase(const ProductBase& data) : currentDevice_(data.device()) {
+ cudaCheck(cudaSetDevice(currentDevice_));
+ if (data.mayReuseStream()) {
+ stream_ = data.streamPtr();
+ } else {
+ stream_ = getStreamCache().get();
+ }
+ }
+
+ ScopedContextBase::ScopedContextBase(int device, SharedStreamPtr stream)
+ : currentDevice_(device), stream_(std::move(stream)) {
+ cudaCheck(cudaSetDevice(currentDevice_));
+ }
+
+ ////////////////////
+
+ void ScopedContextGetterBase::synchronizeStreams(int dataDevice,
+ cudaStream_t dataStream,
+ bool available,
+ cudaEvent_t dataEvent) {
+ if (dataDevice != device()) {
+ // Eventually replace with prefetch to current device (assuming unified memory works)
+ // If we won't go to unified memory, need to figure out something else...
+ throw cms::Exception("LogicError") << "Handling data from multiple devices is not yet supported";
+ }
+
+ if (dataStream != stream()) {
+ // Different streams, need to synchronize
+ if (not available) {
+ // Event not yet occurred, so need to add synchronization
+ // here. Sychronization is done by making the CUDA stream to
+ // wait for an event, so all subsequent work in the stream
+ // will run only after the event has "occurred" (i.e. data
+ // product became available).
+ cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event");
+ }
+ }
+ }
+
+ void ScopedContextHolderHelper::enqueueCallback(int device, cudaStream_t stream) {
+ cudaCheck(
+ cudaStreamAddCallback(stream, cudaScopedContextCallback, new CallbackData{waitingTaskHolder_, device}, 0));
+ }
+ } // namespace impl
+
+ ////////////////////
+
+ ScopedContextAcquire::~ScopedContextAcquire() {
+ holderHelper_.enqueueCallback(device(), stream());
+ if (contextState_) {
+ contextState_->set(device(), std::move(streamPtr()));
+ }
+ }
+
+ void ScopedContextAcquire::throwNoState() {
+ throw cms::Exception("LogicError")
+ << "Calling ScopedContextAcquire::insertNextTask() requires ScopedContextAcquire to be constructed with "
+ "ContextState, but that was not the case";
+ }
+
+ ////////////////////
+
+ ScopedContextProduce::~ScopedContextProduce() {
+ // Intentionally not checking the return value to avoid throwing
+ // exceptions. If this call would fail, we should get failures
+ // elsewhere as well.
+ cudaEventRecord(event_.get(), stream());
+ }
+
+ ////////////////////
+
+ ScopedContextTask::~ScopedContextTask() { holderHelper_.enqueueCallback(device(), stream()); }
+} // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDACore/src/chooseDevice.cc b/HeterogeneousCore/CUDACore/src/chooseDevice.cc
new file mode 100644
index 0000000000000..7312760be7d84
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/src/chooseDevice.cc
@@ -0,0 +1,18 @@
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+#include "chooseDevice.h"
+
+namespace cms::cuda {
+ int chooseDevice(edm::StreamID id) {
+ edm::Service cudaService;
+
+ // For startes we "statically" assign the device based on
+ // edm::Stream number. This is suboptimal if the number of
+ // edm::Streams is not a multiple of the number of CUDA devices
+ // (and even then there is no load balancing).
+ //
+ // TODO: improve the "assignment" logic
+ return id % cudaService->numberOfDevices();
+ }
+} // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDACore/src/chooseDevice.h b/HeterogeneousCore/CUDACore/src/chooseDevice.h
new file mode 100644
index 0000000000000..ab642325aaecf
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/src/chooseDevice.h
@@ -0,0 +1,10 @@
+#ifndef HeterogeneousCore_CUDACore_chooseDevice_h
+#define HeterogeneousCore_CUDACore_chooseDevice_h
+
+#include "FWCore/Utilities/interface/StreamID.h"
+
+namespace cms::cuda {
+ int chooseDevice(edm::StreamID id);
+}
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/test/BuildFile.xml b/HeterogeneousCore/CUDACore/test/BuildFile.xml
new file mode 100644
index 0000000000000..474f3cee1b3e8
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/BuildFile.xml
@@ -0,0 +1,18 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/HeterogeneousCore/CUDACore/test/testStreamEvent.cu b/HeterogeneousCore/CUDACore/test/testStreamEvent.cu
new file mode 100644
index 0000000000000..b0a66e2b777e4
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/testStreamEvent.cu
@@ -0,0 +1,134 @@
+/**
+ * The purpose of this test program is to ensure that the logic for
+ * CUDA event use in cms::cuda::Product and cms::cuda::ScopedContext
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+
+namespace {
+ constexpr int ARRAY_SIZE = 20000000;
+ constexpr int NLOOPS = 10;
+} // namespace
+
+__global__ void kernel_looping(float *point, unsigned int num) {
+ unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+ for (int iloop = 0; iloop < NLOOPS; ++iloop) {
+ for (size_t offset = idx; offset < num; offset += gridDim.x * blockDim.x) {
+ point[offset] += 1;
+ }
+ }
+}
+
+int main() {
+ cms::cudatest::requireDevices();
+
+ constexpr bool debug = false;
+
+ float *dev_points1;
+ float *host_points1;
+ cudaStream_t stream1, stream2;
+ cudaEvent_t event1, event2;
+
+ cudaCheck(cudaMalloc(&dev_points1, ARRAY_SIZE * sizeof(float)));
+ cudaCheck(cudaMallocHost(&host_points1, ARRAY_SIZE * sizeof(float)));
+ cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking);
+ cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking);
+ cudaEventCreate(&event1);
+ cudaEventCreate(&event2);
+
+ for (size_t j = 0; j < ARRAY_SIZE; ++j) {
+ host_points1[j] = static_cast(j);
+ }
+
+ cudaCheck(cudaMemcpyAsync(dev_points1, host_points1, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice, stream1));
+ kernel_looping<<<1, 16, 0, stream1>>>(dev_points1, ARRAY_SIZE);
+ if (debug)
+ std::cout << "Kernel launched on stream1" << std::endl;
+
+ auto status = cudaStreamQuery(stream1);
+ if (debug)
+ std::cout << "Stream1 busy? " << (status == cudaErrorNotReady) << " idle? " << (status == cudaSuccess) << std::endl;
+ cudaEventRecord(event1, stream1);
+ status = cudaEventQuery(event1);
+ if (debug)
+ std::cout << "Event1 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess)
+ << std::endl;
+ assert(status == cudaErrorNotReady);
+
+ status = cudaStreamQuery(stream2);
+ if (debug)
+ std::cout << "Stream2 busy? " << (status == cudaErrorNotReady) << " idle? " << (status == cudaSuccess) << std::endl;
+ assert(status == cudaSuccess);
+ if (debug) {
+ cudaEventRecord(event2, stream2);
+ status = cudaEventQuery(event2);
+ std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess)
+ << std::endl;
+ std::this_thread::sleep_for(std::chrono::milliseconds(1));
+ status = cudaEventQuery(event2);
+ std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess)
+ << std::endl;
+ }
+
+ cudaStreamWaitEvent(stream2, event1, 0);
+ if (debug)
+ std::cout << "\nStream2 waiting for event1" << std::endl;
+ status = cudaStreamQuery(stream2);
+ if (debug)
+ std::cout << "Stream2 busy? " << (status == cudaErrorNotReady) << " idle? " << (status == cudaSuccess) << std::endl;
+ assert(status == cudaErrorNotReady);
+ cudaEventRecord(event2, stream2);
+ status = cudaEventQuery(event2);
+ if (debug)
+ std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess)
+ << std::endl;
+ assert(status == cudaErrorNotReady);
+ if (debug) {
+ std::this_thread::sleep_for(std::chrono::milliseconds(1));
+ status = cudaEventQuery(event2);
+ std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess)
+ << std::endl;
+ }
+
+ status = cudaStreamQuery(stream1);
+ if (debug) {
+ std::cout << "\nStream1 busy? " << (status == cudaErrorNotReady) << " idle? " << (status == cudaSuccess)
+ << std::endl;
+ std::cout << "Synchronizing stream1" << std::endl;
+ }
+ assert(status == cudaErrorNotReady);
+ cudaStreamSynchronize(stream1);
+ if (debug)
+ std::cout << "Synchronized stream1" << std::endl;
+
+ status = cudaEventQuery(event1);
+ if (debug)
+ std::cout << "Event1 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess)
+ << std::endl;
+ assert(status == cudaSuccess);
+ status = cudaEventQuery(event2);
+ if (debug)
+ std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess)
+ << std::endl;
+ assert(status == cudaSuccess);
+
+ cudaFree(dev_points1);
+ cudaFreeHost(host_points1);
+ cudaStreamDestroy(stream1);
+ cudaStreamDestroy(stream2);
+ cudaEventDestroy(event1);
+ cudaEventDestroy(event2);
+
+ return 0;
+}
diff --git a/HeterogeneousCore/CUDACore/test/test_ScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_ScopedContext.cc
new file mode 100644
index 0000000000000..7a6543a667c3d
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_ScopedContext.cc
@@ -0,0 +1,134 @@
+#include "catch.hpp"
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "FWCore/Concurrency/interface/WaitingTask.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
+
+#include "test_ScopedContextKernels.h"
+
+namespace cms::cudatest {
+ class TestScopedContext {
+ public:
+ static cuda::ScopedContextProduce make(int dev, bool createEvent) {
+ cms::cuda::SharedEventPtr event;
+ if (createEvent) {
+ event = cms::cuda::getEventCache().get();
+ }
+ return cuda::ScopedContextProduce(dev, cms::cuda::getStreamCache().get(), std::move(event));
+ }
+ };
+} // namespace cms::cudatest
+
+namespace {
+ std::unique_ptr> produce(int device, int* d, int* h) {
+ auto ctx = cms::cudatest::TestScopedContext::make(device, true);
+ cudaCheck(cudaMemcpyAsync(d, h, sizeof(int), cudaMemcpyHostToDevice, ctx.stream()));
+ cms::cudatest::testScopedContextKernels_single(d, ctx.stream());
+ return ctx.wrap(d);
+ }
+} // namespace
+
+TEST_CASE("Use of cms::cuda::ScopedContext", "[CUDACore]") {
+ if (not cms::cudatest::testDevices()) {
+ return;
+ }
+
+ constexpr int defaultDevice = 0;
+ {
+ auto ctx = cms::cudatest::TestScopedContext::make(defaultDevice, true);
+
+ SECTION("Construct from device ID") { REQUIRE(cms::cuda::currentDevice() == defaultDevice); }
+
+ SECTION("Wrap T to cms::cuda::Product") {
+ std::unique_ptr> dataPtr = ctx.wrap(10);
+ REQUIRE(dataPtr.get() != nullptr);
+ REQUIRE(dataPtr->device() == ctx.device());
+ REQUIRE(dataPtr->stream() == ctx.stream());
+ }
+
+ SECTION("Construct from from cms::cuda::Product") {
+ std::unique_ptr> dataPtr = ctx.wrap(10);
+ const auto& data = *dataPtr;
+
+ cms::cuda::ScopedContextProduce ctx2{data};
+ REQUIRE(cms::cuda::currentDevice() == data.device());
+ REQUIRE(ctx2.stream() == data.stream());
+
+ // Second use of a product should lead to new stream
+ cms::cuda::ScopedContextProduce ctx3{data};
+ REQUIRE(cms::cuda::currentDevice() == data.device());
+ REQUIRE(ctx3.stream() != data.stream());
+ }
+
+ SECTION("Storing state in cms::cuda::ContextState") {
+ cms::cuda::ContextState ctxstate;
+ { // acquire
+ std::unique_ptr> dataPtr = ctx.wrap(10);
+ const auto& data = *dataPtr;
+ edm::WaitingTaskWithArenaHolder dummy{
+ edm::make_waiting_task(tbb::task::allocate_root(), [](std::exception_ptr const* iPtr) {})};
+ cms::cuda::ScopedContextAcquire ctx2{data, std::move(dummy), ctxstate};
+ }
+
+ { // produce
+ cms::cuda::ScopedContextProduce ctx2{ctxstate};
+ REQUIRE(cms::cuda::currentDevice() == ctx.device());
+ REQUIRE(ctx2.stream() == ctx.stream());
+ }
+ }
+
+ SECTION("Joining multiple CUDA streams") {
+ cms::cuda::ScopedSetDevice setDeviceForThisScope(defaultDevice);
+
+ // Mimick a producer on the first CUDA stream
+ int h_a1 = 1;
+ auto d_a1 = cms::cuda::make_device_unique(nullptr);
+ auto wprod1 = produce(defaultDevice, d_a1.get(), &h_a1);
+
+ // Mimick a producer on the second CUDA stream
+ int h_a2 = 2;
+ auto d_a2 = cms::cuda::make_device_unique(nullptr);
+ auto wprod2 = produce(defaultDevice, d_a2.get(), &h_a2);
+
+ REQUIRE(wprod1->stream() != wprod2->stream());
+
+ // Mimick a third producer "joining" the two streams
+ cms::cuda::ScopedContextProduce ctx2{*wprod1};
+
+ auto prod1 = ctx2.get(*wprod1);
+ auto prod2 = ctx2.get(*wprod2);
+
+ auto d_a3 = cms::cuda::make_device_unique(nullptr);
+ cms::cudatest::testScopedContextKernels_join(prod1, prod2, d_a3.get(), ctx2.stream());
+ cudaCheck(cudaStreamSynchronize(ctx2.stream()));
+ REQUIRE(wprod2->isAvailable());
+ REQUIRE(cms::cuda::eventWorkHasCompleted(wprod2->event()));
+
+ h_a1 = 0;
+ h_a2 = 0;
+ int h_a3 = 0;
+
+ cudaCheck(cudaMemcpyAsync(&h_a1, d_a1.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
+ cudaCheck(cudaMemcpyAsync(&h_a2, d_a2.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
+ cudaCheck(cudaMemcpyAsync(&h_a3, d_a3.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
+
+ REQUIRE(h_a1 == 2);
+ REQUIRE(h_a2 == 4);
+ REQUIRE(h_a3 == 6);
+ }
+ }
+
+ cudaCheck(cudaSetDevice(defaultDevice));
+ cudaCheck(cudaDeviceSynchronize());
+ // Note: CUDA resources are cleaned up by the destructors of the global cache objects
+}
diff --git a/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.cu b/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.cu
new file mode 100644
index 0000000000000..b87f1e20a5f24
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.cu
@@ -0,0 +1,17 @@
+#include "test_ScopedContextKernels.h"
+
+namespace {
+ __global__ void single_mul(int *d) { d[0] = d[0] * 2; }
+
+ __global__ void join_add(const int *d1, const int *d2, int *d3) { d3[0] = d1[0] + d2[0]; }
+} // namespace
+
+namespace cms {
+ namespace cudatest {
+ void testScopedContextKernels_single(int *d, cudaStream_t stream) { single_mul<<<1, 1, 0, stream>>>(d); }
+
+ void testScopedContextKernels_join(const int *d1, const int *d2, int *d3, cudaStream_t stream) {
+ join_add<<<1, 1, 0, stream>>>(d1, d2, d3);
+ }
+ } // namespace cudatest
+} // namespace cms
diff --git a/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.h b/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.h
new file mode 100644
index 0000000000000..dfc55682afc76
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.h
@@ -0,0 +1,13 @@
+#ifndef HeterogeneousCore_CUDACore_test_ScopedContextKernels_h
+#define HeterogeneousCore_CUDACore_test_ScopedContextKernels_h
+
+#include
+
+namespace cms {
+ namespace cudatest {
+ void testScopedContextKernels_single(int *d, cudaStream_t stream);
+ void testScopedContextKernels_join(const int *d1, const int *d2, int *d3, cudaStream_t stream);
+ } // namespace cudatest
+} // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/test/test_main.cc b/HeterogeneousCore/CUDACore/test/test_main.cc
new file mode 100644
index 0000000000000..2e1027598a4de
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_main.cc
@@ -0,0 +1,31 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+
+#include "FWCore/ParameterSetReader/interface/ParameterSetReader.h"
+#include "FWCore/PluginManager/interface/PluginManager.h"
+#include "FWCore/PluginManager/interface/standard.h"
+#include "FWCore/ServiceRegistry/interface/ServiceRegistry.h"
+
+class ServiceRegistryListener : public Catch::TestEventListenerBase {
+public:
+ using Catch::TestEventListenerBase::TestEventListenerBase; // inherit constructor
+
+ void testRunStarting(Catch::TestRunInfo const& testRunInfo) override {
+ edmplugin::PluginManager::configure(edmplugin::standard::config());
+
+ const std::string config{
+ R"_(import FWCore.ParameterSet.Config as cms
+process = cms.Process('Test')
+process.CUDAService = cms.Service('CUDAService')
+)_"};
+
+ std::unique_ptr params;
+ edm::makeParameterSets(config, params);
+ edm::ServiceToken tempToken(edm::ServiceRegistry::createServicesFromConfig(std::move(params)));
+ operate_.reset(new edm::ServiceRegistry::Operate(tempToken));
+ }
+
+private:
+ std::unique_ptr operate_;
+};
+CATCH_REGISTER_LISTENER(ServiceRegistryListener);
diff --git a/HeterogeneousCore/CUDAServices/BuildFile.xml b/HeterogeneousCore/CUDAServices/BuildFile.xml
new file mode 100644
index 0000000000000..c2d566baf5c1b
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/BuildFile.xml
@@ -0,0 +1,12 @@
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/HeterogeneousCore/CUDAServices/bin/BuildFile.xml b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml
new file mode 100644
index 0000000000000..68e32b64b4032
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml
@@ -0,0 +1,9 @@
+
+
+
+
+
+
+
+
+
diff --git a/HeterogeneousCore/CUDAServices/bin/cudaComputeCapabilities.cpp b/HeterogeneousCore/CUDAServices/bin/cudaComputeCapabilities.cpp
new file mode 100644
index 0000000000000..5a65575873116
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/bin/cudaComputeCapabilities.cpp
@@ -0,0 +1,23 @@
+// C++ standard headers
+#include
+#include
+
+// CUDA headers
+#include
+
+// CMSSW headers
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+int main() {
+ int devices = 0;
+ cudaCheck(cudaGetDeviceCount(&devices));
+
+ for (int i = 0; i < devices; ++i) {
+ cudaDeviceProp properties;
+ cudaGetDeviceProperties(&properties, i);
+ std::cout << std::setw(4) << i << " " << std::setw(2) << properties.major << "." << properties.minor << " "
+ << properties.name << std::endl;
+ }
+
+ return 0;
+}
diff --git a/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
new file mode 100644
index 0000000000000..eb21f22cd0c5c
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
@@ -0,0 +1,31 @@
+#include
+#include
+#include
+#include
+
+#include
+
+int main() {
+ int devices = 0;
+ auto status = cudaGetDeviceCount(&devices);
+ if (status != cudaSuccess) {
+ return EXIT_FAILURE;
+ }
+
+ int minimumMajor = 6; // min minor is implicitly 0
+
+ // This approach (requiring all devices are supported) is rather
+ // conservative. In principle we could consider just dropping the
+ // unsupported devices. Currently that would be easiest to achieve
+ // in CUDAService though.
+ for (int i = 0; i < devices; ++i) {
+ cudaDeviceProp properties;
+ cudaGetDeviceProperties(&properties, i);
+
+ if ((not(properties.major == 3 and properties.minor == 5)) and properties.major < minimumMajor) {
+ return EXIT_FAILURE;
+ }
+ }
+
+ return EXIT_SUCCESS;
+}
diff --git a/HeterogeneousCore/CUDAServices/interface/CUDAService.h b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
new file mode 100644
index 0000000000000..5295af75513b0
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
@@ -0,0 +1,38 @@
+#ifndef HeterogeneousCore_CUDAServices_CUDAService_h
+#define HeterogeneousCore_CUDAServices_CUDAService_h
+
+#include
+#include
+
+#include "FWCore/Utilities/interface/StreamID.h"
+
+namespace edm {
+ class ParameterSet;
+ class ActivityRegistry;
+ class ConfigurationDescriptions;
+} // namespace edm
+
+class CUDAService {
+public:
+ CUDAService(edm::ParameterSet const& iConfig);
+ ~CUDAService();
+
+ static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+ bool enabled() const { return enabled_; }
+
+ int numberOfDevices() const { return numberOfDevices_; }
+
+ // major, minor
+ std::pair computeCapability(int device) const { return computeCapabilities_.at(device); }
+
+ // Returns the id of device with most free memory. If none is found, returns -1.
+ int deviceWithMostFreeMemory() const;
+
+private:
+ int numberOfDevices_ = 0;
+ std::vector> computeCapabilities_;
+ bool enabled_ = false;
+};
+
+#endif
diff --git a/HeterogeneousCore/CUDAServices/interface/numberOfCUDADevices.h b/HeterogeneousCore/CUDAServices/interface/numberOfCUDADevices.h
new file mode 100644
index 0000000000000..b563b98b516cf
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/interface/numberOfCUDADevices.h
@@ -0,0 +1,9 @@
+#ifndef HeterogeneousCore_CUDAServices_numberOfCUDADevices_h
+#define HeterogeneousCore_CUDAServices_numberOfCUDADevices_h
+
+// Returns the number of CUDA devices
+// The difference wrt. the standard CUDA function is that if
+// CUDAService is disabled, this function returns 0.
+int numberOfCUDADevices();
+
+#endif
diff --git a/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml b/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml
index afcf86afdef75..95857d74e7dfa 100644
--- a/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml
+++ b/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml
@@ -1,18 +1,17 @@
-#Skip building plugins by dropping all files for none-AMD64 build
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
new file mode 100644
index 0000000000000..6d8527935e334
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
@@ -0,0 +1,107 @@
+#include
+
+#include
+
+#include "DataFormats/Provenance/interface/ModuleDescription.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
+#include "FWCore/ServiceRegistry/interface/ModuleCallingContext.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+namespace edm {
+ class StreamContext;
+}
+
+class CUDAMonitoringService {
+public:
+ CUDAMonitoringService(edm::ParameterSet const& iConfig, edm::ActivityRegistry& iRegistry);
+ ~CUDAMonitoringService() = default;
+
+ static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+ void postModuleConstruction(edm::ModuleDescription const& desc);
+ void postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc);
+ void postEvent(edm::StreamContext const& sc);
+
+private:
+ int numberOfDevices_ = 0;
+};
+
+CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) {
+ // make sure that CUDA is initialised, and that the CUDAService destructor is called after this service's destructor
+ edm::Service cudaService;
+ if (!cudaService->enabled())
+ return;
+ numberOfDevices_ = cudaService->numberOfDevices();
+
+ if (config.getUntrackedParameter("memoryConstruction")) {
+ registry.watchPostModuleConstruction(this, &CUDAMonitoringService::postModuleConstruction);
+ }
+ if (config.getUntrackedParameter("memoryBeginStream")) {
+ registry.watchPostModuleBeginStream(this, &CUDAMonitoringService::postModuleBeginStream);
+ }
+ if (config.getUntrackedParameter("memoryPerEvent")) {
+ registry.watchPostEvent(this, &CUDAMonitoringService::postEvent);
+ }
+}
+
+void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+ edm::ParameterSetDescription desc;
+
+ desc.addUntracked("memoryConstruction", false)
+ ->setComment("Print memory information for each device after the construction of each module");
+ desc.addUntracked("memoryBeginStream", true)
+ ->setComment("Print memory information for each device after the beginStream() of each module");
+ desc.addUntracked("memoryPerEvent", true)
+ ->setComment("Print memory information for each device after each event");
+
+ descriptions.add("CUDAMonitoringService", desc);
+ descriptions.setComment(
+ "The memory information is the global state of the device. This gets confusing if there are multiple processes "
+ "running on the same device. Probably the information retrieval should be re-thought?");
+}
+
+// activity handlers
+namespace {
+ template
+ void dumpUsedMemory(T& log, int num) {
+ int old = 0;
+ cudaCheck(cudaGetDevice(&old));
+ for (int i = 0; i < num; ++i) {
+ size_t freeMemory, totalMemory;
+ cudaCheck(cudaSetDevice(i));
+ cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
+ log << "\n"
+ << i << ": " << (totalMemory - freeMemory) / (1 << 20) << " MB used / " << totalMemory / (1 << 20)
+ << " MB total";
+ }
+ cudaCheck(cudaSetDevice(old));
+ }
+} // namespace
+
+void CUDAMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) {
+ auto log = edm::LogPrint("CUDAMonitoringService");
+ log << "CUDA device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")";
+ dumpUsedMemory(log, numberOfDevices_);
+}
+
+void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
+ auto log = edm::LogPrint("CUDAMonitoringService");
+ log << "CUDA device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " ("
+ << mcc.moduleDescription()->moduleName() << ")";
+ dumpUsedMemory(log, numberOfDevices_);
+}
+
+void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) {
+ auto log = edm::LogPrint("CUDAMonitoringService");
+ log << "CUDA device memory after event";
+ dumpUsedMemory(log, numberOfDevices_);
+}
+
+DEFINE_FWK_SERVICE(CUDAMonitoringService);
diff --git a/HeterogeneousCore/CUDAServices/plugins/NVProfilerService.cc b/HeterogeneousCore/CUDAServices/plugins/NVProfilerService.cc
index ec8c4deac4d4d..29fa1ab959025 100644
--- a/HeterogeneousCore/CUDAServices/plugins/NVProfilerService.cc
+++ b/HeterogeneousCore/CUDAServices/plugins/NVProfilerService.cc
@@ -41,6 +41,7 @@
#include "FWCore/Utilities/interface/Exception.h"
#include "FWCore/Utilities/interface/ProductKindOfType.h"
#include "FWCore/Utilities/interface/TimeOfDay.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
using namespace std::string_literals;
@@ -285,9 +286,8 @@ class NVProfilerService {
std::vector highlightModules_;
const bool showModulePrefetching_;
- bool skipFirstEvent_;
+ const bool skipFirstEvent_;
- unsigned int concurrentStreams_;
std::atomic globalFirstEventDone_ = false;
std::vector> streamFirstEventDone_;
std::vector event_; // per-stream event ranges
@@ -295,49 +295,22 @@ class NVProfilerService {
// use a tbb::concurrent_vector rather than an std::vector because its final size is not known
tbb::concurrent_vector global_modules_; // global per-module events
-private:
- struct Domains {
- nvtxDomainHandle_t global;
- std::vector stream;
-
- Domains(NVProfilerService* service) {
- global = nvtxDomainCreate("EDM Global");
- allocate_streams(service->concurrentStreams_);
- }
-
- ~Domains() {
- nvtxDomainDestroy(global);
- for (unsigned int sid = 0; sid < stream.size(); ++sid) {
- nvtxDomainDestroy(stream[sid]);
- }
- }
-
- void allocate_streams(unsigned int streams) {
- stream.resize(streams);
- for (unsigned int sid = 0; sid < streams; ++sid) {
- stream[sid] = nvtxDomainCreate((boost::format("EDM Stream %d") % sid).str().c_str());
- }
- }
- };
-
- // allow access to concurrentStreams_
- friend struct Domains;
-
- tbb::enumerable_thread_specific domains_;
-
- nvtxDomainHandle_t global_domain() { return domains_.local().global; }
-
- nvtxDomainHandle_t stream_domain(unsigned int sid) { return domains_.local().stream.at(sid); }
+ nvtxDomainHandle_t global_domain_; // NVTX domain for global EDM transitions
+ std::vector stream_domain_; // NVTX domains for per-EDM-stream transitions
};
NVProfilerService::NVProfilerService(edm::ParameterSet const& config, edm::ActivityRegistry& registry)
: highlightModules_(config.getUntrackedParameter>("highlightModules")),
showModulePrefetching_(config.getUntrackedParameter("showModulePrefetching")),
- skipFirstEvent_(config.getUntrackedParameter("skipFirstEvent")),
- concurrentStreams_(0),
- domains_(this) {
+ skipFirstEvent_(config.getUntrackedParameter("skipFirstEvent")) {
+ // make sure that CUDA is initialised, and that the CUDAService destructor is called after this service's destructor
+ edm::Service cudaService;
+
std::sort(highlightModules_.begin(), highlightModules_.end());
+ // create the NVTX domain for global EDM transitions
+ global_domain_ = nvtxDomainCreate("EDM Global");
+
// enables profile collection; if profiling is already enabled it has no effect
if (not skipFirstEvent_) {
cudaProfilerStart();
@@ -491,7 +464,13 @@ NVProfilerService::NVProfilerService(edm::ParameterSet const& config, edm::Activ
registry.watchPostEventReadFromSource(this, &NVProfilerService::postEventReadFromSource);
}
-NVProfilerService::~NVProfilerService() { cudaProfilerStop(); }
+NVProfilerService::~NVProfilerService() {
+ for (unsigned int sid = 0; sid < stream_domain_.size(); ++sid) {
+ nvtxDomainDestroy(stream_domain_[sid]);
+ }
+ nvtxDomainDestroy(global_domain_);
+ cudaProfilerStop();
+}
void NVProfilerService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
edm::ParameterSetDescription desc;
@@ -517,17 +496,20 @@ void NVProfilerService::preallocate(edm::service::SystemBounds const& bounds) {
out << "preallocate: " << bounds.maxNumberOfConcurrentRuns() << " concurrent runs, "
<< bounds.maxNumberOfConcurrentLuminosityBlocks() << " luminosity sections, " << bounds.maxNumberOfStreams()
<< " streams\nrunning on" << bounds.maxNumberOfThreads() << " threads";
- nvtxDomainMark(global_domain(), out.str().c_str());
+ nvtxDomainMark(global_domain_, out.str().c_str());
- concurrentStreams_ = bounds.maxNumberOfStreams();
- for (auto& domain : domains_) {
- domain.allocate_streams(concurrentStreams_);
+ auto concurrentStreams = bounds.maxNumberOfStreams();
+ // create the NVTX domains for per-EDM-stream transitions
+ stream_domain_.resize(concurrentStreams);
+ for (unsigned int sid = 0; sid < concurrentStreams; ++sid) {
+ stream_domain_[sid] = nvtxDomainCreate((boost::format("EDM Stream %d") % sid).str().c_str());
}
- event_.resize(concurrentStreams_);
- stream_modules_.resize(concurrentStreams_);
+
+ event_.resize(concurrentStreams);
+ stream_modules_.resize(concurrentStreams);
if (skipFirstEvent_) {
globalFirstEventDone_ = false;
- std::vector> tmp(concurrentStreams_);
+ std::vector> tmp(concurrentStreams);
for (auto& element : tmp)
std::atomic_init(&element, false);
streamFirstEventDone_ = std::move(tmp);
@@ -536,86 +518,86 @@ void NVProfilerService::preallocate(edm::service::SystemBounds const& bounds) {
void NVProfilerService::preBeginJob(edm::PathsAndConsumesOfModulesBase const& pathsAndConsumes,
edm::ProcessContext const& pc) {
- nvtxDomainMark(global_domain(), "preBeginJob");
+ nvtxDomainMark(global_domain_, "preBeginJob");
// FIXME this probably works only in the absence of subprocesses
// size() + 1 because pathsAndConsumes.allModules() does not include the source
unsigned int modules = pathsAndConsumes.allModules().size() + 1;
global_modules_.resize(modules, nvtxInvalidRangeId);
- for (unsigned int sid = 0; sid < concurrentStreams_; ++sid) {
+ for (unsigned int sid = 0; sid < stream_modules_.size(); ++sid) {
stream_modules_[sid].resize(modules, nvtxInvalidRangeId);
}
}
void NVProfilerService::postBeginJob() {
if (not skipFirstEvent_ or globalFirstEventDone_) {
- nvtxDomainMark(global_domain(), "postBeginJob");
+ nvtxDomainMark(global_domain_, "postBeginJob");
}
}
void NVProfilerService::postEndJob() {
if (not skipFirstEvent_ or globalFirstEventDone_) {
- nvtxDomainMark(global_domain(), "postEndJob");
+ nvtxDomainMark(global_domain_, "postEndJob");
}
}
void NVProfilerService::preSourceEvent(edm::StreamID sid) {
if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
- nvtxDomainRangePush(stream_domain(sid), "source");
+ nvtxDomainRangePush(stream_domain_[sid], "source");
}
}
void NVProfilerService::postSourceEvent(edm::StreamID sid) {
if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
- nvtxDomainRangePop(stream_domain(sid));
+ nvtxDomainRangePop(stream_domain_[sid]);
}
}
void NVProfilerService::preSourceLumi(edm::LuminosityBlockIndex index) {
if (not skipFirstEvent_ or globalFirstEventDone_) {
- nvtxDomainRangePush(global_domain(), "source lumi");
+ nvtxDomainRangePush(global_domain_, "source lumi");
}
}
void NVProfilerService::postSourceLumi(edm::LuminosityBlockIndex index) {
if (not skipFirstEvent_ or globalFirstEventDone_) {
- nvtxDomainRangePop(global_domain());
+ nvtxDomainRangePop(global_domain_);
}
}
void NVProfilerService::preSourceRun(edm::RunIndex index) {
if (not skipFirstEvent_ or globalFirstEventDone_) {
- nvtxDomainRangePush(global_domain(), "source run");
+ nvtxDomainRangePush(global_domain_, "source run");
}
}
void NVProfilerService::postSourceRun(edm::RunIndex index) {
if (not skipFirstEvent_ or globalFirstEventDone_) {
- nvtxDomainRangePop(global_domain());
+ nvtxDomainRangePop(global_domain_);
}
}
void NVProfilerService::preOpenFile(std::string const& lfn, bool) {
if (not skipFirstEvent_ or globalFirstEventDone_) {
- nvtxDomainRangePush(global_domain(), ("open file "s + lfn).c_str());
+ nvtxDomainRangePush(global_domain_, ("open file "s + lfn).c_str());
}
}
void NVProfilerService::postOpenFile(std::string const& lfn, bool) {
if (not skipFirstEvent_ or globalFirstEventDone_) {
- nvtxDomainRangePop(global_domain());
+ nvtxDomainRangePop(global_domain_);
}
}
void NVProfilerService::preCloseFile(std::string const& lfn, bool) {
if (not skipFirstEvent_ or globalFirstEventDone_) {
- nvtxDomainRangePush(global_domain(), ("close file "s + lfn).c_str());
+ nvtxDomainRangePush(global_domain_, ("close file "s + lfn).c_str());
}
}
void NVProfilerService::postCloseFile(std::string const& lfn, bool) {
if (not skipFirstEvent_ or globalFirstEventDone_) {
- nvtxDomainRangePop(global_domain());
+ nvtxDomainRangePop(global_domain_);
}
}
@@ -626,7 +608,7 @@ void NVProfilerService::preModuleBeginStream(edm::StreamContext const& sc, edm::
auto const& label = mcc.moduleDescription()->moduleLabel();
auto const& msg = label + " begin stream";
assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
- stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label));
+ stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label));
}
}
@@ -634,7 +616,7 @@ void NVProfilerService::postModuleBeginStream(edm::StreamContext const& sc, edm:
auto sid = sc.streamID();
if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
auto mid = mcc.moduleDescription()->id();
- nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+ nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
stream_modules_[sid][mid] = nvtxInvalidRangeId;
}
}
@@ -646,7 +628,7 @@ void NVProfilerService::preModuleEndStream(edm::StreamContext const& sc, edm::Mo
auto const& label = mcc.moduleDescription()->moduleLabel();
auto const& msg = label + " end stream";
assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
- stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label));
+ stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label));
}
}
@@ -654,124 +636,124 @@ void NVProfilerService::postModuleEndStream(edm::StreamContext const& sc, edm::M
auto sid = sc.streamID();
if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
auto mid = mcc.moduleDescription()->id();
- nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+ nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
stream_modules_[sid][mid] = nvtxInvalidRangeId;
}
}
void NVProfilerService::preGlobalBeginRun(edm::GlobalContext const& gc) {
if (not skipFirstEvent_ or globalFirstEventDone_) {
- nvtxDomainRangePush(global_domain(), "global begin run");
+ nvtxDomainRangePush(global_domain_, "global begin run");
}
}
void NVProfilerService::postGlobalBeginRun(edm::GlobalContext const& gc) {
if (not skipFirstEvent_ or globalFirstEventDone_) {
- nvtxDomainRangePop(global_domain());
+ nvtxDomainRangePop(global_domain_);
}
}
void NVProfilerService::preGlobalEndRun(edm::GlobalContext const& gc) {
if (not skipFirstEvent_ or globalFirstEventDone_) {
- nvtxDomainRangePush(global_domain(), "global end run");
+ nvtxDomainRangePush(global_domain_, "global end run");
}
}
void NVProfilerService::postGlobalEndRun(edm::GlobalContext const& gc) {
if (not skipFirstEvent_ or globalFirstEventDone_) {
- nvtxDomainRangePop(global_domain());
+ nvtxDomainRangePop(global_domain_);
}
}
void NVProfilerService::preStreamBeginRun(edm::StreamContext const& sc) {
auto sid = sc.streamID();
if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
- nvtxDomainRangePush(stream_domain(sid), "stream begin run");
+ nvtxDomainRangePush(stream_domain_[sid], "stream begin run");
}
}
void NVProfilerService::postStreamBeginRun(edm::StreamContext const& sc) {
auto sid = sc.streamID();
if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
- nvtxDomainRangePop(stream_domain(sid));
+ nvtxDomainRangePop(stream_domain_[sid]);
}
}
void NVProfilerService::preStreamEndRun(edm::StreamContext const& sc) {
auto sid = sc.streamID();
if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
- nvtxDomainRangePush(stream_domain(sid), "stream end run");
+ nvtxDomainRangePush(stream_domain_[sid], "stream end run");
}
}
void NVProfilerService::postStreamEndRun(edm::StreamContext const& sc) {
auto sid = sc.streamID();
if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
- nvtxDomainRangePop(stream_domain(sid));
+ nvtxDomainRangePop(stream_domain_[sid]);
}
}
void NVProfilerService::preGlobalBeginLumi(edm::GlobalContext const& gc) {
if (not skipFirstEvent_ or globalFirstEventDone_) {
- nvtxDomainRangePush(global_domain(), "global begin lumi");
+ nvtxDomainRangePush(global_domain_, "global begin lumi");
}
}
void NVProfilerService::postGlobalBeginLumi(edm::GlobalContext const& gc) {
if (not skipFirstEvent_ or globalFirstEventDone_) {
- nvtxDomainRangePop(global_domain());
+ nvtxDomainRangePop(global_domain_);
}
}
void NVProfilerService::preGlobalEndLumi(edm::GlobalContext const& gc) {
if (not skipFirstEvent_ or globalFirstEventDone_) {
- nvtxDomainRangePush(global_domain(), "global end lumi");
+ nvtxDomainRangePush(global_domain_, "global end lumi");
}
}
void NVProfilerService::postGlobalEndLumi(edm::GlobalContext const& gc) {
if (not skipFirstEvent_ or globalFirstEventDone_) {
- nvtxDomainRangePop(global_domain());
+ nvtxDomainRangePop(global_domain_);
}
}
void NVProfilerService::preStreamBeginLumi(edm::StreamContext const& sc) {
auto sid = sc.streamID();
if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
- nvtxDomainRangePush(stream_domain(sid), "stream begin lumi");
+ nvtxDomainRangePush(stream_domain_[sid], "stream begin lumi");
}
}
void NVProfilerService::postStreamBeginLumi(edm::StreamContext const& sc) {
auto sid = sc.streamID();
if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
- nvtxDomainRangePop(stream_domain(sid));
+ nvtxDomainRangePop(stream_domain_[sid]);
}
}
void NVProfilerService::preStreamEndLumi(edm::StreamContext const& sc) {
auto sid = sc.streamID();
- nvtxDomainRangePush(stream_domain(sid), "stream end lumi");
+ nvtxDomainRangePush(stream_domain_[sid], "stream end lumi");
}
void NVProfilerService::postStreamEndLumi(edm::StreamContext const& sc) {
auto sid = sc.streamID();
if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
- nvtxDomainRangePop(stream_domain(sid));
+ nvtxDomainRangePop(stream_domain_[sid]);
}
}
void NVProfilerService::preEvent(edm::StreamContext const& sc) {
auto sid = sc.streamID();
if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
- event_[sid] = nvtxDomainRangeStartColor(stream_domain(sid), "event", nvtxDarkGreen);
+ event_[sid] = nvtxDomainRangeStartColor(stream_domain_[sid], "event", nvtxDarkGreen);
}
}
void NVProfilerService::postEvent(edm::StreamContext const& sc) {
auto sid = sc.streamID();
if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
- nvtxDomainRangeEnd(stream_domain(sid), event_[sid]);
+ nvtxDomainRangeEnd(stream_domain_[sid], event_[sid]);
event_[sid] = nvtxInvalidRangeId;
} else {
streamFirstEventDone_[sid] = true;
@@ -787,7 +769,7 @@ void NVProfilerService::postEvent(edm::StreamContext const& sc) {
void NVProfilerService::prePathEvent(edm::StreamContext const& sc, edm::PathContext const& pc) {
auto sid = sc.streamID();
if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
- nvtxDomainMark(global_domain(), ("before path "s + pc.pathName()).c_str());
+ nvtxDomainMark(global_domain_, ("before path "s + pc.pathName()).c_str());
}
}
@@ -796,7 +778,7 @@ void NVProfilerService::postPathEvent(edm::StreamContext const& sc,
edm::HLTPathStatus const& hlts) {
auto sid = sc.streamID();
if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
- nvtxDomainMark(global_domain(), ("after path "s + pc.pathName()).c_str());
+ nvtxDomainMark(global_domain_, ("after path "s + pc.pathName()).c_str());
}
}
@@ -807,7 +789,7 @@ void NVProfilerService::preModuleEventPrefetching(edm::StreamContext const& sc,
auto const& label = mcc.moduleDescription()->moduleLabel();
auto const& msg = label + " prefetching";
assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
- stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColorLight(label));
+ stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColorLight(label));
}
}
@@ -815,7 +797,7 @@ void NVProfilerService::postModuleEventPrefetching(edm::StreamContext const& sc,
auto sid = sc.streamID();
if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
auto mid = mcc.moduleDescription()->id();
- nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+ nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
stream_modules_[sid][mid] = nvtxInvalidRangeId;
}
}
@@ -826,14 +808,14 @@ void NVProfilerService::preModuleConstruction(edm::ModuleDescription const& desc
global_modules_.grow_to_at_least(mid + 1);
auto const& label = desc.moduleLabel();
auto const& msg = label + " construction";
- global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label));
+ global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label));
}
}
void NVProfilerService::postModuleConstruction(edm::ModuleDescription const& desc) {
if (not skipFirstEvent_) {
auto mid = desc.id();
- nvtxDomainRangeEnd(global_domain(), global_modules_[mid]);
+ nvtxDomainRangeEnd(global_domain_, global_modules_[mid]);
global_modules_[mid] = nvtxInvalidRangeId;
}
}
@@ -843,14 +825,14 @@ void NVProfilerService::preModuleBeginJob(edm::ModuleDescription const& desc) {
auto mid = desc.id();
auto const& label = desc.moduleLabel();
auto const& msg = label + " begin job";
- global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label));
+ global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label));
}
}
void NVProfilerService::postModuleBeginJob(edm::ModuleDescription const& desc) {
if (not skipFirstEvent_) {
auto mid = desc.id();
- nvtxDomainRangeEnd(global_domain(), global_modules_[mid]);
+ nvtxDomainRangeEnd(global_domain_, global_modules_[mid]);
global_modules_[mid] = nvtxInvalidRangeId;
}
}
@@ -860,14 +842,14 @@ void NVProfilerService::preModuleEndJob(edm::ModuleDescription const& desc) {
auto mid = desc.id();
auto const& label = desc.moduleLabel();
auto const& msg = label + " end job";
- global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label));
+ global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label));
}
}
void NVProfilerService::postModuleEndJob(edm::ModuleDescription const& desc) {
if (not skipFirstEvent_ or globalFirstEventDone_) {
auto mid = desc.id();
- nvtxDomainRangeEnd(global_domain(), global_modules_[mid]);
+ nvtxDomainRangeEnd(global_domain_, global_modules_[mid]);
global_modules_[mid] = nvtxInvalidRangeId;
}
}
@@ -879,7 +861,7 @@ void NVProfilerService::preModuleEventAcquire(edm::StreamContext const& sc, edm:
auto const& label = mcc.moduleDescription()->moduleLabel();
auto const& msg = label + " acquire";
assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
- stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label));
+ stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label));
}
}
@@ -887,7 +869,7 @@ void NVProfilerService::postModuleEventAcquire(edm::StreamContext const& sc, edm
auto sid = sc.streamID();
if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
auto mid = mcc.moduleDescription()->id();
- nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+ nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
stream_modules_[sid][mid] = nvtxInvalidRangeId;
}
}
@@ -898,7 +880,7 @@ void NVProfilerService::preModuleEvent(edm::StreamContext const& sc, edm::Module
auto mid = mcc.moduleDescription()->id();
auto const& label = mcc.moduleDescription()->moduleLabel();
assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
- stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), label.c_str(), labelColor(label));
+ stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], label.c_str(), labelColor(label));
}
}
@@ -906,7 +888,7 @@ void NVProfilerService::postModuleEvent(edm::StreamContext const& sc, edm::Modul
auto sid = sc.streamID();
if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
auto mid = mcc.moduleDescription()->id();
- nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+ nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
stream_modules_[sid][mid] = nvtxInvalidRangeId;
}
}
@@ -919,7 +901,7 @@ void NVProfilerService::preModuleEventDelayedGet(edm::StreamContext const& sc, e
auto const & label = mcc.moduleDescription()->moduleLabel();
auto const & msg = label + " delayed get";
assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
- stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), label.c_str(), labelColorLight(label));
+ stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], label.c_str(), labelColorLight(label));
}
*/
}
@@ -929,7 +911,7 @@ void NVProfilerService::postModuleEventDelayedGet(edm::StreamContext const& sc,
auto sid = sc.streamID();
if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
auto mid = mcc.moduleDescription()->id();
- nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+ nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
stream_modules_[sid][mid] = nvtxInvalidRangeId;
}
*/
@@ -943,7 +925,7 @@ void NVProfilerService::preEventReadFromSource(edm::StreamContext const& sc, edm
auto const & label = mcc.moduleDescription()->moduleLabel();
auto const & msg = label + " read from source";
assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
- stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColorLight(label));
+ stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColorLight(label));
}
*/
}
@@ -953,7 +935,7 @@ void NVProfilerService::postEventReadFromSource(edm::StreamContext const& sc, ed
auto sid = sc.streamID();
if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
auto mid = mcc.moduleDescription()->id();
- nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+ nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
stream_modules_[sid][mid] = nvtxInvalidRangeId;
}
*/
@@ -966,7 +948,7 @@ void NVProfilerService::preModuleStreamBeginRun(edm::StreamContext const& sc, ed
auto const& label = mcc.moduleDescription()->moduleLabel();
auto const& msg = label + " stream begin run";
assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
- stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label));
+ stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label));
}
}
@@ -974,7 +956,7 @@ void NVProfilerService::postModuleStreamBeginRun(edm::StreamContext const& sc, e
auto sid = sc.streamID();
if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
auto mid = mcc.moduleDescription()->id();
- nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+ nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
stream_modules_[sid][mid] = nvtxInvalidRangeId;
}
}
@@ -986,7 +968,7 @@ void NVProfilerService::preModuleStreamEndRun(edm::StreamContext const& sc, edm:
auto const& label = mcc.moduleDescription()->moduleLabel();
auto const& msg = label + " stream end run";
assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
- stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label));
+ stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label));
}
}
@@ -994,7 +976,7 @@ void NVProfilerService::postModuleStreamEndRun(edm::StreamContext const& sc, edm
auto sid = sc.streamID();
if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
auto mid = mcc.moduleDescription()->id();
- nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+ nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
stream_modules_[sid][mid] = nvtxInvalidRangeId;
}
}
@@ -1006,7 +988,7 @@ void NVProfilerService::preModuleStreamBeginLumi(edm::StreamContext const& sc, e
auto const& label = mcc.moduleDescription()->moduleLabel();
auto const& msg = label + " stream begin lumi";
assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
- stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label));
+ stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label));
}
}
@@ -1014,7 +996,7 @@ void NVProfilerService::postModuleStreamBeginLumi(edm::StreamContext const& sc,
auto sid = sc.streamID();
if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
auto mid = mcc.moduleDescription()->id();
- nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+ nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
stream_modules_[sid][mid] = nvtxInvalidRangeId;
}
}
@@ -1026,7 +1008,7 @@ void NVProfilerService::preModuleStreamEndLumi(edm::StreamContext const& sc, edm
auto const& label = mcc.moduleDescription()->moduleLabel();
auto const& msg = label + " stream end lumi";
assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
- stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label));
+ stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label));
}
}
@@ -1034,7 +1016,7 @@ void NVProfilerService::postModuleStreamEndLumi(edm::StreamContext const& sc, ed
auto sid = sc.streamID();
if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
auto mid = mcc.moduleDescription()->id();
- nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+ nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
stream_modules_[sid][mid] = nvtxInvalidRangeId;
}
}
@@ -1044,14 +1026,14 @@ void NVProfilerService::preModuleGlobalBeginRun(edm::GlobalContext const& gc, ed
auto mid = mcc.moduleDescription()->id();
auto const& label = mcc.moduleDescription()->moduleLabel();
auto const& msg = label + " global begin run";
- global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label));
+ global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label));
}
}
void NVProfilerService::postModuleGlobalBeginRun(edm::GlobalContext const& gc, edm::ModuleCallingContext const& mcc) {
if (not skipFirstEvent_ or globalFirstEventDone_) {
auto mid = mcc.moduleDescription()->id();
- nvtxDomainRangeEnd(global_domain(), global_modules_[mid]);
+ nvtxDomainRangeEnd(global_domain_, global_modules_[mid]);
global_modules_[mid] = nvtxInvalidRangeId;
}
}
@@ -1061,14 +1043,14 @@ void NVProfilerService::preModuleGlobalEndRun(edm::GlobalContext const& gc, edm:
auto mid = mcc.moduleDescription()->id();
auto const& label = mcc.moduleDescription()->moduleLabel();
auto const& msg = label + " global end run";
- global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label));
+ global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label));
}
}
void NVProfilerService::postModuleGlobalEndRun(edm::GlobalContext const& gc, edm::ModuleCallingContext const& mcc) {
if (not skipFirstEvent_ or globalFirstEventDone_) {
auto mid = mcc.moduleDescription()->id();
- nvtxDomainRangeEnd(global_domain(), global_modules_[mid]);
+ nvtxDomainRangeEnd(global_domain_, global_modules_[mid]);
global_modules_[mid] = nvtxInvalidRangeId;
}
}
@@ -1078,14 +1060,14 @@ void NVProfilerService::preModuleGlobalBeginLumi(edm::GlobalContext const& gc, e
auto mid = mcc.moduleDescription()->id();
auto const& label = mcc.moduleDescription()->moduleLabel();
auto const& msg = label + " global begin lumi";
- global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label));
+ global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label));
}
}
void NVProfilerService::postModuleGlobalBeginLumi(edm::GlobalContext const& gc, edm::ModuleCallingContext const& mcc) {
if (not skipFirstEvent_ or globalFirstEventDone_) {
auto mid = mcc.moduleDescription()->id();
- nvtxDomainRangeEnd(global_domain(), global_modules_[mid]);
+ nvtxDomainRangeEnd(global_domain_, global_modules_[mid]);
global_modules_[mid] = nvtxInvalidRangeId;
}
}
@@ -1095,14 +1077,14 @@ void NVProfilerService::preModuleGlobalEndLumi(edm::GlobalContext const& gc, edm
auto mid = mcc.moduleDescription()->id();
auto const& label = mcc.moduleDescription()->moduleLabel();
auto const& msg = label + " global end lumi";
- global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label));
+ global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label));
}
}
void NVProfilerService::postModuleGlobalEndLumi(edm::GlobalContext const& gc, edm::ModuleCallingContext const& mcc) {
if (not skipFirstEvent_ or globalFirstEventDone_) {
auto mid = mcc.moduleDescription()->id();
- nvtxDomainRangeEnd(global_domain(), global_modules_[mid]);
+ nvtxDomainRangeEnd(global_domain_, global_modules_[mid]);
global_modules_[mid] = nvtxInvalidRangeId;
}
}
@@ -1113,14 +1095,14 @@ void NVProfilerService::preSourceConstruction(edm::ModuleDescription const& desc
global_modules_.grow_to_at_least(mid + 1);
auto const& label = desc.moduleLabel();
auto const& msg = label + " construction";
- global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label));
+ global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label));
}
}
void NVProfilerService::postSourceConstruction(edm::ModuleDescription const& desc) {
if (not skipFirstEvent_) {
auto mid = desc.id();
- nvtxDomainRangeEnd(global_domain(), global_modules_[mid]);
+ nvtxDomainRangeEnd(global_domain_, global_modules_[mid]);
global_modules_[mid] = nvtxInvalidRangeId;
}
}
diff --git a/HeterogeneousCore/CUDAServices/plugins/plugins.cc b/HeterogeneousCore/CUDAServices/plugins/plugins.cc
new file mode 100644
index 0000000000000..120ca10cb13ce
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/plugins/plugins.cc
@@ -0,0 +1,4 @@
+#include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+DEFINE_FWK_SERVICE_MAKER(CUDAService, edm::serviceregistry::ParameterSetMaker);
diff --git a/HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh b/HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh
new file mode 100644
index 0000000000000..bde3e26382976
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh
@@ -0,0 +1,10 @@
+#! /bin/bash -e
+
+# move to the .../src directory
+cd $CMSSW_BASE/src/
+
+# check out all packages containing .cu files
+git ls-files --full-name | grep '.*\.cu$' | cut -d/ -f-2 | sort -u | xargs git cms-addpkg
+
+# rebuild all checked out packages
+scram b -j
diff --git a/HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh b/HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh
new file mode 100644
index 0000000000000..f3335f4cd409f
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh
@@ -0,0 +1,19 @@
+#! /bin/bash
+TOOL=$CMSSW_BASE/config/toolbox/$SCRAM_ARCH/tools/selected/cuda.xml
+
+# enumerate the supported streaming multiprocessor (sm) compute capabilites
+DOTS=$(cudaComputeCapabilities | awk '{ print $2 }' | sort -u)
+CAPS=$(echo $DOTS | sed -e's#\.*##g')
+
+# remove existing capabilities
+sed -i $TOOL -e'\##d'
+
+# add support for the capabilities found on this machine
+for CAP in $CAPS; do
+ sed -i $TOOL -e"\##a\ "
+done
+
+# reconfigure the cuda.xml tool
+scram setup cuda
+
+echo "SCRAM configured to support CUDA streaming multiprocessor architectures $DOTS"
diff --git a/HeterogeneousCore/CUDAServices/scripts/cudaPreallocate.py b/HeterogeneousCore/CUDAServices/scripts/cudaPreallocate.py
new file mode 100644
index 0000000000000..331ddd30f73bd
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/scripts/cudaPreallocate.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+import re
+import sys
+import argparse
+
+def main(opts):
+ device = []
+ host = []
+
+ device_re = re.compile("Device.*allocated new device block.*\((?P\d+) bytes")
+ host_re = re.compile("Host.*allocated new host block.*\((?P\d+) bytes")
+
+ f = open(opts.file)
+ for line in f:
+ m = device_re.search(line)
+ if m:
+ device.append(m.group("bytes"))
+ continue
+ m = host_re.search(line)
+ if m:
+ host.append(m.group("bytes"))
+ f.close()
+
+ print("process.CUDAService.allocator.devicePreallocate = cms.untracked.vuint32(%s)" % ",".join(device))
+ print("process.CUDAService.allocator.hostPreallocate = cms.untracked.vuint32(%s)" % ",".join(host))
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="""Extract CUDAService preallocation parameters from a log file.
+
+To use, run the job once with "process.CUDAService.allocator.debug =
+True" and direct the output to a file. Then run this script by passing
+the file as an argument, and copy the output of this script back to
+the configuration file.""")
+ parser.add_argument("file", type=str, help="Log file to parse")
+ opts = parser.parse_args()
+ main(opts)
diff --git a/HeterogeneousCore/CUDAServices/scripts/nvprof-remote b/HeterogeneousCore/CUDAServices/scripts/nvprof-remote
new file mode 100644
index 0000000000000..3b010c005291f
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/scripts/nvprof-remote
@@ -0,0 +1,23 @@
+#! /bin/bash
+
+# find the CMSSW release
+if [ -z "$CMSSW_BASE" ]; then
+ export CMSSW_BASE=$(readlink -f $(dirname $0)/../..)
+fi
+
+# load the CMS environment
+source $(< "$CMSSW_BASE"/config/scram_basedir)/cmsset_default.sh
+
+# load the CMSSW release environment
+eval `cd "$CMSSW_BASE"; scram runtime -sh 2> /dev/null`
+
+# log the commands being run
+{
+ date
+ echo "cwd: $PWD"
+ echo "cmd: $0 $@"
+ echo
+} >> $CMSSW_BASE/tmp/nvprof.log
+
+# run the CUDA profiler
+nvprof "$@"
diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
new file mode 100644
index 0000000000000..5d1bd30264186
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -0,0 +1,389 @@
+#include
+#include
+#include
+
+#include
+
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/ReusableObjectHolder.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+#include "HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h"
+
+void setCudaLimit(cudaLimit limit, const char* name, size_t request) {
+ // read the current device
+ int device;
+ cudaCheck(cudaGetDevice(&device));
+ // try to set the requested limit
+ auto result = cudaDeviceSetLimit(limit, request);
+ if (cudaErrorUnsupportedLimit == result) {
+ edm::LogWarning("CUDAService") << "CUDA device " << device << ": unsupported limit \"" << name << "\"";
+ return;
+ }
+ // read back the limit value
+ size_t value;
+ cudaCheck(cudaDeviceGetLimit(&value, limit));
+ if (cudaSuccess != result) {
+ edm::LogWarning("CUDAService") << "CUDA device " << device << ": failed to set limit \"" << name << "\" to "
+ << request << ", current value is " << value;
+ } else if (value != request) {
+ edm::LogWarning("CUDAService") << "CUDA device " << device << ": limit \"" << name << "\" set to " << value
+ << " instead of requested " << request;
+ }
+}
+
+constexpr unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor) {
+ switch (major * 10 + minor) {
+ // Fermi architecture
+ case 20: // SM 2.0: GF100 class
+ return 32;
+ case 21: // SM 2.1: GF10x class
+ return 48;
+
+ // Kepler architecture
+ case 30: // SM 3.0: GK10x class
+ case 32: // SM 3.2: GK10x class
+ case 35: // SM 3.5: GK11x class
+ case 37: // SM 3.7: GK21x class
+ return 192;
+
+ // Maxwell architecture
+ case 50: // SM 5.0: GM10x class
+ case 52: // SM 5.2: GM20x class
+ case 53: // SM 5.3: GM20x class
+ return 128;
+
+ // Pascal architecture
+ case 60: // SM 6.0: GP100 class
+ return 64;
+ case 61: // SM 6.1: GP10x class
+ case 62: // SM 6.2: GP10x class
+ return 128;
+
+ // Volta architecture
+ case 70: // SM 7.0: GV100 class
+ case 72: // SM 7.2: GV11b class
+ return 64;
+
+ // Turing architecture
+ case 75: // SM 7.5: TU10x class
+ return 64;
+
+ // unknown architecture, return a default value
+ default:
+ return 64;
+ }
+}
+
+namespace {
+ template typename UniquePtr, typename Allocate>
+ void preallocate(Allocate allocate, const std::vector& bufferSizes) {
+ if (bufferSizes.empty())
+ return;
+
+ auto streamPtr = cms::cuda::getStreamCache().get();
+
+ std::vector > buffers;
+ buffers.reserve(bufferSizes.size());
+ for (auto size : bufferSizes) {
+ buffers.push_back(allocate(size, streamPtr.get()));
+ }
+ }
+
+ void devicePreallocate(int numberOfDevices, const std::vector& bufferSizes) {
+ int device;
+ cudaCheck(cudaGetDevice(&device));
+ for (int i = 0; i < numberOfDevices; ++i) {
+ cudaCheck(cudaSetDevice(i));
+ preallocate(
+ [&](size_t size, cudaStream_t stream) { return cms::cuda::make_device_unique(size, stream); },
+ bufferSizes);
+ }
+ cudaCheck(cudaSetDevice(device));
+ }
+
+ void hostPreallocate(const std::vector& bufferSizes) {
+ preallocate(
+ [&](size_t size, cudaStream_t stream) { return cms::cuda::make_host_unique(size, stream); },
+ bufferSizes);
+ }
+} // namespace
+
+/// Constructor
+CUDAService::CUDAService(edm::ParameterSet const& config) {
+ bool configEnabled = config.getUntrackedParameter("enabled");
+ if (not configEnabled) {
+ edm::LogInfo("CUDAService") << "CUDAService disabled by configuration";
+ return;
+ }
+
+ auto status = cudaGetDeviceCount(&numberOfDevices_);
+ if (cudaSuccess != status) {
+ edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n"
+ << "Disabling the CUDAService.";
+ return;
+ }
+ edm::LogInfo log("CUDAService");
+ computeCapabilities_.reserve(numberOfDevices_);
+ log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n\n";
+
+ auto const& limits = config.getUntrackedParameter("limits");
+ auto printfFifoSize = limits.getUntrackedParameter("cudaLimitPrintfFifoSize");
+ auto stackSize = limits.getUntrackedParameter("cudaLimitStackSize");
+ auto mallocHeapSize = limits.getUntrackedParameter("cudaLimitMallocHeapSize");
+ auto devRuntimeSyncDepth = limits.getUntrackedParameter("cudaLimitDevRuntimeSyncDepth");
+ auto devRuntimePendingLaunchCount = limits.getUntrackedParameter("cudaLimitDevRuntimePendingLaunchCount");
+
+ for (int i = 0; i < numberOfDevices_; ++i) {
+ // read information about the compute device.
+ // see the documentation of cudaGetDeviceProperties() for more information.
+ cudaDeviceProp properties;
+ cudaCheck(cudaGetDeviceProperties(&properties, i));
+ log << "CUDA device " << i << ": " << properties.name << '\n';
+
+ // compute capabilities
+ log << " compute capability: " << properties.major << "." << properties.minor << " (sm_"
+ << properties.major << properties.minor << ")\n";
+ computeCapabilities_.emplace_back(properties.major, properties.minor);
+ log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
+ log << " CUDA cores: " << std::setw(28)
+ << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
+ log << " single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio << ":1\n";
+
+ // compute mode
+ static constexpr const char* computeModeDescription[] = {
+ "default (shared)", // cudaComputeModeDefault
+ "exclusive (single thread)", // cudaComputeModeExclusive
+ "prohibited", // cudaComputeModeProhibited
+ "exclusive (single process)", // cudaComputeModeExclusiveProcess
+ "unknown"};
+ log << " compute mode:" << std::right << std::setw(27)
+ << computeModeDescription[std::min(properties.computeMode,
+ static_cast(std::size(computeModeDescription)) - 1)]
+ << '\n';
+
+ // TODO if a device is in exclusive use, skip it and remove it from the list, instead of failing with abort()
+ cudaCheck(cudaSetDevice(i));
+ cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
+
+ // read the free and total amount of memory available for allocation by the device, in bytes.
+ // see the documentation of cudaMemGetInfo() for more information.
+ size_t freeMemory, totalMemory;
+ cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
+ log << " memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6)
+ << totalMemory / (1 << 20) << " MB total\n";
+ log << " constant memory: " << std::setw(6) << properties.totalConstMem / (1 << 10) << " kB\n";
+ log << " L2 cache size: " << std::setw(6) << properties.l2CacheSize / (1 << 10) << " kB\n";
+
+ // L1 cache behaviour
+ static constexpr const char* l1CacheModeDescription[] = {
+ "unknown", "local memory", "global memory", "local and global memory"};
+ int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
+ log << " L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] << '\n';
+ log << '\n';
+
+ log << "Other capabilities\n";
+ log << " " << (properties.canMapHostMemory ? "can" : "cannot")
+ << " map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n";
+ log << " " << (properties.pageableMemoryAccess ? "supports" : "does not support")
+ << " coherently accessing pageable memory without calling cudaHostRegister() on it\n";
+ log << " " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot")
+ << " access pageable memory via the host's page tables\n";
+ log << " " << (properties.canUseHostPointerForRegisteredMem ? "can" : "cannot")
+ << " access host registered memory at the same virtual address as the host\n";
+ log << " " << (properties.unifiedAddressing ? "shares" : "does not share")
+ << " a unified address space with the host\n";
+ log << " " << (properties.managedMemory ? "supports" : "does not support")
+ << " allocating managed memory on this system\n";
+ log << " " << (properties.concurrentManagedAccess ? "can" : "cannot")
+ << " coherently access managed memory concurrently with the host\n";
+ log << " "
+ << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot")
+ << " directly access managed memory on the device without migration\n";
+ log << " " << (properties.cooperativeLaunch ? "supports" : "does not support")
+ << " launching cooperative kernels via cudaLaunchCooperativeKernel()\n";
+ log << " " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support")
+ << " launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n";
+ log << '\n';
+
+ // set and read the CUDA device flags.
+ // see the documentation of cudaSetDeviceFlags and cudaGetDeviceFlags for more information.
+ log << "CUDA flags\n";
+ unsigned int flags;
+ cudaCheck(cudaGetDeviceFlags(&flags));
+ switch (flags & cudaDeviceScheduleMask) {
+ case cudaDeviceScheduleAuto:
+ log << " thread policy: default\n";
+ break;
+ case cudaDeviceScheduleSpin:
+ log << " thread policy: spin\n";
+ break;
+ case cudaDeviceScheduleYield:
+ log << " thread policy: yield\n";
+ break;
+ case cudaDeviceScheduleBlockingSync:
+ log << " thread policy: blocking sync\n";
+ break;
+ default:
+ log << " thread policy: undefined\n";
+ }
+ if (flags & cudaDeviceMapHost) {
+ log << " pinned host memory allocations: enabled\n";
+ } else {
+ log << " pinned host memory allocations: disabled\n";
+ }
+ if (flags & cudaDeviceLmemResizeToMax) {
+ log << " kernel host memory reuse: enabled\n";
+ } else {
+ log << " kernel host memory reuse: disabled\n";
+ }
+ log << '\n';
+
+ // set and read the CUDA resource limits.
+ // see the documentation of cudaDeviceSetLimit() for more information.
+
+ // cudaLimitPrintfFifoSize controls the size in bytes of the shared FIFO used by the
+ // printf() device system call.
+ if (printfFifoSize >= 0) {
+ setCudaLimit(cudaLimitPrintfFifoSize, "cudaLimitPrintfFifoSize", printfFifoSize);
+ }
+ // cudaLimitStackSize controls the stack size in bytes of each GPU thread.
+ if (stackSize >= 0) {
+ setCudaLimit(cudaLimitStackSize, "cudaLimitStackSize", stackSize);
+ }
+ // cudaLimitMallocHeapSize controls the size in bytes of the heap used by the malloc()
+ // and free() device system calls.
+ if (mallocHeapSize >= 0) {
+ setCudaLimit(cudaLimitMallocHeapSize, "cudaLimitMallocHeapSize", mallocHeapSize);
+ }
+ if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
+ // cudaLimitDevRuntimeSyncDepth controls the maximum nesting depth of a grid at which
+ // a thread can safely call cudaDeviceSynchronize().
+ if (devRuntimeSyncDepth >= 0) {
+ setCudaLimit(cudaLimitDevRuntimeSyncDepth, "cudaLimitDevRuntimeSyncDepth", devRuntimeSyncDepth);
+ }
+ // cudaLimitDevRuntimePendingLaunchCount controls the maximum number of outstanding
+ // device runtime launches that can be made from the current device.
+ if (devRuntimePendingLaunchCount >= 0) {
+ setCudaLimit(cudaLimitDevRuntimePendingLaunchCount,
+ "cudaLimitDevRuntimePendingLaunchCount",
+ devRuntimePendingLaunchCount);
+ }
+ }
+
+ size_t value;
+ log << "CUDA limits\n";
+ cudaCheck(cudaDeviceGetLimit(&value, cudaLimitPrintfFifoSize));
+ log << " printf buffer size: " << std::setw(10) << value / (1 << 20) << " MB\n";
+ cudaCheck(cudaDeviceGetLimit(&value, cudaLimitStackSize));
+ log << " stack size: " << std::setw(10) << value / (1 << 10) << " kB\n";
+ cudaCheck(cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize));
+ log << " malloc heap size: " << std::setw(10) << value / (1 << 20) << " MB\n";
+ if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
+ cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimeSyncDepth));
+ log << " runtime sync depth: " << std::setw(10) << value << '\n';
+ cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimePendingLaunchCount));
+ log << " runtime pending launch count: " << std::setw(10) << value << '\n';
+ }
+ log << '\n';
+ }
+ log << "\n";
+
+ // Make sure the caching allocators and stream/event caches are constructed before declaring successful construction
+ if constexpr (cms::cuda::allocator::useCaching) {
+ cms::cuda::allocator::getCachingDeviceAllocator();
+ cms::cuda::allocator::getCachingHostAllocator();
+ }
+ cms::cuda::getEventCache().clear();
+ cms::cuda::getStreamCache().clear();
+
+ log << "CUDAService fully initialized";
+ enabled_ = true;
+
+ // Preallocate buffers if asked to
+ auto const& allocator = config.getUntrackedParameter("allocator");
+ devicePreallocate(numberOfDevices_, allocator.getUntrackedParameter >("devicePreallocate"));
+ hostPreallocate(allocator.getUntrackedParameter >("hostPreallocate"));
+}
+
+CUDAService::~CUDAService() {
+ if (enabled_) {
+ // Explicitly destruct the allocator before the device resets below
+ if constexpr (cms::cuda::allocator::useCaching) {
+ cms::cuda::allocator::getCachingDeviceAllocator().FreeAllCached();
+ cms::cuda::allocator::getCachingHostAllocator().FreeAllCached();
+ }
+ cms::cuda::getEventCache().clear();
+ cms::cuda::getStreamCache().clear();
+
+ for (int i = 0; i < numberOfDevices_; ++i) {
+ cudaCheck(cudaSetDevice(i));
+ cudaCheck(cudaDeviceSynchronize());
+ // Explicitly destroys and cleans up all resources associated with the current device in the
+ // current process. Any subsequent API call to this device will reinitialize the device.
+ // Useful to check for memory leaks with `cuda-memcheck --tool memcheck --leak-check full`.
+ cudaDeviceReset();
+ }
+ }
+}
+
+void CUDAService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+ edm::ParameterSetDescription desc;
+ desc.addUntracked("enabled", true);
+
+ edm::ParameterSetDescription limits;
+ limits.addUntracked("cudaLimitPrintfFifoSize", -1)
+ ->setComment("Size in bytes of the shared FIFO used by the printf() device system call.");
+ limits.addUntracked("cudaLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread.");
+ limits.addUntracked("cudaLimitMallocHeapSize", -1)
+ ->setComment("Size in bytes of the heap used by the malloc() and free() device system calls.");
+ limits.addUntracked("cudaLimitDevRuntimeSyncDepth", -1)
+ ->setComment("Maximum nesting depth of a grid at which a thread can safely call cudaDeviceSynchronize().");
+ limits.addUntracked("cudaLimitDevRuntimePendingLaunchCount", -1)
+ ->setComment("Maximum number of outstanding device runtime launches that can be made from the current device.");
+ desc.addUntracked("limits", limits)
+ ->setComment(
+ "See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps "
+ "the default value.");
+
+ edm::ParameterSetDescription allocator;
+ allocator.addUntracked >("devicePreallocate", std::vector{})
+ ->setComment("Preallocates buffers of given bytes on all devices");
+ allocator.addUntracked >("hostPreallocate", std::vector{})
+ ->setComment("Preallocates buffers of given bytes on the host");
+ desc.addUntracked("allocator", allocator);
+
+ descriptions.add("CUDAService", desc);
+}
+
+int CUDAService::deviceWithMostFreeMemory() const {
+ // save the current device
+ int currentDevice;
+ cudaCheck(cudaGetDevice(¤tDevice));
+
+ size_t maxFreeMemory = 0;
+ int device = -1;
+ for (int i = 0; i < numberOfDevices_; ++i) {
+ size_t freeMemory, totalMemory;
+ cudaSetDevice(i);
+ cudaMemGetInfo(&freeMemory, &totalMemory);
+ edm::LogPrint("CUDAService") << "CUDA device " << i << ": " << freeMemory / (1 << 20) << " MB free / "
+ << totalMemory / (1 << 20) << " MB total memory";
+ if (freeMemory > maxFreeMemory) {
+ maxFreeMemory = freeMemory;
+ device = i;
+ }
+ }
+ // restore the current device
+ cudaCheck(cudaSetDevice(currentDevice));
+ return device;
+}
diff --git a/HeterogeneousCore/CUDAServices/src/numberOfCUDADevices.cc b/HeterogeneousCore/CUDAServices/src/numberOfCUDADevices.cc
new file mode 100644
index 0000000000000..a0bbc503c3451
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/src/numberOfCUDADevices.cc
@@ -0,0 +1,8 @@
+#include "HeterogeneousCore/CUDAServices/interface/numberOfCUDADevices.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+
+int numberOfCUDADevices() {
+ edm::Service cs;
+ return cs->enabled() ? cs->numberOfDevices() : 0;
+}
diff --git a/HeterogeneousCore/CUDAServices/test/BuildFile.xml b/HeterogeneousCore/CUDAServices/test/BuildFile.xml
new file mode 100644
index 0000000000000..020b3736617c5
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/test/BuildFile.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
diff --git a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
new file mode 100644
index 0000000000000..265703ccf5903
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
@@ -0,0 +1,119 @@
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+#include "catch.hpp"
+
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/Utilities/interface/Exception.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+namespace {
+ CUDAService makeCUDAService(edm::ParameterSet ps) {
+ auto desc = edm::ConfigurationDescriptions("Service", "CUDAService");
+ CUDAService::fillDescriptions(desc);
+ desc.validate(ps, "CUDAService");
+ return CUDAService(ps);
+ }
+} // namespace
+
+TEST_CASE("Tests of CUDAService", "[CUDAService]") {
+ // Test setup: check if a simple CUDA runtime API call fails:
+ // if so, skip the test with the CUDAService enabled
+ int deviceCount = 0;
+ auto ret = cudaGetDeviceCount(&deviceCount);
+
+ if (ret != cudaSuccess) {
+ WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
+ << ret << ") " << cudaGetErrorString(ret) << ". Running only tests not requiring devices.");
+ }
+
+ SECTION("CUDAService enabled") {
+ edm::ParameterSet ps;
+ ps.addUntrackedParameter("enabled", true);
+ SECTION("Enabled only if there are CUDA capable GPUs") {
+ auto cs = makeCUDAService(ps);
+ if (deviceCount <= 0) {
+ REQUIRE(cs.enabled() == false);
+ WARN("CUDAService is disabled as there are no CUDA GPU devices");
+ } else {
+ REQUIRE(cs.enabled() == true);
+ INFO("CUDAService is enabled");
+ }
+ }
+
+ if (deviceCount <= 0) {
+ return;
+ }
+
+ auto cs = makeCUDAService(ps);
+
+ SECTION("CUDA Queries") {
+ int driverVersion = 0, runtimeVersion = 0;
+ ret = cudaDriverGetVersion(&driverVersion);
+ if (ret != cudaSuccess) {
+ FAIL("Unable to query the CUDA driver version from the CUDA runtime API: (" << ret << ") "
+ << cudaGetErrorString(ret));
+ }
+ ret = cudaRuntimeGetVersion(&runtimeVersion);
+ if (ret != cudaSuccess) {
+ FAIL("Unable to query the CUDA runtime API version: (" << ret << ") " << cudaGetErrorString(ret));
+ }
+
+ WARN("CUDA Driver Version / Runtime Version: " << driverVersion / 1000 << "." << (driverVersion % 100) / 10
+ << " / " << runtimeVersion / 1000 << "."
+ << (runtimeVersion % 100) / 10);
+
+ // Test that the number of devices found by the service
+ // is the same as detected by the CUDA runtime API
+ REQUIRE(cs.numberOfDevices() == deviceCount);
+ WARN("Detected " << cs.numberOfDevices() << " CUDA Capable device(s)");
+
+ // Test that the compute capabilities of each device
+ // are the same as detected by the CUDA runtime API
+ for (int i = 0; i < deviceCount; ++i) {
+ cudaDeviceProp deviceProp;
+ ret = cudaGetDeviceProperties(&deviceProp, i);
+ if (ret != cudaSuccess) {
+ FAIL("Unable to query the CUDA properties for device " << i << " from the CUDA runtime API: (" << ret << ") "
+ << cudaGetErrorString(ret));
+ }
+
+ REQUIRE(deviceProp.major == cs.computeCapability(i).first);
+ REQUIRE(deviceProp.minor == cs.computeCapability(i).second);
+ INFO("Device " << i << ": " << deviceProp.name << "\n CUDA Capability Major/Minor version number: "
+ << deviceProp.major << "." << deviceProp.minor);
+ }
+ }
+
+ SECTION("CUDAService device free memory") {
+ size_t mem = 0;
+ int dev = -1;
+ for (int i = 0; i < deviceCount; ++i) {
+ size_t free, tot;
+ cudaSetDevice(i);
+ cudaMemGetInfo(&free, &tot);
+ WARN("Device " << i << " memory total " << tot << " free " << free);
+ if (free > mem) {
+ mem = free;
+ dev = i;
+ }
+ }
+ WARN("Device with most free memory " << dev << "\n"
+ << " as given by CUDAService " << cs.deviceWithMostFreeMemory());
+ }
+ }
+
+ SECTION("Force to be disabled") {
+ edm::ParameterSet ps;
+ ps.addUntrackedParameter("enabled", false);
+ auto cs = makeCUDAService(ps);
+ REQUIRE(cs.enabled() == false);
+ REQUIRE(cs.numberOfDevices() == 0);
+ }
+}
diff --git a/HeterogeneousCore/CUDAServices/test/testCUDAService.py b/HeterogeneousCore/CUDAServices/test/testCUDAService.py
new file mode 100644
index 0000000000000..06edefcac2b66
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/test/testCUDAService.py
@@ -0,0 +1,13 @@
+import FWCore.ParameterSet.Config as cms
+
+process = cms.Process( "TEST" )
+
+process.load('FWCore.MessageService.MessageLogger_cfi')
+process.load('HeterogeneousCore.CUDAServices.CUDAService_cfi')
+process.MessageLogger.categories.append("CUDAService")
+
+process.source = cms.Source("EmptySource")
+
+process.maxEvents = cms.untracked.PSet(
+ input = cms.untracked.int32( 0 )
+)
diff --git a/HeterogeneousCore/CUDAServices/test/test_main.cpp b/HeterogeneousCore/CUDAServices/test/test_main.cpp
new file mode 100644
index 0000000000000..0c7c351f437f5
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/test/test_main.cpp
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
diff --git a/HeterogeneousCore/CUDATest/BuildFile.xml b/HeterogeneousCore/CUDATest/BuildFile.xml
new file mode 100644
index 0000000000000..84905be5e2db8
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/BuildFile.xml
@@ -0,0 +1,5 @@
+
+
+
+
+
diff --git a/HeterogeneousCore/CUDATest/interface/Thing.h b/HeterogeneousCore/CUDATest/interface/Thing.h
new file mode 100644
index 0000000000000..27dc58e1443f4
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/interface/Thing.h
@@ -0,0 +1,21 @@
+#ifndef HeterogeneousCore_CUDATest_Thing_H
+#define HeterogeneousCore_CUDATest_Thing_H
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+namespace cms {
+ namespace cudatest {
+ class Thing {
+ public:
+ Thing() = default;
+ explicit Thing(cms::cuda::device::unique_ptr ptr) : ptr_(std::move(ptr)) {}
+
+ const float *get() const { return ptr_.get(); }
+
+ private:
+ cms::cuda::device::unique_ptr ptr_;
+ };
+ } // namespace cudatest
+} // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDATest/plugins/BuildFile.xml b/HeterogeneousCore/CUDATest/plugins/BuildFile.xml
new file mode 100644
index 0000000000000..13f9ef6c06cd2
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/BuildFile.xml
@@ -0,0 +1,11 @@
+
+
+
+
+
+
+
+
+
+
+
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc
new file mode 100644
index 0000000000000..2778ed02f3ac6
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc
@@ -0,0 +1,81 @@
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDAnalyzer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDATest/interface/Thing.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
+
+#include "TestCUDAAnalyzerGPUKernel.h"
+
+class TestCUDAAnalyzerGPU : public edm::global::EDAnalyzer<> {
+public:
+ explicit TestCUDAAnalyzerGPU(edm::ParameterSet const& iConfig);
+ ~TestCUDAAnalyzerGPU() override = default;
+
+ static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+ void analyze(edm::StreamID, edm::Event const& iEvent, edm::EventSetup const& iSetup) const override;
+ void endJob() override;
+
+private:
+ std::string const label_;
+ edm::EDGetTokenT> const srcToken_;
+ double const minValue_;
+ double const maxValue_;
+ // the public interface is thread safe
+ CMS_THREAD_SAFE mutable std::unique_ptr gpuAlgo_;
+};
+
+TestCUDAAnalyzerGPU::TestCUDAAnalyzerGPU(edm::ParameterSet const& iConfig)
+ : label_(iConfig.getParameter("@module_label")),
+ srcToken_(consumes>(iConfig.getParameter("src"))),
+ minValue_(iConfig.getParameter("minValue")),
+ maxValue_(iConfig.getParameter("maxValue")) {
+ edm::Service cs;
+ if (cs->enabled()) {
+ auto streamPtr = cms::cuda::getStreamCache().get();
+ gpuAlgo_ = std::make_unique(streamPtr.get());
+ }
+}
+
+void TestCUDAAnalyzerGPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+ edm::ParameterSetDescription desc;
+ desc.add("src", edm::InputTag())->setComment("Source of cms::cuda::Product.");
+ desc.add("minValue", -1e308);
+ desc.add("maxValue", 1e308);
+ descriptions.addWithDefaultLabel(desc);
+ descriptions.setComment("This EDAnalyzer is part of the TestCUDAProducer* family. It models a GPU analyzer.");
+}
+
+void TestCUDAAnalyzerGPU::analyze(edm::StreamID, edm::Event const& iEvent, edm::EventSetup const& iSetup) const {
+ edm::LogVerbatim("TestCUDAAnalyzerGPU") << label_ << " TestCUDAAnalyzerGPU::analyze begin event "
+ << iEvent.id().event() << " stream " << iEvent.streamID();
+
+ auto const& in = iEvent.get(srcToken_);
+ cms::cuda::ScopedContextAnalyze ctx{in};
+ cms::cudatest::Thing const& input = ctx.get(in);
+ gpuAlgo_->analyzeAsync(input.get(), ctx.stream());
+
+ edm::LogVerbatim("TestCUDAAnalyzerGPU")
+ << label_ << " TestCUDAAnalyzerGPU::analyze end event " << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+void TestCUDAAnalyzerGPU::endJob() {
+ edm::LogVerbatim("TestCUDAAnalyzerGPU") << label_ << " TestCUDAAnalyzerGPU::endJob begin";
+
+ auto streamPtr = cms::cuda::getStreamCache().get();
+ auto value = gpuAlgo_->value(streamPtr.get());
+ edm::LogVerbatim("TestCUDAAnalyzerGPU") << label_ << " accumulated value " << value;
+ assert(minValue_ <= value && value <= maxValue_);
+
+ edm::LogVerbatim("TestCUDAAnalyzerGPU") << label_ << " TestCUDAAnalyzerGPU::endJob end";
+}
+
+DEFINE_FWK_MODULE(TestCUDAAnalyzerGPU);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.cu
new file mode 100644
index 0000000000000..2b3951a2b5cfe
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.cu
@@ -0,0 +1,46 @@
+#include "TestCUDAAnalyzerGPUKernel.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+namespace {
+ __global__ void analyze(const float *input, float *sum, int numElements) {
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ if (i < numElements) {
+ atomicAdd(sum + i, input[i]);
+ }
+ }
+
+ __global__ void sum(const float *input, float *output, int numElements) {
+ float val = 0.f;
+ for (int i = 0; i < numElements; ++i) {
+ val += input[i];
+ }
+ *output = val;
+ }
+} // namespace
+
+TestCUDAAnalyzerGPUKernel::TestCUDAAnalyzerGPUKernel(cudaStream_t stream) {
+ sum_ = cms::cuda::make_device_unique(NUM_VALUES, stream);
+ cms::cuda::memsetAsync(sum_, 0, NUM_VALUES, stream);
+ // better to synchronize since there is no guarantee that the stream
+ // of analyzeAsync() would be otherwise synchronized with this one
+ cudaCheck(cudaStreamSynchronize(stream));
+}
+
+void TestCUDAAnalyzerGPUKernel::analyzeAsync(const float *d_input, cudaStream_t stream) {
+ analyze<<>>(d_input, sum_.get(), NUM_VALUES);
+}
+
+float TestCUDAAnalyzerGPUKernel::value(cudaStream_t stream) const {
+ auto accumulator = cms::cuda::make_device_unique(stream);
+ auto h_accumulator = cms::cuda::make_host_unique(stream);
+ sum<<<1, 1, 0, stream>>>(sum_.get(), accumulator.get(), NUM_VALUES);
+ cms::cuda::copyAsync(h_accumulator, accumulator, stream);
+ // need to synchronize
+ cudaCheck(cudaStreamSynchronize(stream));
+ return *h_accumulator;
+}
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.h b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.h
new file mode 100644
index 0000000000000..a9a6b962c2cc4
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.h
@@ -0,0 +1,23 @@
+#ifndef HeterogeneousCore_CUDACore_TestCUDAAnalyzerGPUKernel_h
+#define HeterogeneousCore_CUDACore_TestCUDAAnalyzerGPUKernel_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+#include
+
+class TestCUDAAnalyzerGPUKernel {
+public:
+ static constexpr int NUM_VALUES = 4000;
+
+ TestCUDAAnalyzerGPUKernel(cudaStream_t stream);
+ ~TestCUDAAnalyzerGPUKernel() = default;
+
+ // thread safe
+ void analyzeAsync(const float* d_input, cudaStream_t stream);
+ float value(cudaStream_t stream) const;
+
+private:
+ cms::cuda::device::unique_ptr sum_; // all writes are atomic in CUDA
+};
+
+#endif
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc
new file mode 100644
index 0000000000000..c25a44023ebc0
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc
@@ -0,0 +1,67 @@
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include
+#include
+#include
+
+class TestCUDAProducerCPU : public edm::global::EDProducer<> {
+public:
+ explicit TestCUDAProducerCPU(edm::ParameterSet const& iConfig);
+ ~TestCUDAProducerCPU() override = default;
+
+ static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+ void produce(edm::StreamID id, edm::Event& iEvent, edm::EventSetup const& iSetup) const override;
+
+private:
+ std::string const label_;
+ edm::EDGetTokenT srcToken_;
+ edm::EDPutTokenT const dstToken_;
+};
+
+TestCUDAProducerCPU::TestCUDAProducerCPU(edm::ParameterSet const& iConfig)
+ : label_{iConfig.getParameter("@module_label")}, dstToken_{produces()} {
+ auto srcTag = iConfig.getParameter("src");
+ if (!srcTag.label().empty()) {
+ srcToken_ = consumes