diff --git a/src/cudadev/CUDACore/AcquireContext.cc b/src/cudadev/CUDACore/AcquireContext.cc new file mode 100644 index 000000000..dd6e8d1de --- /dev/null +++ b/src/cudadev/CUDACore/AcquireContext.cc @@ -0,0 +1,5 @@ +#include "CUDACore/AcquireContext.h" + +namespace cms::cuda { + void AcquireContext::commit() { holderHelper_.enqueueCallback(stream()); } +} diff --git a/src/cudadev/CUDACore/AcquireContext.h b/src/cudadev/CUDACore/AcquireContext.h new file mode 100644 index 000000000..409320abe --- /dev/null +++ b/src/cudadev/CUDACore/AcquireContext.h @@ -0,0 +1,45 @@ +#ifndef HeterogeneousCore_CUDACore_AcquireContext_h +#define HeterogeneousCore_CUDACore_AcquireContext_h + +#include "CUDACore/EDGetterContextBase.h" +#include "CUDACore/TaskContext.h" + +namespace cms::cuda { + /** + * The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire(): + * - setting the current device + * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary + * - synchronizing between CUDA streams if necessary + * Users should not, however, construct it explicitly. + */ + class AcquireContext : public impl::EDGetterContextBase { + public: + explicit AcquireContext(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder) + : EDGetterContextBase(streamID), holderHelper_{std::move(waitingTaskHolder), device()} {} + ~AcquireContext() = default; + + template + void pushNextTask(F&& f) { + holderHelper_.pushNextTask(std::forward(f)); + } + + void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder)); + } + + // internal API + void commit(); + + private: + impl::FwkContextHolderHelper holderHelper_; + }; + + template + void runAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder holder, F func) { + AcquireContext context(streamID, std::move(holder)); + func(context); + context.commit(); + } +} + +#endif diff --git a/src/cudadev/CUDACore/AnalyzeContext.h b/src/cudadev/CUDACore/AnalyzeContext.h new file mode 100644 index 000000000..cbe9a66dd --- /dev/null +++ b/src/cudadev/CUDACore/AnalyzeContext.h @@ -0,0 +1,20 @@ +#ifndef HeterogeneousCore_CUDACore_AnalyzeContext_h +#define HeterogeneousCore_CUDACore_AnalyzeContext_h + +#include "CUDACore/EDGetterContextBase.h" + +namespace cms::cuda { + /** + * The aim of this class is to do necessary per-event "initialization" in analyze() + * - setting the current device + * - synchronizing between CUDA streams if necessary + * and enforce that those get done in a proper way in RAII fashion. + */ + class AnalyzeContext : public impl::EDGetterContextBase { + public: + /// Constructor to (possibly) re-use a CUDA stream + explicit AnalyzeContext(edm::StreamID streamID) : EDGetterContextBase(streamID) {} + }; +} + +#endif diff --git a/src/cudadev/CUDACore/ContextState.h b/src/cudadev/CUDACore/ContextState.h index 3c73054ab..b0109e33d 100644 --- a/src/cudadev/CUDACore/ContextState.h +++ b/src/cudadev/CUDACore/ContextState.h @@ -26,6 +26,9 @@ namespace cms { friend class ScopedContextAcquire; friend class ScopedContextProduce; friend class ScopedContextTask; + friend class AcquireContext; + friend class ProduceContext; + friend class TaskContext; void set(int device, SharedStreamPtr stream) { throwIfStream(); diff --git a/src/cudadev/CUDACore/EDGetterContextBase.cc b/src/cudadev/CUDACore/EDGetterContextBase.cc new file mode 100644 index 000000000..cfeccb178 --- /dev/null +++ b/src/cudadev/CUDACore/EDGetterContextBase.cc @@ -0,0 +1,27 @@ +#include "CUDACore/EDGetterContextBase.h" +#include "CUDACore/cudaCheck.h" + +namespace cms::cuda::impl { + void EDGetterContextBase::synchronizeStreams(int dataDevice, + cudaStream_t dataStream, + bool available, + cudaEvent_t dataEvent) { + if (dataDevice != device()) { + // Eventually replace with prefetch to current device (assuming unified memory works) + // If we won't go to unified memory, need to figure out something else... + throw std::runtime_error("Handling data from multiple devices is not yet supported"); + } + + if (dataStream != stream()) { + // Different streams, need to synchronize + if (not available) { + // Event not yet occurred, so need to add synchronization + // here. Sychronization is done by making the CUDA stream to + // wait for an event, so all subsequent work in the stream + // will run only after the event has "occurred" (i.e. data + // product became available). + cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event"); + } + } + } +} diff --git a/src/cudadev/CUDACore/EDGetterContextBase.h b/src/cudadev/CUDACore/EDGetterContextBase.h new file mode 100644 index 000000000..d17814d69 --- /dev/null +++ b/src/cudadev/CUDACore/EDGetterContextBase.h @@ -0,0 +1,38 @@ +#ifndef HeterogeneousCore_CUDACore_EDGetterContextBase_h +#define HeterogeneousCore_CUDACore_EDGetterContextBase_h + +#include "CUDACore/FwkContextBase.h" +#include "CUDACore/Product.h" +#include "Framework/EDGetToken.h" + +namespace cms::cuda::impl { + /** + * This class is a base class for Context classes that should be + * able to read Event Data products + */ + class EDGetterContextBase : public FwkContextBase { + public: + template + const T& get(const Product& data) { + if (not isInitialized()) { + initialize(data); + } + synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event()); + return data.data_; + } + + template + const T& get(const edm::Event& iEvent, edm::EDGetTokenT> token) { + return get(iEvent.get(token)); + } + + protected: + template + EDGetterContextBase(Args&&... args) : FwkContextBase(std::forward(args)...) {} + + private: + void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent); + }; +} + +#endif diff --git a/src/cudadev/CUDACore/EDProducer.cc b/src/cudadev/CUDACore/EDProducer.cc new file mode 100644 index 000000000..2e08ea2d2 --- /dev/null +++ b/src/cudadev/CUDACore/EDProducer.cc @@ -0,0 +1,17 @@ +#include "CUDACore/EDProducer.h" + +namespace cms::cuda { + void EDProducer::produce(edm::Event& event, edm::EventSetup const& eventSetup) { + runProduce(event.streamID(), [&](auto& ctx) { produce(event, eventSetup, ctx); }); + } + + void SynchronizingEDProducer::acquire(edm::Event const& event, + edm::EventSetup const& eventSetup, + edm::WaitingTaskWithArenaHolder holder) { + runAcquire(event.streamID(), std::move(holder), [&](auto& ctx) { acquire(event, eventSetup, ctx); }); + } + + void SynchronizingEDProducer::produce(edm::Event& event, edm::EventSetup const& eventSetup) { + runProduce(event.streamID(), [&](auto& ctx) { produce(event, eventSetup, ctx); }); + } +} // namespace cms::cuda diff --git a/src/cudadev/CUDACore/EDProducer.h b/src/cudadev/CUDACore/EDProducer.h new file mode 100644 index 000000000..92529c502 --- /dev/null +++ b/src/cudadev/CUDACore/EDProducer.h @@ -0,0 +1,27 @@ +#ifndef HeterogeneousCore_CUDACore_stream_EDProducer_h +#define HeterogeneousCore_CUDACore_stream_EDProducer_h + +#include "Framework/EDProducer.h" +#include "CUDACore/AcquireContext.h" +#include "CUDACore/ProduceContext.h" + +namespace cms::cuda { + class EDProducer : public edm::EDProducer { + public: + void produce(edm::Event& event, edm::EventSetup const& eventSetup) override; + virtual void produce(edm::Event& event, edm::EventSetup const& eventSetup, ProduceContext& context) = 0; + }; + + class SynchronizingEDProducer : public edm::EDProducerExternalWork { + public: + void acquire(edm::Event const& event, + edm::EventSetup const& eventSetup, + edm::WaitingTaskWithArenaHolder holder) override; + void produce(edm::Event& event, edm::EventSetup const& eventSetup) override; + + virtual void acquire(edm::Event const& event, edm::EventSetup const& eventSetup, AcquireContext& context) = 0; + virtual void produce(edm::Event& event, edm::EventSetup const& eventSetup, ProduceContext& context) = 0; + }; +} // namespace cms::cuda + +#endif diff --git a/src/cudadev/CUDACore/FwkContextBase.cc b/src/cudadev/CUDACore/FwkContextBase.cc new file mode 100644 index 000000000..47e2d8199 --- /dev/null +++ b/src/cudadev/CUDACore/FwkContextBase.cc @@ -0,0 +1,28 @@ +#include "CUDACore/FwkContextBase.h" +#include "CUDACore/StreamCache.h" +#include "CUDACore/cudaCheck.h" + +#include "chooseDevice.h" + +namespace cms::cuda::impl { + FwkContextBase::FwkContextBase(edm::StreamID streamID) : FwkContextBase(chooseDevice(streamID)) {} + + FwkContextBase::FwkContextBase(int device) : currentDevice_(device) { cudaCheck(cudaSetDevice(currentDevice_)); } + + FwkContextBase::FwkContextBase(int device, SharedStreamPtr stream) + : currentDevice_(device), stream_(std::make_shared(std::move(stream))) { + cudaCheck(cudaSetDevice(currentDevice_)); + } + + void FwkContextBase::initialize() { stream_ = std::make_shared(getStreamCache().get()); } + + void FwkContextBase::initialize(const ProductBase& data) { + SharedStreamPtr stream; + if (data.mayReuseStream()) { + stream = data.streamPtr(); + } else { + stream = getStreamCache().get(); + } + stream_ = std::make_shared(std::move(stream)); + } +} diff --git a/src/cudadev/CUDACore/FwkContextBase.h b/src/cudadev/CUDACore/FwkContextBase.h new file mode 100644 index 000000000..1b329cb03 --- /dev/null +++ b/src/cudadev/CUDACore/FwkContextBase.h @@ -0,0 +1,66 @@ +#ifndef HeterogeneousCore_CUDACore_FwkContextBase_h +#define HeterogeneousCore_CUDACore_FwkContextBase_h + +#include "CUDACore/ProductBase.h" +#include "CUDACore/SharedStreamPtr.h" +#include "Framework/Event.h" + +namespace cms::cuda::impl { + /** + * This class is a base class for other Context classes for interacting with the framework + */ + class FwkContextBase { + public: + FwkContextBase(FwkContextBase const&) = delete; + FwkContextBase& operator=(FwkContextBase const&) = delete; + FwkContextBase(FwkContextBase&&) = delete; + FwkContextBase& operator=(FwkContextBase&&) = delete; + + int device() const { return currentDevice_; } + + cudaStream_t stream() { + if (not isInitialized()) { + initialize(); + } + return stream_->streamPtr().get(); + } + const SharedStreamPtr& streamPtr() { + if (not isInitialized()) { + initialize(); + } + return stream_->streamPtr(); + } + + protected: + // The constructors set the current device, but the device + // is not set back to the previous value at the destructor. This + // should be sufficient (and tiny bit faster) as all CUDA API + // functions relying on the current device should be called from + // the scope where this context is. The current device doesn't + // really matter between modules (or across TBB tasks). + explicit FwkContextBase(edm::StreamID streamID); + + explicit FwkContextBase(int device); + + // meant only for testing + explicit FwkContextBase(int device, SharedStreamPtr stream); + + bool isInitialized() const { return bool(stream_); } + + void initialize(); + void initialize(const ProductBase& data); + + const std::shared_ptr& streamSharingHelper() { + if (not isInitialized()) { + initialize(); + } + return stream_; + } + + private: + int currentDevice_ = -1; + std::shared_ptr stream_; + }; +} + +#endif diff --git a/src/cudadev/CUDACore/FwkContextHolderHelper.h b/src/cudadev/CUDACore/FwkContextHolderHelper.h new file mode 100644 index 000000000..18e465142 --- /dev/null +++ b/src/cudadev/CUDACore/FwkContextHolderHelper.h @@ -0,0 +1,25 @@ +#ifndef HeterogeneousCore_CUDACore_FwkContextHolderHelper_h +#define HeterogeneousCore_CUDACore_FwkContextHolderHelper_h + +namespace cms::cuda::impl { + class FwkContextHolderHelper { + public: + FwkContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder, int device) + : waitingTaskHolder_{std::move(waitingTaskHolder)}, device_{device} {} + + template + void pushNextTask(F&& f); + + void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + waitingTaskHolder_ = std::move(waitingTaskHolder); + } + + void enqueueCallback(cudaStream_t stream); + + private: + edm::WaitingTaskWithArenaHolder waitingTaskHolder_; + int device_; + }; +} + +#endif diff --git a/src/cudadev/CUDACore/ProduceContext.cc b/src/cudadev/CUDACore/ProduceContext.cc new file mode 100644 index 000000000..0af59c3a7 --- /dev/null +++ b/src/cudadev/CUDACore/ProduceContext.cc @@ -0,0 +1,6 @@ +#include "CUDACore/ProduceContext.h" +#include "CUDACore/cudaCheck.h" + +namespace cms::cuda { + void ProduceContext::commit() { cudaCheck(cudaEventRecord(event_.get(), stream())); } +} diff --git a/src/cudadev/CUDACore/ProduceContext.h b/src/cudadev/CUDACore/ProduceContext.h new file mode 100644 index 000000000..980e53377 --- /dev/null +++ b/src/cudadev/CUDACore/ProduceContext.h @@ -0,0 +1,52 @@ +#ifndef HeterogeneousCore_CUDACore_ProduceContext_h +#define HeterogeneousCore_CUDACore_ProduceContext_h + +#include "CUDACore/EDGetterContextBase.h" +#include "CUDACore/EventCache.h" +#include "Framework/EDPutToken.h" + +namespace cms::cuda { + /** + * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce(): + * - setting the current device + * - synchronizing between CUDA streams if necessary + * Users should not, however, construct it explicitly. + */ + class ProduceContext : public impl::EDGetterContextBase { + public: + explicit ProduceContext(edm::StreamID streamID) : EDGetterContextBase(streamID) {} + + ~ProduceContext() = default; + + template + std::unique_ptr> wrap(T data) { + // make_unique doesn't work because of private constructor + return std::unique_ptr>(new Product(device(), streamSharingHelper(), event_, std::move(data))); + } + + template + auto emplace(edm::Event& iEvent, edm::EDPutTokenT token, Args&&... args) { + return iEvent.emplace(token, device(), streamSharingHelper(), event_, std::forward(args)...); + } + + // internal API + void commit(); + + private: + // This construcor is only meant for testing + explicit ProduceContext(int device, SharedStreamPtr stream, SharedEventPtr event) + : EDGetterContextBase(device, std::move(stream)), event_{std::move(event)} {} + + // create the CUDA Event upfront to catch possible errors from its creation + SharedEventPtr event_ = getEventCache().get(); + }; + + template + void runProduce(edm::StreamID streamID, F&& func) { + ProduceContext context(streamID); + func(context); + context.commit(); + } +} + +#endif diff --git a/src/cudadev/CUDACore/Product.h b/src/cudadev/CUDACore/Product.h index c60e994f0..5d9aaf36e 100644 --- a/src/cudadev/CUDACore/Product.h +++ b/src/cudadev/CUDACore/Product.h @@ -13,14 +13,14 @@ namespace edm { namespace cms { namespace cuda { namespace impl { - class ScopedContextGetterBase; - } + class EDGetterContextBase; + } // namespace impl /** * The purpose of this class is to wrap CUDA data to edm::Event in a * way which forces correct use of various utilities. * - * The non-default construction has to be done with cms::cuda::ScopedContext + * The non-default construction has to be done with cms::cuda::Context * (in order to properly register the CUDA event). * * The default constructor is needed only for the ROOT dictionary generation. @@ -41,15 +41,18 @@ namespace cms { Product& operator=(Product&&) = default; private: - friend class impl::ScopedContextGetterBase; - friend class ScopedContextProduce; + friend class impl::EDGetterContextBase; + friend class ProduceContext; friend class edm::Wrapper>; - explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, T data) + explicit Product(int device, std::shared_ptr stream, SharedEventPtr event, T data) : ProductBase(device, std::move(stream), std::move(event)), data_(std::move(data)) {} template - explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, Args&&... args) + explicit Product(int device, + std::shared_ptr stream, + SharedEventPtr event, + Args&&... args) : ProductBase(device, std::move(stream), std::move(event)), data_(std::forward(args)...) {} T data_; //! diff --git a/src/cudadev/CUDACore/ProductBase.h b/src/cudadev/CUDACore/ProductBase.h index cb3fd4db9..e1f04ec12 100644 --- a/src/cudadev/CUDACore/ProductBase.h +++ b/src/cudadev/CUDACore/ProductBase.h @@ -10,8 +10,42 @@ namespace cms { namespace cuda { namespace impl { - class ScopedContextBase; - } + class FwkContextBase; + + /** + * The CUDA stream is shared between all the Event products of + * the EDProducer. If the stream gets re-used, only one consumer + * of all the products should be allowed to use the stream. An + * objects of this class is shared between such Event products + * and takes care of letting only those consumers get the stream. + */ + class StreamSharingHelper { + public: + explicit StreamSharingHelper(SharedStreamPtr stream) : stream_(std::move(stream)) {} + StreamSharingHelper(const StreamSharingHelper&) = delete; + StreamSharingHelper& operator=(const StreamSharingHelper&) = delete; + StreamSharingHelper(StreamSharingHelper&&) = delete; + StreamSharingHelper& operator=(StreamSharingHelper) = delete; + + const SharedStreamPtr& streamPtr() const { return stream_; } + + bool mayReuseStream() const { + bool expected = true; + bool changed = mayReuseStream_.compare_exchange_strong(expected, false); + // If the current thread is the one flipping the flag, it may + // reuse the stream. + return changed; + } + + private: + SharedStreamPtr stream_; + + // This flag tells whether the CUDA stream may be reused by a + // consumer or not. The goal is to have a "chain" of modules to + // queue their work to the same stream. + mutable std::atomic mayReuseStream_ = true; + }; + } // namespace impl /** * Base class for all instantiations of CUDA to hold the @@ -24,66 +58,45 @@ namespace cms { ProductBase(const ProductBase&) = delete; ProductBase& operator=(const ProductBase&) = delete; - ProductBase(ProductBase&& other) - : stream_{std::move(other.stream_)}, - event_{std::move(other.event_)}, - mayReuseStream_{other.mayReuseStream_.load()}, - device_{other.device_} {} - ProductBase& operator=(ProductBase&& other) { - stream_ = std::move(other.stream_); - event_ = std::move(other.event_); - mayReuseStream_ = other.mayReuseStream_.load(); - device_ = other.device_; - return *this; - } - - bool isValid() const { return stream_.get() != nullptr; } + ProductBase(ProductBase&& other) = default; + ProductBase& operator=(ProductBase&& other) = default; + bool isAvailable() const; int device() const { return device_; } // cudaStream_t is a pointer to a thread-safe object, for which a - // mutable access is needed even if the cms::cuda::ScopedContext itself + // mutable access is needed even if the ProductBase itself // would be const. Therefore it is ok to return a non-const // pointer from a const method here. - cudaStream_t stream() const { return stream_.get(); } + cudaStream_t stream() const { return stream_->streamPtr().get(); } // cudaEvent_t is a pointer to a thread-safe object, for which a - // mutable access is needed even if the cms::cuda::ScopedContext itself + // mutable access is needed even if the ProductBase itself // would be const. Therefore it is ok to return a non-const // pointer from a const method here. cudaEvent_t event() const { return event_.get(); } protected: - explicit ProductBase(int device, SharedStreamPtr stream, SharedEventPtr event) + explicit ProductBase(int device, std::shared_ptr stream, SharedEventPtr event) : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {} private: - friend class impl::ScopedContextBase; - friend class ScopedContextProduce; - - // The following function is intended to be used only from ScopedContext - const SharedStreamPtr& streamPtr() const { return stream_; } - - bool mayReuseStream() const { - bool expected = true; - bool changed = mayReuseStream_.compare_exchange_strong(expected, false); - // If the current thread is the one flipping the flag, it may - // reuse the stream. - return changed; - } - - // The cudaStream_t is really shared among edm::Event products, so - // using shared_ptr also here - SharedStreamPtr stream_; //! + friend class impl::FwkContextBase; + friend class ProduceContext; + + // The following function is intended to be used only from Context + const SharedStreamPtr& streamPtr() const { return stream_->streamPtr(); } + + bool mayReuseStream() const { return stream_->mayReuseStream(); } + + // Helper shared between all cms::cuda::Product event + // products of an EDProducer + std::shared_ptr stream_; //! + // shared_ptr because of caching in cms::cuda::EventCache SharedEventPtr event_; //! - // This flag tells whether the CUDA stream may be reused by a - // consumer or not. The goal is to have a "chain" of modules to - // queue their work to the same stream. - mutable std::atomic mayReuseStream_ = true; //! - // The CUDA device associated with this product int device_ = -1; //! }; diff --git a/src/cudadev/CUDACore/ScopedContext.cc b/src/cudadev/CUDACore/ScopedContext.cc deleted file mode 100644 index 14bff04eb..000000000 --- a/src/cudadev/CUDACore/ScopedContext.cc +++ /dev/null @@ -1,116 +0,0 @@ -#include "CUDACore/ScopedContext.h" - -#include "CUDACore/StreamCache.h" -#include "CUDACore/cudaCheck.h" - -#include "chooseDevice.h" - -namespace { - struct CallbackData { - edm::WaitingTaskWithArenaHolder holder; - int device; - }; - - void CUDART_CB cudaScopedContextCallback(cudaStream_t streamId, cudaError_t status, void* data) { - std::unique_ptr guard{reinterpret_cast(data)}; - edm::WaitingTaskWithArenaHolder& waitingTaskHolder = guard->holder; - int device = guard->device; - if (status == cudaSuccess) { - //std::cout << " GPU kernel finished (in callback) device " << device << " CUDA stream " - // << streamId << std::endl; - waitingTaskHolder.doneWaiting(nullptr); - } else { - // wrap the exception in a try-catch block to let GDB "catch throw" break on it - try { - auto error = cudaGetErrorName(status); - auto message = cudaGetErrorString(status); - throw std::runtime_error("Callback of CUDA stream " + - std::to_string(reinterpret_cast(streamId)) + " in device " + - std::to_string(device) + " error " + std::string(error) + ": " + std::string(message)); - } catch (std::exception&) { - waitingTaskHolder.doneWaiting(std::current_exception()); - } - } - } -} // namespace - -namespace cms::cuda { - namespace impl { - ScopedContextBase::ScopedContextBase(edm::StreamID streamID) : currentDevice_(chooseDevice(streamID)) { - cudaCheck(cudaSetDevice(currentDevice_)); - stream_ = getStreamCache().get(); - } - - ScopedContextBase::ScopedContextBase(const ProductBase& data) : currentDevice_(data.device()) { - cudaCheck(cudaSetDevice(currentDevice_)); - if (data.mayReuseStream()) { - stream_ = data.streamPtr(); - } else { - stream_ = getStreamCache().get(); - } - } - - ScopedContextBase::ScopedContextBase(int device, SharedStreamPtr stream) - : currentDevice_(device), stream_(std::move(stream)) { - cudaCheck(cudaSetDevice(currentDevice_)); - } - - //////////////////// - - void ScopedContextGetterBase::synchronizeStreams(int dataDevice, - cudaStream_t dataStream, - bool available, - cudaEvent_t dataEvent) { - if (dataDevice != device()) { - // Eventually replace with prefetch to current device (assuming unified memory works) - // If we won't go to unified memory, need to figure out something else... - throw std::runtime_error("Handling data from multiple devices is not yet supported"); - } - - if (dataStream != stream()) { - // Different streams, need to synchronize - if (not available) { - // Event not yet occurred, so need to add synchronization - // here. Sychronization is done by making the CUDA stream to - // wait for an event, so all subsequent work in the stream - // will run only after the event has "occurred" (i.e. data - // product became available). - cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event"); - } - } - } - - void ScopedContextHolderHelper::enqueueCallback(int device, cudaStream_t stream) { - cudaCheck( - cudaStreamAddCallback(stream, cudaScopedContextCallback, new CallbackData{waitingTaskHolder_, device}, 0)); - } - } // namespace impl - - //////////////////// - - ScopedContextAcquire::~ScopedContextAcquire() { - holderHelper_.enqueueCallback(device(), stream()); - if (contextState_) { - contextState_->set(device(), streamPtr()); - } - } - - void ScopedContextAcquire::throwNoState() { - throw std::runtime_error( - "Calling ScopedContextAcquire::insertNextTask() requires ScopedContextAcquire to be constructed with " - "ContextState, but that was not the case"); - } - - //////////////////// - - ScopedContextProduce::~ScopedContextProduce() { - // Intentionally not checking the return value to avoid throwing - // exceptions. If this call would fail, we should get failures - // elsewhere as well. - cudaEventRecord(event_.get(), stream()); - } - - //////////////////// - - ScopedContextTask::~ScopedContextTask() { holderHelper_.enqueueCallback(device(), stream()); } -} // namespace cms::cuda diff --git a/src/cudadev/CUDACore/ScopedContext.h b/src/cudadev/CUDACore/ScopedContext.h deleted file mode 100644 index 4f6669883..000000000 --- a/src/cudadev/CUDACore/ScopedContext.h +++ /dev/null @@ -1,241 +0,0 @@ -#ifndef HeterogeneousCore_CUDACore_ScopedContext_h -#define HeterogeneousCore_CUDACore_ScopedContext_h - -#include - -#include "CUDACore/Product.h" -#include "Framework/WaitingTaskWithArenaHolder.h" -#include "Framework/Event.h" -#include "Framework/EDGetToken.h" -#include "Framework/EDPutToken.h" -#include "CUDACore/ContextState.h" -#include "CUDACore/EventCache.h" -#include "CUDACore/SharedEventPtr.h" -#include "CUDACore/SharedStreamPtr.h" - -namespace cms { - namespace cudatest { - class TestScopedContext; - } - - namespace cuda { - - namespace impl { - // This class is intended to be derived by other ScopedContext*, not for general use - class ScopedContextBase { - public: - int device() const { return currentDevice_; } - - // cudaStream_t is a pointer to a thread-safe object, for which a - // mutable access is needed even if the ScopedContext itself - // would be const. Therefore it is ok to return a non-const - // pointer from a const method here. - cudaStream_t stream() const { return stream_.get(); } - const SharedStreamPtr& streamPtr() const { return stream_; } - - protected: - // The constructors set the current device, but the device - // is not set back to the previous value at the destructor. This - // should be sufficient (and tiny bit faster) as all CUDA API - // functions relying on the current device should be called from - // the scope where this context is. The current device doesn't - // really matter between modules (or across TBB tasks). - explicit ScopedContextBase(edm::StreamID streamID); - - explicit ScopedContextBase(const ProductBase& data); - - explicit ScopedContextBase(int device, SharedStreamPtr stream); - - private: - int currentDevice_; - SharedStreamPtr stream_; - }; - - class ScopedContextGetterBase : public ScopedContextBase { - public: - template - const T& get(const Product& data) { - synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event()); - return data.data_; - } - - template - const T& get(const edm::Event& iEvent, edm::EDGetTokenT> token) { - return get(iEvent.get(token)); - } - - protected: - template - ScopedContextGetterBase(Args&&... args) : ScopedContextBase(std::forward(args)...) {} - - void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent); - }; - - class ScopedContextHolderHelper { - public: - ScopedContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder) - : waitingTaskHolder_{std::move(waitingTaskHolder)} {} - - template - void pushNextTask(F&& f, ContextState const* state); - - void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - waitingTaskHolder_ = std::move(waitingTaskHolder); - } - - void enqueueCallback(int device, cudaStream_t stream); - - private: - edm::WaitingTaskWithArenaHolder waitingTaskHolder_; - }; - } // namespace impl - - /** - * The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire(): - * - setting the current device - * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary - * - synchronizing between CUDA streams if necessary - * and enforce that those get done in a proper way in RAII fashion. - */ - class ScopedContextAcquire : public impl::ScopedContextGetterBase { - public: - /// Constructor to create a new CUDA stream (no need for context beyond acquire()) - explicit ScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder) - : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)} {} - - /// Constructor to create a new CUDA stream, and the context is needed after acquire() - explicit ScopedContextAcquire(edm::StreamID streamID, - edm::WaitingTaskWithArenaHolder waitingTaskHolder, - ContextState& state) - : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {} - - /// Constructor to (possibly) re-use a CUDA stream (no need for context beyond acquire()) - explicit ScopedContextAcquire(const ProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder) - : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)} {} - - /// Constructor to (possibly) re-use a CUDA stream, and the context is needed after acquire() - explicit ScopedContextAcquire(const ProductBase& data, - edm::WaitingTaskWithArenaHolder waitingTaskHolder, - ContextState& state) - : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {} - - ~ScopedContextAcquire(); - - template - void pushNextTask(F&& f) { - if (contextState_ == nullptr) - throwNoState(); - holderHelper_.pushNextTask(std::forward(f), contextState_); - } - - void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder)); - } - - private: - void throwNoState(); - - impl::ScopedContextHolderHelper holderHelper_; - ContextState* contextState_ = nullptr; - }; - - /** - * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce(): - * - setting the current device - * - synchronizing between CUDA streams if necessary - * and enforce that those get done in a proper way in RAII fashion. - */ - class ScopedContextProduce : public impl::ScopedContextGetterBase { - public: - /// Constructor to create a new CUDA stream (non-ExternalWork module) - explicit ScopedContextProduce(edm::StreamID streamID) : ScopedContextGetterBase(streamID) {} - - /// Constructor to (possibly) re-use a CUDA stream (non-ExternalWork module) - explicit ScopedContextProduce(const ProductBase& data) : ScopedContextGetterBase(data) {} - - /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module) - explicit ScopedContextProduce(ContextState& state) - : ScopedContextGetterBase(state.device(), state.releaseStreamPtr()) {} - - /// Record the CUDA event, all asynchronous work must have been queued before the destructor - ~ScopedContextProduce(); - - template - std::unique_ptr> wrap(T data) { - // make_unique doesn't work because of private constructor - return std::unique_ptr>(new Product(device(), streamPtr(), event_, std::move(data))); - } - - template - auto emplace(edm::Event& iEvent, edm::EDPutTokenT token, Args&&... args) { - return iEvent.emplace(token, device(), streamPtr(), event_, std::forward(args)...); - } - - private: - friend class cudatest::TestScopedContext; - - // This construcor is only meant for testing - explicit ScopedContextProduce(int device, SharedStreamPtr stream, SharedEventPtr event) - : ScopedContextGetterBase(device, std::move(stream)), event_{std::move(event)} {} - - // create the CUDA Event upfront to catch possible errors from its creation - SharedEventPtr event_ = getEventCache().get(); - }; - - /** - * The aim of this class is to do necessary per-task "initialization" tasks created in ExternalWork acquire(): - * - setting the current device - * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary - * and enforce that those get done in a proper way in RAII fashion. - */ - class ScopedContextTask : public impl::ScopedContextBase { - public: - /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module) - explicit ScopedContextTask(ContextState const* state, edm::WaitingTaskWithArenaHolder waitingTaskHolder) - : ScopedContextBase(state->device(), state->streamPtr()), // don't move, state is re-used afterwards - holderHelper_{std::move(waitingTaskHolder)}, - contextState_{state} {} - - ~ScopedContextTask(); - - template - void pushNextTask(F&& f) { - holderHelper_.pushNextTask(std::forward(f), contextState_); - } - - void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder)); - } - - private: - impl::ScopedContextHolderHelper holderHelper_; - ContextState const* contextState_; - }; - - /** - * The aim of this class is to do necessary per-event "initialization" in analyze() - * - setting the current device - * - synchronizing between CUDA streams if necessary - * and enforce that those get done in a proper way in RAII fashion. - */ - class ScopedContextAnalyze : public impl::ScopedContextGetterBase { - public: - /// Constructor to (possibly) re-use a CUDA stream - explicit ScopedContextAnalyze(const ProductBase& data) : ScopedContextGetterBase(data) {} - }; - - namespace impl { - template - void ScopedContextHolderHelper::pushNextTask(F&& f, ContextState const* state) { - replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{ - edm::make_waiting_task_with_holder(tbb::task::allocate_root(), - std::move(waitingTaskHolder_), - [state, func = std::forward(f)](edm::WaitingTaskWithArenaHolder h) { - func(ScopedContextTask{state, std::move(h)}); - })}); - } - } // namespace impl - } // namespace cuda -} // namespace cms - -#endif diff --git a/src/cudadev/CUDACore/TaskContext.cc b/src/cudadev/CUDACore/TaskContext.cc new file mode 100644 index 000000000..a2c5b0194 --- /dev/null +++ b/src/cudadev/CUDACore/TaskContext.cc @@ -0,0 +1,41 @@ +#include "CUDACore/TaskContext.h" +#include "CUDACore/cudaCheck.h" + +namespace { + struct CallbackData { + edm::WaitingTaskWithArenaHolder holder; + int device; + }; + + void CUDART_CB cudaContextCallback(cudaStream_t streamId, cudaError_t status, void* data) { + std::unique_ptr guard{reinterpret_cast(data)}; + edm::WaitingTaskWithArenaHolder& waitingTaskHolder = guard->holder; + int device = guard->device; + if (status == cudaSuccess) { + //std::cout << " GPU kernel finished (in callback) device " << device << " CUDA stream " + // << streamId << std::endl; + waitingTaskHolder.doneWaiting(nullptr); + } else { + // wrap the exception in a try-catch block to let GDB "catch throw" break on it + try { + auto error = cudaGetErrorName(status); + auto message = cudaGetErrorString(status); + throw std::runtime_error("Callback of CUDA stream " + + std::to_string(reinterpret_cast(streamId)) + " in device " + + std::to_string(device) + " error " + std::string(error) + ": " + std::string(message)); + } catch (std::exception&) { + waitingTaskHolder.doneWaiting(std::current_exception()); + } + } + } +} // namespace + +namespace cms::cuda { + namespace impl { + void FwkContextHolderHelper::enqueueCallback(cudaStream_t stream) { + cudaCheck(cudaStreamAddCallback(stream, cudaContextCallback, new CallbackData{waitingTaskHolder_, device_}, 0)); + } + } + + void TaskContext::commit() { holderHelper_.enqueueCallback(stream()); } +} diff --git a/src/cudadev/CUDACore/TaskContext.h b/src/cudadev/CUDACore/TaskContext.h new file mode 100644 index 000000000..121c9165d --- /dev/null +++ b/src/cudadev/CUDACore/TaskContext.h @@ -0,0 +1,71 @@ +#ifndef HeterogeneousCore_CUDACore_TaskContext_h +#define HeterogeneousCore_CUDACore_TaskContext_h + +#include "CUDACore/FwkContextBase.h" +#include "Framework/WaitingTaskWithArenaHolder.h" + +namespace cms::cuda { + namespace impl { + class FwkContextHolderHelper { + public: + FwkContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder, int device) + : waitingTaskHolder_{std::move(waitingTaskHolder)}, device_{device} {} + + template + void pushNextTask(F&& f); + + void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + waitingTaskHolder_ = std::move(waitingTaskHolder); + } + + void enqueueCallback(cudaStream_t stream); + + private: + edm::WaitingTaskWithArenaHolder waitingTaskHolder_; + int device_; + }; + } + + /** + * The aim of this class is to do necessary per-task "initialization" tasks created in ExternalWork acquire(): + * - setting the current device + * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary + */ + class TaskContext : public impl::FwkContextBase { + public: + /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module) + explicit TaskContext(int device, edm::WaitingTaskWithArenaHolder waitingTaskHolder) + : FwkContextBase(device), holderHelper_{std::move(waitingTaskHolder), device} {} + + ~TaskContext() = default; + + template + void pushNextTask(F&& f) { + holderHelper_.pushNextTask(std::forward(f)); + } + + void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder)); + } + + // Internal API + void commit(); + + private: + impl::FwkContextHolderHelper holderHelper_; + }; + + namespace impl { + template + void FwkContextHolderHelper::pushNextTask(F&& f) { + replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{edm::make_waiting_task_with_holder( + tbb::task::allocate_root(), + std::move(waitingTaskHolder_), + [device = device_, func = std::forward(f)](edm::WaitingTaskWithArenaHolder h) { + func(TaskContext{device, std::move(h)}); + })}); + } + } // namespace impl +} + +#endif diff --git a/src/cudadev/Framework/Event.h b/src/cudadev/Framework/Event.h index 9f952c492..68a04e729 100644 --- a/src/cudadev/Framework/Event.h +++ b/src/cudadev/Framework/Event.h @@ -9,7 +9,32 @@ // type erasure namespace edm { - using StreamID = int; + class Event; + + class StreamID { + public: + ~StreamID() = default; + StreamID() = delete; + StreamID(const StreamID&) = default; + StreamID& operator=(const StreamID&) = default; + + bool operator==(const StreamID& iID) const { return iID.value_ == value_; } + + operator unsigned int() const { return value_; } + + /** \return value ranging from 0 to one less than max number of streams. + */ + unsigned int value() const { return value_; } + + static StreamID invalidStreamID() { return StreamID(0xFFFFFFFFU); } + + private: + ///Only a Event is allowed to create one of these + friend class Event; + explicit StreamID(unsigned int iValue) : value_(iValue) {} + + unsigned int value_; + }; class WrapperBase { public: diff --git a/src/cudadev/plugin-BeamSpotProducer/BeamSpotToCUDA.cc b/src/cudadev/plugin-BeamSpotProducer/BeamSpotToCUDA.cc index 48badcabf..b9633a856 100644 --- a/src/cudadev/plugin-BeamSpotProducer/BeamSpotToCUDA.cc +++ b/src/cudadev/plugin-BeamSpotProducer/BeamSpotToCUDA.cc @@ -2,23 +2,22 @@ #include +#include "CUDACore/EDProducer.h" #include "CUDACore/Product.h" -#include "CUDACore/ScopedContext.h" #include "CUDACore/copyAsync.h" #include "CUDACore/host_noncached_unique_ptr.h" #include "CUDADataFormats/BeamSpotCUDA.h" #include "DataFormats/BeamSpotPOD.h" -#include "Framework/EDProducer.h" #include "Framework/Event.h" #include "Framework/EventSetup.h" #include "Framework/PluginFactory.h" -class BeamSpotToCUDA : public edm::EDProducer { +class BeamSpotToCUDA : public cms::cuda::EDProducer { public: explicit BeamSpotToCUDA(edm::ProductRegistry& reg); ~BeamSpotToCUDA() override = default; - void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; + void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext& ctx) override; private: const edm::EDPutTokenT> bsPutToken_; @@ -30,11 +29,9 @@ BeamSpotToCUDA::BeamSpotToCUDA(edm::ProductRegistry& reg) : bsPutToken_{reg.produces>()}, bsHost{cms::cuda::make_host_noncached_unique(cudaHostAllocWriteCombined)} {} -void BeamSpotToCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { +void BeamSpotToCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext& ctx) { *bsHost = iSetup.get(); - cms::cuda::ScopedContextProduce ctx{iEvent.streamID()}; - BeamSpotCUDA bsDevice(ctx.stream()); cms::cuda::copyAsync(bsDevice.ptr(), bsHost, ctx.stream()); diff --git a/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc b/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc index 408450ea8..c9e0cc5d9 100644 --- a/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc +++ b/src/cudadev/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc @@ -1,27 +1,24 @@ #include +#include "CUDACore/EDProducer.h" #include "CUDACore/Product.h" #include "CUDACore/HostProduct.h" #include "CUDADataFormats/PixelTrackHeterogeneous.h" #include "Framework/EventSetup.h" #include "Framework/Event.h" #include "Framework/PluginFactory.h" -#include "Framework/EDProducer.h" -#include "CUDACore/ScopedContext.h" // Switch on to enable checks and printout for found tracks #undef PIXEL_DEBUG_PRODUCE -class PixelTrackSoAFromCUDA : public edm::EDProducerExternalWork { +class PixelTrackSoAFromCUDA : public cms::cuda::SynchronizingEDProducer { public: explicit PixelTrackSoAFromCUDA(edm::ProductRegistry& reg); ~PixelTrackSoAFromCUDA() override = default; private: - void acquire(edm::Event const& iEvent, - edm::EventSetup const& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; - void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; + void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, cms::cuda::AcquireContext& ctx) override; + void produce(edm::Event& iEvent, edm::EventSetup const& iSetup, cms::cuda::ProduceContext&) override; edm::EDGetTokenT> tokenCUDA_; edm::EDPutTokenT tokenSOA_; @@ -35,15 +32,13 @@ PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(edm::ProductRegistry& reg) void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); - cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; - auto const& inputData = ctx.get(inputDataWrapped); + cms::cuda::AcquireContext& ctx) { + auto const& inputData = ctx.get(iEvent, tokenCUDA_); soa_ = inputData.toHostAsync(ctx.stream()); } -void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { +void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup, cms::cuda::ProduceContext&) { #ifdef PIXEL_DEBUG_PRODUCE auto const& tsoa = *soa_; auto maxTracks = tsoa.stride(); diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc b/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc index 94085d784..aca6bd319 100644 --- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc +++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletCUDA.cc @@ -1,24 +1,23 @@ #include #include "CUDACore/Product.h" +#include "CUDACore/EDProducer.h" #include "Framework/EventSetup.h" #include "Framework/Event.h" #include "Framework/PluginFactory.h" -#include "Framework/EDProducer.h" #include "Framework/RunningAverage.h" -#include "CUDACore/ScopedContext.h" #include "CAHitNtupletGeneratorOnGPU.h" #include "CUDADataFormats/PixelTrackHeterogeneous.h" #include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h" -class CAHitNtupletCUDA : public edm::EDProducer { +class CAHitNtupletCUDA : public cms::cuda::EDProducer { public: explicit CAHitNtupletCUDA(edm::ProductRegistry& reg); ~CAHitNtupletCUDA() override = default; private: - void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; + void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext& ctx) override; edm::EDGetTokenT> tokenHitGPU_; edm::EDPutTokenT> tokenTrackGPU_; @@ -31,12 +30,10 @@ CAHitNtupletCUDA::CAHitNtupletCUDA(edm::ProductRegistry& reg) tokenTrackGPU_{reg.produces>()}, gpuAlgo_(reg) {} -void CAHitNtupletCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es) { +void CAHitNtupletCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es, cms::cuda::ProduceContext& ctx) { auto bf = 0.0114256972711507; // 1/fieldInGeV - auto const& phits = iEvent.get(tokenHitGPU_); - cms::cuda::ScopedContextProduce ctx{phits}; - auto const& hits = ctx.get(phits); + auto const& hits = ctx.get(iEvent, tokenHitGPU_); ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream())); } diff --git a/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc b/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc index 723f7eb4c..d4ca2196b 100644 --- a/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc +++ b/src/cudadev/plugin-PixelVertexFinding/PixelVertexProducerCUDA.cc @@ -1,12 +1,12 @@ #include #include "CUDACore/Product.h" +#include "CUDACore/ProduceContext.h" #include "Framework/EventSetup.h" #include "Framework/Event.h" #include "Framework/PluginFactory.h" #include "Framework/EDProducer.h" #include "Framework/RunningAverage.h" -#include "CUDACore/ScopedContext.h" #include "gpuVertexFinder.h" @@ -58,14 +58,13 @@ PixelVertexProducerCUDA::PixelVertexProducerCUDA(edm::ProductRegistry& reg) } void PixelVertexProducerCUDA::produceOnGPU(edm::Event& iEvent, const edm::EventSetup& iSetup) { - auto const& ptracks = iEvent.get(tokenGPUTrack_); + cms::cuda::runProduce(iEvent.streamID(), [&](cms::cuda::ProduceContext& ctx) { + auto const* tracks = ctx.get(iEvent, tokenGPUTrack_).get(); - cms::cuda::ScopedContextProduce ctx{ptracks}; - auto const* tracks = ctx.get(ptracks).get(); + assert(tracks); - assert(tracks); - - ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks, ptMin_)); + ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks, ptMin_)); + }); } void PixelVertexProducerCUDA::produceOnCPU(edm::Event& iEvent, const edm::EventSetup& iSetup) { diff --git a/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc b/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc index d709f0c5e..4f38c0ad8 100644 --- a/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc +++ b/src/cudadev/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc @@ -1,25 +1,22 @@ #include +#include "CUDACore/EDProducer.h" #include "CUDACore/Product.h" #include "CUDACore/HostProduct.h" #include "CUDADataFormats/ZVertexHeterogeneous.h" #include "Framework/EventSetup.h" #include "Framework/Event.h" #include "Framework/PluginFactory.h" -#include "Framework/EDProducer.h" #include "Framework/RunningAverage.h" -#include "CUDACore/ScopedContext.h" -class PixelVertexSoAFromCUDA : public edm::EDProducerExternalWork { +class PixelVertexSoAFromCUDA : public cms::cuda::SynchronizingEDProducer { public: explicit PixelVertexSoAFromCUDA(edm::ProductRegistry& reg); ~PixelVertexSoAFromCUDA() override = default; private: - void acquire(edm::Event const& iEvent, - edm::EventSetup const& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; - void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; + void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, cms::cuda::AcquireContext& ctx) override; + void produce(edm::Event& iEvent, edm::EventSetup const& iSetup, cms::cuda::ProduceContext&) override; edm::EDGetTokenT> tokenCUDA_; edm::EDPutTokenT tokenSOA_; @@ -33,15 +30,13 @@ PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(edm::ProductRegistry& reg) void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - auto const& inputDataWrapped = iEvent.get(tokenCUDA_); - cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; - auto const& inputData = ctx.get(inputDataWrapped); + cms::cuda::AcquireContext& ctx) { + auto const& inputData = ctx.get(iEvent, tokenCUDA_); m_soa = inputData.toHostAsync(ctx.stream()); } -void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { +void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup, cms::cuda::ProduceContext&) { // No copies.... iEvent.emplace(tokenSOA_, ZVertexHeterogeneous(std::move(m_soa))); } diff --git a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc index a5229b295..4f06bf540 100644 --- a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc +++ b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc @@ -4,8 +4,8 @@ #include // CMSSW includes +#include "CUDACore/EDProducer.h" #include "CUDACore/Product.h" -#include "CUDACore/ScopedContext.h" #include "CUDADataFormats/SiPixelClustersCUDA.h" #include "CUDADataFormats/SiPixelDigiErrorsCUDA.h" #include "CUDADataFormats/SiPixelDigisCUDA.h" @@ -16,7 +16,6 @@ #include "DataFormats/FEDRawData.h" #include "DataFormats/FEDRawDataCollection.h" #include "DataFormats/SiPixelErrorCompact.h" -#include "Framework/EDProducer.h" #include "Framework/Event.h" #include "Framework/EventSetup.h" #include "Framework/PluginFactory.h" @@ -26,18 +25,14 @@ #include "SiPixelClusterThresholds.h" #include "SiPixelRawToClusterGPUKernel.h" -class SiPixelRawToClusterCUDA : public edm::EDProducerExternalWork { +class SiPixelRawToClusterCUDA : public cms::cuda::SynchronizingEDProducer { public: explicit SiPixelRawToClusterCUDA(edm::ProductRegistry& reg); ~SiPixelRawToClusterCUDA() override = default; private: - void acquire(const edm::Event& iEvent, - const edm::EventSetup& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; - void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; - - cms::cuda::ContextState ctxState_; + void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::AcquireContext& ctx) override; + void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext& ctx) override; edm::EDGetTokenT rawGetToken_; edm::EDPutTokenT> digiPutToken_; @@ -72,9 +67,7 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(edm::ProductRegistry& reg) void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_}; - + cms::cuda::AcquireContext& ctx) { auto const& hgpuMap = iSetup.get(); if (hgpuMap.hasQuality() != useQuality_) { throw std::runtime_error( @@ -170,9 +163,9 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, ctx.stream()); } -void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { - cms::cuda::ScopedContextProduce ctx{ctxState_}; - +void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, + const edm::EventSetup& iSetup, + cms::cuda::ProduceContext& ctx) { auto tmp = gpuAlgo_.getResults(); ctx.emplace(iEvent, digiPutToken_, std::move(tmp.first)); ctx.emplace(iEvent, clusterPutToken_, std::move(tmp.second)); diff --git a/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc b/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc index 448f4b797..8d1f8b3c4 100644 --- a/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc +++ b/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc @@ -1,23 +1,20 @@ +#include "CUDACore/EDProducer.h" #include "CUDACore/Product.h" #include "CUDADataFormats/SiPixelDigisCUDA.h" #include "DataFormats/SiPixelDigisSoA.h" #include "Framework/EventSetup.h" #include "Framework/Event.h" -#include "Framework/EDProducer.h" #include "Framework/PluginFactory.h" -#include "CUDACore/ScopedContext.h" #include "CUDACore/host_unique_ptr.h" -class SiPixelDigisSoAFromCUDA : public edm::EDProducerExternalWork { +class SiPixelDigisSoAFromCUDA : public cms::cuda::SynchronizingEDProducer { public: explicit SiPixelDigisSoAFromCUDA(edm::ProductRegistry& reg); ~SiPixelDigisSoAFromCUDA() override = default; private: - void acquire(const edm::Event& iEvent, - const edm::EventSetup& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; - void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; + void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::AcquireContext& ctx) override; + void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext&) override; edm::EDGetTokenT> digiGetToken_; edm::EDPutTokenT digiPutToken_; @@ -36,10 +33,7 @@ SiPixelDigisSoAFromCUDA::SiPixelDigisSoAFromCUDA(edm::ProductRegistry& reg) void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - // Do the transfer in a CUDA stream parallel to the computation CUDA stream - cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)}; - + cms::cuda::AcquireContext& ctx) { const auto& gpuDigis = ctx.get(iEvent, digiGetToken_); nDigis_ = gpuDigis.nDigis(); @@ -49,7 +43,7 @@ void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent, clus_ = gpuDigis.clusToHostAsync(ctx.stream()); } -void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { +void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext&) { // The following line copies the data from the pinned host memory to // regular host memory. In principle that feels unnecessary (why not // just use the pinned host memory?). There are a few arguments for diff --git a/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc b/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc index 413982fc3..133fa8eaf 100644 --- a/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc +++ b/src/cudadev/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc @@ -1,6 +1,7 @@ #include #include "CUDADataFormats/BeamSpotCUDA.h" +#include "CUDACore/EDProducer.h" #include "CUDACore/Product.h" #include "CUDADataFormats/SiPixelClustersCUDA.h" #include "CUDADataFormats/SiPixelDigisCUDA.h" @@ -8,19 +9,17 @@ #include "Framework/EventSetup.h" #include "Framework/Event.h" #include "Framework/PluginFactory.h" -#include "Framework/EDProducer.h" -#include "CUDACore/ScopedContext.h" #include "CondFormats/PixelCPEFast.h" #include "PixelRecHitGPUKernel.h" -class SiPixelRecHitCUDA : public edm::EDProducer { +class SiPixelRecHitCUDA : public cms::cuda::EDProducer { public: explicit SiPixelRecHitCUDA(edm::ProductRegistry& reg); ~SiPixelRecHitCUDA() override = default; private: - void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; + void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext& ctx) override; // The mess with inputs will be cleaned up when migrating to the new framework const edm::EDGetTokenT> tBeamSpot; @@ -36,13 +35,10 @@ SiPixelRecHitCUDA::SiPixelRecHitCUDA(edm::ProductRegistry& reg) tokenDigi_(reg.consumes>()), tokenHit_(reg.produces>()) {} -void SiPixelRecHitCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es) { +void SiPixelRecHitCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es, cms::cuda::ProduceContext& ctx) { PixelCPEFast const& fcpe = es.get(); - auto const& pclusters = iEvent.get(token_); - cms::cuda::ScopedContextProduce ctx{pclusters}; - - auto const& clusters = ctx.get(pclusters); + auto const& clusters = ctx.get(iEvent, token_); auto const& digis = ctx.get(iEvent, tokenDigi_); auto const& bs = ctx.get(iEvent, tBeamSpot); diff --git a/src/cudadev/plugin-Validation/CountValidator.cc b/src/cudadev/plugin-Validation/CountValidator.cc index 23352f5ba..bd9f40989 100644 --- a/src/cudadev/plugin-Validation/CountValidator.cc +++ b/src/cudadev/plugin-Validation/CountValidator.cc @@ -1,5 +1,5 @@ +#include "CUDACore/ProduceContext.h" #include "CUDACore/Product.h" -#include "CUDACore/ScopedContext.h" #include "CUDADataFormats/PixelTrackHeterogeneous.h" #include "CUDADataFormats/SiPixelClustersCUDA.h" #include "CUDADataFormats/SiPixelDigisCUDA.h" @@ -63,24 +63,24 @@ void CountValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) ss << "Event " << iEvent.eventID() << " "; { - auto const& pdigis = iEvent.get(digiToken_); - cms::cuda::ScopedContextProduce ctx{pdigis}; - auto const& count = iEvent.get(digiClusterCountToken_); - auto const& digis = ctx.get(iEvent, digiToken_); - auto const& clusters = ctx.get(iEvent, clusterToken_); - - if (digis.nModules() != count.nModules()) { - ss << "\n N(modules) is " << digis.nModules() << " expected " << count.nModules(); - ok = false; - } - if (digis.nDigis() != count.nDigis()) { - ss << "\n N(digis) is " << digis.nDigis() << " expected " << count.nDigis(); - ok = false; - } - if (clusters.nClusters() != count.nClusters()) { - ss << "\n N(clusters) is " << clusters.nClusters() << " expected " << count.nClusters(); - ok = false; - } + cms::cuda::runProduce(iEvent.streamID(), [&](cms::cuda::ProduceContext& ctx) { + auto const& count = iEvent.get(digiClusterCountToken_); + auto const& digis = ctx.get(iEvent, digiToken_); + auto const& clusters = ctx.get(iEvent, clusterToken_); + + if (digis.nModules() != count.nModules()) { + ss << "\n N(modules) is " << digis.nModules() << " expected " << count.nModules(); + ok = false; + } + if (digis.nDigis() != count.nDigis()) { + ss << "\n N(digis) is " << digis.nDigis() << " expected " << count.nDigis(); + ok = false; + } + if (clusters.nClusters() != count.nClusters()) { + ss << "\n N(clusters) is " << clusters.nClusters() << " expected " << count.nClusters(); + ok = false; + } + }); } { diff --git a/src/cudadev/plugin-Validation/HistoValidator.cc b/src/cudadev/plugin-Validation/HistoValidator.cc index 8a888666b..f8af1773d 100644 --- a/src/cudadev/plugin-Validation/HistoValidator.cc +++ b/src/cudadev/plugin-Validation/HistoValidator.cc @@ -1,5 +1,5 @@ +#include "CUDACore/EDProducer.h" #include "CUDACore/Product.h" -#include "CUDACore/ScopedContext.h" #include "CUDADataFormats/PixelTrackHeterogeneous.h" #include "CUDADataFormats/SiPixelClustersCUDA.h" #include "CUDADataFormats/SiPixelDigisCUDA.h" @@ -8,22 +8,19 @@ #include "Framework/EventSetup.h" #include "Framework/Event.h" #include "Framework/PluginFactory.h" -#include "Framework/EDProducer.h" #include "SimpleAtomicHisto.h" #include #include -class HistoValidator : public edm::EDProducerExternalWork { +class HistoValidator : public cms::cuda::SynchronizingEDProducer { public: explicit HistoValidator(edm::ProductRegistry& reg); private: - void acquire(const edm::Event& iEvent, - const edm::EventSetup& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; - void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; + void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::AcquireContext& ctx) override; + void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext&) override; void endJob() override; edm::EDGetTokenT> digiToken_; @@ -88,11 +85,7 @@ HistoValidator::HistoValidator(edm::ProductRegistry& reg) trackToken_(reg.consumes()), vertexToken_(reg.consumes()) {} -void HistoValidator::acquire(const edm::Event& iEvent, - const edm::EventSetup& iSetup, - edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - auto const& pdigis = iEvent.get(digiToken_); - cms::cuda::ScopedContextAcquire ctx{pdigis, std::move(waitingTaskHolder)}; +void HistoValidator::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::AcquireContext& ctx) { auto const& digis = ctx.get(iEvent, digiToken_); auto const& clusters = ctx.get(iEvent, clusterToken_); auto const& hits = ctx.get(iEvent, hitToken_); @@ -113,7 +106,7 @@ void HistoValidator::acquire(const edm::Event& iEvent, h_size = hits.sizeToHostAsync(ctx.stream()); } -void HistoValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { +void HistoValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup, cms::cuda::ProduceContext&) { histos["digi_n"].fill(nDigis); for (uint32_t i = 0; i < nDigis; ++i) { histos["digi_adc"].fill(h_adc[i]);