-
Notifications
You must be signed in to change notification settings - Fork 47
[cudadev][RFC] Next version of CUDA EDModule API #224
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d00f46b
f7a46ab
3afeb58
aa518d6
9f04f75
bdc498e
7e68862
fdf393f
5e0d422
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| #include "CUDACore/AcquireContext.h" | ||
|
|
||
| namespace cms::cuda { | ||
| void AcquireContext::commit() { holderHelper_.enqueueCallback(stream()); } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,45 @@ | ||
| #ifndef HeterogeneousCore_CUDACore_AcquireContext_h | ||
| #define HeterogeneousCore_CUDACore_AcquireContext_h | ||
|
|
||
| #include "CUDACore/EDGetterContextBase.h" | ||
| #include "CUDACore/TaskContext.h" | ||
|
|
||
| namespace cms::cuda { | ||
| /** | ||
| * The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire(): | ||
| * - setting the current device | ||
| * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @makortel how is Is it
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Correct. (similar to how now |
||
| * - synchronizing between CUDA streams if necessary | ||
| * Users should not, however, construct it explicitly. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Given this comment, should the constructor be
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I would attempt making constructors |
||
| */ | ||
| class AcquireContext : public impl::EDGetterContextBase { | ||
| public: | ||
| explicit AcquireContext(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder) | ||
| : EDGetterContextBase(streamID), holderHelper_{std::move(waitingTaskHolder), device()} {} | ||
| ~AcquireContext() = default; | ||
|
|
||
| template <typename F> | ||
| void pushNextTask(F&& f) { | ||
| holderHelper_.pushNextTask(std::forward<F>(f)); | ||
| } | ||
|
|
||
| void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { | ||
| holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder)); | ||
| } | ||
|
|
||
| // internal API | ||
| void commit(); | ||
|
|
||
| private: | ||
| impl::FwkContextHolderHelper holderHelper_; | ||
| }; | ||
|
|
||
| template <typename F> | ||
| void runAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder holder, F func) { | ||
| AcquireContext context(streamID, std::move(holder)); | ||
| func(context); | ||
| context.commit(); | ||
|
Comment on lines
+39
to
+41
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Browsing the code, this seems the only place where an Similarly for
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The separate
(I'm also thinking to rename the function to something more descriptive, maybe |
||
| } | ||
| } | ||
|
|
||
| #endif | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| #ifndef HeterogeneousCore_CUDACore_AnalyzeContext_h | ||
| #define HeterogeneousCore_CUDACore_AnalyzeContext_h | ||
|
|
||
| #include "CUDACore/EDGetterContextBase.h" | ||
|
|
||
| namespace cms::cuda { | ||
| /** | ||
| * The aim of this class is to do necessary per-event "initialization" in analyze() | ||
| * - setting the current device | ||
| * - synchronizing between CUDA streams if necessary | ||
| * and enforce that those get done in a proper way in RAII fashion. | ||
| */ | ||
| class AnalyzeContext : public impl::EDGetterContextBase { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Correct. It's mainly a copy-over from CMSSW (where there are tests using it), so I consider it more of a sketch here. |
||
| public: | ||
| /// Constructor to (possibly) re-use a CUDA stream | ||
| explicit AnalyzeContext(edm::StreamID streamID) : EDGetterContextBase(streamID) {} | ||
| }; | ||
| } | ||
|
|
||
| #endif | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,27 @@ | ||
| #include "CUDACore/EDGetterContextBase.h" | ||
| #include "CUDACore/cudaCheck.h" | ||
|
|
||
| namespace cms::cuda::impl { | ||
| void EDGetterContextBase::synchronizeStreams(int dataDevice, | ||
| cudaStream_t dataStream, | ||
| bool available, | ||
| cudaEvent_t dataEvent) { | ||
| if (dataDevice != device()) { | ||
| // Eventually replace with prefetch to current device (assuming unified memory works) | ||
| // If we won't go to unified memory, need to figure out something else... | ||
| throw std::runtime_error("Handling data from multiple devices is not yet supported"); | ||
| } | ||
|
|
||
| if (dataStream != stream()) { | ||
| // Different streams, need to synchronize | ||
| if (not available) { | ||
| // Event not yet occurred, so need to add synchronization | ||
| // here. Sychronization is done by making the CUDA stream to | ||
| // wait for an event, so all subsequent work in the stream | ||
| // will run only after the event has "occurred" (i.e. data | ||
| // product became available). | ||
| cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event"); | ||
| } | ||
| } | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,38 @@ | ||
| #ifndef HeterogeneousCore_CUDACore_EDGetterContextBase_h | ||
| #define HeterogeneousCore_CUDACore_EDGetterContextBase_h | ||
|
|
||
| #include "CUDACore/FwkContextBase.h" | ||
| #include "CUDACore/Product.h" | ||
| #include "Framework/EDGetToken.h" | ||
|
|
||
| namespace cms::cuda::impl { | ||
| /** | ||
| * This class is a base class for Context classes that should be | ||
| * able to read Event Data products | ||
| */ | ||
| class EDGetterContextBase : public FwkContextBase { | ||
| public: | ||
| template <typename T> | ||
| const T& get(const Product<T>& data) { | ||
| if (not isInitialized()) { | ||
| initialize(data); | ||
| } | ||
| synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event()); | ||
| return data.data_; | ||
| } | ||
|
|
||
| template <typename T> | ||
| const T& get(const edm::Event& iEvent, edm::EDGetTokenT<Product<T>> token) { | ||
| return get(iEvent.get(token)); | ||
| } | ||
|
|
||
| protected: | ||
| template <typename... Args> | ||
| EDGetterContextBase(Args&&... args) : FwkContextBase(std::forward<Args>(args)...) {} | ||
|
|
||
| private: | ||
| void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent); | ||
| }; | ||
| } | ||
|
|
||
| #endif |
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,17 @@ | ||||||||||
| #include "CUDACore/EDProducer.h" | ||||||||||
|
|
||||||||||
| namespace cms::cuda { | ||||||||||
| void EDProducer::produce(edm::Event& event, edm::EventSetup const& eventSetup) { | ||||||||||
| runProduce(event.streamID(), [&](auto& ctx) { produce(event, eventSetup, ctx); }); | ||||||||||
| } | ||||||||||
|
|
||||||||||
| void SynchronizingEDProducer::acquire(edm::Event const& event, | ||||||||||
| edm::EventSetup const& eventSetup, | ||||||||||
| edm::WaitingTaskWithArenaHolder holder) { | ||||||||||
| runAcquire(event.streamID(), std::move(holder), [&](auto& ctx) { acquire(event, eventSetup, ctx); }); | ||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I understand the encapsulation, though personally I would find it simpler to avoid the lambda:
Suggested change
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, so this is intended to support the first point from cms-sw/cmssw#30266 (comment) .
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Correct. |
||||||||||
| } | ||||||||||
|
|
||||||||||
| void SynchronizingEDProducer::produce(edm::Event& event, edm::EventSetup const& eventSetup) { | ||||||||||
| runProduce(event.streamID(), [&](auto& ctx) { produce(event, eventSetup, ctx); }); | ||||||||||
| } | ||||||||||
| } // namespace cms::cuda | ||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,27 @@ | ||
| #ifndef HeterogeneousCore_CUDACore_stream_EDProducer_h | ||
| #define HeterogeneousCore_CUDACore_stream_EDProducer_h | ||
|
|
||
| #include "Framework/EDProducer.h" | ||
| #include "CUDACore/AcquireContext.h" | ||
| #include "CUDACore/ProduceContext.h" | ||
|
|
||
| namespace cms::cuda { | ||
| class EDProducer : public edm::EDProducer { | ||
| public: | ||
| void produce(edm::Event& event, edm::EventSetup const& eventSetup) override; | ||
| virtual void produce(edm::Event& event, edm::EventSetup const& eventSetup, ProduceContext& context) = 0; | ||
| }; | ||
|
|
||
| class SynchronizingEDProducer : public edm::EDProducerExternalWork { | ||
| public: | ||
| void acquire(edm::Event const& event, | ||
| edm::EventSetup const& eventSetup, | ||
| edm::WaitingTaskWithArenaHolder holder) override; | ||
| void produce(edm::Event& event, edm::EventSetup const& eventSetup) override; | ||
|
|
||
| virtual void acquire(edm::Event const& event, edm::EventSetup const& eventSetup, AcquireContext& context) = 0; | ||
| virtual void produce(edm::Event& event, edm::EventSetup const& eventSetup, ProduceContext& context) = 0; | ||
| }; | ||
| } // namespace cms::cuda | ||
|
|
||
| #endif |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| #include "CUDACore/FwkContextBase.h" | ||
| #include "CUDACore/StreamCache.h" | ||
| #include "CUDACore/cudaCheck.h" | ||
|
|
||
| #include "chooseDevice.h" | ||
|
|
||
| namespace cms::cuda::impl { | ||
| FwkContextBase::FwkContextBase(edm::StreamID streamID) : FwkContextBase(chooseDevice(streamID)) {} | ||
|
|
||
| FwkContextBase::FwkContextBase(int device) : currentDevice_(device) { cudaCheck(cudaSetDevice(currentDevice_)); } | ||
|
|
||
| FwkContextBase::FwkContextBase(int device, SharedStreamPtr stream) | ||
| : currentDevice_(device), stream_(std::make_shared<impl::StreamSharingHelper>(std::move(stream))) { | ||
| cudaCheck(cudaSetDevice(currentDevice_)); | ||
| } | ||
|
|
||
| void FwkContextBase::initialize() { stream_ = std::make_shared<impl::StreamSharingHelper>(getStreamCache().get()); } | ||
|
|
||
| void FwkContextBase::initialize(const ProductBase& data) { | ||
| SharedStreamPtr stream; | ||
| if (data.mayReuseStream()) { | ||
| stream = data.streamPtr(); | ||
| } else { | ||
| stream = getStreamCache().get(); | ||
| } | ||
| stream_ = std::make_shared<impl::StreamSharingHelper>(std::move(stream)); | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,66 @@ | ||
| #ifndef HeterogeneousCore_CUDACore_FwkContextBase_h | ||
| #define HeterogeneousCore_CUDACore_FwkContextBase_h | ||
|
|
||
| #include "CUDACore/ProductBase.h" | ||
| #include "CUDACore/SharedStreamPtr.h" | ||
| #include "Framework/Event.h" | ||
|
|
||
| namespace cms::cuda::impl { | ||
| /** | ||
| * This class is a base class for other Context classes for interacting with the framework | ||
| */ | ||
| class FwkContextBase { | ||
| public: | ||
| FwkContextBase(FwkContextBase const&) = delete; | ||
| FwkContextBase& operator=(FwkContextBase const&) = delete; | ||
| FwkContextBase(FwkContextBase&&) = delete; | ||
| FwkContextBase& operator=(FwkContextBase&&) = delete; | ||
|
|
||
| int device() const { return currentDevice_; } | ||
|
|
||
| cudaStream_t stream() { | ||
| if (not isInitialized()) { | ||
| initialize(); | ||
| } | ||
| return stream_->streamPtr().get(); | ||
| } | ||
| const SharedStreamPtr& streamPtr() { | ||
| if (not isInitialized()) { | ||
| initialize(); | ||
| } | ||
| return stream_->streamPtr(); | ||
| } | ||
|
|
||
| protected: | ||
| // The constructors set the current device, but the device | ||
| // is not set back to the previous value at the destructor. This | ||
| // should be sufficient (and tiny bit faster) as all CUDA API | ||
| // functions relying on the current device should be called from | ||
| // the scope where this context is. The current device doesn't | ||
| // really matter between modules (or across TBB tasks). | ||
| explicit FwkContextBase(edm::StreamID streamID); | ||
|
|
||
| explicit FwkContextBase(int device); | ||
|
|
||
| // meant only for testing | ||
| explicit FwkContextBase(int device, SharedStreamPtr stream); | ||
|
|
||
| bool isInitialized() const { return bool(stream_); } | ||
|
|
||
| void initialize(); | ||
| void initialize(const ProductBase& data); | ||
|
|
||
| const std::shared_ptr<impl::StreamSharingHelper>& streamSharingHelper() { | ||
| if (not isInitialized()) { | ||
| initialize(); | ||
| } | ||
| return stream_; | ||
| } | ||
|
|
||
| private: | ||
| int currentDevice_ = -1; | ||
| std::shared_ptr<impl::StreamSharingHelper> stream_; | ||
| }; | ||
| } | ||
|
|
||
| #endif |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,25 @@ | ||
| #ifndef HeterogeneousCore_CUDACore_FwkContextHolderHelper_h | ||
| #define HeterogeneousCore_CUDACore_FwkContextHolderHelper_h | ||
|
|
||
| namespace cms::cuda::impl { | ||
| class FwkContextHolderHelper { | ||
| public: | ||
| FwkContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder, int device) | ||
| : waitingTaskHolder_{std::move(waitingTaskHolder)}, device_{device} {} | ||
|
|
||
| template <typename F> | ||
| void pushNextTask(F&& f); | ||
|
|
||
| void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { | ||
| waitingTaskHolder_ = std::move(waitingTaskHolder); | ||
| } | ||
|
|
||
| void enqueueCallback(cudaStream_t stream); | ||
|
|
||
| private: | ||
| edm::WaitingTaskWithArenaHolder waitingTaskHolder_; | ||
| int device_; | ||
| }; | ||
| } | ||
|
|
||
| #endif |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| #include "CUDACore/ProduceContext.h" | ||
| #include "CUDACore/cudaCheck.h" | ||
|
|
||
| namespace cms::cuda { | ||
| void ProduceContext::commit() { cudaCheck(cudaEventRecord(event_.get(), stream())); } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,52 @@ | ||
| #ifndef HeterogeneousCore_CUDACore_ProduceContext_h | ||
| #define HeterogeneousCore_CUDACore_ProduceContext_h | ||
|
|
||
| #include "CUDACore/EDGetterContextBase.h" | ||
| #include "CUDACore/EventCache.h" | ||
| #include "Framework/EDPutToken.h" | ||
|
|
||
| namespace cms::cuda { | ||
| /** | ||
| * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce(): | ||
| * - setting the current device | ||
| * - synchronizing between CUDA streams if necessary | ||
| * Users should not, however, construct it explicitly. | ||
| */ | ||
| class ProduceContext : public impl::EDGetterContextBase { | ||
| public: | ||
| explicit ProduceContext(edm::StreamID streamID) : EDGetterContextBase(streamID) {} | ||
|
|
||
| ~ProduceContext() = default; | ||
|
|
||
| template <typename T> | ||
| std::unique_ptr<Product<T>> wrap(T data) { | ||
| // make_unique doesn't work because of private constructor | ||
| return std::unique_ptr<Product<T>>(new Product<T>(device(), streamSharingHelper(), event_, std::move(data))); | ||
| } | ||
|
|
||
| template <typename T, typename... Args> | ||
| auto emplace(edm::Event& iEvent, edm::EDPutTokenT<T> token, Args&&... args) { | ||
| return iEvent.emplace(token, device(), streamSharingHelper(), event_, std::forward<Args>(args)...); | ||
| } | ||
|
|
||
| // internal API | ||
| void commit(); | ||
|
|
||
| private: | ||
| // This construcor is only meant for testing | ||
| explicit ProduceContext(int device, SharedStreamPtr stream, SharedEventPtr event) | ||
| : EDGetterContextBase(device, std::move(stream)), event_{std::move(event)} {} | ||
|
|
||
| // create the CUDA Event upfront to catch possible errors from its creation | ||
| SharedEventPtr event_ = getEventCache().get(); | ||
| }; | ||
|
|
||
| template <typename F> | ||
| void runProduce(edm::StreamID streamID, F&& func) { | ||
| ProduceContext context(streamID); | ||
| func(context); | ||
| context.commit(); | ||
| } | ||
| } | ||
|
|
||
| #endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is this defined in the .cc file, instead of the .h file like the other methods ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can't think of any good reason now, so the definition could be moved to the header.