diff --git a/src/alpaka/AlpakaCore/ContextState.h b/src/alpaka/AlpakaCore/ContextState.h index 1263f4f0e..a1efa8913 100644 --- a/src/alpaka/AlpakaCore/ContextState.h +++ b/src/alpaka/AlpakaCore/ContextState.h @@ -4,7 +4,6 @@ #include #include "AlpakaCore/alpakaConfig.h" -#include "AlpakaCore/SharedStreamPtr.h" namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { @@ -15,6 +14,9 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { */ class ContextState { public: + using Queue = ::ALPAKA_ACCELERATOR_NAMESPACE::Queue; + using Device = alpaka::Dev; + ContextState() = default; ~ContextState() = default; @@ -28,24 +30,23 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { friend class ScopedContextProduce; friend class ScopedContextTask; - void set(int device, SharedStreamPtr stream) { + void set(std::shared_ptr stream) { throwIfStream(); - device_ = device; stream_ = std::move(stream); } - int device() const { return device_; } + Device device() const { return alpaka::getDev(*stream_); } - const SharedStreamPtr& streamPtr() const { + const std::shared_ptr& streamPtr() const { throwIfNoStream(); return stream_; } - SharedStreamPtr releaseStreamPtr() { + std::shared_ptr releaseStreamPtr() { throwIfNoStream(); // This function needs to effectively reset stream_ (i.e. stream_ // must be empty after this function). This behavior ensures that - // the SharedStreamPtr is not hold for inadvertedly long (i.e. to + // the std::shared_ptr is not hold for inadvertedly long (i.e. to // the next event), and is checked at run time. return std::move(stream_); } @@ -53,8 +54,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { void throwIfStream() const; void throwIfNoStream() const; - SharedStreamPtr stream_; - int device_; + std::shared_ptr stream_; }; } // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/AlpakaCore/ESProduct.h b/src/alpaka/AlpakaCore/ESProduct.h index f6a134e11..3dfb1a682 100644 --- a/src/alpaka/AlpakaCore/ESProduct.h +++ b/src/alpaka/AlpakaCore/ESProduct.h @@ -8,7 +8,6 @@ #include "AlpakaCore/alpakaConfig.h" #include "AlpakaCore/EventCache.h" -#include "AlpakaCore/SharedEventPtr.h" #include "AlpakaCore/currentDevice.h" #include "AlpakaCore/deviceCount.h" #include "AlpakaCore/eventWorkHasCompleted.h" @@ -18,11 +17,12 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { template class ESProduct { public: + using Event = ::ALPAKA_ACCELERATOR_NAMESPACE::Event; + template ESProduct(T_Acc acc) : gpuDataPerDevice_(::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::deviceCount()) { for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) { - gpuDataPerDevice_[i].m_event = - ::cms::alpakatools::getEventCache<::ALPAKA_ACCELERATOR_NAMESPACE::Event>().get(acc); + gpuDataPerDevice_[i].m_event = ::cms::alpakatools::getEventCache().get(acc); } } @@ -92,7 +92,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { private: struct Item { mutable std::mutex m_mutex; - mutable SharedEventPtr m_event; // guarded by m_mutex + mutable std::shared_ptr m_event; // guarded by m_mutex // non-null if some thread is already filling (cudaStream_t is just a pointer) mutable ::ALPAKA_ACCELERATOR_NAMESPACE::Queue* m_fillingStream = nullptr; // guarded by m_mutex mutable std::atomic m_filled = false; // easy check if data has been filled already or not diff --git a/src/alpaka/AlpakaCore/Product.h b/src/alpaka/AlpakaCore/Product.h index 6a5e0f5f6..a14a2fad6 100644 --- a/src/alpaka/AlpakaCore/Product.h +++ b/src/alpaka/AlpakaCore/Product.h @@ -46,12 +46,12 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { friend class ScopedContextProduce; friend class edm::Wrapper>; - explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, T data) - : ProductBase(device, std::move(stream), std::move(event)), data_(std::move(data)) {} + explicit Product(std::shared_ptr stream, std::shared_ptr event, T data) + : ProductBase(std::move(stream), std::move(event)), data_(std::move(data)) {} template - explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, Args&&... args) - : ProductBase(device, std::move(stream), std::move(event)), data_(std::forward(args)...) {} + explicit Product(std::shared_ptr stream, std::shared_ptr event, Args&&... args) + : ProductBase(std::move(stream), std::move(event)), data_(std::forward(args)...) {} T data_; //! }; diff --git a/src/alpaka/AlpakaCore/ProductBase.h b/src/alpaka/AlpakaCore/ProductBase.h index 3d1f09cf0..63b071a3a 100644 --- a/src/alpaka/AlpakaCore/ProductBase.h +++ b/src/alpaka/AlpakaCore/ProductBase.h @@ -4,8 +4,9 @@ #include #include -#include "AlpakaCore/SharedEventPtr.h" -#include "AlpakaCore/SharedStreamPtr.h" +#include + +#include "AlpakaCore/alpakaConfigAcc.h" namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { @@ -19,6 +20,9 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { */ class ProductBase { public: + using Queue = ::ALPAKA_ACCELERATOR_NAMESPACE::Queue; + using Event = alpaka::Event; + ProductBase() = default; // Needed only for ROOT dictionary generation ~ProductBase(); @@ -27,43 +31,41 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { ProductBase(ProductBase&& other) : stream_{std::move(other.stream_)}, event_{std::move(other.event_)}, - mayReuseStream_{other.mayReuseStream_.load()}, - device_{other.device_} {} + mayReuseStream_{other.mayReuseStream_.load()} {} ProductBase& operator=(ProductBase&& other) { stream_ = std::move(other.stream_); event_ = std::move(other.event_); mayReuseStream_ = other.mayReuseStream_.load(); - device_ = other.device_; return *this; } bool isValid() const { return stream_.get() != nullptr; } bool isAvailable() const; - int device() const { return device_; } + alpaka::Dev device() const { return alpaka::getDev(stream()); } // cudaStream_t is a pointer to a thread-safe object, for which a // mutable access is needed even if the ::cms::alpakatools::ScopedContext itself // would be const. Therefore it is ok to return a non-const // pointer from a const method here. - ::ALPAKA_ACCELERATOR_NAMESPACE::Queue& stream() const { return *(stream_.get()); } + Queue& stream() const { return *(stream_.get()); } // cudaEvent_t is a pointer to a thread-safe object, for which a // mutable access is needed even if the ::cms::alpakatools::ScopedContext itself // would be const. Therefore it is ok to return a non-const // pointer from a const method here. - alpaka::Event<::ALPAKA_ACCELERATOR_NAMESPACE::Queue>& event() const { return *(event_.get()); } + Event& event() const { return *(event_.get()); } protected: - explicit ProductBase(int device, SharedStreamPtr stream, SharedEventPtr event) - : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {} + explicit ProductBase(std::shared_ptr stream, std::shared_ptr event) + : stream_{std::move(stream)}, event_{std::move(event)} {} private: friend class impl::ScopedContextBase; friend class ScopedContextProduce; // The following function is intended to be used only from ScopedContext - const SharedStreamPtr& streamPtr() const { return stream_; } + const std::shared_ptr& streamPtr() const { return stream_; } bool mayReuseStream() const { bool expected = true; @@ -75,17 +77,14 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { // The cudaStream_t is really shared among edm::Event products, so // using shared_ptr also here - SharedStreamPtr stream_; //! + std::shared_ptr stream_; //! // shared_ptr because of caching in ::cms::alpakatools::EventCache - SharedEventPtr event_; //! + std::shared_ptr event_; //! // This flag tells whether the CUDA stream may be reused by a // consumer or not. The goal is to have a "chain" of modules to // queue their work to the same stream. mutable std::atomic mayReuseStream_ = true; //! - - // The CUDA device associated with this product - int device_ = -1; //! }; } // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/AlpakaCore/ScopedContext.h b/src/alpaka/AlpakaCore/ScopedContext.h index c20e8dbed..a146422b2 100644 --- a/src/alpaka/AlpakaCore/ScopedContext.h +++ b/src/alpaka/AlpakaCore/ScopedContext.h @@ -7,8 +7,6 @@ #include "AlpakaCore/ContextState.h" #include "AlpakaCore/EventCache.h" #include "AlpakaCore/Product.h" -#include "AlpakaCore/SharedEventPtr.h" -#include "AlpakaCore/SharedStreamPtr.h" #include "Framework/EDGetToken.h" #include "Framework/EDPutToken.h" #include "Framework/Event.h" @@ -26,14 +24,17 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { // This class is intended to be derived by other ScopedContext*, not for general use class ScopedContextBase { public: - int device() const { return currentDevice_; } + using Queue = ::ALPAKA_ACCELERATOR_NAMESPACE::Queue; + using Device = alpaka::Dev; + + Device device() const { return alpaka::getDev(*stream_); } // cudaStream_t is a pointer to a thread-safe object, for which a // mutable access is needed even if the ScopedContext itself // would be const. Therefore it is ok to return a non-const // pointer from a const method here. - ::ALPAKA_ACCELERATOR_NAMESPACE::Queue& stream() const { return *(stream_.get()); } - const SharedStreamPtr& streamPtr() const { return stream_; } + Queue& stream() const { return *(stream_.get()); } + const std::shared_ptr& streamPtr() const { return stream_; } protected: // The constructors set the current device, but the device @@ -43,44 +44,24 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { // the scope where this context is. The current device doesn't // really matter between modules (or across TBB tasks). - template - ScopedContextBase(T_Acc acc, const ProductBase& data) : currentDevice_(data.device()) { -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - cudaSetDevice(currentDevice_); -#endif - if (data.mayReuseStream()) { - stream_ = data.streamPtr(); - } else { - stream_ = getStreamCache<::ALPAKA_ACCELERATOR_NAMESPACE::Queue>().get(acc); - } - } + ScopedContextBase(const ProductBase& data) + : stream_{data.mayReuseStream() ? data.streamPtr() : getStreamCache().get(data.device())} {} - explicit ScopedContextBase(int device, SharedStreamPtr stream) - : currentDevice_(device), stream_(std::move(stream)) { -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - cudaSetDevice(currentDevice_); -#endif - } + explicit ScopedContextBase(std::shared_ptr stream) : stream_(std::move(stream)) {} - template - explicit ScopedContextBase(T_Acc acc, edm::StreamID streamID) - : currentDevice_(::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::chooseDevice(streamID)) { -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - cudaSetDevice(currentDevice_); -#endif - stream_ = getStreamCache<::ALPAKA_ACCELERATOR_NAMESPACE::Queue>().get(acc); - } + explicit ScopedContextBase(edm::StreamID streamID) + : stream_{getStreamCache().get( + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::chooseDevice(streamID))} {} private: - int currentDevice_; - SharedStreamPtr stream_; + std::shared_ptr stream_; }; class ScopedContextGetterBase : public ScopedContextBase { public: template const T& get(const Product& data) { - synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event()); + synchronizeStreams(data.stream(), data.isAvailable(), data.event()); return data.data_; } @@ -93,10 +74,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { template ScopedContextGetterBase(Args&&... args) : ScopedContextBase(std::forward(args)...) {} - void synchronizeStreams(int dataDevice, - ::ALPAKA_ACCELERATOR_NAMESPACE::Queue& dataStream, - bool available, - alpaka::Event<::ALPAKA_ACCELERATOR_NAMESPACE::Queue> dataEvent); + void synchronizeStreams(Queue& dataStream, bool available, alpaka::Event dataEvent); }; class ScopedContextHolderHelper { @@ -111,7 +89,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { waitingTaskHolder_ = std::move(waitingTaskHolder); } - void enqueueCallback(int device, ::ALPAKA_ACCELERATOR_NAMESPACE::Queue& stream); + void enqueueCallback(ScopedContextBase::Queue& stream); private: edm::WaitingTaskWithArenaHolder waitingTaskHolder_; @@ -128,30 +106,24 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { class ScopedContextAcquire : public impl::ScopedContextGetterBase { public: /// Constructor to create a new CUDA stream (no need for context beyond acquire()) - template - explicit ScopedContextAcquire(T_Acc acc, edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder) - : ScopedContextGetterBase(acc, streamID), holderHelper_{std::move(waitingTaskHolder)} {} + explicit ScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder) + : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)} {} // /// Constructor to create a new CUDA stream, and the context is needed after acquire() - template - explicit ScopedContextAcquire(T_Acc acc, - edm::StreamID streamID, + explicit ScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder, ContextState& state) - : ScopedContextGetterBase(acc, streamID), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {} + : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {} // /// Constructor to (possibly) re-use a CUDA stream (no need for context beyond acquire()) - template - explicit ScopedContextAcquire(T_Acc acc, const ProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder) - : ScopedContextGetterBase(acc, data), holderHelper_{std::move(waitingTaskHolder)} {} + explicit ScopedContextAcquire(const ProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder) + : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)} {} // /// Constructor to (possibly) re-use a CUDA stream, and the context is needed after acquire() - template - explicit ScopedContextAcquire(T_Acc acc, - const ProductBase& data, + explicit ScopedContextAcquire(const ProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder, ContextState& state) - : ScopedContextGetterBase(acc, data), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {} + : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {} ~ScopedContextAcquire(); @@ -182,27 +154,24 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { class ScopedContextProduce : public impl::ScopedContextGetterBase { public: /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module) - explicit ScopedContextProduce(ContextState& state) - : ScopedContextGetterBase(state.device(), state.releaseStreamPtr()) {} + explicit ScopedContextProduce(ContextState& state) : ScopedContextGetterBase(state.releaseStreamPtr()) {} - template - explicit ScopedContextProduce(T_Acc acc, const ProductBase& data) : ScopedContextGetterBase(acc, data) {} + explicit ScopedContextProduce(const ProductBase& data) : ScopedContextGetterBase(data) {} - template - explicit ScopedContextProduce(T_Acc acc, edm::StreamID streamID) : ScopedContextGetterBase(acc, streamID) {} + explicit ScopedContextProduce(edm::StreamID streamID) : ScopedContextGetterBase(streamID) {} /// Record the CUDA event, all asynchronous work must have been queued before the destructor ~ScopedContextProduce(); - template - std::unique_ptr> wrap(T_Acc acc, T data) { + template + std::unique_ptr> wrap(T data) { // make_unique doesn't work because of private constructor - return std::unique_ptr>(new Product(device(), streamPtr(), getEvent(acc), std::move(data))); + return std::unique_ptr>(new Product(streamPtr(), std::move(data))); } - template - auto emplace(T_Acc acc, edm::Event& iEvent, edm::EDPutTokenT token, Args&&... args) { - // return iEvent.emplace(token, device(), streamPtr(), getEvent(acc), std::forward(args)...); + template + auto emplace(edm::Event& iEvent, edm::EDPutTokenT token, Args&&... args) { + // return iEvent.emplace(token, streamPtr(), getEvent(acc), std::forward(args)...); return iEvent.emplace(token, std::forward(args)...); // TODO } @@ -210,13 +179,9 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { private: friend class ::ALPAKA_ACCELERATOR_NAMESPACE::cms::alpakatest::TestScopedContext; - explicit ScopedContextProduce(int device, SharedStreamPtr stream) - : ScopedContextGetterBase(device, std::move(stream)) {} + explicit ScopedContextProduce(std::shared_ptr stream) : ScopedContextGetterBase(std::move(stream)) {} - template - auto getEvent(T_Acc acc) { - return getEventCache<::ALPAKA_ACCELERATOR_NAMESPACE::Event>().get(acc); - } + auto getEvent() { return getEventCache<::ALPAKA_ACCELERATOR_NAMESPACE::Event>().get(device()); } // create the CUDA Event upfront to catch possible errors from its creation }; @@ -231,7 +196,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { public: /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module) explicit ScopedContextTask(ContextState const* state, edm::WaitingTaskWithArenaHolder waitingTaskHolder) - : ScopedContextBase(state->device(), state->streamPtr()), // don't move, state is re-used afterwards + : ScopedContextBase(state->streamPtr()), // don't move, state is re-used afterwards holderHelper_{std::move(waitingTaskHolder)}, contextState_{state} {} @@ -260,8 +225,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { class ScopedContextAnalyze : public impl::ScopedContextGetterBase { public: /// Constructor to (possibly) re-use a CUDA stream - template - explicit ScopedContextAnalyze(T_Acc acc, const ProductBase& data) : ScopedContextGetterBase(acc, data) {} + explicit ScopedContextAnalyze(const ProductBase& data) : ScopedContextGetterBase(data) {} }; namespace impl { diff --git a/src/alpaka/AlpakaCore/SharedEventPtr.h b/src/alpaka/AlpakaCore/SharedEventPtr.h deleted file mode 100644 index 3582a928c..000000000 --- a/src/alpaka/AlpakaCore/SharedEventPtr.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef HeterogeneousCore_AlpakaUtilities_SharedEventPtr_h -#define HeterogeneousCore_AlpakaUtilities_SharedEventPtr_h - -#include -#include - -#include "AlpakaCore/alpakaConfig.h" - -namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { - - using SharedEventPtr = std::shared_ptr<::ALPAKA_ACCELERATOR_NAMESPACE::Event>; - -} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE - -#endif diff --git a/src/alpaka/AlpakaCore/SharedStreamPtr.h b/src/alpaka/AlpakaCore/SharedStreamPtr.h deleted file mode 100644 index 38909af5a..000000000 --- a/src/alpaka/AlpakaCore/SharedStreamPtr.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef HeterogeneousCore_AlpakaUtilities_SharedStreamPtr_h -#define HeterogeneousCore_AlpakaUtilities_SharedStreamPtr_h - -#include -#include - -#include "AlpakaCore/alpakaConfig.h" - -namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { - - using SharedStreamPtr = std::shared_ptr<::ALPAKA_ACCELERATOR_NAMESPACE::Queue>; - -} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE - -#endif diff --git a/src/alpaka/AlpakaCore/StreamCache.h b/src/alpaka/AlpakaCore/StreamCache.h index e24f7dfb9..22b93a312 100644 --- a/src/alpaka/AlpakaCore/StreamCache.h +++ b/src/alpaka/AlpakaCore/StreamCache.h @@ -22,7 +22,7 @@ namespace cms::alpakatools { // Gets a (cached) CUDA stream for the current device. The stream // will be returned to the cache by the shared_ptr destructor. // This function is thread safe - ALPAKA_FN_HOST std::shared_ptr get(Device dev) { + ALPAKA_FN_HOST std::shared_ptr get(Device const& dev) { return cache_[cms::alpakatools::getDevIndex(dev)].makeOrGet([dev]() { return std::make_unique(dev); }); } diff --git a/src/alpaka/AlpakaCore/alpaka/ScopedContext.cc b/src/alpaka/AlpakaCore/alpaka/ScopedContext.cc index 52967f959..2944d9638 100644 --- a/src/alpaka/AlpakaCore/alpaka/ScopedContext.cc +++ b/src/alpaka/AlpakaCore/alpaka/ScopedContext.cc @@ -1,28 +1,21 @@ #include "AlpakaCore/alpakaConfig.h" #include "AlpakaCore/ScopedContext.h" -namespace { - struct CallbackData { - edm::WaitingTaskWithArenaHolder holder; - int device; - }; -} // namespace - namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { namespace impl { - void ScopedContextGetterBase::synchronizeStreams(int dataDevice, - ::ALPAKA_ACCELERATOR_NAMESPACE::Queue& dataStream, + void ScopedContextGetterBase::synchronizeStreams(Queue& dataStream, bool available, - alpaka::Event<::ALPAKA_ACCELERATOR_NAMESPACE::Queue> dataEvent) { - if (dataDevice != device()) { - // Eventually replace with prefetch to current device (assuming unified memory works) - // If we won't go to unified memory, need to figure out something else... - throw std::runtime_error("Handling data from multiple devices is not yet supported"); - } - + alpaka::Event dataEvent) { if (dataStream != stream()) { - // Different streams, need to synchronize + // Different streams, check if the underlying device is the same + if (alpaka::getDev(dataStream) != device()) { + // Eventually replace with prefetch to current device (assuming unified memory works) + // If we won't go to unified memory, need to figure out something else... + throw std::runtime_error("Handling data from multiple devices is not yet supported"); + } + + // Synchronize the two streams if (not available) { // Event not yet occurred, so need to add synchronization // here. Sychronization is done by making the CUDA stream to @@ -34,13 +27,13 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { } } - void ScopedContextHolderHelper::enqueueCallback(int device, ::ALPAKA_ACCELERATOR_NAMESPACE::Queue& stream) { - alpaka::enqueue(stream, [this, device]() { - auto data = new CallbackData{waitingTaskHolder_, device}; - std::unique_ptr guard{reinterpret_cast(data)}; - edm::WaitingTaskWithArenaHolder& waitingTaskHolder = guard->holder; - int device2 = guard->device; - waitingTaskHolder.doneWaiting(nullptr); + void ScopedContextHolderHelper::enqueueCallback(ScopedContextBase::Queue& stream) { + alpaka::enqueue(stream, [holder = waitingTaskHolder_]() { + // TODO: The functor is required to be const, so can't use + // 'mutable', so I'm copying the object as a workaround. I + // wonder if there are any wider implications. + auto h = holder; + h.doneWaiting(nullptr); }); } } // namespace impl @@ -48,9 +41,9 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { //////////////////// ScopedContextAcquire::~ScopedContextAcquire() { - holderHelper_.enqueueCallback(device(), stream()); + holderHelper_.enqueueCallback(stream()); if (contextState_) { - contextState_->set(device(), streamPtr()); + contextState_->set(streamPtr()); } } @@ -73,6 +66,6 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { //////////////////// - ScopedContextTask::~ScopedContextTask() { holderHelper_.enqueueCallback(device(), stream()); } + ScopedContextTask::~ScopedContextTask() { holderHelper_.enqueueCallback(stream()); } } // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/AlpakaCore/alpaka/chooseDevice.cc b/src/alpaka/AlpakaCore/alpaka/chooseDevice.cc index d7296fd41..ab0364a73 100644 --- a/src/alpaka/AlpakaCore/alpaka/chooseDevice.cc +++ b/src/alpaka/AlpakaCore/alpaka/chooseDevice.cc @@ -1,17 +1,18 @@ #include "AlpakaCore/alpakaConfig.h" #include "AlpakaCore/chooseDevice.h" -#include "AlpakaCore/deviceCount.h" +#include "AlpakaCore/alpakaDevAcc.h" namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { - int chooseDevice(edm::StreamID id) { + ::ALPAKA_ACCELERATOR_NAMESPACE::Device const& chooseDevice(edm::StreamID id) { // For startes we "statically" assign the device based on // edm::Stream number. This is suboptimal if the number of // edm::Streams is not a multiple of the number of CUDA devices // (and even then there is no load balancing). // TODO: improve the "assignment" logic - return id % ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::deviceCount(); + auto const& devices = ::ALPAKA_ACCELERATOR_NAMESPACE::devices; + return devices[id % devices.size()]; } } // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/AlpakaCore/chooseDevice.h b/src/alpaka/AlpakaCore/chooseDevice.h index 9580e4439..fec1bdbf4 100644 --- a/src/alpaka/AlpakaCore/chooseDevice.h +++ b/src/alpaka/AlpakaCore/chooseDevice.h @@ -6,7 +6,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { - int chooseDevice(edm::StreamID id); + ::ALPAKA_ACCELERATOR_NAMESPACE::Device const& chooseDevice(edm::StreamID id); } // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/plugin-BeamSpotProducer/alpaka/BeamSpotToAlpaka.cc b/src/alpaka/plugin-BeamSpotProducer/alpaka/BeamSpotToAlpaka.cc index 157aca30b..0423aea84 100644 --- a/src/alpaka/plugin-BeamSpotProducer/alpaka/BeamSpotToAlpaka.cc +++ b/src/alpaka/plugin-BeamSpotProducer/alpaka/BeamSpotToAlpaka.cc @@ -26,11 +26,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { void BeamSpotToAlpaka::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { auto const& bsRaw = iSetup.get(); - // TO DO: Add inter-event parallelization. cms::alpaka::ScopedContextProduce? - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], - iEvent.streamID()}; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{iEvent.streamID()}; BeamSpotAlpaka bsDevice(&bsRaw, ctx.stream()); - ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], iEvent, bsPutToken_, std::move(bsDevice)); + ctx.emplace(iEvent, bsPutToken_, std::move(bsDevice)); } } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletAlpaka.cc b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletAlpaka.cc index 51801c83f..004228751 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletAlpaka.cc +++ b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletAlpaka.cc @@ -36,12 +36,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto const& hits = iEvent.get(tokenHitGPU_); - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], - iEvent.streamID()}; - ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], - iEvent, - tokenTrackGPU_, - gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream())); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{iEvent.streamID()}; + ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream())); } } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/PixelTrackSoAFromAlpaka.cc b/src/alpaka/plugin-PixelTriplets/alpaka/PixelTrackSoAFromAlpaka.cc index 064648534..6e7c08a6a 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/PixelTrackSoAFromAlpaka.cc +++ b/src/alpaka/plugin-PixelTriplets/alpaka/PixelTrackSoAFromAlpaka.cc @@ -76,12 +76,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED auto const& inputData = iEvent.get(tokenAlpaka_); auto outputData = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(1u); - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], - iEvent.streamID()}; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{iEvent.streamID()}; alpaka::memcpy(ctx.stream(), outputData, inputData, 1u); // DO NOT make a copy (actually TWO....) - ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], iEvent, tokenSOA_, std::move(outputData)); + ctx.emplace(iEvent, tokenSOA_, std::move(outputData)); #endif } diff --git a/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexProducerAlpaka.cc b/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexProducerAlpaka.cc index 175d22163..01e238f8b 100644 --- a/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexProducerAlpaka.cc +++ b/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexProducerAlpaka.cc @@ -51,12 +51,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto const& tracksBuf = iEvent.get(tokenTrack_); auto const tracks = alpaka::getPtrNative(tracksBuf); - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], - iEvent.streamID()}; - ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], - iEvent, - tokenVertex_, - m_gpuAlgo.makeAsync(tracks, m_ptMin, ctx.stream())); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{iEvent.streamID()}; + ctx.emplace(iEvent, tokenVertex_, m_gpuAlgo.makeAsync(tracks, m_ptMin, ctx.stream())); } } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexSoAFromAlpaka.cc b/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexSoAFromAlpaka.cc index eb017e2eb..7fd14cc50 100644 --- a/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexSoAFromAlpaka.cc +++ b/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexSoAFromAlpaka.cc @@ -57,12 +57,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED auto const& inputData = iEvent.get(tokenAlpaka_); auto outputData = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(1u); - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], - iEvent.streamID()}; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{iEvent.streamID()}; alpaka::memcpy(ctx.stream(), outputData, inputData, 1u); // No copies.... - ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], iEvent, tokenSOA_, std::move(outputData)); + ctx.emplace(iEvent, tokenSOA_, std::move(outputData)); #endif } diff --git a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToCluster.cc b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToCluster.cc index edd08fd14..60ca31bd0 100644 --- a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToCluster.cc +++ b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToCluster.cc @@ -140,8 +140,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } // end of for loop - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], - iEvent.streamID()}; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{iEvent.streamID()}; gpuAlgo_.makeClustersAsync(isRun2_, gpuMap, gpuModulesToUnpack, @@ -156,10 +155,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ctx.stream()); auto tmp = gpuAlgo_.getResults(); - ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], iEvent, digiPutToken_, std::move(tmp.first)); - ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], iEvent, clusterPutToken_, std::move(tmp.second)); + ctx.emplace(iEvent, digiPutToken_, std::move(tmp.first)); + ctx.emplace(iEvent, clusterPutToken_, std::move(tmp.second)); if (includeErrors_) { - ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], iEvent, digiErrorPutToken_, gpuAlgo_.getErrors()); + ctx.emplace(iEvent, digiErrorPutToken_, gpuAlgo_.getErrors()); } } diff --git a/src/alpaka/plugin-SiPixelRecHits/alpaka/SiPixelRecHitAlpaka.cc b/src/alpaka/plugin-SiPixelRecHits/alpaka/SiPixelRecHitAlpaka.cc index c8b9046b6..2098dd1a8 100644 --- a/src/alpaka/plugin-SiPixelRecHits/alpaka/SiPixelRecHitAlpaka.cc +++ b/src/alpaka/plugin-SiPixelRecHits/alpaka/SiPixelRecHitAlpaka.cc @@ -54,12 +54,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } // TO DO: Async: Would need to add a queue as a parameter, not async for now! - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], - iEvent.streamID()}; - ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], - iEvent, - tokenHit_, - gpuAlgo_.makeHitsAsync(digis, clusters, bs, fcpe.params(), ctx.stream())); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{iEvent.streamID()}; + ctx.emplace(iEvent, tokenHit_, gpuAlgo_.makeHitsAsync(digis, clusters, bs, fcpe.params(), ctx.stream())); } } // namespace ALPAKA_ACCELERATOR_NAMESPACE