From e3fdd33ae06ca298d2d0a77da2edc220a327a7f5 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Thu, 21 Oct 2021 17:31:53 -0700 Subject: [PATCH 1/3] [alpaka] Simplify Device handling in ScopedContext and Product --- src/alpaka/AlpakaCore/ContextState.h | 13 ++- src/alpaka/AlpakaCore/Product.h | 8 +- src/alpaka/AlpakaCore/ProductBase.h | 13 +-- src/alpaka/AlpakaCore/ScopedContext.h | 84 +++++++------------ src/alpaka/AlpakaCore/alpaka/ScopedContext.cc | 27 +++--- src/alpaka/AlpakaCore/alpaka/chooseDevice.cc | 7 +- src/alpaka/AlpakaCore/chooseDevice.h | 2 +- .../alpaka/BeamSpotToAlpaka.cc | 6 +- .../alpaka/CAHitNtupletAlpaka.cc | 8 +- .../alpaka/PixelTrackSoAFromAlpaka.cc | 5 +- .../alpaka/PixelVertexProducerAlpaka.cc | 8 +- .../alpaka/PixelVertexSoAFromAlpaka.cc | 5 +- .../alpaka/SiPixelRawToCluster.cc | 9 +- .../alpaka/SiPixelRecHitAlpaka.cc | 8 +- 14 files changed, 79 insertions(+), 124 deletions(-) diff --git a/src/alpaka/AlpakaCore/ContextState.h b/src/alpaka/AlpakaCore/ContextState.h index 1263f4f0e..59d29d80c 100644 --- a/src/alpaka/AlpakaCore/ContextState.h +++ b/src/alpaka/AlpakaCore/ContextState.h @@ -2,6 +2,7 @@ #define HeterogeneousCore_AlpakaCore_ContextState_h #include +#include #include "AlpakaCore/alpakaConfig.h" #include "AlpakaCore/SharedStreamPtr.h" @@ -15,6 +16,8 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { */ class ContextState { public: + using Device = ::ALPAKA_ACCELERATOR_NAMESPACE::Device; + ContextState() = default; ~ContextState() = default; @@ -28,13 +31,13 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { friend class ScopedContextProduce; friend class ScopedContextTask; - void set(int device, SharedStreamPtr stream) { + void set(Device device, SharedStreamPtr stream) { throwIfStream(); - device_ = device; + device_ = std::move(device); stream_ = std::move(stream); } - int device() const { return device_; } + Device const& device() const { return *device_; } const SharedStreamPtr& streamPtr() const { throwIfNoStream(); @@ -54,7 +57,9 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { void throwIfNoStream() const; SharedStreamPtr stream_; - int device_; + // Work around no default constructor for Device + // On the other hand, we don't strictly need the Device here... + std::optional device_; }; } // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/AlpakaCore/Product.h b/src/alpaka/AlpakaCore/Product.h index 6a5e0f5f6..662f8bb09 100644 --- a/src/alpaka/AlpakaCore/Product.h +++ b/src/alpaka/AlpakaCore/Product.h @@ -46,12 +46,12 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { friend class ScopedContextProduce; friend class edm::Wrapper>; - explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, T data) - : ProductBase(device, std::move(stream), std::move(event)), data_(std::move(data)) {} + explicit Product(SharedStreamPtr stream, SharedEventPtr event, T data) + : ProductBase(std::move(stream), std::move(event)), data_(std::move(data)) {} template - explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, Args&&... args) - : ProductBase(device, std::move(stream), std::move(event)), data_(std::forward(args)...) {} + explicit Product(SharedStreamPtr stream, SharedEventPtr event, Args&&... args) + : ProductBase(std::move(stream), std::move(event)), data_(std::forward(args)...) {} T data_; //! }; diff --git a/src/alpaka/AlpakaCore/ProductBase.h b/src/alpaka/AlpakaCore/ProductBase.h index 3d1f09cf0..e6bc08ed5 100644 --- a/src/alpaka/AlpakaCore/ProductBase.h +++ b/src/alpaka/AlpakaCore/ProductBase.h @@ -27,20 +27,18 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { ProductBase(ProductBase&& other) : stream_{std::move(other.stream_)}, event_{std::move(other.event_)}, - mayReuseStream_{other.mayReuseStream_.load()}, - device_{other.device_} {} + mayReuseStream_{other.mayReuseStream_.load()} {} ProductBase& operator=(ProductBase&& other) { stream_ = std::move(other.stream_); event_ = std::move(other.event_); mayReuseStream_ = other.mayReuseStream_.load(); - device_ = other.device_; return *this; } bool isValid() const { return stream_.get() != nullptr; } bool isAvailable() const; - int device() const { return device_; } + ::ALPAKA_ACCELERATOR_NAMESPACE::Device device() const { return alpaka::getDev(stream()); } // cudaStream_t is a pointer to a thread-safe object, for which a // mutable access is needed even if the ::cms::alpakatools::ScopedContext itself @@ -55,8 +53,8 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { alpaka::Event<::ALPAKA_ACCELERATOR_NAMESPACE::Queue>& event() const { return *(event_.get()); } protected: - explicit ProductBase(int device, SharedStreamPtr stream, SharedEventPtr event) - : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {} + explicit ProductBase(SharedStreamPtr stream, SharedEventPtr event) + : stream_{std::move(stream)}, event_{std::move(event)} {} private: friend class impl::ScopedContextBase; @@ -83,9 +81,6 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { // consumer or not. The goal is to have a "chain" of modules to // queue their work to the same stream. mutable std::atomic mayReuseStream_ = true; //! - - // The CUDA device associated with this product - int device_ = -1; //! }; } // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/AlpakaCore/ScopedContext.h b/src/alpaka/AlpakaCore/ScopedContext.h index c20e8dbed..5ff26540a 100644 --- a/src/alpaka/AlpakaCore/ScopedContext.h +++ b/src/alpaka/AlpakaCore/ScopedContext.h @@ -26,7 +26,9 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { // This class is intended to be derived by other ScopedContext*, not for general use class ScopedContextBase { public: - int device() const { return currentDevice_; } + using Device = ::ALPAKA_ACCELERATOR_NAMESPACE::Device; + + Device const& device() const { return currentDevice_; } // cudaStream_t is a pointer to a thread-safe object, for which a // mutable access is needed even if the ScopedContext itself @@ -43,36 +45,24 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { // the scope where this context is. The current device doesn't // really matter between modules (or across TBB tasks). - template - ScopedContextBase(T_Acc acc, const ProductBase& data) : currentDevice_(data.device()) { -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - cudaSetDevice(currentDevice_); -#endif + ScopedContextBase(const ProductBase& data) : currentDevice_(data.device()) { if (data.mayReuseStream()) { stream_ = data.streamPtr(); } else { - stream_ = getStreamCache<::ALPAKA_ACCELERATOR_NAMESPACE::Queue>().get(acc); + stream_ = getStreamCache<::ALPAKA_ACCELERATOR_NAMESPACE::Queue>().get(currentDevice_); } } - explicit ScopedContextBase(int device, SharedStreamPtr stream) - : currentDevice_(device), stream_(std::move(stream)) { -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - cudaSetDevice(currentDevice_); -#endif - } + explicit ScopedContextBase(Device device, SharedStreamPtr stream) + : currentDevice_(std::move(device)), stream_(std::move(stream)) {} - template - explicit ScopedContextBase(T_Acc acc, edm::StreamID streamID) + explicit ScopedContextBase(edm::StreamID streamID) : currentDevice_(::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::chooseDevice(streamID)) { -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - cudaSetDevice(currentDevice_); -#endif - stream_ = getStreamCache<::ALPAKA_ACCELERATOR_NAMESPACE::Queue>().get(acc); + stream_ = getStreamCache<::ALPAKA_ACCELERATOR_NAMESPACE::Queue>().get(currentDevice_); } private: - int currentDevice_; + Device const currentDevice_; SharedStreamPtr stream_; }; @@ -93,7 +83,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { template ScopedContextGetterBase(Args&&... args) : ScopedContextBase(std::forward(args)...) {} - void synchronizeStreams(int dataDevice, + void synchronizeStreams(Device const& dataDevice, ::ALPAKA_ACCELERATOR_NAMESPACE::Queue& dataStream, bool available, alpaka::Event<::ALPAKA_ACCELERATOR_NAMESPACE::Queue> dataEvent); @@ -111,7 +101,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { waitingTaskHolder_ = std::move(waitingTaskHolder); } - void enqueueCallback(int device, ::ALPAKA_ACCELERATOR_NAMESPACE::Queue& stream); + void enqueueCallback(::ALPAKA_ACCELERATOR_NAMESPACE::Queue& stream); private: edm::WaitingTaskWithArenaHolder waitingTaskHolder_; @@ -128,30 +118,24 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { class ScopedContextAcquire : public impl::ScopedContextGetterBase { public: /// Constructor to create a new CUDA stream (no need for context beyond acquire()) - template - explicit ScopedContextAcquire(T_Acc acc, edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder) - : ScopedContextGetterBase(acc, streamID), holderHelper_{std::move(waitingTaskHolder)} {} + explicit ScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder) + : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)} {} // /// Constructor to create a new CUDA stream, and the context is needed after acquire() - template - explicit ScopedContextAcquire(T_Acc acc, - edm::StreamID streamID, + explicit ScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder, ContextState& state) - : ScopedContextGetterBase(acc, streamID), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {} + : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {} // /// Constructor to (possibly) re-use a CUDA stream (no need for context beyond acquire()) - template - explicit ScopedContextAcquire(T_Acc acc, const ProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder) - : ScopedContextGetterBase(acc, data), holderHelper_{std::move(waitingTaskHolder)} {} + explicit ScopedContextAcquire(const ProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder) + : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)} {} // /// Constructor to (possibly) re-use a CUDA stream, and the context is needed after acquire() - template - explicit ScopedContextAcquire(T_Acc acc, - const ProductBase& data, + explicit ScopedContextAcquire(const ProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder, ContextState& state) - : ScopedContextGetterBase(acc, data), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {} + : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {} ~ScopedContextAcquire(); @@ -185,23 +169,21 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { explicit ScopedContextProduce(ContextState& state) : ScopedContextGetterBase(state.device(), state.releaseStreamPtr()) {} - template - explicit ScopedContextProduce(T_Acc acc, const ProductBase& data) : ScopedContextGetterBase(acc, data) {} + explicit ScopedContextProduce(const ProductBase& data) : ScopedContextGetterBase(data) {} - template - explicit ScopedContextProduce(T_Acc acc, edm::StreamID streamID) : ScopedContextGetterBase(acc, streamID) {} + explicit ScopedContextProduce(edm::StreamID streamID) : ScopedContextGetterBase(streamID) {} /// Record the CUDA event, all asynchronous work must have been queued before the destructor ~ScopedContextProduce(); - template - std::unique_ptr> wrap(T_Acc acc, T data) { + template + std::unique_ptr> wrap(T data) { // make_unique doesn't work because of private constructor - return std::unique_ptr>(new Product(device(), streamPtr(), getEvent(acc), std::move(data))); + return std::unique_ptr>(new Product(streamPtr(), std::move(data))); } - template - auto emplace(T_Acc acc, edm::Event& iEvent, edm::EDPutTokenT token, Args&&... args) { + template + auto emplace(edm::Event& iEvent, edm::EDPutTokenT token, Args&&... args) { // return iEvent.emplace(token, device(), streamPtr(), getEvent(acc), std::forward(args)...); return iEvent.emplace(token, std::forward(args)...); // TODO @@ -210,13 +192,10 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { private: friend class ::ALPAKA_ACCELERATOR_NAMESPACE::cms::alpakatest::TestScopedContext; - explicit ScopedContextProduce(int device, SharedStreamPtr stream) - : ScopedContextGetterBase(device, std::move(stream)) {} + explicit ScopedContextProduce(Device device, SharedStreamPtr stream) + : ScopedContextGetterBase(std::move(device), std::move(stream)) {} - template - auto getEvent(T_Acc acc) { - return getEventCache<::ALPAKA_ACCELERATOR_NAMESPACE::Event>().get(acc); - } + auto getEvent() { return getEventCache<::ALPAKA_ACCELERATOR_NAMESPACE::Event>().get(device()); } // create the CUDA Event upfront to catch possible errors from its creation }; @@ -260,8 +239,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { class ScopedContextAnalyze : public impl::ScopedContextGetterBase { public: /// Constructor to (possibly) re-use a CUDA stream - template - explicit ScopedContextAnalyze(T_Acc acc, const ProductBase& data) : ScopedContextGetterBase(acc, data) {} + explicit ScopedContextAnalyze(const ProductBase& data) : ScopedContextGetterBase(data) {} }; namespace impl { diff --git a/src/alpaka/AlpakaCore/alpaka/ScopedContext.cc b/src/alpaka/AlpakaCore/alpaka/ScopedContext.cc index 52967f959..a7490563c 100644 --- a/src/alpaka/AlpakaCore/alpaka/ScopedContext.cc +++ b/src/alpaka/AlpakaCore/alpaka/ScopedContext.cc @@ -1,17 +1,10 @@ #include "AlpakaCore/alpakaConfig.h" #include "AlpakaCore/ScopedContext.h" -namespace { - struct CallbackData { - edm::WaitingTaskWithArenaHolder holder; - int device; - }; -} // namespace - namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { namespace impl { - void ScopedContextGetterBase::synchronizeStreams(int dataDevice, + void ScopedContextGetterBase::synchronizeStreams(Device const& dataDevice, ::ALPAKA_ACCELERATOR_NAMESPACE::Queue& dataStream, bool available, alpaka::Event<::ALPAKA_ACCELERATOR_NAMESPACE::Queue> dataEvent) { @@ -34,13 +27,13 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { } } - void ScopedContextHolderHelper::enqueueCallback(int device, ::ALPAKA_ACCELERATOR_NAMESPACE::Queue& stream) { - alpaka::enqueue(stream, [this, device]() { - auto data = new CallbackData{waitingTaskHolder_, device}; - std::unique_ptr guard{reinterpret_cast(data)}; - edm::WaitingTaskWithArenaHolder& waitingTaskHolder = guard->holder; - int device2 = guard->device; - waitingTaskHolder.doneWaiting(nullptr); + void ScopedContextHolderHelper::enqueueCallback(::ALPAKA_ACCELERATOR_NAMESPACE::Queue& stream) { + alpaka::enqueue(stream, [holder = waitingTaskHolder_]() { + // TODO: The functor is required to be const, so can't use + // 'mutable', so I'm copying the object as a workaround. I + // wonder if there are any wider implications. + auto h = holder; + h.doneWaiting(nullptr); }); } } // namespace impl @@ -48,7 +41,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { //////////////////// ScopedContextAcquire::~ScopedContextAcquire() { - holderHelper_.enqueueCallback(device(), stream()); + holderHelper_.enqueueCallback(stream()); if (contextState_) { contextState_->set(device(), streamPtr()); } @@ -73,6 +66,6 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { //////////////////// - ScopedContextTask::~ScopedContextTask() { holderHelper_.enqueueCallback(device(), stream()); } + ScopedContextTask::~ScopedContextTask() { holderHelper_.enqueueCallback(stream()); } } // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/AlpakaCore/alpaka/chooseDevice.cc b/src/alpaka/AlpakaCore/alpaka/chooseDevice.cc index d7296fd41..ab0364a73 100644 --- a/src/alpaka/AlpakaCore/alpaka/chooseDevice.cc +++ b/src/alpaka/AlpakaCore/alpaka/chooseDevice.cc @@ -1,17 +1,18 @@ #include "AlpakaCore/alpakaConfig.h" #include "AlpakaCore/chooseDevice.h" -#include "AlpakaCore/deviceCount.h" +#include "AlpakaCore/alpakaDevAcc.h" namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { - int chooseDevice(edm::StreamID id) { + ::ALPAKA_ACCELERATOR_NAMESPACE::Device const& chooseDevice(edm::StreamID id) { // For startes we "statically" assign the device based on // edm::Stream number. This is suboptimal if the number of // edm::Streams is not a multiple of the number of CUDA devices // (and even then there is no load balancing). // TODO: improve the "assignment" logic - return id % ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::deviceCount(); + auto const& devices = ::ALPAKA_ACCELERATOR_NAMESPACE::devices; + return devices[id % devices.size()]; } } // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/AlpakaCore/chooseDevice.h b/src/alpaka/AlpakaCore/chooseDevice.h index 9580e4439..fec1bdbf4 100644 --- a/src/alpaka/AlpakaCore/chooseDevice.h +++ b/src/alpaka/AlpakaCore/chooseDevice.h @@ -6,7 +6,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { - int chooseDevice(edm::StreamID id); + ::ALPAKA_ACCELERATOR_NAMESPACE::Device const& chooseDevice(edm::StreamID id); } // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/plugin-BeamSpotProducer/alpaka/BeamSpotToAlpaka.cc b/src/alpaka/plugin-BeamSpotProducer/alpaka/BeamSpotToAlpaka.cc index 157aca30b..0423aea84 100644 --- a/src/alpaka/plugin-BeamSpotProducer/alpaka/BeamSpotToAlpaka.cc +++ b/src/alpaka/plugin-BeamSpotProducer/alpaka/BeamSpotToAlpaka.cc @@ -26,11 +26,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { void BeamSpotToAlpaka::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { auto const& bsRaw = iSetup.get(); - // TO DO: Add inter-event parallelization. cms::alpaka::ScopedContextProduce? - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], - iEvent.streamID()}; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{iEvent.streamID()}; BeamSpotAlpaka bsDevice(&bsRaw, ctx.stream()); - ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], iEvent, bsPutToken_, std::move(bsDevice)); + ctx.emplace(iEvent, bsPutToken_, std::move(bsDevice)); } } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletAlpaka.cc b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletAlpaka.cc index 51801c83f..004228751 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletAlpaka.cc +++ b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletAlpaka.cc @@ -36,12 +36,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto const& hits = iEvent.get(tokenHitGPU_); - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], - iEvent.streamID()}; - ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], - iEvent, - tokenTrackGPU_, - gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream())); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{iEvent.streamID()}; + ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream())); } } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/PixelTrackSoAFromAlpaka.cc b/src/alpaka/plugin-PixelTriplets/alpaka/PixelTrackSoAFromAlpaka.cc index 064648534..6e7c08a6a 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/PixelTrackSoAFromAlpaka.cc +++ b/src/alpaka/plugin-PixelTriplets/alpaka/PixelTrackSoAFromAlpaka.cc @@ -76,12 +76,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED auto const& inputData = iEvent.get(tokenAlpaka_); auto outputData = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(1u); - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], - iEvent.streamID()}; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{iEvent.streamID()}; alpaka::memcpy(ctx.stream(), outputData, inputData, 1u); // DO NOT make a copy (actually TWO....) - ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], iEvent, tokenSOA_, std::move(outputData)); + ctx.emplace(iEvent, tokenSOA_, std::move(outputData)); #endif } diff --git a/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexProducerAlpaka.cc b/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexProducerAlpaka.cc index 175d22163..01e238f8b 100644 --- a/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexProducerAlpaka.cc +++ b/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexProducerAlpaka.cc @@ -51,12 +51,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto const& tracksBuf = iEvent.get(tokenTrack_); auto const tracks = alpaka::getPtrNative(tracksBuf); - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], - iEvent.streamID()}; - ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], - iEvent, - tokenVertex_, - m_gpuAlgo.makeAsync(tracks, m_ptMin, ctx.stream())); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{iEvent.streamID()}; + ctx.emplace(iEvent, tokenVertex_, m_gpuAlgo.makeAsync(tracks, m_ptMin, ctx.stream())); } } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexSoAFromAlpaka.cc b/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexSoAFromAlpaka.cc index eb017e2eb..7fd14cc50 100644 --- a/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexSoAFromAlpaka.cc +++ b/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexSoAFromAlpaka.cc @@ -57,12 +57,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED auto const& inputData = iEvent.get(tokenAlpaka_); auto outputData = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(1u); - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], - iEvent.streamID()}; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{iEvent.streamID()}; alpaka::memcpy(ctx.stream(), outputData, inputData, 1u); // No copies.... - ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], iEvent, tokenSOA_, std::move(outputData)); + ctx.emplace(iEvent, tokenSOA_, std::move(outputData)); #endif } diff --git a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToCluster.cc b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToCluster.cc index edd08fd14..60ca31bd0 100644 --- a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToCluster.cc +++ b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToCluster.cc @@ -140,8 +140,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } // end of for loop - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], - iEvent.streamID()}; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{iEvent.streamID()}; gpuAlgo_.makeClustersAsync(isRun2_, gpuMap, gpuModulesToUnpack, @@ -156,10 +155,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ctx.stream()); auto tmp = gpuAlgo_.getResults(); - ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], iEvent, digiPutToken_, std::move(tmp.first)); - ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], iEvent, clusterPutToken_, std::move(tmp.second)); + ctx.emplace(iEvent, digiPutToken_, std::move(tmp.first)); + ctx.emplace(iEvent, clusterPutToken_, std::move(tmp.second)); if (includeErrors_) { - ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], iEvent, digiErrorPutToken_, gpuAlgo_.getErrors()); + ctx.emplace(iEvent, digiErrorPutToken_, gpuAlgo_.getErrors()); } } diff --git a/src/alpaka/plugin-SiPixelRecHits/alpaka/SiPixelRecHitAlpaka.cc b/src/alpaka/plugin-SiPixelRecHits/alpaka/SiPixelRecHitAlpaka.cc index c8b9046b6..2098dd1a8 100644 --- a/src/alpaka/plugin-SiPixelRecHits/alpaka/SiPixelRecHitAlpaka.cc +++ b/src/alpaka/plugin-SiPixelRecHits/alpaka/SiPixelRecHitAlpaka.cc @@ -54,12 +54,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } // TO DO: Async: Would need to add a queue as a parameter, not async for now! - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], - iEvent.streamID()}; - ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], - iEvent, - tokenHit_, - gpuAlgo_.makeHitsAsync(digis, clusters, bs, fcpe.params(), ctx.stream())); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{iEvent.streamID()}; + ctx.emplace(iEvent, tokenHit_, gpuAlgo_.makeHitsAsync(digis, clusters, bs, fcpe.params(), ctx.stream())); } } // namespace ALPAKA_ACCELERATOR_NAMESPACE From be55c61a2e9cc39f56a12adf6d48e47365b9423d Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Sun, 24 Oct 2021 14:33:24 +0200 Subject: [PATCH 2/3] [alpaka] Remove the SharedEventPtr and SharedStreamPtr typedefs --- src/alpaka/AlpakaCore/ContextState.h | 12 +++++----- src/alpaka/AlpakaCore/ESProduct.h | 8 +++---- src/alpaka/AlpakaCore/Product.h | 4 ++-- src/alpaka/AlpakaCore/ProductBase.h | 22 ++++++++++-------- src/alpaka/AlpakaCore/ScopedContext.h | 23 +++++++++---------- src/alpaka/AlpakaCore/SharedEventPtr.h | 15 ------------ src/alpaka/AlpakaCore/SharedStreamPtr.h | 15 ------------ src/alpaka/AlpakaCore/alpaka/ScopedContext.cc | 6 ++--- 8 files changed, 39 insertions(+), 66 deletions(-) delete mode 100644 src/alpaka/AlpakaCore/SharedEventPtr.h delete mode 100644 src/alpaka/AlpakaCore/SharedStreamPtr.h diff --git a/src/alpaka/AlpakaCore/ContextState.h b/src/alpaka/AlpakaCore/ContextState.h index 59d29d80c..b85d76286 100644 --- a/src/alpaka/AlpakaCore/ContextState.h +++ b/src/alpaka/AlpakaCore/ContextState.h @@ -5,7 +5,6 @@ #include #include "AlpakaCore/alpakaConfig.h" -#include "AlpakaCore/SharedStreamPtr.h" namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { @@ -17,6 +16,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { class ContextState { public: using Device = ::ALPAKA_ACCELERATOR_NAMESPACE::Device; + using Queue = ::ALPAKA_ACCELERATOR_NAMESPACE::Queue; ContextState() = default; ~ContextState() = default; @@ -31,7 +31,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { friend class ScopedContextProduce; friend class ScopedContextTask; - void set(Device device, SharedStreamPtr stream) { + void set(Device device, std::shared_ptr stream) { throwIfStream(); device_ = std::move(device); stream_ = std::move(stream); @@ -39,16 +39,16 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { Device const& device() const { return *device_; } - const SharedStreamPtr& streamPtr() const { + const std::shared_ptr& streamPtr() const { throwIfNoStream(); return stream_; } - SharedStreamPtr releaseStreamPtr() { + std::shared_ptr releaseStreamPtr() { throwIfNoStream(); // This function needs to effectively reset stream_ (i.e. stream_ // must be empty after this function). This behavior ensures that - // the SharedStreamPtr is not hold for inadvertedly long (i.e. to + // the std::shared_ptr is not hold for inadvertedly long (i.e. to // the next event), and is checked at run time. return std::move(stream_); } @@ -56,7 +56,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { void throwIfStream() const; void throwIfNoStream() const; - SharedStreamPtr stream_; + std::shared_ptr stream_; // Work around no default constructor for Device // On the other hand, we don't strictly need the Device here... std::optional device_; diff --git a/src/alpaka/AlpakaCore/ESProduct.h b/src/alpaka/AlpakaCore/ESProduct.h index f6a134e11..3dfb1a682 100644 --- a/src/alpaka/AlpakaCore/ESProduct.h +++ b/src/alpaka/AlpakaCore/ESProduct.h @@ -8,7 +8,6 @@ #include "AlpakaCore/alpakaConfig.h" #include "AlpakaCore/EventCache.h" -#include "AlpakaCore/SharedEventPtr.h" #include "AlpakaCore/currentDevice.h" #include "AlpakaCore/deviceCount.h" #include "AlpakaCore/eventWorkHasCompleted.h" @@ -18,11 +17,12 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { template class ESProduct { public: + using Event = ::ALPAKA_ACCELERATOR_NAMESPACE::Event; + template ESProduct(T_Acc acc) : gpuDataPerDevice_(::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::deviceCount()) { for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) { - gpuDataPerDevice_[i].m_event = - ::cms::alpakatools::getEventCache<::ALPAKA_ACCELERATOR_NAMESPACE::Event>().get(acc); + gpuDataPerDevice_[i].m_event = ::cms::alpakatools::getEventCache().get(acc); } } @@ -92,7 +92,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { private: struct Item { mutable std::mutex m_mutex; - mutable SharedEventPtr m_event; // guarded by m_mutex + mutable std::shared_ptr m_event; // guarded by m_mutex // non-null if some thread is already filling (cudaStream_t is just a pointer) mutable ::ALPAKA_ACCELERATOR_NAMESPACE::Queue* m_fillingStream = nullptr; // guarded by m_mutex mutable std::atomic m_filled = false; // easy check if data has been filled already or not diff --git a/src/alpaka/AlpakaCore/Product.h b/src/alpaka/AlpakaCore/Product.h index 662f8bb09..a14a2fad6 100644 --- a/src/alpaka/AlpakaCore/Product.h +++ b/src/alpaka/AlpakaCore/Product.h @@ -46,11 +46,11 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { friend class ScopedContextProduce; friend class edm::Wrapper>; - explicit Product(SharedStreamPtr stream, SharedEventPtr event, T data) + explicit Product(std::shared_ptr stream, std::shared_ptr event, T data) : ProductBase(std::move(stream), std::move(event)), data_(std::move(data)) {} template - explicit Product(SharedStreamPtr stream, SharedEventPtr event, Args&&... args) + explicit Product(std::shared_ptr stream, std::shared_ptr event, Args&&... args) : ProductBase(std::move(stream), std::move(event)), data_(std::forward(args)...) {} T data_; //! diff --git a/src/alpaka/AlpakaCore/ProductBase.h b/src/alpaka/AlpakaCore/ProductBase.h index e6bc08ed5..9780ed37a 100644 --- a/src/alpaka/AlpakaCore/ProductBase.h +++ b/src/alpaka/AlpakaCore/ProductBase.h @@ -4,8 +4,9 @@ #include #include -#include "AlpakaCore/SharedEventPtr.h" -#include "AlpakaCore/SharedStreamPtr.h" +#include + +#include "AlpakaCore/alpakaConfigAcc.h" namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { @@ -19,6 +20,9 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { */ class ProductBase { public: + using Event = ::ALPAKA_ACCELERATOR_NAMESPACE::Event; + using Queue = ::ALPAKA_ACCELERATOR_NAMESPACE::Queue; + ProductBase() = default; // Needed only for ROOT dictionary generation ~ProductBase(); @@ -38,22 +42,22 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { bool isValid() const { return stream_.get() != nullptr; } bool isAvailable() const; - ::ALPAKA_ACCELERATOR_NAMESPACE::Device device() const { return alpaka::getDev(stream()); } + alpaka::Dev device() const { return alpaka::getDev(stream()); } // cudaStream_t is a pointer to a thread-safe object, for which a // mutable access is needed even if the ::cms::alpakatools::ScopedContext itself // would be const. Therefore it is ok to return a non-const // pointer from a const method here. - ::ALPAKA_ACCELERATOR_NAMESPACE::Queue& stream() const { return *(stream_.get()); } + Queue& stream() const { return *(stream_.get()); } // cudaEvent_t is a pointer to a thread-safe object, for which a // mutable access is needed even if the ::cms::alpakatools::ScopedContext itself // would be const. Therefore it is ok to return a non-const // pointer from a const method here. - alpaka::Event<::ALPAKA_ACCELERATOR_NAMESPACE::Queue>& event() const { return *(event_.get()); } + Event& event() const { return *(event_.get()); } protected: - explicit ProductBase(SharedStreamPtr stream, SharedEventPtr event) + explicit ProductBase(std::shared_ptr stream, std::shared_ptr event) : stream_{std::move(stream)}, event_{std::move(event)} {} private: @@ -61,7 +65,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { friend class ScopedContextProduce; // The following function is intended to be used only from ScopedContext - const SharedStreamPtr& streamPtr() const { return stream_; } + const std::shared_ptr& streamPtr() const { return stream_; } bool mayReuseStream() const { bool expected = true; @@ -73,9 +77,9 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { // The cudaStream_t is really shared among edm::Event products, so // using shared_ptr also here - SharedStreamPtr stream_; //! + std::shared_ptr stream_; //! // shared_ptr because of caching in ::cms::alpakatools::EventCache - SharedEventPtr event_; //! + std::shared_ptr event_; //! // This flag tells whether the CUDA stream may be reused by a // consumer or not. The goal is to have a "chain" of modules to diff --git a/src/alpaka/AlpakaCore/ScopedContext.h b/src/alpaka/AlpakaCore/ScopedContext.h index 5ff26540a..51f45e5d4 100644 --- a/src/alpaka/AlpakaCore/ScopedContext.h +++ b/src/alpaka/AlpakaCore/ScopedContext.h @@ -7,8 +7,6 @@ #include "AlpakaCore/ContextState.h" #include "AlpakaCore/EventCache.h" #include "AlpakaCore/Product.h" -#include "AlpakaCore/SharedEventPtr.h" -#include "AlpakaCore/SharedStreamPtr.h" #include "Framework/EDGetToken.h" #include "Framework/EDPutToken.h" #include "Framework/Event.h" @@ -27,6 +25,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { class ScopedContextBase { public: using Device = ::ALPAKA_ACCELERATOR_NAMESPACE::Device; + using Queue = ::ALPAKA_ACCELERATOR_NAMESPACE::Queue; Device const& device() const { return currentDevice_; } @@ -34,8 +33,8 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { // mutable access is needed even if the ScopedContext itself // would be const. Therefore it is ok to return a non-const // pointer from a const method here. - ::ALPAKA_ACCELERATOR_NAMESPACE::Queue& stream() const { return *(stream_.get()); } - const SharedStreamPtr& streamPtr() const { return stream_; } + Queue& stream() const { return *(stream_.get()); } + const std::shared_ptr& streamPtr() const { return stream_; } protected: // The constructors set the current device, but the device @@ -49,21 +48,21 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { if (data.mayReuseStream()) { stream_ = data.streamPtr(); } else { - stream_ = getStreamCache<::ALPAKA_ACCELERATOR_NAMESPACE::Queue>().get(currentDevice_); + stream_ = getStreamCache().get(currentDevice_); } } - explicit ScopedContextBase(Device device, SharedStreamPtr stream) + explicit ScopedContextBase(Device device, std::shared_ptr stream) : currentDevice_(std::move(device)), stream_(std::move(stream)) {} explicit ScopedContextBase(edm::StreamID streamID) : currentDevice_(::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::chooseDevice(streamID)) { - stream_ = getStreamCache<::ALPAKA_ACCELERATOR_NAMESPACE::Queue>().get(currentDevice_); + stream_ = getStreamCache().get(currentDevice_); } private: Device const currentDevice_; - SharedStreamPtr stream_; + std::shared_ptr stream_; }; class ScopedContextGetterBase : public ScopedContextBase { @@ -84,9 +83,9 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { ScopedContextGetterBase(Args&&... args) : ScopedContextBase(std::forward(args)...) {} void synchronizeStreams(Device const& dataDevice, - ::ALPAKA_ACCELERATOR_NAMESPACE::Queue& dataStream, + Queue& dataStream, bool available, - alpaka::Event<::ALPAKA_ACCELERATOR_NAMESPACE::Queue> dataEvent); + alpaka::Event dataEvent); }; class ScopedContextHolderHelper { @@ -101,7 +100,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { waitingTaskHolder_ = std::move(waitingTaskHolder); } - void enqueueCallback(::ALPAKA_ACCELERATOR_NAMESPACE::Queue& stream); + void enqueueCallback(ScopedContextBase::Queue& stream); private: edm::WaitingTaskWithArenaHolder waitingTaskHolder_; @@ -192,7 +191,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { private: friend class ::ALPAKA_ACCELERATOR_NAMESPACE::cms::alpakatest::TestScopedContext; - explicit ScopedContextProduce(Device device, SharedStreamPtr stream) + explicit ScopedContextProduce(Device device, std::shared_ptr stream) : ScopedContextGetterBase(std::move(device), std::move(stream)) {} auto getEvent() { return getEventCache<::ALPAKA_ACCELERATOR_NAMESPACE::Event>().get(device()); } diff --git a/src/alpaka/AlpakaCore/SharedEventPtr.h b/src/alpaka/AlpakaCore/SharedEventPtr.h deleted file mode 100644 index 3582a928c..000000000 --- a/src/alpaka/AlpakaCore/SharedEventPtr.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef HeterogeneousCore_AlpakaUtilities_SharedEventPtr_h -#define HeterogeneousCore_AlpakaUtilities_SharedEventPtr_h - -#include -#include - -#include "AlpakaCore/alpakaConfig.h" - -namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { - - using SharedEventPtr = std::shared_ptr<::ALPAKA_ACCELERATOR_NAMESPACE::Event>; - -} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE - -#endif diff --git a/src/alpaka/AlpakaCore/SharedStreamPtr.h b/src/alpaka/AlpakaCore/SharedStreamPtr.h deleted file mode 100644 index 38909af5a..000000000 --- a/src/alpaka/AlpakaCore/SharedStreamPtr.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef HeterogeneousCore_AlpakaUtilities_SharedStreamPtr_h -#define HeterogeneousCore_AlpakaUtilities_SharedStreamPtr_h - -#include -#include - -#include "AlpakaCore/alpakaConfig.h" - -namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { - - using SharedStreamPtr = std::shared_ptr<::ALPAKA_ACCELERATOR_NAMESPACE::Queue>; - -} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE - -#endif diff --git a/src/alpaka/AlpakaCore/alpaka/ScopedContext.cc b/src/alpaka/AlpakaCore/alpaka/ScopedContext.cc index a7490563c..816ce50df 100644 --- a/src/alpaka/AlpakaCore/alpaka/ScopedContext.cc +++ b/src/alpaka/AlpakaCore/alpaka/ScopedContext.cc @@ -5,9 +5,9 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { namespace impl { void ScopedContextGetterBase::synchronizeStreams(Device const& dataDevice, - ::ALPAKA_ACCELERATOR_NAMESPACE::Queue& dataStream, + Queue& dataStream, bool available, - alpaka::Event<::ALPAKA_ACCELERATOR_NAMESPACE::Queue> dataEvent) { + alpaka::Event dataEvent) { if (dataDevice != device()) { // Eventually replace with prefetch to current device (assuming unified memory works) // If we won't go to unified memory, need to figure out something else... @@ -27,7 +27,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { } } - void ScopedContextHolderHelper::enqueueCallback(::ALPAKA_ACCELERATOR_NAMESPACE::Queue& stream) { + void ScopedContextHolderHelper::enqueueCallback(ScopedContextBase::Queue& stream) { alpaka::enqueue(stream, [holder = waitingTaskHolder_]() { // TODO: The functor is required to be const, so can't use // 'mutable', so I'm copying the object as a workaround. I From 6e8e215f04cb67b1ab96b0aa7de08cd54d082016 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Tue, 26 Oct 2021 10:42:44 +0200 Subject: [PATCH 3/3] [alpaka] Read the device from the queue instead of storing it explicitly --- src/alpaka/AlpakaCore/ContextState.h | 11 ++---- src/alpaka/AlpakaCore/ProductBase.h | 2 +- src/alpaka/AlpakaCore/ScopedContext.h | 39 +++++++------------ src/alpaka/AlpakaCore/StreamCache.h | 2 +- src/alpaka/AlpakaCore/alpaka/ScopedContext.cc | 20 +++++----- 5 files changed, 28 insertions(+), 46 deletions(-) diff --git a/src/alpaka/AlpakaCore/ContextState.h b/src/alpaka/AlpakaCore/ContextState.h index b85d76286..a1efa8913 100644 --- a/src/alpaka/AlpakaCore/ContextState.h +++ b/src/alpaka/AlpakaCore/ContextState.h @@ -2,7 +2,6 @@ #define HeterogeneousCore_AlpakaCore_ContextState_h #include -#include #include "AlpakaCore/alpakaConfig.h" @@ -15,8 +14,8 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { */ class ContextState { public: - using Device = ::ALPAKA_ACCELERATOR_NAMESPACE::Device; using Queue = ::ALPAKA_ACCELERATOR_NAMESPACE::Queue; + using Device = alpaka::Dev; ContextState() = default; ~ContextState() = default; @@ -31,13 +30,12 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { friend class ScopedContextProduce; friend class ScopedContextTask; - void set(Device device, std::shared_ptr stream) { + void set(std::shared_ptr stream) { throwIfStream(); - device_ = std::move(device); stream_ = std::move(stream); } - Device const& device() const { return *device_; } + Device device() const { return alpaka::getDev(*stream_); } const std::shared_ptr& streamPtr() const { throwIfNoStream(); @@ -57,9 +55,6 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { void throwIfNoStream() const; std::shared_ptr stream_; - // Work around no default constructor for Device - // On the other hand, we don't strictly need the Device here... - std::optional device_; }; } // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/AlpakaCore/ProductBase.h b/src/alpaka/AlpakaCore/ProductBase.h index 9780ed37a..63b071a3a 100644 --- a/src/alpaka/AlpakaCore/ProductBase.h +++ b/src/alpaka/AlpakaCore/ProductBase.h @@ -20,8 +20,8 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { */ class ProductBase { public: - using Event = ::ALPAKA_ACCELERATOR_NAMESPACE::Event; using Queue = ::ALPAKA_ACCELERATOR_NAMESPACE::Queue; + using Event = alpaka::Event; ProductBase() = default; // Needed only for ROOT dictionary generation ~ProductBase(); diff --git a/src/alpaka/AlpakaCore/ScopedContext.h b/src/alpaka/AlpakaCore/ScopedContext.h index 51f45e5d4..a146422b2 100644 --- a/src/alpaka/AlpakaCore/ScopedContext.h +++ b/src/alpaka/AlpakaCore/ScopedContext.h @@ -24,10 +24,10 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { // This class is intended to be derived by other ScopedContext*, not for general use class ScopedContextBase { public: - using Device = ::ALPAKA_ACCELERATOR_NAMESPACE::Device; using Queue = ::ALPAKA_ACCELERATOR_NAMESPACE::Queue; + using Device = alpaka::Dev; - Device const& device() const { return currentDevice_; } + Device device() const { return alpaka::getDev(*stream_); } // cudaStream_t is a pointer to a thread-safe object, for which a // mutable access is needed even if the ScopedContext itself @@ -44,24 +44,16 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { // the scope where this context is. The current device doesn't // really matter between modules (or across TBB tasks). - ScopedContextBase(const ProductBase& data) : currentDevice_(data.device()) { - if (data.mayReuseStream()) { - stream_ = data.streamPtr(); - } else { - stream_ = getStreamCache().get(currentDevice_); - } - } + ScopedContextBase(const ProductBase& data) + : stream_{data.mayReuseStream() ? data.streamPtr() : getStreamCache().get(data.device())} {} - explicit ScopedContextBase(Device device, std::shared_ptr stream) - : currentDevice_(std::move(device)), stream_(std::move(stream)) {} + explicit ScopedContextBase(std::shared_ptr stream) : stream_(std::move(stream)) {} explicit ScopedContextBase(edm::StreamID streamID) - : currentDevice_(::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::chooseDevice(streamID)) { - stream_ = getStreamCache().get(currentDevice_); - } + : stream_{getStreamCache().get( + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::chooseDevice(streamID))} {} private: - Device const currentDevice_; std::shared_ptr stream_; }; @@ -69,7 +61,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { public: template const T& get(const Product& data) { - synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event()); + synchronizeStreams(data.stream(), data.isAvailable(), data.event()); return data.data_; } @@ -82,10 +74,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { template ScopedContextGetterBase(Args&&... args) : ScopedContextBase(std::forward(args)...) {} - void synchronizeStreams(Device const& dataDevice, - Queue& dataStream, - bool available, - alpaka::Event dataEvent); + void synchronizeStreams(Queue& dataStream, bool available, alpaka::Event dataEvent); }; class ScopedContextHolderHelper { @@ -165,8 +154,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { class ScopedContextProduce : public impl::ScopedContextGetterBase { public: /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module) - explicit ScopedContextProduce(ContextState& state) - : ScopedContextGetterBase(state.device(), state.releaseStreamPtr()) {} + explicit ScopedContextProduce(ContextState& state) : ScopedContextGetterBase(state.releaseStreamPtr()) {} explicit ScopedContextProduce(const ProductBase& data) : ScopedContextGetterBase(data) {} @@ -183,7 +171,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { template auto emplace(edm::Event& iEvent, edm::EDPutTokenT token, Args&&... args) { - // return iEvent.emplace(token, device(), streamPtr(), getEvent(acc), std::forward(args)...); + // return iEvent.emplace(token, streamPtr(), getEvent(acc), std::forward(args)...); return iEvent.emplace(token, std::forward(args)...); // TODO } @@ -191,8 +179,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { private: friend class ::ALPAKA_ACCELERATOR_NAMESPACE::cms::alpakatest::TestScopedContext; - explicit ScopedContextProduce(Device device, std::shared_ptr stream) - : ScopedContextGetterBase(std::move(device), std::move(stream)) {} + explicit ScopedContextProduce(std::shared_ptr stream) : ScopedContextGetterBase(std::move(stream)) {} auto getEvent() { return getEventCache<::ALPAKA_ACCELERATOR_NAMESPACE::Event>().get(device()); } @@ -209,7 +196,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { public: /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module) explicit ScopedContextTask(ContextState const* state, edm::WaitingTaskWithArenaHolder waitingTaskHolder) - : ScopedContextBase(state->device(), state->streamPtr()), // don't move, state is re-used afterwards + : ScopedContextBase(state->streamPtr()), // don't move, state is re-used afterwards holderHelper_{std::move(waitingTaskHolder)}, contextState_{state} {} diff --git a/src/alpaka/AlpakaCore/StreamCache.h b/src/alpaka/AlpakaCore/StreamCache.h index e24f7dfb9..22b93a312 100644 --- a/src/alpaka/AlpakaCore/StreamCache.h +++ b/src/alpaka/AlpakaCore/StreamCache.h @@ -22,7 +22,7 @@ namespace cms::alpakatools { // Gets a (cached) CUDA stream for the current device. The stream // will be returned to the cache by the shared_ptr destructor. // This function is thread safe - ALPAKA_FN_HOST std::shared_ptr get(Device dev) { + ALPAKA_FN_HOST std::shared_ptr get(Device const& dev) { return cache_[cms::alpakatools::getDevIndex(dev)].makeOrGet([dev]() { return std::make_unique(dev); }); } diff --git a/src/alpaka/AlpakaCore/alpaka/ScopedContext.cc b/src/alpaka/AlpakaCore/alpaka/ScopedContext.cc index 816ce50df..2944d9638 100644 --- a/src/alpaka/AlpakaCore/alpaka/ScopedContext.cc +++ b/src/alpaka/AlpakaCore/alpaka/ScopedContext.cc @@ -4,18 +4,18 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { namespace impl { - void ScopedContextGetterBase::synchronizeStreams(Device const& dataDevice, - Queue& dataStream, + void ScopedContextGetterBase::synchronizeStreams(Queue& dataStream, bool available, alpaka::Event dataEvent) { - if (dataDevice != device()) { - // Eventually replace with prefetch to current device (assuming unified memory works) - // If we won't go to unified memory, need to figure out something else... - throw std::runtime_error("Handling data from multiple devices is not yet supported"); - } - if (dataStream != stream()) { - // Different streams, need to synchronize + // Different streams, check if the underlying device is the same + if (alpaka::getDev(dataStream) != device()) { + // Eventually replace with prefetch to current device (assuming unified memory works) + // If we won't go to unified memory, need to figure out something else... + throw std::runtime_error("Handling data from multiple devices is not yet supported"); + } + + // Synchronize the two streams if (not available) { // Event not yet occurred, so need to add synchronization // here. Sychronization is done by making the CUDA stream to @@ -43,7 +43,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { ScopedContextAcquire::~ScopedContextAcquire() { holderHelper_.enqueueCallback(stream()); if (contextState_) { - contextState_->set(device(), streamPtr()); + contextState_->set(streamPtr()); } }