diff --git a/src/alpaka/AlpakaCore/ContextState.h b/src/alpaka/AlpakaCore/ContextState.h index a1efa8913..771ce6ad9 100644 --- a/src/alpaka/AlpakaCore/ContextState.h +++ b/src/alpaka/AlpakaCore/ContextState.h @@ -3,18 +3,28 @@ #include -#include "AlpakaCore/alpakaConfig.h" +#include -namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { +namespace cms::alpakatools { + + template + class ScopedContextAcquire; + + template + class ScopedContextProduce; + + template + class ScopedContextTask; /** * The purpose of this class is to deliver the device and CUDA stream * information from ExternalWork's acquire() to producer() via a * member/StreamCache variable. */ + template class ContextState { public: - using Queue = ::ALPAKA_ACCELERATOR_NAMESPACE::Queue; + using Queue = TQueue; using Device = alpaka::Dev; ContextState() = default; @@ -26,18 +36,26 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { ContextState& operator=(ContextState&& other) = delete; private: - friend class ScopedContextAcquire; - friend class ScopedContextProduce; - friend class ScopedContextTask; + friend class ScopedContextAcquire; + friend class ScopedContextProduce; + friend class ScopedContextTask; void set(std::shared_ptr stream) { throwIfStream(); stream_ = std::move(stream); } - Device device() const { return alpaka::getDev(*stream_); } + Device device() const { + throwIfNoStream(); + return alpaka::getDev(*stream_); + } - const std::shared_ptr& streamPtr() const { + Queue stream() const { + throwIfNoStream(); + return *stream_; + } + + std::shared_ptr const& streamPtr() const { throwIfNoStream(); return stream_; } @@ -51,12 +69,21 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { return std::move(stream_); } - void throwIfStream() const; - void throwIfNoStream() const; + void throwIfStream() const { + if (stream_) { + throw std::runtime_error("Trying to set ContextState, but it already had a valid state"); + } + } + + void throwIfNoStream() const { + if (not stream_) { + throw std::runtime_error("Trying to get ContextState, but it did not have a valid state"); + } + } std::shared_ptr stream_; }; -} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE +} // namespace cms::alpakatools #endif // HeterogeneousCore_AlpakaCore_ContextState_h diff --git a/src/alpaka/AlpakaCore/ESProduct.h b/src/alpaka/AlpakaCore/ESProduct.h index 3dfb1a682..c478cabbb 100644 --- a/src/alpaka/AlpakaCore/ESProduct.h +++ b/src/alpaka/AlpakaCore/ESProduct.h @@ -8,8 +8,6 @@ #include "AlpakaCore/alpakaConfig.h" #include "AlpakaCore/EventCache.h" -#include "AlpakaCore/currentDevice.h" -#include "AlpakaCore/deviceCount.h" #include "AlpakaCore/eventWorkHasCompleted.h" namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { @@ -17,12 +15,12 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { template class ESProduct { public: + using Queue = ::ALPAKA_ACCELERATOR_NAMESPACE::Queue; using Event = ::ALPAKA_ACCELERATOR_NAMESPACE::Event; - template - ESProduct(T_Acc acc) : gpuDataPerDevice_(::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::deviceCount()) { + ESProduct() : gpuDataPerDevice_(::ALPAKA_ACCELERATOR_NAMESPACE::devices.size()) { for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) { - gpuDataPerDevice_[i].m_event = ::cms::alpakatools::getEventCache().get(acc); + gpuDataPerDevice_[i].m_event = ::cms::alpakatools::getEventCache().get(::ALPAKA_ACCELERATOR_NAMESPACE::devices[i]); } } @@ -32,8 +30,8 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { // which enqueues asynchronous transfers (possibly kernels as well) // to the CUDA stream template - const T& dataForCurrentDeviceAsync(::ALPAKA_ACCELERATOR_NAMESPACE::Queue queue, F transferAsync) const { - auto device = currentDevice(); + const T& dataForDeviceAsync(Queue queue, F transferAsync) const { + auto device = cms::alpakatools::getDevIndex(alpaka::getDev(queue)); auto& data = gpuDataPerDevice_[device]; // If GPU data has already been filled, we can return it diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index 3cb2918f9..314352e75 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -56,8 +56,8 @@ namespace cms { ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void launchZero( Histo *__restrict__ h, ::ALPAKA_ACCELERATOR_NAMESPACE::Queue &queue) { uint32_t *poff = (uint32_t *)(char *)(&(h->off)); - auto histoOffView = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createDeviceView( - poff, Histo::totbins()); + auto histoOffView = + cms::alpakatools::createDeviceView(alpaka::getDev(queue), poff, Histo::totbins()); alpaka::memset(queue, histoOffView, 0, Histo::totbins()); alpaka::wait(queue); @@ -77,16 +77,21 @@ namespace cms { const WorkDiv1D &workDiv = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( blocksPerGrid, threadsPerBlockOrElementsPerThread); - alpaka::enqueue(queue, - alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>( - workDiv, ::cms::alpakatools::multiBlockPrefixScanFirstStep(), poff, poff, num_items)); - - const WorkDiv1D &workDivWith1Block = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( - Vec1D::all(1), threadsPerBlockOrElementsPerThread); alpaka::enqueue( queue, alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>( - workDivWith1Block, ::cms::alpakatools::multiBlockPrefixScanSecondStep(), poff, poff, num_items, nblocks)); + workDiv, ::cms::alpakatools::multiBlockPrefixScanFirstStep(), poff, poff, num_items)); + + const WorkDiv1D &workDivWith1Block = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( + Vec1D::all(1), threadsPerBlockOrElementsPerThread); + alpaka::enqueue(queue, + alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>( + workDivWith1Block, + ::cms::alpakatools::multiBlockPrefixScanSecondStep(), + poff, + poff, + num_items, + nblocks)); } template @@ -106,14 +111,14 @@ namespace cms { const WorkDiv1D &workDiv = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( blocksPerGrid, threadsPerBlockOrElementsPerThread); - alpaka::enqueue( - queue, - alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>(workDiv, countFromVector(), h, nh, v, offsets)); + alpaka::enqueue(queue, + alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>( + workDiv, countFromVector(), h, nh, v, offsets)); launchFinalize(h, queue); - alpaka::enqueue( - queue, - alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>(workDiv, fillFromVector(), h, nh, v, offsets)); + alpaka::enqueue(queue, + alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>( + workDiv, fillFromVector(), h, nh, v, offsets)); } struct finalizeBulk { diff --git a/src/alpaka/AlpakaCore/Product.h b/src/alpaka/AlpakaCore/Product.h index a14a2fad6..7fbd6d19d 100644 --- a/src/alpaka/AlpakaCore/Product.h +++ b/src/alpaka/AlpakaCore/Product.h @@ -11,9 +11,10 @@ namespace edm { class Wrapper; } -namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { +namespace cms::alpakatools { namespace impl { + template class ScopedContextGetterBase; } @@ -21,7 +22,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { * The purpose of this class is to wrap CUDA data to edm::Event in a * way which forces correct use of various utilities. * - * The non-default construction has to be done with ::cms::alpakatools::ScopedContext + * The non-default construction has to be done with ScopedContext * (in order to properly register the CUDA event). * * The default constructor is needed only for the ROOT dictionary generation. @@ -31,9 +32,12 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { * it. Here is a somewhat natural place. If overhead is too much, we * can use them only where synchronization between streams is needed. */ - template - class Product : public ProductBase { + template + class Product : public ProductBase { public: + using Queue = TQueue; + using Event = alpaka::Event; + Product() = default; // Needed only for ROOT dictionary generation Product(const Product&) = delete; @@ -42,20 +46,20 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { Product& operator=(Product&&) = default; private: - friend class impl::ScopedContextGetterBase; - friend class ScopedContextProduce; - friend class edm::Wrapper>; + friend class impl::ScopedContextGetterBase; + friend class ScopedContextProduce; + friend class edm::Wrapper>; explicit Product(std::shared_ptr stream, std::shared_ptr event, T data) - : ProductBase(std::move(stream), std::move(event)), data_(std::move(data)) {} + : ProductBase(std::move(stream), std::move(event)), data_(std::move(data)) {} template explicit Product(std::shared_ptr stream, std::shared_ptr event, Args&&... args) - : ProductBase(std::move(stream), std::move(event)), data_(std::forward(args)...) {} + : ProductBase(std::move(stream), std::move(event)), data_(std::forward(args)...) {} T data_; //! }; -} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE +} // namespace cms::alpakatools #endif // AlpakaDataFormats_Common_Product_h diff --git a/src/alpaka/AlpakaCore/ProductBase.h b/src/alpaka/AlpakaCore/ProductBase.h index 63b071a3a..bdd7a5eb6 100644 --- a/src/alpaka/AlpakaCore/ProductBase.h +++ b/src/alpaka/AlpakaCore/ProductBase.h @@ -6,25 +6,43 @@ #include -#include "AlpakaCore/alpakaConfigAcc.h" - -namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { +namespace cms::alpakatools { namespace impl { + template class ScopedContextBase; } + template + class ScopedContextProduce; + /** - * Base class for all instantiations of CUDA to hold the + * Base class for all instantiations of Product to hold the * non-T-dependent members. */ + template class ProductBase { public: - using Queue = ::ALPAKA_ACCELERATOR_NAMESPACE::Queue; + using Queue = TQueue; using Event = alpaka::Event; + using Device = alpaka::Dev; ProductBase() = default; // Needed only for ROOT dictionary generation - ~ProductBase(); + + ~ProductBase() { + // Make sure that the production of the product in the GPU is + // complete before destructing the product. This is to make sure + // that the EDM stream does not move to the next event before all + // asynchronous processing of the current is complete. + + // TODO: a callback notifying a WaitingTaskHolder (or similar) + // would avoid blocking the CPU, but would also require more work. + + // FIXME: this may throw an execption if the underlaying call fails. + if (event_) { + alpaka::wait(*event_); + } + } ProductBase(const ProductBase&) = delete; ProductBase& operator=(const ProductBase&) = delete; @@ -40,18 +58,25 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { } bool isValid() const { return stream_.get() != nullptr; } - bool isAvailable() const; - alpaka::Dev device() const { return alpaka::getDev(stream()); } + bool isAvailable() const { + // if default-constructed, the product is not available + if (not event_) { + return false; + } + return eventWorkHasCompleted(*(event_.get())); + } + + Device device() const { return alpaka::getDev(stream()); } // cudaStream_t is a pointer to a thread-safe object, for which a - // mutable access is needed even if the ::cms::alpakatools::ScopedContext itself + // mutable access is needed even if the ScopedContext itself // would be const. Therefore it is ok to return a non-const // pointer from a const method here. Queue& stream() const { return *(stream_.get()); } // cudaEvent_t is a pointer to a thread-safe object, for which a - // mutable access is needed even if the ::cms::alpakatools::ScopedContext itself + // mutable access is needed even if the ScopedContext itself // would be const. Therefore it is ok to return a non-const // pointer from a const method here. Event& event() const { return *(event_.get()); } @@ -61,8 +86,8 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { : stream_{std::move(stream)}, event_{std::move(event)} {} private: - friend class impl::ScopedContextBase; - friend class ScopedContextProduce; + friend class impl::ScopedContextBase; + friend class ScopedContextProduce; // The following function is intended to be used only from ScopedContext const std::shared_ptr& streamPtr() const { return stream_; } @@ -78,7 +103,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { // The cudaStream_t is really shared among edm::Event products, so // using shared_ptr also here std::shared_ptr stream_; //! - // shared_ptr because of caching in ::cms::alpakatools::EventCache + // shared_ptr because of caching in EventCache std::shared_ptr event_; //! // This flag tells whether the CUDA stream may be reused by a @@ -87,6 +112,6 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { mutable std::atomic mayReuseStream_ = true; //! }; -} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE +} // namespace cms::alpakatools #endif // AlpakaDataFormats_Common_ProductBase_h diff --git a/src/alpaka/AlpakaCore/ScopedContext.h b/src/alpaka/AlpakaCore/ScopedContext.h index a146422b2..639dcb972 100644 --- a/src/alpaka/AlpakaCore/ScopedContext.h +++ b/src/alpaka/AlpakaCore/ScopedContext.h @@ -18,13 +18,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::cms::alpakatest { class TestScopedContext; } -namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { +namespace cms::alpakatools { namespace impl { // This class is intended to be derived by other ScopedContext*, not for general use + template class ScopedContextBase { public: - using Queue = ::ALPAKA_ACCELERATOR_NAMESPACE::Queue; + using Queue = TQueue; using Device = alpaka::Dev; Device device() const { return alpaka::getDev(*stream_); } @@ -33,7 +34,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { // mutable access is needed even if the ScopedContext itself // would be const. Therefore it is ok to return a non-const // pointer from a const method here. - Queue& stream() const { return *(stream_.get()); } + Queue& stream() const { return *stream_; } const std::shared_ptr& streamPtr() const { return stream_; } protected: @@ -44,7 +45,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { // the scope where this context is. The current device doesn't // really matter between modules (or across TBB tasks). - ScopedContextBase(const ProductBase& data) + ScopedContextBase(ProductBase const& data) : stream_{data.mayReuseStream() ? data.streamPtr() : getStreamCache().get(data.device())} {} explicit ScopedContextBase(std::shared_ptr stream) : stream_(std::move(stream)) {} @@ -57,24 +58,46 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { std::shared_ptr stream_; }; - class ScopedContextGetterBase : public ScopedContextBase { + template + class ScopedContextGetterBase : public ScopedContextBase { public: + using Queue = TQueue; + template - const T& get(const Product& data) { - synchronizeStreams(data.stream(), data.isAvailable(), data.event()); + const T& get(Product const& data) { + synchronizeStreams(data); return data.data_; } template - const T& get(const edm::Event& iEvent, edm::EDGetTokenT> token) { + const T& get(const edm::Event& iEvent, edm::EDGetTokenT> token) { return get(iEvent.get(token)); } protected: template - ScopedContextGetterBase(Args&&... args) : ScopedContextBase(std::forward(args)...) {} - - void synchronizeStreams(Queue& dataStream, bool available, alpaka::Event dataEvent); + ScopedContextGetterBase(Args&&... args) : ScopedContextBase(std::forward(args)...) {} + + void synchronizeStreams(ProductBase const& data) { + // If the product has been enqueued to a different queue, make sure that it is available before accessing it + if (data.stream() != this->stream()) { + // Different streams, check if the underlying device is the same + if (data.device() != this->device()) { + // Eventually replace with prefetch to current device (assuming unified memory works) + // If we won't go to unified memory, need to figure out something else... + throw std::runtime_error("Handling data from multiple devices is not yet supported"); + } + // If the data product is not yet available, synchronize the two streams + if (not data.isAvailable()) { + // Event not yet occurred, so need to add synchronization + // here. Sychronization is done by making the current queue + // wait for an event, so all subsequent work in the stream + // will run only after the event has "occurred" (i.e. data + // product became available). + alpaka::wait(this->stream(), data.event()); + } + } + } }; class ScopedContextHolderHelper { @@ -82,14 +105,30 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { ScopedContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder) : waitingTaskHolder_{std::move(waitingTaskHolder)} {} - template - void pushNextTask(F&& f, ContextState const* state); + template + void pushNextTask(F&& f, ContextState const* state) { + replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{ + edm::make_waiting_task_with_holder(tbb::task::allocate_root(), + std::move(waitingTaskHolder_), + [state, func = std::forward(f)](edm::WaitingTaskWithArenaHolder h) { + func(ScopedContextTask{state, std::move(h)}); + })}); + } void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { waitingTaskHolder_ = std::move(waitingTaskHolder); } - void enqueueCallback(ScopedContextBase::Queue& stream); + template + void enqueueCallback(TQueue& stream) { + alpaka::enqueue(stream, [holder = waitingTaskHolder_]() { + // TODO: The functor is required to be const, so can't use + // 'mutable', so I'm copying the object as a workaround. I + // wonder if there are any wider implications. + auto h = holder; + h.doneWaiting(nullptr); + }); + } private: edm::WaitingTaskWithArenaHolder waitingTaskHolder_; @@ -103,8 +142,14 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { * - synchronizing between CUDA streams if necessary * and enforce that those get done in a proper way in RAII fashion. */ - class ScopedContextAcquire : public impl::ScopedContextGetterBase { + template + class ScopedContextAcquire : public impl::ScopedContextGetterBase { public: + using Queue = TQueue; + using ScopedContextGetterBase = impl::ScopedContextGetterBase; + using ScopedContextGetterBase::stream; + using ScopedContextGetterBase::streamPtr; + /// Constructor to create a new CUDA stream (no need for context beyond acquire()) explicit ScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder) : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)} {} @@ -112,20 +157,25 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { // /// Constructor to create a new CUDA stream, and the context is needed after acquire() explicit ScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder, - ContextState& state) + ContextState& state) : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {} // /// Constructor to (possibly) re-use a CUDA stream (no need for context beyond acquire()) - explicit ScopedContextAcquire(const ProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder) + explicit ScopedContextAcquire(ProductBase const& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder) : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)} {} // /// Constructor to (possibly) re-use a CUDA stream, and the context is needed after acquire() - explicit ScopedContextAcquire(const ProductBase& data, + explicit ScopedContextAcquire(ProductBase const& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder, - ContextState& state) + ContextState& state) : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {} - ~ScopedContextAcquire(); + ~ScopedContextAcquire() { + holderHelper_.enqueueCallback(stream()); + if (contextState_) { + contextState_->set(streamPtr()); + } + } template void pushNextTask(F&& f) { @@ -139,10 +189,14 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { } private: - void throwNoState(); + void throwNoState() { + throw std::runtime_error( + "Calling ScopedContextAcquire::insertNextTask() requires ScopedContextAcquire to be constructed with " + "ContextState, but that was not the case"); + } impl::ScopedContextHolderHelper holderHelper_; - ContextState* contextState_ = nullptr; + ContextState* contextState_ = nullptr; }; /** @@ -151,22 +205,35 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { * - synchronizing between CUDA streams if necessary * and enforce that those get done in a proper way in RAII fashion. */ - class ScopedContextProduce : public impl::ScopedContextGetterBase { + template + class ScopedContextProduce : public impl::ScopedContextGetterBase { public: + using Queue = TQueue; + using ScopedContextGetterBase = impl::ScopedContextGetterBase; + using ScopedContextGetterBase::device; + using ScopedContextGetterBase::streamPtr; + /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module) - explicit ScopedContextProduce(ContextState& state) : ScopedContextGetterBase(state.releaseStreamPtr()) {} + explicit ScopedContextProduce(ContextState& state) : ScopedContextGetterBase(state.releaseStreamPtr()) {} - explicit ScopedContextProduce(const ProductBase& data) : ScopedContextGetterBase(data) {} + explicit ScopedContextProduce(ProductBase const& data) : ScopedContextGetterBase(data) {} explicit ScopedContextProduce(edm::StreamID streamID) : ScopedContextGetterBase(streamID) {} /// Record the CUDA event, all asynchronous work must have been queued before the destructor - ~ScopedContextProduce(); + ~ScopedContextProduce() { + // Intentionally not checking the return value to avoid throwing + // exceptions. If this call would fail, we should get failures + // elsewhere as well. + //TODO + //cudaEventRecord(event_.get(), stream()); + //alpaka::enqueue(stream(), getEvent(::ALPAKA_ACCELERATOR_NAMESPACE::Device).get()); + } template - std::unique_ptr> wrap(T data) { + std::unique_ptr> wrap(T data) { // make_unique doesn't work because of private constructor - return std::unique_ptr>(new Product(streamPtr(), std::move(data))); + return std::unique_ptr>(new Product(streamPtr(), std::move(data))); } template @@ -192,15 +259,21 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary * and enforce that those get done in a proper way in RAII fashion. */ - class ScopedContextTask : public impl::ScopedContextBase { + template + class ScopedContextTask : public impl::ScopedContextBase { public: + using Queue = TQueue; + using ScopedContextBase = impl::ScopedContextBase; + using ScopedContextBase::stream; + using ScopedContextBase::streamPtr; + /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module) - explicit ScopedContextTask(ContextState const* state, edm::WaitingTaskWithArenaHolder waitingTaskHolder) + explicit ScopedContextTask(ContextState const* state, edm::WaitingTaskWithArenaHolder waitingTaskHolder) : ScopedContextBase(state->streamPtr()), // don't move, state is re-used afterwards holderHelper_{std::move(waitingTaskHolder)}, contextState_{state} {} - ~ScopedContextTask(); + ~ScopedContextTask() { holderHelper_.enqueueCallback(stream()); } template void pushNextTask(F&& f) { @@ -213,7 +286,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { private: impl::ScopedContextHolderHelper holderHelper_; - ContextState const* contextState_; + ContextState const* contextState_; }; /** @@ -222,24 +295,18 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { * - synchronizing between CUDA streams if necessary * and enforce that those get done in a proper way in RAII fashion. */ - class ScopedContextAnalyze : public impl::ScopedContextGetterBase { + template + class ScopedContextAnalyze : public impl::ScopedContextGetterBase { public: + using Queue = TQueue; + using ScopedContextGetterBase = impl::ScopedContextGetterBase; + using ScopedContextGetterBase::stream; + using ScopedContextGetterBase::streamPtr; + /// Constructor to (possibly) re-use a CUDA stream - explicit ScopedContextAnalyze(const ProductBase& data) : ScopedContextGetterBase(data) {} + explicit ScopedContextAnalyze(ProductBase const& data) : ScopedContextGetterBase(data) {} }; - namespace impl { - template - void ScopedContextHolderHelper::pushNextTask(F&& f, ContextState const* state) { - replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{ - edm::make_waiting_task_with_holder(tbb::task::allocate_root(), - std::move(waitingTaskHolder_), - [state, func = std::forward(f)](edm::WaitingTaskWithArenaHolder h) { - func(ScopedContextTask{state, std::move(h)}); - })}); - } - } // namespace impl - -} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE +} // namespace cms::alpakatools #endif // HeterogeneousCore_AlpakaCore_ScopedContext_h diff --git a/src/alpaka/AlpakaCore/ScopedSetDevice.h b/src/alpaka/AlpakaCore/ScopedSetDevice.h deleted file mode 100644 index f2560ddd3..000000000 --- a/src/alpaka/AlpakaCore/ScopedSetDevice.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef HeterogeneousCore_AlpakaUtilities_ScopedSetDevice_h -#define HeterogeneousCore_AlpakaUtilities_ScopedSetDevice_h - -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED -#include -#endif - -namespace cms::alpakatools { - - class ScopedSetDevice { - public: - explicit ScopedSetDevice(int newDevice) { -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - cudaGetDevice(&prevDevice_); - cudaSetDevice(newDevice); -#endif - } - - ~ScopedSetDevice() { - // Intentionally don't check the return value to avoid - // exceptions to be thrown. If this call fails, the process is - // doomed anyway. -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - cudaSetDevice(prevDevice_); -#endif - } - - private: - int prevDevice_; - }; - -} // namespace cms::alpakatools - -#endif diff --git a/src/alpaka/AlpakaCore/alpaka/ContextState.cc b/src/alpaka/AlpakaCore/alpaka/ContextState.cc deleted file mode 100644 index 553289692..000000000 --- a/src/alpaka/AlpakaCore/alpaka/ContextState.cc +++ /dev/null @@ -1,19 +0,0 @@ -#include - -#include "AlpakaCore/ContextState.h" - -namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { - - void ContextState::throwIfStream() const { - if (stream_) { - throw std::runtime_error("Trying to set ContextState, but it already had a valid state"); - } - } - - void ContextState::throwIfNoStream() const { - if (not stream_) { - throw std::runtime_error("Trying to get ContextState, but it did not have a valid state"); - } - } - -} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/AlpakaCore/alpaka/ProductBase.cc b/src/alpaka/AlpakaCore/alpaka/ProductBase.cc deleted file mode 100644 index 9470cb732..000000000 --- a/src/alpaka/AlpakaCore/alpaka/ProductBase.cc +++ /dev/null @@ -1,31 +0,0 @@ -#include "AlpakaCore/ProductBase.h" -#include "AlpakaCore/eventWorkHasCompleted.h" - -namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { - - bool ProductBase::isAvailable() const { - // if default-constructed, the product is not available - if (not event_) { - return false; - } - return eventWorkHasCompleted(*(event_.get())); - } - - ProductBase::~ProductBase() { - // Make sure that the production of the product in the GPU is - // complete before destructing the product. This is to make sure - // that the EDM stream does not move to the next event before all - // asynchronous processing of the current is complete. - - // TODO: a callback notifying a WaitingTaskHolder (or similar) - // would avoid blocking the CPU, but would also require more work. - // - // Intentionally not checking the return value to avoid throwing - // exceptions. If this call would fail, we should get failures - // elsewhere as well. - if (event_) { - alpaka::wait(*(event_.get())); - } - } - -} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/AlpakaCore/alpaka/ScopedContext.cc b/src/alpaka/AlpakaCore/alpaka/ScopedContext.cc deleted file mode 100644 index 2944d9638..000000000 --- a/src/alpaka/AlpakaCore/alpaka/ScopedContext.cc +++ /dev/null @@ -1,71 +0,0 @@ -#include "AlpakaCore/alpakaConfig.h" -#include "AlpakaCore/ScopedContext.h" - -namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { - - namespace impl { - void ScopedContextGetterBase::synchronizeStreams(Queue& dataStream, - bool available, - alpaka::Event dataEvent) { - if (dataStream != stream()) { - // Different streams, check if the underlying device is the same - if (alpaka::getDev(dataStream) != device()) { - // Eventually replace with prefetch to current device (assuming unified memory works) - // If we won't go to unified memory, need to figure out something else... - throw std::runtime_error("Handling data from multiple devices is not yet supported"); - } - - // Synchronize the two streams - if (not available) { - // Event not yet occurred, so need to add synchronization - // here. Sychronization is done by making the CUDA stream to - // wait for an event, so all subsequent work in the stream - // will run only after the event has "occurred" (i.e. data - // product became available). - alpaka::wait(stream(), dataEvent); - } - } - } - - void ScopedContextHolderHelper::enqueueCallback(ScopedContextBase::Queue& stream) { - alpaka::enqueue(stream, [holder = waitingTaskHolder_]() { - // TODO: The functor is required to be const, so can't use - // 'mutable', so I'm copying the object as a workaround. I - // wonder if there are any wider implications. - auto h = holder; - h.doneWaiting(nullptr); - }); - } - } // namespace impl - - //////////////////// - - ScopedContextAcquire::~ScopedContextAcquire() { - holderHelper_.enqueueCallback(stream()); - if (contextState_) { - contextState_->set(streamPtr()); - } - } - - void ScopedContextAcquire::throwNoState() { - throw std::runtime_error( - "Calling ScopedContextAcquire::insertNextTask() requires ScopedContextAcquire to be constructed with " - "ContextState, but that was not the case"); - } - - //////////////////// - - ScopedContextProduce::~ScopedContextProduce() { - // Intentionally not checking the return value to avoid throwing - // exceptions. If this call would fail, we should get failures - // elsewhere as well. - //cudaEventRecord(event_.get(), stream()); - //alpaka::enqueue(stream(), getEvent(::ALPAKA_ACCELERATOR_NAMESPACE::Device).get()); - //TODO - } - - //////////////////// - - ScopedContextTask::~ScopedContextTask() { holderHelper_.enqueueCallback(stream()); } - -} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/AlpakaCore/alpaka/deviceCount.cc b/src/alpaka/AlpakaCore/alpaka/deviceCount.cc deleted file mode 100644 index 001b1e817..000000000 --- a/src/alpaka/AlpakaCore/alpaka/deviceCount.cc +++ /dev/null @@ -1,14 +0,0 @@ -#include "AlpakaCore/alpakaConfig.h" -#include "AlpakaCore/deviceCount.h" - -namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { - - int deviceCount() { - int ndevices = 1; -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - ndevices = alpaka::getDevCount<::ALPAKA_ACCELERATOR_NAMESPACE::Platform>(); -#endif - return ndevices; - } - -} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/AlpakaCore/alpakaMemoryHelper.h b/src/alpaka/AlpakaCore/alpakaMemoryHelper.h index f0c874327..9219b11a8 100644 --- a/src/alpaka/AlpakaCore/alpakaMemoryHelper.h +++ b/src/alpaka/AlpakaCore/alpakaMemoryHelper.h @@ -4,37 +4,36 @@ #include "AlpakaCore/alpakaConfig.h" #include "AlpakaCore/alpakaDevices.h" -using namespace alpaka_common; +namespace cms::alpakatools { -namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { + // for Extent, Dim1D, Idx + using namespace alpaka_common; template - auto allocHostBuf(const Extent& extent) { + auto allocHostBuf(Extent extent) { return alpaka::allocBuf(host, extent); } template - auto createHostView(TData* data, const Extent& extent) { + auto createHostView(TData* data, Extent extent) { return alpaka::ViewPlainPtr(data, host, extent); } - template - auto allocDeviceBuf(const Extent& extent) { - return alpaka::allocBuf(::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], extent); + template + auto allocDeviceBuf(TDevice const& device, Extent extent) { + return alpaka::allocBuf(device, extent); } - template - auto createDeviceView(const TData* data, const Extent& extent) { - return alpaka::ViewPlainPtr<::ALPAKA_ACCELERATOR_NAMESPACE::Device, const TData, Dim1D, Idx>( - data, ::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], extent); + template + auto createDeviceView(TDevice const& device, TData const* data, Extent extent) { + return alpaka::ViewPlainPtr(data, device, extent); } - template - auto createDeviceView(TData* data, const Extent& extent) { - return alpaka::ViewPlainPtr<::ALPAKA_ACCELERATOR_NAMESPACE::Device, TData, Dim1D, Idx>( - data, ::ALPAKA_ACCELERATOR_NAMESPACE::devices[0], extent); + template + auto createDeviceView(TDevice const& device, TData* data, Extent extent) { + return alpaka::ViewPlainPtr(data, device, extent); } -} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE +} // namespace cms::alpakatools #endif // AlpakaCore_alpakaMemoryHelper_h diff --git a/src/alpaka/AlpakaCore/currentDevice.h b/src/alpaka/AlpakaCore/currentDevice.h deleted file mode 100644 index 8141214a9..000000000 --- a/src/alpaka/AlpakaCore/currentDevice.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef HeterogenousCore_AlpakaUtilities_currentDevice_h -#define HeterogenousCore_AlpakaUtilities_currentDevice_h - -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED -#include -#endif - -#include "AlpakaCore/alpakaConfig.h" - -namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { - - inline int currentDevice() { - int dev = 0; -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - cudaGetDevice(&dev); -#endif - return dev; - } - -} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE - -#endif diff --git a/src/alpaka/AlpakaCore/deviceCount.h b/src/alpaka/AlpakaCore/deviceCount.h deleted file mode 100644 index 69da217ef..000000000 --- a/src/alpaka/AlpakaCore/deviceCount.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef HeterogenousCore_AlpakaUtilities_deviceCount_h -#define HeterogenousCore_AlpakaUtilities_deviceCount_h - -#include "AlpakaCore/alpakaConfig.h" - -namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { - - int deviceCount(); - -} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE - -#endif // HeterogenousCore_AlpakaUtilities_deviceCount_h diff --git a/src/alpaka/AlpakaDataFormats/BeamSpotAlpaka.h b/src/alpaka/AlpakaDataFormats/BeamSpotAlpaka.h index 5ae141f67..3012f4399 100644 --- a/src/alpaka/AlpakaDataFormats/BeamSpotAlpaka.h +++ b/src/alpaka/AlpakaDataFormats/BeamSpotAlpaka.h @@ -13,8 +13,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { BeamSpotAlpaka() = default; BeamSpotAlpaka(BeamSpotPOD const* data, Queue& queue) - : data_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)} { - auto data_h{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createHostView(data, 1u)}; + : data_d{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)} { + auto data_h{cms::alpakatools::createHostView(data, 1u)}; alpaka::memcpy(queue, data_d, data_h, 1u); // alpaka::wait(queue); diff --git a/src/alpaka/AlpakaDataFormats/SiPixelClustersAlpaka.h b/src/alpaka/AlpakaDataFormats/SiPixelClustersAlpaka.h index 0b7261f27..9c13818ca 100644 --- a/src/alpaka/AlpakaDataFormats/SiPixelClustersAlpaka.h +++ b/src/alpaka/AlpakaDataFormats/SiPixelClustersAlpaka.h @@ -8,12 +8,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { class SiPixelClustersAlpaka { public: SiPixelClustersAlpaka() = default; - explicit SiPixelClustersAlpaka(size_t maxClusters) - : moduleStart_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxClusters + 1)}, - clusInModule_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxClusters)}, - moduleId_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxClusters)}, - clusModuleStart_d{ - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxClusters + 1)} {} + explicit SiPixelClustersAlpaka(Device const &device, size_t maxClusters) + : moduleStart_d{cms::alpakatools::allocDeviceBuf(device, maxClusters + 1)}, + clusInModule_d{cms::alpakatools::allocDeviceBuf(device, maxClusters)}, + moduleId_d{cms::alpakatools::allocDeviceBuf(device, maxClusters)}, + clusModuleStart_d{cms::alpakatools::allocDeviceBuf(device, maxClusters + 1)} {} ~SiPixelClustersAlpaka() = default; SiPixelClustersAlpaka(const SiPixelClustersAlpaka &) = delete; diff --git a/src/alpaka/AlpakaDataFormats/SiPixelDigiErrorsAlpaka.h b/src/alpaka/AlpakaDataFormats/SiPixelDigiErrorsAlpaka.h index f42b3eff2..bba2cd2a5 100644 --- a/src/alpaka/AlpakaDataFormats/SiPixelDigiErrorsAlpaka.h +++ b/src/alpaka/AlpakaDataFormats/SiPixelDigiErrorsAlpaka.h @@ -11,12 +11,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { class SiPixelDigiErrorsAlpaka { public: SiPixelDigiErrorsAlpaka() = default; - explicit SiPixelDigiErrorsAlpaka(size_t maxFedWords, PixelFormatterErrors errors, Queue& queue) - : data_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxFedWords)}, - error_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf< - ::cms::alpakatools::SimpleVector>(1u)}, - error_h{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf< - ::cms::alpakatools::SimpleVector>(1u)}, + explicit SiPixelDigiErrorsAlpaka(Device const& device, size_t maxFedWords, PixelFormatterErrors errors, Queue& queue) + : data_d{cms::alpakatools::allocDeviceBuf(device, maxFedWords)}, + error_d{cms::alpakatools::allocDeviceBuf<::cms::alpakatools::SimpleVector>(device, 1u)}, + error_h{::cms::alpakatools::allocHostBuf<::cms::alpakatools::SimpleVector>(1u)}, formatterErrors_h{std::move(errors)} { auto perror_h = alpaka::getPtrNative(error_h); perror_h->construct(maxFedWords, alpaka::getPtrNative(data_d)); diff --git a/src/alpaka/AlpakaDataFormats/SiPixelDigisAlpaka.h b/src/alpaka/AlpakaDataFormats/SiPixelDigisAlpaka.h index 84c11de0d..bc8c2192c 100644 --- a/src/alpaka/AlpakaDataFormats/SiPixelDigisAlpaka.h +++ b/src/alpaka/AlpakaDataFormats/SiPixelDigisAlpaka.h @@ -8,14 +8,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { class SiPixelDigisAlpaka { public: SiPixelDigisAlpaka() = default; - explicit SiPixelDigisAlpaka(size_t maxFedWords) - : xx_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxFedWords)}, - yy_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxFedWords)}, - adc_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxFedWords)}, - moduleInd_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxFedWords)}, - clus_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxFedWords)}, - pdigi_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxFedWords)}, - rawIdArr_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxFedWords)} {} + explicit SiPixelDigisAlpaka(Device const &device, size_t maxFedWords) + : xx_d{cms::alpakatools::allocDeviceBuf(device, maxFedWords)}, + yy_d{cms::alpakatools::allocDeviceBuf(device, maxFedWords)}, + adc_d{cms::alpakatools::allocDeviceBuf(device, maxFedWords)}, + moduleInd_d{cms::alpakatools::allocDeviceBuf(device, maxFedWords)}, + clus_d{cms::alpakatools::allocDeviceBuf(device, maxFedWords)}, + pdigi_d{cms::alpakatools::allocDeviceBuf(device, maxFedWords)}, + rawIdArr_d{cms::alpakatools::allocDeviceBuf(device, maxFedWords)} {} ~SiPixelDigisAlpaka() = default; SiPixelDigisAlpaka(const SiPixelDigisAlpaka &) = delete; @@ -57,7 +57,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // TO DO: nothing async in here for now... Pass the queue as argument instead, and don't wait anymore! auto adcToHostAsync(Queue &queue) const { - auto ret = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(nDigis()); + auto ret = ::cms::alpakatools::allocHostBuf(nDigis()); alpaka::memcpy(queue, ret, adc_d, nDigis()); return ret; } diff --git a/src/alpaka/AlpakaDataFormats/TrackingRecHit2DAlpaka.h b/src/alpaka/AlpakaDataFormats/TrackingRecHit2DAlpaka.h index 99689088a..c9b372c96 100644 --- a/src/alpaka/AlpakaDataFormats/TrackingRecHit2DAlpaka.h +++ b/src/alpaka/AlpakaDataFormats/TrackingRecHit2DAlpaka.h @@ -20,28 +20,26 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // NON-OWNING DEVICE POINTERS: m_hitsModuleStart(hitsModuleStart), // OWNING DEVICE POINTERS: - m_xl{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, - m_yl{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, - m_xerr{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, - m_yerr{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, - m_xg{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, - m_yg{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, - m_zg{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, - m_rg{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, - m_iphi{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, - m_charge{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, - m_xsize{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, - m_ysize{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, - m_detInd{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, + m_xl{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, + m_yl{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, + m_xerr{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, + m_yerr{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, + m_xg{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, + m_yg{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, + m_zg{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, + m_rg{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, + m_iphi{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, + m_charge{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, + m_xsize{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, + m_ysize{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, + m_detInd{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, m_averageGeometry{ - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf( - 1u)}, - m_hitsLayerStart{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, - m_hist{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}, + cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}, + m_hitsLayerStart{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), nHits)}, + m_hist{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}, // SoA view: - m_view{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}, - m_view_h{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(1u)} - { + m_view{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}, + m_view_h{::cms::alpakatools::allocHostBuf(1u)} { // the hits are actually accessed in order only in building // if ordering is relevant they may have to be stored phi-ordered by layer or so // this will break 1to1 correspondence with cluster and module locality @@ -96,57 +94,57 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto const* c_iphi() const { return alpaka::getPtrNative(m_iphi); } auto xlToHostAsync(Queue& queue) const { - auto ret = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(nHits()); + auto ret = ::cms::alpakatools::allocHostBuf(nHits()); alpaka::memcpy(queue, ret, m_xl, nHits()); return ret; } auto ylToHostAsync(Queue& queue) const { - auto ret = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(nHits()); + auto ret = ::cms::alpakatools::allocHostBuf(nHits()); alpaka::memcpy(queue, ret, m_yl, nHits()); return ret; } auto xerrToHostAsync(Queue& queue) const { - auto ret = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(nHits()); + auto ret = ::cms::alpakatools::allocHostBuf(nHits()); alpaka::memcpy(queue, ret, m_xerr, nHits()); return ret; } auto yerrToHostAsync(Queue& queue) const { - auto ret = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(nHits()); + auto ret = ::cms::alpakatools::allocHostBuf(nHits()); alpaka::memcpy(queue, ret, m_yerr, nHits()); return ret; } auto xgToHostAsync(Queue& queue) const { - auto ret = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(nHits()); + auto ret = ::cms::alpakatools::allocHostBuf(nHits()); alpaka::memcpy(queue, ret, m_xg, nHits()); return ret; } auto ygToHostAsync(Queue& queue) const { - auto ret = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(nHits()); + auto ret = ::cms::alpakatools::allocHostBuf(nHits()); alpaka::memcpy(queue, ret, m_yg, nHits()); return ret; } auto zgToHostAsync(Queue& queue) const { - auto ret = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(nHits()); + auto ret = ::cms::alpakatools::allocHostBuf(nHits()); alpaka::memcpy(queue, ret, m_zg, nHits()); return ret; } auto rgToHostAsync(Queue& queue) const { - auto ret = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(nHits()); + auto ret = ::cms::alpakatools::allocHostBuf(nHits()); alpaka::memcpy(queue, ret, m_rg, nHits()); return ret; } auto chargeToHostAsync(Queue& queue) const { - auto ret = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(nHits()); + auto ret = ::cms::alpakatools::allocHostBuf(nHits()); alpaka::memcpy(queue, ret, m_charge, nHits()); return ret; } auto xsizeToHostAsync(Queue& queue) const { - auto ret = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(nHits()); + auto ret = ::cms::alpakatools::allocHostBuf(nHits()); alpaka::memcpy(queue, ret, m_xsize, nHits()); return ret; } auto ysizeToHostAsync(Queue& queue) const { - auto ret = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(nHits()); + auto ret = ::cms::alpakatools::allocHostBuf(nHits()); alpaka::memcpy(queue, ret, m_ysize, nHits()); return ret; } diff --git a/src/alpaka/CondFormats/PixelCPEFast.h b/src/alpaka/CondFormats/PixelCPEFast.h index b5a93cba3..4b910f1b1 100644 --- a/src/alpaka/CondFormats/PixelCPEFast.h +++ b/src/alpaka/CondFormats/PixelCPEFast.h @@ -28,39 +28,35 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { pixelCPEforGPU::ParamsOnGPU const *params() const { return alpaka::getPtrNative(m_params); } - template - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ESProduct getGPUData(T_Acc acc) { - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ESProduct gpuData_(acc); - return gpuData_; +#ifdef TODO + template + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ESProduct getGPUData() const { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ESProduct gpuData; + return gpuData; } // The return value can only be used safely in kernels launched on // the same cudaStream, or after cudaStreamSynchronize. + const pixelCPEforGPU::ParamsOnGPU getGPUProductAsync(Queue queue) const { + auto gpuData = getGPUData(); - template - const pixelCPEforGPU::ParamsOnGPU getGPUProductAsync(T_Acc acc, Queue queue) const { - auto gpuData_ = getGPUData(acc); - - const auto &data = gpuData_.dataForCurrentDeviceAsync(queue, [this](GPUData &data, Queue queue) { + auto const& data = gpuData_.dataForDeviceAsync(queue, [this](GPUData &data, Queue queue) { + using namespace ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE; // and now copy to device... - auto cParams = - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u); + auto cParams = allocDeviceBuf(1u); data.h_paramsOnGPU.m_commonParams = alpaka::getPtrNative(cParams); uint32_t size_detParams = alpaka::extent::getExtentVec(this->m_detParams)[0u]; - auto detParams = - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(size_detParams); + auto detParams = allocDeviceBuf(size_detParams); data.h_paramsOnGPU.m_detParams = alpaka::getPtrNative(detParams); - auto avgGeom = - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u); + auto avgGeom = allocDeviceBuf(1u); data.h_paramsOnGPU.m_averageGeometry = alpaka::getPtrNative(avgGeom); - auto layerGeom = - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u); + auto layerGeom = allocDeviceBuf(1u); data.h_paramsOnGPU.m_layerGeometry = alpaka::getPtrNative(layerGeom); - auto parGPU = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u); + auto parGPU = allocDeviceBuf(1u); data.d_paramsOnGPU = alpaka::getPtrNative(parGPU); alpaka::prepareForAsyncCopy(cParams); @@ -69,7 +65,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { alpaka::prepareForAsyncCopy(layerGeom); alpaka::prepareForAsyncCopy(parGPU); - alpaka::memcpy(queue, data.d_paramsOnGPU, data.h_paramsOnGPU, 1u); + alpaka::memcpy(queue, parGPU, data.h_paramsOnGPU, 1u); alpaka::memcpy(queue, data.h_paramsOnGPU.m_commonParams, this->m_commonParams, 1u); alpaka::memcpy(queue, data.h_paramsOnGPU.m_averageGeometry, this->m_averageGeometry, 1u); alpaka::memcpy(queue, data.h_paramsOnGPU.m_layerGeometry, this->m_layerGeometry, 1u); @@ -77,9 +73,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { }); #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED return *data.d_paramsOnGPU; -#endif +#else return data.h_paramsOnGPU; +#endif } +#endif // TODO private: AlpakaDeviceBuf m_commonParams; @@ -88,6 +86,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { AlpakaDeviceBuf m_averageGeometry; AlpakaDeviceBuf m_params; +#ifdef TODO struct GPUData { // not needed if not used on CPU... pixelCPEforGPU::ParamsOnGPU h_paramsOnGPU; @@ -99,7 +98,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } }; - //::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ESProduct gpuData_; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ESProduct gpuData_; +#endif // TODO }; } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/plugin-BeamSpotProducer/alpaka/BeamSpotToAlpaka.cc b/src/alpaka/plugin-BeamSpotProducer/alpaka/BeamSpotToAlpaka.cc index 0423aea84..e6bbfac04 100644 --- a/src/alpaka/plugin-BeamSpotProducer/alpaka/BeamSpotToAlpaka.cc +++ b/src/alpaka/plugin-BeamSpotProducer/alpaka/BeamSpotToAlpaka.cc @@ -26,7 +26,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { void BeamSpotToAlpaka::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { auto const& bsRaw = iSetup.get(); - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{iEvent.streamID()}; + ::cms::alpakatools::ScopedContextProduce ctx{iEvent.streamID()}; BeamSpotAlpaka bsDevice(&bsRaw, ctx.stream()); ctx.emplace(iEvent, bsPutToken_, std::move(bsDevice)); } diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/BrokenLineFitOnGPU.cc b/src/alpaka/plugin-PixelTriplets/alpaka/BrokenLineFitOnGPU.cc index 32d9d2629..d3fada0b0 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/BrokenLineFitOnGPU.cc +++ b/src/alpaka/plugin-PixelTriplets/alpaka/BrokenLineFitOnGPU.cc @@ -18,14 +18,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { Vec1D::all(numberOfBlocks / 4), Vec1D::all(blockSize)); // Fit internals - auto hitsGPU_ = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf( - maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double)); + auto hitsGPU_ = cms::alpakatools::allocDeviceBuf( + alpaka::getDev(queue), maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double)); - auto hits_geGPU_ = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf( - maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float)); + auto hits_geGPU_ = cms::alpakatools::allocDeviceBuf( + alpaka::getDev(queue), maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float)); - auto fast_fit_resultsGPU_ = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf( - maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double)); + auto fast_fit_resultsGPU_ = cms::alpakatools::allocDeviceBuf( + alpaka::getDev(queue), maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double)); for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { // fit triplets diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletAlpaka.cc b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletAlpaka.cc index 004228751..a4282a93d 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletAlpaka.cc +++ b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletAlpaka.cc @@ -36,7 +36,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto const& hits = iEvent.get(tokenHitGPU_); - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{iEvent.streamID()}; + ::cms::alpakatools::ScopedContextProduce ctx{iEvent.streamID()}; ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream())); } diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernels.h b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernels.h index 2d12a32eb..b23dac1c5 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernels.h +++ b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernels.h @@ -160,43 +160,37 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ////////////////////////////////////////////////////////// // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER) ////////////////////////////////////////////////////////// - counters_{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}, + counters_{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}, - device_hitToTuple_{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}, - device_tupleMultiplicity_{ - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}, + device_hitToTuple_{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}, + device_tupleMultiplicity_{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}, device_theCells_{ - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(params.maxNumberOfDoublets_)}, + cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), params.maxNumberOfDoublets_)}, // in principle we can use "nhits" to heuristically dimension the workspace... device_isOuterHitOfCell_{ - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf( - std::max(1U, nhits))}, + cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), std::max(1U, nhits))}, device_theCellNeighbors_{ - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}, + cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}, device_theCellTracks_{ - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}, + cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}, - //cellStorage_{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) + CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks))}, - device_theCellNeighborsContainer_{ - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf( - CAConstants::maxNumOfActiveDoublets())}, - device_theCellTracksContainer_{ - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf( - CAConstants::maxNumOfActiveDoublets())}, + //cellStorage_{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) + CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks))}, + device_theCellNeighborsContainer_{cms::alpakatools::allocDeviceBuf( + alpaka::getDev(queue), CAConstants::maxNumOfActiveDoublets())}, + device_theCellTracksContainer_{cms::alpakatools::allocDeviceBuf( + alpaka::getDev(queue), CAConstants::maxNumOfActiveDoublets())}, - //device_storage_{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf<::ALPAKA_ACCELERATOR_NAMESPACE::cmscuda::AtomicPairCounter::c_type>(3u)}, + //device_storage_{cms::alpakatools::allocDeviceBuf<::ALPAKA_ACCELERATOR_NAMESPACE::cmscuda::AtomicPairCounter::c_type>(alpaka::getDev(queue), 3u)}, //device_hitTuple_apc_ = (::cms::alpakatools::AtomicPairCounter*)device_storage_.get()}, //device_hitToTuple_apc_ = (::cms::alpakatools::AtomicPairCounter*)device_storage_.get() + 1; //device_nCells_ = (uint32_t*)(device_storage_.get() + 2)}, device_hitTuple_apc_{ - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf<::cms::alpakatools::AtomicPairCounter>( - 1u)}, + cms::alpakatools::allocDeviceBuf<::cms::alpakatools::AtomicPairCounter>(alpaka::getDev(queue), 1u)}, device_hitToTuple_apc_{ - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf<::cms::alpakatools::AtomicPairCounter>( - 1u)}, - device_nCells_{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)} { + cms::alpakatools::allocDeviceBuf<::cms::alpakatools::AtomicPairCounter>(alpaka::getDev(queue), 1u)}, + device_nCells_{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)} { alpaka::memset(queue, counters_, 0, 1u); alpaka::memset(queue, device_nCells_, 0, 1u); diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorOnGPU.cc b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorOnGPU.cc index 0c128ed04..fdf2f4c8d 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorOnGPU.cc +++ b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorOnGPU.cc @@ -91,7 +91,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { PixelTrackAlpaka CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DAlpaka const& hits_d, float bfield, Queue& queue) const { - PixelTrackAlpaka tracks{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}; + PixelTrackAlpaka tracks{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}; auto* soa = alpaka::getPtrNative(tracks); CAHitNtupletGeneratorKernels kernels(m_params, hits_d.nHits(), queue); diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/PixelTrackSoAFromAlpaka.cc b/src/alpaka/plugin-PixelTriplets/alpaka/PixelTrackSoAFromAlpaka.cc index 6e7c08a6a..dc5208a0b 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/PixelTrackSoAFromAlpaka.cc +++ b/src/alpaka/plugin-PixelTriplets/alpaka/PixelTrackSoAFromAlpaka.cc @@ -75,8 +75,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED auto const& inputData = iEvent.get(tokenAlpaka_); - auto outputData = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(1u); - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{iEvent.streamID()}; + auto outputData = ::cms::alpakatools::allocHostBuf(1u); + ::cms::alpakatools::ScopedContextProduce ctx{iEvent.streamID()}; alpaka::memcpy(ctx.stream(), outputData, inputData, 1u); // DO NOT make a copy (actually TWO....) diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/RiemannFitOnGPU.cc b/src/alpaka/plugin-PixelTriplets/alpaka/RiemannFitOnGPU.cc index fdce8ef07..34b39b2b8 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/RiemannFitOnGPU.cc +++ b/src/alpaka/plugin-PixelTriplets/alpaka/RiemannFitOnGPU.cc @@ -18,21 +18,21 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { Vec1D::all(numberOfBlocks / 4), Vec1D::all(blockSize)); // Fit internals - auto hitsGPU_ = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf( - maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double)); + auto hitsGPU_ = cms::alpakatools::allocDeviceBuf( + alpaka::getDev(queue), maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double)); - auto hits_geGPU_ = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf( - maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float)); + auto hits_geGPU_ = cms::alpakatools::allocDeviceBuf( + alpaka::getDev(queue), maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float)); - auto fast_fit_resultsGPU_ = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf( - maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double)); + auto fast_fit_resultsGPU_ = cms::alpakatools::allocDeviceBuf( + alpaka::getDev(queue), maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double)); //auto circle_fit_resultsGPU_holder = //cms::cuda::make_device_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit), stream); //Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get()); - //auto circle_fit_resultsGPU_holder = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit)); + //auto circle_fit_resultsGPU_holder = cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit)); auto circle_fit_resultsGPU_ = - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxNumberOfConcurrentFits_); + cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), maxNumberOfConcurrentFits_); for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { // triplets diff --git a/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexProducerAlpaka.cc b/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexProducerAlpaka.cc index 01e238f8b..0f77fbc6f 100644 --- a/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexProducerAlpaka.cc +++ b/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexProducerAlpaka.cc @@ -51,7 +51,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto const& tracksBuf = iEvent.get(tokenTrack_); auto const tracks = alpaka::getPtrNative(tracksBuf); - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{iEvent.streamID()}; + ::cms::alpakatools::ScopedContextProduce ctx{iEvent.streamID()}; ctx.emplace(iEvent, tokenVertex_, m_gpuAlgo.makeAsync(tracks, m_ptMin, ctx.stream())); } diff --git a/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexSoAFromAlpaka.cc b/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexSoAFromAlpaka.cc index 7fd14cc50..fe6cd2b17 100644 --- a/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexSoAFromAlpaka.cc +++ b/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexSoAFromAlpaka.cc @@ -1,11 +1,12 @@ +#include "AlpakaCore/ScopedContext.h" #include "AlpakaCore/alpakaCommon.h" +#include "AlpakaCore/alpakaMemoryHelper.h" #include "AlpakaDataFormats/ZVertexAlpaka.h" -#include "Framework/EventSetup.h" +#include "Framework/EDProducer.h" #include "Framework/Event.h" +#include "Framework/EventSetup.h" #include "Framework/PluginFactory.h" -#include "Framework/EDProducer.h" #include "Framework/RunningAverage.h" -#include "AlpakaCore/ScopedContext.h" namespace ALPAKA_ACCELERATOR_NAMESPACE { @@ -56,8 +57,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { void PixelVertexSoAFromAlpaka::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED auto const& inputData = iEvent.get(tokenAlpaka_); - auto outputData = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(1u); - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{iEvent.streamID()}; + auto outputData = ::cms::alpakatools::allocHostBuf(1u); + ::cms::alpakatools::ScopedContextProduce ctx{iEvent.streamID()}; alpaka::memcpy(ctx.stream(), outputData, inputData, 1u); // No copies.... diff --git a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuVertexFinder.cc b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuVertexFinder.cc index 124829f75..600756040 100644 --- a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuVertexFinder.cc +++ b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuVertexFinder.cc @@ -107,21 +107,19 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // std::cout << "producing Vertices on GPU" << std::endl; ALPAKA_ASSERT_OFFLOAD(tksoa); - ZVertexAlpaka vertices{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}; + ZVertexAlpaka vertices{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}; auto* soa = alpaka::getPtrNative(vertices); ALPAKA_ASSERT_OFFLOAD(soa); - auto ws_dBuf{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}; + auto ws_dBuf{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}; auto ws_d = alpaka::getPtrNative(ws_dBuf); - auto nvFinalVerticesView = - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createDeviceView(&soa->nvFinal, 1u); + auto nvFinalVerticesView = cms::alpakatools::createDeviceView(alpaka::getDev(queue), &soa->nvFinal, 1u); alpaka::memset(queue, nvFinalVerticesView, 0, 1u); - auto ntrksWorkspaceView = - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createDeviceView(&ws_d->ntrks, 1u); + auto ntrksWorkspaceView = cms::alpakatools::createDeviceView(alpaka::getDev(queue), &ws_d->ntrks, 1u); alpaka::memset(queue, ntrksWorkspaceView, 0, 1u); auto nvIntermediateWorkspaceView = - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createDeviceView(&ws_d->nvIntermediate, 1u); + cms::alpakatools::createDeviceView(alpaka::getDev(queue), &ws_d->nvIntermediate, 1u); alpaka::memset(queue, nvIntermediateWorkspaceView, 0, 1u); const uint32_t blockSize = 128; diff --git a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelFedCablingMapESProducer.cc b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelFedCablingMapESProducer.cc index ac1b64b4f..4c1bb9c6e 100644 --- a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelFedCablingMapESProducer.cc +++ b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelFedCablingMapESProducer.cc @@ -29,18 +29,17 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { std::vector modToUnpDefault(modToUnpDefSize); in.read(reinterpret_cast(modToUnpDefault.data()), modToUnpDefSize); + // TODO FIXME use the correct device Queue queue(devices[0]); - auto cablingMap_h{ - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createHostView(&obj, 1u)}; - auto cablingMap_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}; + auto cablingMap_h{cms::alpakatools::createHostView(&obj, 1u)}; + auto cablingMap_d{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}; alpaka::prepareForAsyncCopy(cablingMap_d); alpaka::memcpy(queue, cablingMap_d, cablingMap_h, 1u); eventSetup.put(std::make_unique(std::move(cablingMap_d), true)); - auto modToUnp_h{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createHostView( - modToUnpDefault.data(), modToUnpDefSize)}; - auto modToUnp_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(modToUnpDefSize)}; + auto modToUnp_h{cms::alpakatools::createHostView(modToUnpDefault.data(), modToUnpDefSize)}; + auto modToUnp_d{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), modToUnpDefSize)}; alpaka::prepareForAsyncCopy(modToUnp_d); alpaka::memcpy(queue, modToUnp_d, modToUnp_h, modToUnpDefSize); diff --git a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelGainCalibrationForHLTESProducer.cc b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelGainCalibrationForHLTESProducer.cc index 4cf2a229e..dd12bca26 100644 --- a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelGainCalibrationForHLTESProducer.cc +++ b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelGainCalibrationForHLTESProducer.cc @@ -42,29 +42,26 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { std::vector gainData(nbytes); in.read(gainData.data(), nbytes); + // TODO FIXME use the correct device Queue queue(devices[0]); const uint32_t numDecodingStructures = gainData.size() / sizeof(SiPixelGainForHLTonGPU_DecodingStructure); - auto ped_h{ - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createHostView( - reinterpret_cast(gainData.data()), numDecodingStructures)}; - auto ped_d{ - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf( - numDecodingStructures)}; + auto ped_h{cms::alpakatools::createHostView( + reinterpret_cast(gainData.data()), numDecodingStructures)}; + auto ped_d{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), + numDecodingStructures)}; alpaka::prepareForAsyncCopy(ped_d); alpaka::memcpy(queue, ped_d, ped_h, numDecodingStructures); auto rangeAndCols_h{ - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createHostView( - gain.rangeAndCols, 2000u)}; + cms::alpakatools::createHostView(gain.rangeAndCols, 2000u)}; auto rangeAndCols_d{ - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(2000u)}; + cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 2000u)}; alpaka::prepareForAsyncCopy(rangeAndCols_d); alpaka::memcpy(queue, rangeAndCols_d, rangeAndCols_h, 2000u); - auto fields_h{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createHostView( - &gain.fields_, 1u)}; - auto fields_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}; + auto fields_h{cms::alpakatools::createHostView(&gain.fields_, 1u)}; + auto fields_d{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}; alpaka::prepareForAsyncCopy(fields_d); alpaka::memcpy(queue, fields_d, fields_h, 1u); diff --git a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToCluster.cc b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToCluster.cc index 60ca31bd0..b6fb2c74f 100644 --- a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToCluster.cc +++ b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToCluster.cc @@ -33,7 +33,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { private: void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ContextState ctxState_; + ::cms::alpakatools::ContextState ctxState_; edm::EDGetTokenT rawGetToken_; edm::EDPutTokenT digiPutToken_; @@ -140,7 +140,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } // end of for loop - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{iEvent.streamID()}; + ::cms::alpakatools::ScopedContextProduce ctx{iEvent.streamID()}; gpuAlgo_.makeClustersAsync(isRun2_, gpuMap, gpuModulesToUnpack, diff --git a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.cc b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.cc index 2f10f0a61..36b6b5884 100644 --- a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.cc +++ b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.cc @@ -32,8 +32,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { namespace pixelgpudetails { SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender() - : word_{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(MAX_FED_WORDS)}, - fedId_{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(MAX_FED_WORDS)} {} + : word_{::cms::alpakatools::allocHostBuf(MAX_FED_WORDS)}, + fedId_{::cms::alpakatools::allocHostBuf(MAX_FED_WORDS)} {} void SiPixelRawToClusterGPUKernel::WordFedAppender::initializeWordFed(int fedId, unsigned int wordCounterGPU, @@ -563,11 +563,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { std::cout << "decoding " << wordCounter << " digis. Max is " << pixelgpudetails::MAX_FED_WORDS << std::endl; #endif - digis_d = SiPixelDigisAlpaka(pixelgpudetails::MAX_FED_WORDS); + digis_d = SiPixelDigisAlpaka(alpaka::getDev(queue), pixelgpudetails::MAX_FED_WORDS); if (includeErrors) { - digiErrors_d = SiPixelDigiErrorsAlpaka(pixelgpudetails::MAX_FED_WORDS, std::move(errors), queue); + digiErrors_d = + SiPixelDigiErrorsAlpaka(alpaka::getDev(queue), pixelgpudetails::MAX_FED_WORDS, std::move(errors), queue); } - clusters_d = SiPixelClustersAlpaka(gpuClustering::MaxNumModules); + clusters_d = SiPixelClustersAlpaka(alpaka::getDev(queue), gpuClustering::MaxNumModules); if (wordCounter) // protect in case of empty event.... { @@ -584,11 +585,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ALPAKA_ASSERT_OFFLOAD(0 == wordCounter % 2); // wordCounter is the total no of words in each event to be trasfered on device - auto word_d = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(wordCounter); + auto word_d = cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), wordCounter); // NB: IMPORTANT: fedId_d: In legacy, wordCounter elements are allocated. // However, only the first half of elements end up eventually used: // hence, here, only wordCounter/2 elements are allocated. - auto fedId_d = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(wordCounter / 2); + auto fedId_d = cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), wordCounter / 2); alpaka::memcpy(queue, word_d, wordFed.word(), wordCounter); alpaka::memcpy(queue, fedId_d, wordFed.fedId(), wordCounter / 2); @@ -602,12 +603,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { wordCounter, alpaka::getPtrNative(word_d), alpaka::getPtrNative(fedId_d), - digis_d.xx(), - digis_d.yy(), - digis_d.adc(), - digis_d.pdigi(), - digis_d.rawIdArr(), - digis_d.moduleInd(), + digis_d->xx(), + digis_d->yy(), + digis_d->adc(), + digis_d->pdigi(), + digis_d->rawIdArr(), + digis_d->moduleInd(), digiErrors_d->error(), useQualityInfo, includeErrors, @@ -645,31 +646,34 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { alpaka::createTaskKernel(workDiv, gpuCalibPixel::calibDigis(), isRun2, - digis_d.moduleInd(), - digis_d.c_xx(), - digis_d.c_yy(), - digis_d.adc(), + digis_d->moduleInd(), + digis_d->c_xx(), + digis_d->c_yy(), + digis_d->adc(), //gains, gains->getVpedestals(), gains->getRangeAndCols(), gains->getFields(), wordCounter, - clusters_d.moduleStart(), - clusters_d.clusInModule(), - clusters_d.clusModuleStart())); + clusters_d->moduleStart(), + clusters_d->clusInModule(), + clusters_d->clusModuleStart())); #ifdef GPU_DEBUG alpaka::wait(queue); std::cout << "CUDA countModules kernel launch with " << blocks << " blocks of " << threadsPerBlockOrElementsPerThread << " threadsPerBlockOrElementsPerThread\n"; #endif - alpaka::enqueue( - queue, - alpaka::createTaskKernel( - workDiv, countModules(), digis_d.c_moduleInd(), clusters_d.moduleStart(), digis_d.clus(), wordCounter)); + alpaka::enqueue(queue, + alpaka::createTaskKernel(workDiv, + countModules(), + digis_d->c_moduleInd(), + clusters_d->moduleStart(), + digis_d->clus(), + wordCounter)); auto moduleStartFirstElement = - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createDeviceView(clusters_d.moduleStart(), 1u); + cms::alpakatools::createDeviceView(alpaka::getDev(queue), clusters_d->moduleStart(), 1u); alpaka::memcpy(queue, nModules_Clusters_h, moduleStartFirstElement, 1u); @@ -687,13 +691,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { alpaka::enqueue(queue, alpaka::createTaskKernel(workDivMaxNumModules, findClus(), - digis_d.c_moduleInd(), - digis_d.c_xx(), - digis_d.c_yy(), - clusters_d.c_moduleStart(), - clusters_d.clusInModule(), - clusters_d.moduleId(), - digis_d.clus(), + digis_d->c_moduleInd(), + digis_d->c_xx(), + digis_d->c_yy(), + clusters_d->c_moduleStart(), + clusters_d->clusInModule(), + clusters_d->moduleId(), + digis_d->clus(), wordCounter)); #ifdef GPU_DEBUG @@ -704,12 +708,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { alpaka::enqueue(queue, alpaka::createTaskKernel(workDivMaxNumModules, clusterChargeCut(), - digis_d.moduleInd(), - digis_d.c_adc(), - clusters_d.c_moduleStart(), - clusters_d.clusInModule(), - clusters_d.c_moduleId(), - digis_d.clus(), + digis_d->moduleInd(), + digis_d->c_adc(), + clusters_d->c_moduleStart(), + clusters_d->clusInModule(), + clusters_d->c_moduleId(), + digis_d->clus(), wordCounter)); // count the module start indices already here (instead of @@ -724,16 +728,16 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { alpaka::enqueue(queue, alpaka::createTaskKernel(workDivOneBlock, ::pixelgpudetails::fillHitsModuleStart(), - clusters_d.c_clusInModule(), - clusters_d.clusModuleStart())); + clusters_d->c_clusInModule(), + clusters_d->clusModuleStart())); // last element holds the number of all clusters - auto clusModuleStartView = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createDeviceView( - clusters_d.clusModuleStart(), gpuClustering::MaxNumModules + 1); + auto clusModuleStartView = cms::alpakatools::createDeviceView( + alpaka::getDev(queue), clusters_d->clusModuleStart(), gpuClustering::MaxNumModules + 1); const auto clusModuleStartLastElement = AlpakaDeviceSubView(clusModuleStartView, 1u, gpuClustering::MaxNumModules); // slice on host - auto nModules_Clusters_1_h{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(1u)}; + auto nModules_Clusters_1_h{::cms::alpakatools::allocHostBuf(1u)}; auto p_nModules_Clusters_1_h = alpaka::getPtrNative(nModules_Clusters_1_h); alpaka::memcpy(queue, nModules_Clusters_1_h, clusModuleStartLastElement, 1u); diff --git a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.h b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.h index 27415c1bb..04130fe91 100644 --- a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.h +++ b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.h @@ -4,16 +4,13 @@ #include #include "AlpakaCore/alpakaCommon.h" - +#include "AlpakaDataFormats/SiPixelClustersAlpaka.h" +#include "AlpakaDataFormats/SiPixelDigiErrorsAlpaka.h" +#include "AlpakaDataFormats/SiPixelDigisAlpaka.h" #include "AlpakaDataFormats/gpuClusteringConstants.h" #include "CondFormats/SiPixelFedCablingMapGPU.h" #include "CondFormats/SiPixelGainForHLTonGPU.h" - -#include "AlpakaDataFormats/SiPixelDigisAlpaka.h" -#include "AlpakaDataFormats/SiPixelDigiErrorsAlpaka.h" -#include "AlpakaDataFormats/SiPixelClustersAlpaka.h" #include "DataFormats/PixelErrors.h" -#include "AlpakaDataFormats/gpuClusteringConstants.h" struct SiPixelFedCablingMapGPU; class SiPixelGainForHLTonGPU; @@ -169,10 +166,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { AlpakaHostBuf fedId_; }; - SiPixelRawToClusterGPUKernel() - : nModules_Clusters_h{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(2u)}, - digis_d{SiPixelDigisAlpaka(0u)}, - clusters_d{SiPixelClustersAlpaka(0u)} {}; + SiPixelRawToClusterGPUKernel() : nModules_Clusters_h{::cms::alpakatools::allocHostBuf(2u)} {} ~SiPixelRawToClusterGPUKernel() = default; @@ -196,9 +190,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { std::pair getResults() { auto pnModules_Clusters_h = alpaka::getPtrNative(nModules_Clusters_h); - digis_d.setNModulesDigis(pnModules_Clusters_h[0], nDigis); - clusters_d.setNClusters(pnModules_Clusters_h[1]); - return std::make_pair(std::move(digis_d), std::move(clusters_d)); + digis_d->setNModulesDigis(pnModules_Clusters_h[0], nDigis); + clusters_d->setNClusters(pnModules_Clusters_h[1]); + return std::make_pair(std::move(*digis_d), std::move(*clusters_d)); } SiPixelDigiErrorsAlpaka&& getErrors() { return std::move(*digiErrors_d); } @@ -208,8 +202,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // Data to be put in the event AlpakaHostBuf nModules_Clusters_h; - SiPixelDigisAlpaka digis_d; - SiPixelClustersAlpaka clusters_d; + std::optional digis_d; + std::optional clusters_d; std::optional digiErrors_d; }; diff --git a/src/alpaka/plugin-SiPixelRecHits/alpaka/PixelCPEFastESProducer.cc b/src/alpaka/plugin-SiPixelRecHits/alpaka/PixelCPEFastESProducer.cc index 90781c18f..67d407eae 100644 --- a/src/alpaka/plugin-SiPixelRecHits/alpaka/PixelCPEFastESProducer.cc +++ b/src/alpaka/plugin-SiPixelRecHits/alpaka/PixelCPEFastESProducer.cc @@ -23,14 +23,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { std::ifstream in((data_ + "/cpefast.bin").c_str(), std::ios::binary); in.exceptions(std::ifstream::badbit | std::ifstream::failbit | std::ifstream::eofbit); + // TODO FIXME use the correct device Queue queue(devices[0]); pixelCPEforGPU::CommonParams commonParams; in.read(reinterpret_cast(&commonParams), sizeof(pixelCPEforGPU::CommonParams)); - auto commonParams_h{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createHostView( - &commonParams, 1u)}; - auto commonParams_d{ - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}; + auto commonParams_h{cms::alpakatools::createHostView(&commonParams, 1u)}; + auto commonParams_d{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}; alpaka::prepareForAsyncCopy(commonParams_d); alpaka::memcpy(queue, commonParams_d, commonParams_h, 1u); @@ -40,30 +39,23 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { //detParams.resize(ndetParams); std::vector detParams(ndetParams); in.read(reinterpret_cast(detParams.data()), ndetParams * sizeof(pixelCPEforGPU::DetParams)); - auto detParams_h{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createHostView( - detParams.data(), ndetParams)}; - auto detParams_d{ - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(ndetParams)}; + auto detParams_h{cms::alpakatools::createHostView(detParams.data(), ndetParams)}; + auto detParams_d{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), ndetParams)}; alpaka::prepareForAsyncCopy(detParams_d); alpaka::memcpy(queue, detParams_d, detParams_h, ndetParams); pixelCPEforGPU::AverageGeometry averageGeometry; in.read(reinterpret_cast(&averageGeometry), sizeof(pixelCPEforGPU::AverageGeometry)); - auto averageGeometry_h{ - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createHostView( - &averageGeometry, 1u)}; + auto averageGeometry_h{cms::alpakatools::createHostView(&averageGeometry, 1u)}; auto averageGeometry_d{ - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}; + cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}; alpaka::prepareForAsyncCopy(averageGeometry_d); alpaka::memcpy(queue, averageGeometry_d, averageGeometry_h, 1u); pixelCPEforGPU::LayerGeometry layerGeometry; in.read(reinterpret_cast(&layerGeometry), sizeof(pixelCPEforGPU::LayerGeometry)); - auto layerGeometry_h{ - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createHostView(&layerGeometry, - 1u)}; - auto layerGeometry_d{ - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}; + auto layerGeometry_h{cms::alpakatools::createHostView(&layerGeometry, 1u)}; + auto layerGeometry_d{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}; alpaka::prepareForAsyncCopy(layerGeometry_d); alpaka::memcpy(queue, layerGeometry_d, layerGeometry_h, 1u); @@ -72,9 +64,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { params.m_detParams = alpaka::getPtrNative(detParams_d); params.m_layerGeometry = alpaka::getPtrNative(layerGeometry_d); params.m_averageGeometry = alpaka::getPtrNative(averageGeometry_d); - auto params_h{ - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createHostView(¶ms, 1u)}; - auto params_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}; + auto params_h{cms::alpakatools::createHostView(¶ms, 1u)}; + auto params_d{cms::alpakatools::allocDeviceBuf(alpaka::getDev(queue), 1u)}; alpaka::prepareForAsyncCopy(params_d); alpaka::memcpy(queue, params_d, params_h, 1u); diff --git a/src/alpaka/plugin-SiPixelRecHits/alpaka/SiPixelRecHitAlpaka.cc b/src/alpaka/plugin-SiPixelRecHits/alpaka/SiPixelRecHitAlpaka.cc index 2098dd1a8..a61131066 100644 --- a/src/alpaka/plugin-SiPixelRecHits/alpaka/SiPixelRecHitAlpaka.cc +++ b/src/alpaka/plugin-SiPixelRecHits/alpaka/SiPixelRecHitAlpaka.cc @@ -54,7 +54,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } // TO DO: Async: Would need to add a queue as a parameter, not async for now! - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{iEvent.streamID()}; + ::cms::alpakatools::ScopedContextProduce ctx{iEvent.streamID()}; ctx.emplace(iEvent, tokenHit_, gpuAlgo_.makeHitsAsync(digis, clusters, bs, fcpe.params(), ctx.stream())); } diff --git a/src/alpaka/plugin-Validation/alpaka/HistoValidator.cc b/src/alpaka/plugin-Validation/alpaka/HistoValidator.cc index a97b70784..29ef61b38 100644 --- a/src/alpaka/plugin-Validation/alpaka/HistoValidator.cc +++ b/src/alpaka/plugin-Validation/alpaka/HistoValidator.cc @@ -120,14 +120,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto const nHits = hits.nHits(); #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + // TODO FIXME use the correct device Queue queue(devices[0]); auto const h_adcBuf = digis.adcToHostAsync(queue); auto const h_adc = alpaka::getPtrNative(h_adcBuf); - auto const d_clusInModuleView = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createDeviceView( - clusters.clusInModule(), gpuClustering::MaxNumModules); - auto h_clusInModuleBuf{ - ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(gpuClustering::MaxNumModules)}; + auto const d_clusInModuleView = cms::alpakatools::createDeviceView( + alpaka::getDev(queue), clusters.clusInModule(), gpuClustering::MaxNumModules); + auto h_clusInModuleBuf{::cms::alpakatools::allocHostBuf(gpuClustering::MaxNumModules)}; alpaka::memcpy(queue, h_clusInModuleBuf, d_clusInModuleView, gpuClustering::MaxNumModules); auto h_clusInModule = alpaka::getPtrNative(h_clusInModuleBuf);