diff --git a/src/alpaka/AlpakaCore/AtomicPairCounter.h b/src/alpaka/AlpakaCore/AtomicPairCounter.h index c2c5a6f80..b35891431 100644 --- a/src/alpaka/AlpakaCore/AtomicPairCounter.h +++ b/src/alpaka/AlpakaCore/AtomicPairCounter.h @@ -3,51 +3,49 @@ #include -namespace cms { - namespace alpakatools { +namespace cms::alpakatools { - class AtomicPairCounter { - public: - using c_type = unsigned long long int; + class AtomicPairCounter { + public: + using c_type = unsigned long long int; - ALPAKA_FN_HOST_ACC AtomicPairCounter() {} - ALPAKA_FN_HOST_ACC AtomicPairCounter(c_type i) { counter.ac = i; } + ALPAKA_FN_HOST_ACC AtomicPairCounter() {} + ALPAKA_FN_HOST_ACC AtomicPairCounter(c_type i) { counter.ac = i; } - ALPAKA_FN_HOST_ACC AtomicPairCounter& operator=(c_type i) { - counter.ac = i; - return *this; - } + ALPAKA_FN_HOST_ACC AtomicPairCounter& operator=(c_type i) { + counter.ac = i; + return *this; + } - struct Counters { - uint32_t n; // in a "One to Many" association is the number of "One" - uint32_t m; // in a "One to Many" association is the total number of associations - }; + struct Counters { + uint32_t n; // in a "One to Many" association is the number of "One" + uint32_t m; // in a "One to Many" association is the total number of associations + }; - union Atomic2 { - Counters counters; - c_type ac; - }; + union Atomic2 { + Counters counters; + c_type ac; + }; - static constexpr c_type incr = 1UL << 32; + static constexpr c_type incr = 1UL << 32; - ALPAKA_FN_HOST_ACC Counters get() const { return counter.counters; } + ALPAKA_FN_HOST_ACC Counters get() const { return counter.counters; } - // increment n by 1 and m by i. return previous value - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE Counters add(const T_Acc& acc, uint32_t i) { - c_type c = i; - c += incr; + // increment n by 1 and m by i. return previous value + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE Counters add(const T_Acc& acc, uint32_t i) { + c_type c = i; + c += incr; - Atomic2 ret; - ret.ac = alpaka::atomicAdd(acc, &counter.ac, c, alpaka::hierarchy::Blocks{}); - return ret.counters; - } + Atomic2 ret; + ret.ac = alpaka::atomicAdd(acc, &counter.ac, c, alpaka::hierarchy::Blocks{}); + return ret.counters; + } - private: - Atomic2 counter; - }; + private: + Atomic2 counter; + }; - } // namespace alpakatools -} // namespace cms +} // namespace cms::alpakatools #endif // HeterogeneousCore_CUDAUtilities_interface_AtomicPairCounter_h diff --git a/src/alpaka/AlpakaCore/ContextState.h b/src/alpaka/AlpakaCore/ContextState.h new file mode 100644 index 000000000..1263f4f0e --- /dev/null +++ b/src/alpaka/AlpakaCore/ContextState.h @@ -0,0 +1,62 @@ +#ifndef HeterogeneousCore_AlpakaCore_ContextState_h +#define HeterogeneousCore_AlpakaCore_ContextState_h + +#include + +#include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/SharedStreamPtr.h" + +namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { + + /** + * The purpose of this class is to deliver the device and CUDA stream + * information from ExternalWork's acquire() to producer() via a + * member/StreamCache variable. + */ + class ContextState { + public: + ContextState() = default; + ~ContextState() = default; + + ContextState(const ContextState&) = delete; + ContextState& operator=(const ContextState&) = delete; + ContextState(ContextState&&) = delete; + ContextState& operator=(ContextState&& other) = delete; + + private: + friend class ScopedContextAcquire; + friend class ScopedContextProduce; + friend class ScopedContextTask; + + void set(int device, SharedStreamPtr stream) { + throwIfStream(); + device_ = device; + stream_ = std::move(stream); + } + + int device() const { return device_; } + + const SharedStreamPtr& streamPtr() const { + throwIfNoStream(); + return stream_; + } + + SharedStreamPtr releaseStreamPtr() { + throwIfNoStream(); + // This function needs to effectively reset stream_ (i.e. stream_ + // must be empty after this function). This behavior ensures that + // the SharedStreamPtr is not hold for inadvertedly long (i.e. to + // the next event), and is checked at run time. + return std::move(stream_); + } + + void throwIfStream() const; + void throwIfNoStream() const; + + SharedStreamPtr stream_; + int device_; + }; + +} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE + +#endif // HeterogeneousCore_AlpakaCore_ContextState_h diff --git a/src/alpaka/AlpakaCore/ESProduct.h b/src/alpaka/AlpakaCore/ESProduct.h new file mode 100644 index 000000000..c90376624 --- /dev/null +++ b/src/alpaka/AlpakaCore/ESProduct.h @@ -0,0 +1,104 @@ +#ifndef HeterogeneousCore_AlpakaCore_ESProduct_h +#define HeterogeneousCore_AlpakaCore_ESProduct_h + +#include +#include +#include +#include + +#include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/EventCache.h" +#include "AlpakaCore/currentDevice.h" +#include "AlpakaCore/eventWorkHasCompleted.h" + +namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { + + template + class ESProduct { + public: + template + ESProduct(T_Acc acc) : gpuDataPerDevice_(::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::deviceCount()) { + for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) { + gpuDataPerDevice_[i].m_event = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::getEventCache().get(acc); + } + } + + ~ESProduct() = default; + + // transferAsync should be a function of (T&, cudaStream_t) + // which enqueues asynchronous transfers (possibly kernels as well) + // to the CUDA stream + template + const T& dataForCurrentDeviceAsync(::ALPAKA_ACCELERATOR_NAMESPACE::Queue queue, F transferAsync) const { + auto device = currentDevice(); + auto& data = gpuDataPerDevice_[device]; + + // If GPU data has already been filled, we can return it + // immediately + if (not data.m_filled.load()) { + // It wasn't, so need to fill it + std::scoped_lock lk{data.m_mutex}; + + if (data.m_filled.load()) { + // Other thread marked it filled while we were locking the mutex, so we're free to return it + return data.m_data; + } + + if (data.m_fillingStream != nullptr) { + // Someone else is filling + + // Check first if the recorded event has occurred + if (eventWorkHasCompleted(data.m_event.get())) { + // It was, so data is accessible from all CUDA streams on + // the device. Set the 'filled' for all subsequent calls and + // return the value + auto should_be_false = data.m_filled.exchange(true); + assert(not should_be_false); + data.m_fillingStream = nullptr; + } else if (data.m_fillingStream != queue) { + // Filling is still going on. For other CUDA stream, add + // wait on the CUDA stream and return the value. Subsequent + // work queued on the stream will wait for the event to + // occur (i.e. transfer to finish). + alpaka::wait(queue, data.m_event.get()); + } + // else: filling is still going on. But for the same CUDA + // stream (which would be a bit strange but fine), we can just + // return as all subsequent work should be enqueued to the + // same CUDA stream (or stream to be explicitly synchronized + // by the caller) + } else { + // Now we can be sure that the data is not yet on the GPU, and + // this thread is the first to try that. + transferAsync(data.m_data, queue); + assert(data.m_fillingStream == nullptr); + data.m_fillingStream = queue; + // Record in the cudaStream an event to mark the readiness of the + // EventSetup data on the GPU, so other streams can check for it + alpaka::enqueue(queue, data.m_event.get()); + // Now the filling has been enqueued to the cudaStream, so we + // can return the GPU data immediately, since all subsequent + // work must be either enqueued to the cudaStream, or the cudaStream + // must be synchronized by the caller + } + } + + return data.m_data; + } + + private: + struct Item { + mutable std::mutex m_mutex; + mutable SharedEventPtr m_event; // guarded by m_mutex + // non-null if some thread is already filling (cudaStream_t is just a pointer) + mutable ::ALPAKA_ACCELERATOR_NAMESPACE::Queue* m_fillingStream = nullptr; // guarded by m_mutex + mutable std::atomic m_filled = false; // easy check if data has been filled already or not + mutable T m_data; // guarded by m_mutex + }; + + std::vector gpuDataPerDevice_; + }; + +} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE + +#endif // HeterogeneousCore_AlpakaCore_ESProduct_h diff --git a/src/alpaka/AlpakaCore/EventCache.h b/src/alpaka/AlpakaCore/EventCache.h new file mode 100644 index 000000000..114460524 --- /dev/null +++ b/src/alpaka/AlpakaCore/EventCache.h @@ -0,0 +1,76 @@ +#ifndef HeterogeneousCore_AlpakaUtilities_EventCache_h +#define HeterogeneousCore_AlpakaUtilities_EventCache_h + +#include + +#include + +#include "AlpakaCore/ScopedSetDevice.h" +#include "AlpakaCore/SharedEventPtr.h" +#include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/currentDevice.h" +#include "AlpakaCore/deviceCount.h" +#include "AlpakaCore/eventWorkHasCompleted.h" +#include "Framework/ReusableObjectHolder.h" + +class CUDAService; + +namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { + + class EventCache { + public: + EventCache(); + + // Gets a (cached) CUDA event for the current device. The event + // will be returned to the cache by the shared_ptr destructor. The + // returned event is guaranteed to be in the state where all + // captured work has completed, i.e. cudaEventQuery() == cudaSuccess. + // + // This function is thread safe + template + SharedEventPtr get(T_Acc acc) { + const auto dev = currentDevice(); + auto event = makeOrGet(dev, acc); + // captured work has completed, or a just-created event + if (eventWorkHasCompleted(*(event.get()))) { + return event; + } + + // Got an event with incomplete captured work. Try again until we + // get a completed (or a just-created) event. Need to keep all + // incomplete events until a completed event is found in order to + // avoid ping-pong with an incomplete event. + std::vector ptrs{std::move(event)}; + bool completed; + do { + event = makeOrGet(dev, acc); + completed = eventWorkHasCompleted(*(event.get())); + if (not completed) { + ptrs.emplace_back(std::move(event)); + } + } while (not completed); + return event; + } + + private: + friend class ::CUDAService; + + template + SharedEventPtr makeOrGet(int dev, T_Acc acc) { + return cache_[dev].makeOrGet( + [dev, acc]() { return std::make_unique<::ALPAKA_ACCELERATOR_NAMESPACE::Event>(acc); }); + } + + // not thread safe, intended to be called only from CUDAService destructor + void clear(); + + std::vector> cache_; + }; + + // Gets the global instance of a EventCache + // This function is thread safe + EventCache& getEventCache(); + +} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE + +#endif // HeterogeneousCore_AlpakaUtilities_EventCache_h diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index ce18c7616..792eac471 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -11,311 +11,311 @@ #include "AlpakaCore/alpakastdAlgorithm.h" #include "AlpakaCore/prefixScan.h" -namespace cms { - namespace alpakatools { - - struct countFromVector { - template - ALPAKA_FN_ACC void operator()(const T_Acc &acc, - Histo *__restrict__ h, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets) const { - const uint32_t nt = offsets[nh]; - cms::alpakatools::for_each_element_in_grid_strided(acc, nt, [&](uint32_t i) { - auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); - ALPAKA_ASSERT_OFFLOAD((*off) > 0); - int32_t ih = off - offsets - 1; - ALPAKA_ASSERT_OFFLOAD(ih >= 0); - ALPAKA_ASSERT_OFFLOAD(ih < int(nh)); - h->count(acc, v[i], ih); - }); - } - }; - - struct fillFromVector { - template - ALPAKA_FN_ACC void operator()(const T_Acc &acc, - Histo *__restrict__ h, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets) const { - const uint32_t nt = offsets[nh]; - cms::alpakatools::for_each_element_in_grid_strided(acc, nt, [&](uint32_t i) { - auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); - ALPAKA_ASSERT_OFFLOAD((*off) > 0); - int32_t ih = off - offsets - 1; - ALPAKA_ASSERT_OFFLOAD(ih >= 0); - ALPAKA_ASSERT_OFFLOAD(ih < int(nh)); - h->fill(acc, v[i], i, ih); - }); - } - }; - - template - ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void launchZero( - Histo *__restrict__ h, ALPAKA_ACCELERATOR_NAMESPACE::Queue &queue) { - uint32_t *poff = (uint32_t *)(char *)(&(h->off)); - auto histoOffView = cms::alpakatools::createDeviceView(poff, Histo::totbins()); - - alpaka::memset(queue, histoOffView, 0, Histo::totbins()); - alpaka::wait(queue); +namespace cms::alpakatools { + + struct countFromVector { + template + ALPAKA_FN_ACC void operator()(const T_Acc &acc, + Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) const { + const uint32_t nt = offsets[nh]; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(acc, nt, [&](uint32_t i) { + auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); + ALPAKA_ASSERT_OFFLOAD((*off) > 0); + int32_t ih = off - offsets - 1; + ALPAKA_ASSERT_OFFLOAD(ih >= 0); + ALPAKA_ASSERT_OFFLOAD(ih < int(nh)); + h->count(acc, v[i], ih); + }); } - - template - ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void launchFinalize( - Histo *__restrict__ h, ALPAKA_ACCELERATOR_NAMESPACE::Queue &queue) { - uint32_t *poff = (uint32_t *)(char *)(&(h->off)); - - const int num_items = Histo::totbins(); - - const unsigned int nthreads = 1024; - const Vec1D threadsPerBlockOrElementsPerThread(nthreads); - const unsigned int nblocks = (num_items + nthreads - 1) / nthreads; - const Vec1D blocksPerGrid(nblocks); - - const WorkDiv1D &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); - alpaka::enqueue(queue, - alpaka::createTaskKernel( - workDiv, multiBlockPrefixScanFirstStep(), poff, poff, num_items)); - - const WorkDiv1D &workDivWith1Block = - cms::alpakatools::make_workdiv(Vec1D::all(1), threadsPerBlockOrElementsPerThread); - alpaka::enqueue( - queue, - alpaka::createTaskKernel( - workDivWith1Block, multiBlockPrefixScanSecondStep(), poff, poff, num_items, nblocks)); + }; + + struct fillFromVector { + template + ALPAKA_FN_ACC void operator()(const T_Acc &acc, + Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) const { + const uint32_t nt = offsets[nh]; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(acc, nt, [&](uint32_t i) { + auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); + ALPAKA_ASSERT_OFFLOAD((*off) > 0); + int32_t ih = off - offsets - 1; + ALPAKA_ASSERT_OFFLOAD(ih >= 0); + ALPAKA_ASSERT_OFFLOAD(ih < int(nh)); + h->fill(acc, v[i], i, ih); + }); } - - template - ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void fillManyFromVector( - Histo *__restrict__ h, - uint32_t nh, - T const *__restrict__ v, - uint32_t const *__restrict__ offsets, - uint32_t totSize, - unsigned int nthreads, - ALPAKA_ACCELERATOR_NAMESPACE::Queue &queue) { - launchZero(h, queue); - - const unsigned int nblocks = (totSize + nthreads - 1) / nthreads; - const Vec1D blocksPerGrid(nblocks); - const Vec1D threadsPerBlockOrElementsPerThread(nthreads); - const WorkDiv1D &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); - - alpaka::enqueue( - queue, - alpaka::createTaskKernel(workDiv, countFromVector(), h, nh, v, offsets)); - launchFinalize(h, queue); - - alpaka::enqueue( - queue, - alpaka::createTaskKernel(workDiv, fillFromVector(), h, nh, v, offsets)); + }; + + template + ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void launchZero( + Histo *__restrict__ h, ::ALPAKA_ACCELERATOR_NAMESPACE::Queue &queue) { + uint32_t *poff = (uint32_t *)(char *)(&(h->off)); + auto histoOffView = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createDeviceView( + poff, Histo::totbins()); + + alpaka::memset(queue, histoOffView, 0, Histo::totbins()); + alpaka::wait(queue); + } + + template + ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void launchFinalize( + Histo *__restrict__ h, ::ALPAKA_ACCELERATOR_NAMESPACE::Queue &queue) { + uint32_t *poff = (uint32_t *)(char *)(&(h->off)); + + const int num_items = Histo::totbins(); + + const unsigned int nthreads = 1024; + const Vec1D threadsPerBlockOrElementsPerThread(nthreads); + const unsigned int nblocks = (num_items + nthreads - 1) / nthreads; + const Vec1D blocksPerGrid(nblocks); + + auto d_pc = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u); + int32_t *pc = alpaka::getPtrNative(d_pc); + alpaka::memset(queue, d_pc, 0, 1u); + + const WorkDiv1D &workDiv = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( + blocksPerGrid, threadsPerBlockOrElementsPerThread); + alpaka::enqueue(queue, + alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>( + workDiv, ::cms::alpakatools::multiBlockPrefixScan(), poff, poff, num_items, pc)); + alpaka::wait(queue); + } + + template + ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void fillManyFromVector( + Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets, + uint32_t totSize, + unsigned int nthreads, + ::ALPAKA_ACCELERATOR_NAMESPACE::Queue &queue) { + launchZero(h, queue); + + const unsigned int nblocks = (totSize + nthreads - 1) / nthreads; + const Vec1D blocksPerGrid(nblocks); + const Vec1D threadsPerBlockOrElementsPerThread(nthreads); + const WorkDiv1D &workDiv = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( + blocksPerGrid, threadsPerBlockOrElementsPerThread); + + alpaka::enqueue( + queue, + alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>(workDiv, countFromVector(), h, nh, v, offsets)); + launchFinalize(h, queue); + + alpaka::enqueue( + queue, + alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>(workDiv, fillFromVector(), h, nh, v, offsets)); + } + + struct finalizeBulk { + template + ALPAKA_FN_ACC void operator()(const T_Acc &acc, AtomicPairCounter const *apc, Assoc *__restrict__ assoc) const { + assoc->bulkFinalizeFill(acc, *apc); } - - struct finalizeBulk { - template - ALPAKA_FN_ACC void operator()(const T_Acc &acc, AtomicPairCounter const *apc, Assoc *__restrict__ assoc) const { - assoc->bulkFinalizeFill(acc, *apc); - } - }; - - // iteratate over N bins left and right of the one containing "v" - template - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void forEachInBins(Hist const &hist, V value, int n, Func func) { - int bs = Hist::bin(value); - int be = std::min(int(Hist::nbins() - 1), bs + n); - bs = std::max(0, bs - n); - ALPAKA_ASSERT_OFFLOAD(be >= bs); - for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { - func(*pj); - } + }; + + // iteratate over N bins left and right of the one containing "v" + template + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void forEachInBins(Hist const &hist, V value, int n, Func func) { + int bs = Hist::bin(value); + int be = std::min(int(Hist::nbins() - 1), bs + n); + bs = std::max(0, bs - n); + ALPAKA_ASSERT_OFFLOAD(be >= bs); + for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { + func(*pj); } - - // iteratate over bins containing all values in window wmin, wmax - template - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) { - auto bs = Hist::bin(wmin); - auto be = Hist::bin(wmax); - ALPAKA_ASSERT_OFFLOAD(be >= bs); - for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { - func(*pj); - } + } + + // iteratate over bins containing all values in window wmin, wmax + template + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) { + auto bs = Hist::bin(wmin); + auto be = Hist::bin(wmax); + ALPAKA_ASSERT_OFFLOAD(be >= bs); + for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { + func(*pj); + } + } + + template + class HistoContainer { + public: + using Counter = uint32_t; + + using CountersOnly = HistoContainer; + + using index_type = I; + using UT = typename std::make_unsigned::type; + + static constexpr uint32_t ilog2(uint32_t v) { + constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000}; + constexpr uint32_t s[] = {1, 2, 4, 8, 16}; + + uint32_t r = 0; // result of log2(v) will go here + for (auto i = 4; i >= 0; i--) + if (v & b[i]) { + v >>= s[i]; + r |= s[i]; + } + return r; } - template - class HistoContainer { - public: - using Counter = uint32_t; - - using CountersOnly = HistoContainer; - - using index_type = I; - using UT = typename std::make_unsigned::type; - - static constexpr uint32_t ilog2(uint32_t v) { - constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000}; - constexpr uint32_t s[] = {1, 2, 4, 8, 16}; - - uint32_t r = 0; // result of log2(v) will go here - for (auto i = 4; i >= 0; i--) - if (v & b[i]) { - v >>= s[i]; - r |= s[i]; - } - return r; - } - - static constexpr uint32_t sizeT() { return S; } - static constexpr uint32_t nbins() { return NBINS; } - static constexpr uint32_t nhists() { return NHISTS; } - static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; } - static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; } - static constexpr uint32_t capacity() { return SIZE; } - - static constexpr auto histOff(uint32_t nh) { return NBINS * nh; } + static constexpr uint32_t sizeT() { return S; } + static constexpr uint32_t nbins() { return NBINS; } + static constexpr uint32_t nhists() { return NHISTS; } + static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; } + static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; } + static constexpr uint32_t capacity() { return SIZE; } - static constexpr UT bin(T t) { - constexpr uint32_t shift = sizeT() - nbits(); - constexpr uint32_t mask = (1 << nbits()) - 1; - return (t >> shift) & mask; - } + static constexpr auto histOff(uint32_t nh) { return NBINS * nh; } - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void zero() { - for (auto &i : off) - i = 0; - } + static constexpr UT bin(T t) { + constexpr uint32_t shift = sizeT() - nbits(); + constexpr uint32_t mask = (1 << nbits()) - 1; + return (t >> shift) & mask; + } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void add(const T_Acc &acc, CountersOnly const &co) { - for (uint32_t i = 0; i < totbins(); ++i) { - alpaka::atomicAdd(acc, off + i, co.off[i], alpaka::hierarchy::Blocks{}); - } - } + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void zero() { + for (auto &i : off) + i = 0; + } - template - static ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t atomicIncrement(const T_Acc &acc, Counter &x) { - return alpaka::atomicAdd(acc, &x, 1u, alpaka::hierarchy::Blocks{}); + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void add(const T_Acc &acc, CountersOnly const &co) { + for (uint32_t i = 0; i < totbins(); ++i) { + alpaka::atomicAdd(acc, off + i, co.off[i], alpaka::hierarchy::Blocks{}); } + } - template - static ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE uint32_t atomicDecrement(const T_Acc &acc, Counter &x) { - return alpaka::atomicSub(acc, &x, 1u, alpaka::hierarchy::Blocks{}); - } + template + static ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t atomicIncrement(const T_Acc &acc, Counter &x) { + return alpaka::atomicAdd(acc, &x, 1u, alpaka::hierarchy::Blocks{}); + } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void countDirect(const T_Acc &acc, T b) { - ALPAKA_ASSERT_OFFLOAD(b < nbins()); - atomicIncrement(acc, off[b]); - } + template + static ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE uint32_t atomicDecrement(const T_Acc &acc, Counter &x) { + return alpaka::atomicSub(acc, &x, 1u, alpaka::hierarchy::Blocks{}); + } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void fillDirect(const T_Acc &acc, T b, index_type j) { - ALPAKA_ASSERT_OFFLOAD(b < nbins()); - auto w = atomicDecrement(acc, off[b]); - ALPAKA_ASSERT_OFFLOAD(w > 0); - bins[w - 1] = j; - } + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void countDirect(const T_Acc &acc, T b) { + ALPAKA_ASSERT_OFFLOAD(b < nbins()); + atomicIncrement(acc, off[b]); + } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE int32_t - bulkFill(const T_Acc &acc, AtomicPairCounter &apc, index_type const *v, uint32_t n) { - auto c = apc.add(acc, n); - if (c.m >= nbins()) - return -int32_t(c.m); - off[c.m] = c.n; - for (uint32_t j = 0; j < n; ++j) - bins[c.n + j] = v[j]; - return c.m; - } + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void fillDirect(const T_Acc &acc, T b, index_type j) { + ALPAKA_ASSERT_OFFLOAD(b < nbins()); + auto w = atomicDecrement(acc, off[b]); + ALPAKA_ASSERT_OFFLOAD(w > 0); + bins[w - 1] = j; + } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void bulkFinalize(const T_Acc &acc, AtomicPairCounter const &apc) { - off[apc.get().m] = apc.get().n; - } + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE int32_t + bulkFill(const T_Acc &acc, AtomicPairCounter &apc, index_type const *v, uint32_t n) { + auto c = apc.add(acc, n); + if (c.m >= nbins()) + return -int32_t(c.m); + off[c.m] = c.n; + for (uint32_t j = 0; j < n; ++j) + bins[c.n + j] = v[j]; + return c.m; + } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void bulkFinalizeFill(const T_Acc &acc, AtomicPairCounter const &apc) { - auto m = apc.get().m; - auto n = apc.get().n; + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void bulkFinalize(const T_Acc &acc, AtomicPairCounter const &apc) { + off[apc.get().m] = apc.get().n; + } - if (m >= nbins()) { // overflow! - off[nbins()] = uint32_t(off[nbins() - 1]); - return; - } + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void bulkFinalizeFill(const T_Acc &acc, AtomicPairCounter const &apc) { + auto m = apc.get().m; + auto n = apc.get().n; - cms::alpakatools::for_each_element_in_grid_strided(acc, totbins(), m, [&](uint32_t i) { off[i] = n; }); + if (m >= nbins()) { // overflow! + off[nbins()] = uint32_t(off[nbins() - 1]); + return; } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const T_Acc &acc, T t) { - uint32_t b = bin(t); - ALPAKA_ASSERT_OFFLOAD(b < nbins()); - atomicIncrement(acc, off[b]); - } + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, totbins(), m, [&](uint32_t i) { off[i] = n; }); + } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const T_Acc &acc, T t, index_type j) { - uint32_t b = bin(t); - ALPAKA_ASSERT_OFFLOAD(b < nbins()); - auto w = atomicDecrement(acc, off[b]); - ALPAKA_ASSERT_OFFLOAD(w > 0); - bins[w - 1] = j; - } + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const T_Acc &acc, T t) { + uint32_t b = bin(t); + ALPAKA_ASSERT_OFFLOAD(b < nbins()); + atomicIncrement(acc, off[b]); + } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const T_Acc &acc, T t, uint32_t nh) { - uint32_t b = bin(t); - ALPAKA_ASSERT_OFFLOAD(b < nbins()); - b += histOff(nh); - ALPAKA_ASSERT_OFFLOAD(b < totbins()); - atomicIncrement(acc, off[b]); - } + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const T_Acc &acc, T t, index_type j) { + uint32_t b = bin(t); + ALPAKA_ASSERT_OFFLOAD(b < nbins()); + auto w = atomicDecrement(acc, off[b]); + ALPAKA_ASSERT_OFFLOAD(w > 0); + bins[w - 1] = j; + } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const T_Acc &acc, T t, index_type j, uint32_t nh) { - uint32_t b = bin(t); - ALPAKA_ASSERT_OFFLOAD(b < nbins()); - b += histOff(nh); - ALPAKA_ASSERT_OFFLOAD(b < totbins()); - auto w = atomicDecrement(acc, off[b]); - ALPAKA_ASSERT_OFFLOAD(w > 0); - bins[w - 1] = j; - } + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const T_Acc &acc, T t, uint32_t nh) { + uint32_t b = bin(t); + ALPAKA_ASSERT_OFFLOAD(b < nbins()); + b += histOff(nh); + ALPAKA_ASSERT_OFFLOAD(b < totbins()); + atomicIncrement(acc, off[b]); + } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void finalize(const T_Acc &acc, Counter *ws = nullptr) { - ALPAKA_ASSERT_OFFLOAD(off[totbins() - 1] == 0); - blockPrefixScan(acc, off, totbins(), ws); - ALPAKA_ASSERT_OFFLOAD(off[totbins() - 1] == off[totbins() - 2]); - } + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const T_Acc &acc, T t, index_type j, uint32_t nh) { + uint32_t b = bin(t); + ALPAKA_ASSERT_OFFLOAD(b < nbins()); + b += histOff(nh); + ALPAKA_ASSERT_OFFLOAD(b < totbins()); + auto w = atomicDecrement(acc, off[b]); + ALPAKA_ASSERT_OFFLOAD(w > 0); + bins[w - 1] = j; + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void finalize(const T_Acc &acc, Counter *ws = nullptr) { + ALPAKA_ASSERT_OFFLOAD(off[totbins() - 1] == 0); + blockPrefixScan(acc, off, totbins(), ws); + ALPAKA_ASSERT_OFFLOAD(off[totbins() - 1] == off[totbins() - 2]); + } - constexpr auto size() const { return uint32_t(off[totbins() - 1]); } - constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; } + constexpr auto size() const { return uint32_t(off[totbins() - 1]); } + constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; } - constexpr index_type const *begin() const { return bins; } - constexpr index_type const *end() const { return begin() + size(); } + constexpr index_type const *begin() const { return bins; } + constexpr index_type const *end() const { return begin() + size(); } - constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; } - constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; } + constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; } + constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; } - Counter off[totbins()]; - index_type bins[capacity()]; - }; + Counter off[totbins()]; + index_type bins[capacity()]; + }; - template - using OneToManyAssoc = HistoContainer; + template + using OneToManyAssoc = HistoContainer; - } // namespace alpakatools -} // namespace cms +} // namespace cms::alpakatools #endif // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h diff --git a/src/alpaka/AlpakaCore/Product.h b/src/alpaka/AlpakaCore/Product.h new file mode 100644 index 000000000..6a5e0f5f6 --- /dev/null +++ b/src/alpaka/AlpakaCore/Product.h @@ -0,0 +1,61 @@ +#ifndef AlpakaDataFormats_Common_Product_h +#define AlpakaDataFormats_Common_Product_h + +#include + +#include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/ProductBase.h" + +namespace edm { + template + class Wrapper; +} + +namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { + + namespace impl { + class ScopedContextGetterBase; + } + + /** + * The purpose of this class is to wrap CUDA data to edm::Event in a + * way which forces correct use of various utilities. + * + * The non-default construction has to be done with ::cms::alpakatools::ScopedContext + * (in order to properly register the CUDA event). + * + * The default constructor is needed only for the ROOT dictionary generation. + * + * The CUDA event is in practice needed only for stream-stream + * synchronization, but someone with long-enough lifetime has to own + * it. Here is a somewhat natural place. If overhead is too much, we + * can use them only where synchronization between streams is needed. + */ + template + class Product : public ProductBase { + public: + Product() = default; // Needed only for ROOT dictionary generation + + Product(const Product&) = delete; + Product& operator=(const Product&) = delete; + Product(Product&&) = default; + Product& operator=(Product&&) = default; + + private: + friend class impl::ScopedContextGetterBase; + friend class ScopedContextProduce; + friend class edm::Wrapper>; + + explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, T data) + : ProductBase(device, std::move(stream), std::move(event)), data_(std::move(data)) {} + + template + explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, Args&&... args) + : ProductBase(device, std::move(stream), std::move(event)), data_(std::forward(args)...) {} + + T data_; //! + }; + +} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE + +#endif // AlpakaDataFormats_Common_Product_h diff --git a/src/alpaka/AlpakaCore/ProductBase.h b/src/alpaka/AlpakaCore/ProductBase.h new file mode 100644 index 000000000..3d1f09cf0 --- /dev/null +++ b/src/alpaka/AlpakaCore/ProductBase.h @@ -0,0 +1,93 @@ +#ifndef AlpakaDataFormats_Common_ProductBase_h +#define AlpakaDataFormats_Common_ProductBase_h + +#include +#include + +#include "AlpakaCore/SharedEventPtr.h" +#include "AlpakaCore/SharedStreamPtr.h" + +namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { + + namespace impl { + class ScopedContextBase; + } + + /** + * Base class for all instantiations of CUDA to hold the + * non-T-dependent members. + */ + class ProductBase { + public: + ProductBase() = default; // Needed only for ROOT dictionary generation + ~ProductBase(); + + ProductBase(const ProductBase&) = delete; + ProductBase& operator=(const ProductBase&) = delete; + ProductBase(ProductBase&& other) + : stream_{std::move(other.stream_)}, + event_{std::move(other.event_)}, + mayReuseStream_{other.mayReuseStream_.load()}, + device_{other.device_} {} + ProductBase& operator=(ProductBase&& other) { + stream_ = std::move(other.stream_); + event_ = std::move(other.event_); + mayReuseStream_ = other.mayReuseStream_.load(); + device_ = other.device_; + return *this; + } + + bool isValid() const { return stream_.get() != nullptr; } + bool isAvailable() const; + + int device() const { return device_; } + + // cudaStream_t is a pointer to a thread-safe object, for which a + // mutable access is needed even if the ::cms::alpakatools::ScopedContext itself + // would be const. Therefore it is ok to return a non-const + // pointer from a const method here. + ::ALPAKA_ACCELERATOR_NAMESPACE::Queue& stream() const { return *(stream_.get()); } + + // cudaEvent_t is a pointer to a thread-safe object, for which a + // mutable access is needed even if the ::cms::alpakatools::ScopedContext itself + // would be const. Therefore it is ok to return a non-const + // pointer from a const method here. + alpaka::Event<::ALPAKA_ACCELERATOR_NAMESPACE::Queue>& event() const { return *(event_.get()); } + + protected: + explicit ProductBase(int device, SharedStreamPtr stream, SharedEventPtr event) + : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {} + + private: + friend class impl::ScopedContextBase; + friend class ScopedContextProduce; + + // The following function is intended to be used only from ScopedContext + const SharedStreamPtr& streamPtr() const { return stream_; } + + bool mayReuseStream() const { + bool expected = true; + bool changed = mayReuseStream_.compare_exchange_strong(expected, false); + // If the current thread is the one flipping the flag, it may + // reuse the stream. + return changed; + } + + // The cudaStream_t is really shared among edm::Event products, so + // using shared_ptr also here + SharedStreamPtr stream_; //! + // shared_ptr because of caching in ::cms::alpakatools::EventCache + SharedEventPtr event_; //! + + // This flag tells whether the CUDA stream may be reused by a + // consumer or not. The goal is to have a "chain" of modules to + // queue their work to the same stream. + mutable std::atomic mayReuseStream_ = true; //! + + // The CUDA device associated with this product + int device_ = -1; //! + }; + +} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE + +#endif // AlpakaDataFormats_Common_ProductBase_h diff --git a/src/alpaka/AlpakaCore/ScopedContext.h b/src/alpaka/AlpakaCore/ScopedContext.h new file mode 100644 index 000000000..cbea42530 --- /dev/null +++ b/src/alpaka/AlpakaCore/ScopedContext.h @@ -0,0 +1,281 @@ +#ifndef HeterogeneousCore_AlpakaCore_ScopedContext_h +#define HeterogeneousCore_AlpakaCore_ScopedContext_h + +#include + +#include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/ContextState.h" +#include "AlpakaCore/EventCache.h" +#include "AlpakaCore/Product.h" +#include "AlpakaCore/SharedEventPtr.h" +#include "AlpakaCore/SharedStreamPtr.h" +#include "Framework/EDGetToken.h" +#include "Framework/EDPutToken.h" +#include "Framework/Event.h" +#include "Framework/WaitingTaskWithArenaHolder.h" +#include "chooseDevice.h" +#include "AlpakaCore/StreamCache.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE::cms::alpakatest { + class TestScopedContext; +} + +namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { + + namespace impl { + // This class is intended to be derived by other ScopedContext*, not for general use + class ScopedContextBase { + public: + int device() const { return currentDevice_; } + + // cudaStream_t is a pointer to a thread-safe object, for which a + // mutable access is needed even if the ScopedContext itself + // would be const. Therefore it is ok to return a non-const + // pointer from a const method here. + ::ALPAKA_ACCELERATOR_NAMESPACE::Queue& stream() const { return *(stream_.get()); } + const SharedStreamPtr& streamPtr() const { return stream_; } + + protected: + // The constructors set the current device, but the device + // is not set back to the previous value at the destructor. This + // should be sufficient (and tiny bit faster) as all CUDA API + // functions relying on the current device should be called from + // the scope where this context is. The current device doesn't + // really matter between modules (or across TBB tasks). + + template + ScopedContextBase(T_Acc acc, const ProductBase& data) : currentDevice_(data.device()) { +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + cudaSetDevice(currentDevice_); +#endif + if (data.mayReuseStream()) { + stream_ = data.streamPtr(); + } else { + stream_ = getStreamCache().get(acc); + } + } + + explicit ScopedContextBase(int device, SharedStreamPtr stream) + : currentDevice_(device), stream_(std::move(stream)) { +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + cudaSetDevice(currentDevice_); +#endif + } + + template + explicit ScopedContextBase(T_Acc acc, edm::StreamID streamID) + : currentDevice_(::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::chooseDevice(streamID)) { +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + cudaSetDevice(currentDevice_); +#endif + stream_ = getStreamCache().get(acc); + } + + private: + int currentDevice_; + SharedStreamPtr stream_; + }; + + class ScopedContextGetterBase : public ScopedContextBase { + public: + template + const T& get(const Product& data) { + synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event()); + return data.data_; + } + + template + const T& get(const edm::Event& iEvent, edm::EDGetTokenT> token) { + return get(iEvent.get(token)); + } + + protected: + template + ScopedContextGetterBase(Args&&... args) : ScopedContextBase(std::forward(args)...) {} + + void synchronizeStreams(int dataDevice, + ::ALPAKA_ACCELERATOR_NAMESPACE::Queue& dataStream, + bool available, + alpaka::Event<::ALPAKA_ACCELERATOR_NAMESPACE::Queue> dataEvent); + }; + + class ScopedContextHolderHelper { + public: + ScopedContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder) + : waitingTaskHolder_{std::move(waitingTaskHolder)} {} + + template + void pushNextTask(F&& f, ContextState const* state); + + void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + waitingTaskHolder_ = std::move(waitingTaskHolder); + } + + void enqueueCallback(int device, ::ALPAKA_ACCELERATOR_NAMESPACE::Queue& stream); + + private: + edm::WaitingTaskWithArenaHolder waitingTaskHolder_; + }; + } // namespace impl + + /** + * The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire(): + * - setting the current device + * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary + * - synchronizing between CUDA streams if necessary + * and enforce that those get done in a proper way in RAII fashion. + */ + class ScopedContextAcquire : public impl::ScopedContextGetterBase { + public: + /// Constructor to create a new CUDA stream (no need for context beyond acquire()) + template + explicit ScopedContextAcquire(T_Acc acc, edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder) + : ScopedContextGetterBase(acc, streamID), holderHelper_{std::move(waitingTaskHolder)} {} + + // /// Constructor to create a new CUDA stream, and the context is needed after acquire() + template + explicit ScopedContextAcquire(T_Acc acc, + edm::StreamID streamID, + edm::WaitingTaskWithArenaHolder waitingTaskHolder, + ContextState& state) + : ScopedContextGetterBase(acc, streamID), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {} + + // /// Constructor to (possibly) re-use a CUDA stream (no need for context beyond acquire()) + template + explicit ScopedContextAcquire(T_Acc acc, const ProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder) + : ScopedContextGetterBase(acc, data), holderHelper_{std::move(waitingTaskHolder)} {} + + // /// Constructor to (possibly) re-use a CUDA stream, and the context is needed after acquire() + template + explicit ScopedContextAcquire(T_Acc acc, + const ProductBase& data, + edm::WaitingTaskWithArenaHolder waitingTaskHolder, + ContextState& state) + : ScopedContextGetterBase(acc, data), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {} + + ~ScopedContextAcquire(); + + template + void pushNextTask(F&& f) { + if (contextState_ == nullptr) + throwNoState(); + holderHelper_.pushNextTask(std::forward(f), contextState_); + } + + void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder)); + } + + private: + void throwNoState(); + + impl::ScopedContextHolderHelper holderHelper_; + ContextState* contextState_ = nullptr; + }; + + /** + * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce(): + * - setting the current device + * - synchronizing between CUDA streams if necessary + * and enforce that those get done in a proper way in RAII fashion. + */ + class ScopedContextProduce : public impl::ScopedContextGetterBase { + public: + /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module) + explicit ScopedContextProduce(ContextState& state) + : ScopedContextGetterBase(state.device(), state.releaseStreamPtr()) {} + + template + explicit ScopedContextProduce(T_Acc acc, const ProductBase& data) : ScopedContextGetterBase(acc, data) {} + + template + explicit ScopedContextProduce(T_Acc acc, edm::StreamID streamID) : ScopedContextGetterBase(acc, streamID) {} + + /// Record the CUDA event, all asynchronous work must have been queued before the destructor + ~ScopedContextProduce(); + + template + std::unique_ptr> wrap(T_Acc acc, T data) { + // make_unique doesn't work because of private constructor + return std::unique_ptr>(new Product(device(), streamPtr(), getEvent(acc), std::move(data))); + } + + template + auto emplace(T_Acc acc, edm::Event& iEvent, edm::EDPutTokenT token, Args&&... args) { + // return iEvent.emplace(token, device(), streamPtr(), getEvent(acc), std::forward(args)...); + return iEvent.emplace(token, std::forward(args)...); + // TODO + } + + private: + friend class ::ALPAKA_ACCELERATOR_NAMESPACE::cms::alpakatest::TestScopedContext; + + explicit ScopedContextProduce(int device, SharedStreamPtr stream) + : ScopedContextGetterBase(device, std::move(stream)) {} + + template + auto getEvent(T_Acc acc) { + return getEventCache().get(acc); + } + + // create the CUDA Event upfront to catch possible errors from its creation + }; + + /** + * The aim of this class is to do necessary per-task "initialization" tasks created in ExternalWork acquire(): + * - setting the current device + * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary + * and enforce that those get done in a proper way in RAII fashion. + */ + class ScopedContextTask : public impl::ScopedContextBase { + public: + /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module) + explicit ScopedContextTask(ContextState const* state, edm::WaitingTaskWithArenaHolder waitingTaskHolder) + : ScopedContextBase(state->device(), state->streamPtr()), // don't move, state is re-used afterwards + holderHelper_{std::move(waitingTaskHolder)}, + contextState_{state} {} + + ~ScopedContextTask(); + + template + void pushNextTask(F&& f) { + holderHelper_.pushNextTask(std::forward(f), contextState_); + } + + void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder)); + } + + private: + impl::ScopedContextHolderHelper holderHelper_; + ContextState const* contextState_; + }; + + /** + * The aim of this class is to do necessary per-event "initialization" in analyze() + * - setting the current device + * - synchronizing between CUDA streams if necessary + * and enforce that those get done in a proper way in RAII fashion. + */ + class ScopedContextAnalyze : public impl::ScopedContextGetterBase { + public: + /// Constructor to (possibly) re-use a CUDA stream + template + explicit ScopedContextAnalyze(T_Acc acc, const ProductBase& data) : ScopedContextGetterBase(acc, data) {} + }; + + namespace impl { + template + void ScopedContextHolderHelper::pushNextTask(F&& f, ContextState const* state) { + replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{ + edm::make_waiting_task_with_holder(tbb::task::allocate_root(), + std::move(waitingTaskHolder_), + [state, func = std::forward(f)](edm::WaitingTaskWithArenaHolder h) { + func(ScopedContextTask{state, std::move(h)}); + })}); + } + } // namespace impl + +} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE + +#endif // HeterogeneousCore_AlpakaCore_ScopedContext_h diff --git a/src/alpaka/AlpakaCore/ScopedSetDevice.h b/src/alpaka/AlpakaCore/ScopedSetDevice.h new file mode 100644 index 000000000..055a9b8f5 --- /dev/null +++ b/src/alpaka/AlpakaCore/ScopedSetDevice.h @@ -0,0 +1,32 @@ +#ifndef HeterogeneousCore_AlpakaUtilities_ScopedSetDevice_h +#define HeterogeneousCore_AlpakaUtilities_ScopedSetDevice_h + +#include + +namespace cms::alpakatools { + + class ScopedSetDevice { + public: + explicit ScopedSetDevice(int newDevice) { +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + cudaGetDevice(&prevDevice_); + cudaSetDevice(newDevice); +#endif + } + + ~ScopedSetDevice() { + // Intentionally don't check the return value to avoid + // exceptions to be thrown. If this call fails, the process is + // doomed anyway. +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + cudaSetDevice(prevDevice_); +#endif + } + + private: + int prevDevice_; + }; + +} // namespace cms::alpakatools + +#endif diff --git a/src/alpaka/AlpakaCore/SharedEventPtr.h b/src/alpaka/AlpakaCore/SharedEventPtr.h new file mode 100644 index 000000000..3582a928c --- /dev/null +++ b/src/alpaka/AlpakaCore/SharedEventPtr.h @@ -0,0 +1,15 @@ +#ifndef HeterogeneousCore_AlpakaUtilities_SharedEventPtr_h +#define HeterogeneousCore_AlpakaUtilities_SharedEventPtr_h + +#include +#include + +#include "AlpakaCore/alpakaConfig.h" + +namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { + + using SharedEventPtr = std::shared_ptr<::ALPAKA_ACCELERATOR_NAMESPACE::Event>; + +} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE + +#endif diff --git a/src/alpaka/AlpakaCore/SharedStreamPtr.h b/src/alpaka/AlpakaCore/SharedStreamPtr.h new file mode 100644 index 000000000..38909af5a --- /dev/null +++ b/src/alpaka/AlpakaCore/SharedStreamPtr.h @@ -0,0 +1,15 @@ +#ifndef HeterogeneousCore_AlpakaUtilities_SharedStreamPtr_h +#define HeterogeneousCore_AlpakaUtilities_SharedStreamPtr_h + +#include +#include + +#include "AlpakaCore/alpakaConfig.h" + +namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { + + using SharedStreamPtr = std::shared_ptr<::ALPAKA_ACCELERATOR_NAMESPACE::Queue>; + +} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE + +#endif diff --git a/src/alpaka/AlpakaCore/SimpleVector.h b/src/alpaka/AlpakaCore/SimpleVector.h index 6ee32f590..4dcc82d6b 100644 --- a/src/alpaka/AlpakaCore/SimpleVector.h +++ b/src/alpaka/AlpakaCore/SimpleVector.h @@ -8,137 +8,135 @@ #include "AlpakaCore/alpakaConfig.h" -namespace cms { - namespace alpakatools { - - template - struct SimpleVector { - constexpr SimpleVector() = default; - - // ownership of m_data stays within the caller - constexpr void construct(int capacity, T *data) { - m_size = 0; - m_capacity = capacity; - m_data = data; - } +namespace cms::alpakatools { - inline constexpr int push_back_unsafe(const T &element) { - auto previousSize = m_size; - m_size++; - if (previousSize < m_capacity) { - m_data[previousSize] = element; - return previousSize; - } else { - --m_size; - return -1; - } - } + template + struct SimpleVector { + constexpr SimpleVector() = default; - template - constexpr int emplace_back_unsafe(Ts &&...args) { - auto previousSize = m_size; - m_size++; - if (previousSize < m_capacity) { - (new (&m_data[previousSize]) T(std::forward(args)...)); - return previousSize; - } else { - --m_size; - return -1; - } - } - - ALPAKA_FN_ACC inline T &back() { return m_data[m_size - 1]; } + // ownership of m_data stays within the caller + constexpr void construct(int capacity, T *data) { + m_size = 0; + m_capacity = capacity; + m_data = data; + } - ALPAKA_FN_ACC inline const T &back() const { - if (m_size > 0) { - return m_data[m_size - 1]; - } else - return T(); //undefined behaviour + inline constexpr int push_back_unsafe(const T &element) { + auto previousSize = m_size; + m_size++; + if (previousSize < m_capacity) { + m_data[previousSize] = element; + return previousSize; + } else { + --m_size; + return -1; } + } - // thread-safe version of the vector, when used in a CUDA kernel - template - ALPAKA_FN_ACC int push_back(const T_Acc &acc, const T &element) { - auto previousSize = alpaka::atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); - if (previousSize < m_capacity) { - m_data[previousSize] = element; - return previousSize; - } else { - alpaka::atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); - return -1; - } + template + constexpr int emplace_back_unsafe(Ts &&...args) { + auto previousSize = m_size; + m_size++; + if (previousSize < m_capacity) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + --m_size; + return -1; } + } - template - ALPAKA_FN_ACC int emplace_back(const T_Acc &acc, Ts &&...args) { - auto previousSize = alpaka::atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); - if (previousSize < m_capacity) { - (new (&m_data[previousSize]) T(std::forward(args)...)); - return previousSize; - } else { - alpaka::atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); - return -1; - } - } + ALPAKA_FN_ACC inline T &back() { return m_data[m_size - 1]; } - // thread safe version of resize - template - ALPAKA_FN_ACC int extend(const T_Acc &acc, int size = 1) { - auto previousSize = alpaka::atomicAdd(acc, &m_size, size, alpaka::hierarchy::Blocks{}); - if (previousSize < m_capacity) { - return previousSize; - } else { - alpaka::atomicSub(acc, &m_size, size, alpaka::hierarchy::Blocks{}); - return -1; - } - } + ALPAKA_FN_ACC inline const T &back() const { + if (m_size > 0) { + return m_data[m_size - 1]; + } else + return T(); //undefined behaviour + } - template - ALPAKA_FN_ACC int shrink(const T_Acc &acc, int size = 1) { - auto previousSize = alpaka::atomicSub(acc, &m_size, size, alpaka::hierarchy::Blocks{}); - if (previousSize >= size) { - return previousSize - size; - } else { - alpaka::atomicAdd(acc, &m_size, size, alpaka::hierarchy::Blocks{}); - return -1; - } + // thread-safe version of the vector, when used in a CUDA kernel + template + ALPAKA_FN_ACC int push_back(const T_Acc &acc, const T &element) { + auto previousSize = alpaka::atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + if (previousSize < m_capacity) { + m_data[previousSize] = element; + return previousSize; + } else { + alpaka::atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + return -1; } + } - inline constexpr bool empty() const { return m_size <= 0; } - inline constexpr bool full() const { return m_size >= m_capacity; } - inline constexpr T &operator[](int i) { return m_data[i]; } - inline constexpr const T &operator[](int i) const { return m_data[i]; } - inline constexpr void reset() { m_size = 0; } - inline constexpr int size() const { return m_size; } - inline constexpr int capacity() const { return m_capacity; } - inline constexpr T const *data() const { return m_data; } - inline constexpr void resize(int size) { m_size = size; } - inline constexpr void set_data(T *data) { m_data = data; } - - private: - int m_size; - int m_capacity; - - T *m_data; - }; + template + ALPAKA_FN_ACC int emplace_back(const T_Acc &acc, Ts &&...args) { + auto previousSize = alpaka::atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + if (previousSize < m_capacity) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + alpaka::atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + return -1; + } + } - // ownership of m_data stays within the caller - template - SimpleVector make_SimpleVector(int capacity, T *data) { - SimpleVector ret; - ret.construct(capacity, data); - return ret; + // thread safe version of resize + template + ALPAKA_FN_ACC int extend(const T_Acc &acc, int size = 1) { + auto previousSize = alpaka::atomicAdd(acc, &m_size, size, alpaka::hierarchy::Blocks{}); + if (previousSize < m_capacity) { + return previousSize; + } else { + alpaka::atomicSub(acc, &m_size, size, alpaka::hierarchy::Blocks{}); + return -1; + } } - // ownership of m_data stays within the caller - template - SimpleVector *make_SimpleVector(SimpleVector *mem, int capacity, T *data) { - auto ret = new (mem) SimpleVector(); - ret->construct(capacity, data); - return ret; + template + ALPAKA_FN_ACC int shrink(const T_Acc &acc, int size = 1) { + auto previousSize = alpaka::atomicSub(acc, &m_size, size, alpaka::hierarchy::Blocks{}); + if (previousSize >= size) { + return previousSize - size; + } else { + alpaka::atomicAdd(acc, &m_size, size, alpaka::hierarchy::Blocks{}); + return -1; + } } - } // namespace alpakatools -} // namespace cms + inline constexpr bool empty() const { return m_size <= 0; } + inline constexpr bool full() const { return m_size >= m_capacity; } + inline constexpr T &operator[](int i) { return m_data[i]; } + inline constexpr const T &operator[](int i) const { return m_data[i]; } + inline constexpr void reset() { m_size = 0; } + inline constexpr int size() const { return m_size; } + inline constexpr int capacity() const { return m_capacity; } + inline constexpr T const *data() const { return m_data; } + inline constexpr void resize(int size) { m_size = size; } + inline constexpr void set_data(T *data) { m_data = data; } + + private: + int m_size; + int m_capacity; + + T *m_data; + }; + + // ownership of m_data stays within the caller + template + SimpleVector make_SimpleVector(int capacity, T *data) { + SimpleVector ret; + ret.construct(capacity, data); + return ret; + } + + // ownership of m_data stays within the caller + template + SimpleVector *make_SimpleVector(SimpleVector *mem, int capacity, T *data) { + auto ret = new (mem) SimpleVector(); + ret->construct(capacity, data); + return ret; + } + +} // namespace cms::alpakatools #endif // HeterogeneousCore_CUDAUtilities_interface_SimpleVector_h diff --git a/src/alpaka/AlpakaCore/StreamCache.h b/src/alpaka/AlpakaCore/StreamCache.h new file mode 100644 index 000000000..0cd3344f4 --- /dev/null +++ b/src/alpaka/AlpakaCore/StreamCache.h @@ -0,0 +1,46 @@ +#ifndef HeterogeneousCore_AlpakaUtilities_StreamCache_h +#define HeterogeneousCore_AlpakaUtilities_StreamCache_h + +#include + +#include + +#include "AlpakaCore/ScopedSetDevice.h" +#include "AlpakaCore/SharedStreamPtr.h" +#include "AlpakaCore/currentDevice.h" +#include "AlpakaCore/deviceCount.h" +#include "Framework/ReusableObjectHolder.h" + +class CUDAService; + +namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { + + class StreamCache { + public: + StreamCache(); + + // Gets a (cached) CUDA stream for the current device. The stream + // will be returned to the cache by the shared_ptr destructor. + // This function is thread safe + template + ALPAKA_FN_HOST SharedStreamPtr get(T_Acc acc) { + const auto dev = currentDevice(); + return cache_[dev].makeOrGet( + [dev, acc]() { return std::make_unique<::ALPAKA_ACCELERATOR_NAMESPACE::Queue>(acc); }); + } + + private: + friend class ::CUDAService; + // not thread safe, intended to be called only from CUDAService destructor + void clear(); + + std::vector> cache_; + }; + + // Gets the global instance of a StreamCache + // This function is thread safe + StreamCache& getStreamCache(); + +} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE + +#endif // HeterogeneousCore_AlpakaUtilities_StreamCache_h diff --git a/src/alpaka/AlpakaCore/VecArray.h b/src/alpaka/AlpakaCore/VecArray.h index d36a77096..8fb8f3601 100644 --- a/src/alpaka/AlpakaCore/VecArray.h +++ b/src/alpaka/AlpakaCore/VecArray.h @@ -7,101 +7,99 @@ #include "AlpakaCore/alpakaConfig.h" -namespace cms { - namespace alpakatools { +namespace cms::alpakatools { - template - class VecArray { - public: - using self = VecArray; - using value_t = T; + template + class VecArray { + public: + using self = VecArray; + using value_t = T; - inline constexpr int push_back_unsafe(const T &element) { - auto previousSize = m_size; - m_size++; - if (previousSize < maxSize) { - m_data[previousSize] = element; - return previousSize; - } else { - --m_size; - return -1; - } + inline constexpr int push_back_unsafe(const T &element) { + auto previousSize = m_size; + m_size++; + if (previousSize < maxSize) { + m_data[previousSize] = element; + return previousSize; + } else { + --m_size; + return -1; } + } - template - constexpr int emplace_back_unsafe(Ts &&...args) { - auto previousSize = m_size; - m_size++; - if (previousSize < maxSize) { - (new (&m_data[previousSize]) T(std::forward(args)...)); - return previousSize; - } else { - --m_size; - return -1; - } + template + constexpr int emplace_back_unsafe(Ts &&...args) { + auto previousSize = m_size; + m_size++; + if (previousSize < maxSize) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + --m_size; + return -1; } + } - inline constexpr T &back() const { - if (m_size > 0) { - return m_data[m_size - 1]; - } else - return T(); //undefined behaviour - } + inline constexpr T &back() const { + if (m_size > 0) { + return m_data[m_size - 1]; + } else + return T(); //undefined behaviour + } - // thread-safe version of the vector, when used in a CUDA kernel - template - ALPAKA_FN_ACC int push_back(const T_Acc &acc, const T &element) { - auto previousSize = alpaka::atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); - if (previousSize < maxSize) { - m_data[previousSize] = element; - return previousSize; - } else { - alpaka::atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); - return -1; - } + // thread-safe version of the vector, when used in a CUDA kernel + template + ALPAKA_FN_ACC int push_back(const T_Acc &acc, const T &element) { + auto previousSize = alpaka::atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + if (previousSize < maxSize) { + m_data[previousSize] = element; + return previousSize; + } else { + alpaka::atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + return -1; } + } - template - ALPAKA_FN_ACC int emplace_back(const T_Acc &acc, Ts &&...args) { - auto previousSize = alpaka::atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); - if (previousSize < maxSize) { - (new (&m_data[previousSize]) T(std::forward(args)...)); - return previousSize; - } else { - alpaka::atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); - return -1; - } + template + ALPAKA_FN_ACC int emplace_back(const T_Acc &acc, Ts &&...args) { + auto previousSize = alpaka::atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + if (previousSize < maxSize) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + alpaka::atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + return -1; } + } - inline constexpr T pop_back() { - if (m_size > 0) { - auto previousSize = m_size--; - return m_data[previousSize - 1]; - } else - return T(); - } + inline constexpr T pop_back() { + if (m_size > 0) { + auto previousSize = m_size--; + return m_data[previousSize - 1]; + } else + return T(); + } - inline constexpr T const *begin() const { return m_data; } - inline constexpr T const *end() const { return m_data + m_size; } - inline constexpr T *begin() { return m_data; } - inline constexpr T *end() { return m_data + m_size; } - inline constexpr int size() const { return m_size; } - inline constexpr T &operator[](int i) { return m_data[i]; } - inline constexpr const T &operator[](int i) const { return m_data[i]; } - inline constexpr void reset() { m_size = 0; } - inline static constexpr int capacity() { return maxSize; } - inline constexpr T const *data() const { return m_data; } - inline constexpr void resize(int size) { m_size = size; } - inline constexpr bool empty() const { return 0 == m_size; } - inline constexpr bool full() const { return maxSize == m_size; } + inline constexpr T const *begin() const { return m_data; } + inline constexpr T const *end() const { return m_data + m_size; } + inline constexpr T *begin() { return m_data; } + inline constexpr T *end() { return m_data + m_size; } + inline constexpr int size() const { return m_size; } + inline constexpr T &operator[](int i) { return m_data[i]; } + inline constexpr const T &operator[](int i) const { return m_data[i]; } + inline constexpr void reset() { m_size = 0; } + inline static constexpr int capacity() { return maxSize; } + inline constexpr T const *data() const { return m_data; } + inline constexpr void resize(int size) { m_size = size; } + inline constexpr bool empty() const { return 0 == m_size; } + inline constexpr bool full() const { return maxSize == m_size; } - private: - T m_data[maxSize]; + private: + T m_data[maxSize]; - int m_size; - }; + int m_size; + }; - } // namespace alpakatools -} // namespace cms +} // namespace cms::alpakatools #endif // HeterogeneousCore_CUDAUtilities_interface_VecArray_h diff --git a/src/alpaka/AlpakaCore/alpaka/ContextState.cc b/src/alpaka/AlpakaCore/alpaka/ContextState.cc new file mode 100644 index 000000000..553289692 --- /dev/null +++ b/src/alpaka/AlpakaCore/alpaka/ContextState.cc @@ -0,0 +1,19 @@ +#include + +#include "AlpakaCore/ContextState.h" + +namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { + + void ContextState::throwIfStream() const { + if (stream_) { + throw std::runtime_error("Trying to set ContextState, but it already had a valid state"); + } + } + + void ContextState::throwIfNoStream() const { + if (not stream_) { + throw std::runtime_error("Trying to get ContextState, but it did not have a valid state"); + } + } + +} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/AlpakaCore/alpaka/EventCache.cc b/src/alpaka/AlpakaCore/alpaka/EventCache.cc new file mode 100644 index 000000000..a08a9d4d2 --- /dev/null +++ b/src/alpaka/AlpakaCore/alpaka/EventCache.cc @@ -0,0 +1,26 @@ +#include "AlpakaCore/EventCache.h" +#include "AlpakaCore/ScopedSetDevice.h" + +namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { + + // EventCache should be constructed by the first call to + // getEventCache() only if we have CUDA devices present + EventCache::EventCache() : cache_(::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::deviceCount()) {} + + void EventCache::clear() { + // Reset the contents of the caches, but leave an + // edm::ReusableObjectHolder alive for each device. This is needed + // mostly for the unit tests, where the function-static + // EventCache lives through multiple tests (and go through + // multiple shutdowns of the framework). + cache_.clear(); + cache_.resize(::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::deviceCount()); + } + + EventCache& getEventCache() { + // the public interface is thread safe + static EventCache cache; + return cache; + } + +} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/AlpakaCore/alpaka/ProductBase.cc b/src/alpaka/AlpakaCore/alpaka/ProductBase.cc new file mode 100644 index 000000000..9470cb732 --- /dev/null +++ b/src/alpaka/AlpakaCore/alpaka/ProductBase.cc @@ -0,0 +1,31 @@ +#include "AlpakaCore/ProductBase.h" +#include "AlpakaCore/eventWorkHasCompleted.h" + +namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { + + bool ProductBase::isAvailable() const { + // if default-constructed, the product is not available + if (not event_) { + return false; + } + return eventWorkHasCompleted(*(event_.get())); + } + + ProductBase::~ProductBase() { + // Make sure that the production of the product in the GPU is + // complete before destructing the product. This is to make sure + // that the EDM stream does not move to the next event before all + // asynchronous processing of the current is complete. + + // TODO: a callback notifying a WaitingTaskHolder (or similar) + // would avoid blocking the CPU, but would also require more work. + // + // Intentionally not checking the return value to avoid throwing + // exceptions. If this call would fail, we should get failures + // elsewhere as well. + if (event_) { + alpaka::wait(*(event_.get())); + } + } + +} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/AlpakaCore/alpaka/ScopedContext.cc b/src/alpaka/AlpakaCore/alpaka/ScopedContext.cc new file mode 100644 index 000000000..52967f959 --- /dev/null +++ b/src/alpaka/AlpakaCore/alpaka/ScopedContext.cc @@ -0,0 +1,78 @@ +#include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/ScopedContext.h" + +namespace { + struct CallbackData { + edm::WaitingTaskWithArenaHolder holder; + int device; + }; +} // namespace + +namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { + + namespace impl { + void ScopedContextGetterBase::synchronizeStreams(int dataDevice, + ::ALPAKA_ACCELERATOR_NAMESPACE::Queue& dataStream, + bool available, + alpaka::Event<::ALPAKA_ACCELERATOR_NAMESPACE::Queue> dataEvent) { + if (dataDevice != device()) { + // Eventually replace with prefetch to current device (assuming unified memory works) + // If we won't go to unified memory, need to figure out something else... + throw std::runtime_error("Handling data from multiple devices is not yet supported"); + } + + if (dataStream != stream()) { + // Different streams, need to synchronize + if (not available) { + // Event not yet occurred, so need to add synchronization + // here. Sychronization is done by making the CUDA stream to + // wait for an event, so all subsequent work in the stream + // will run only after the event has "occurred" (i.e. data + // product became available). + alpaka::wait(stream(), dataEvent); + } + } + } + + void ScopedContextHolderHelper::enqueueCallback(int device, ::ALPAKA_ACCELERATOR_NAMESPACE::Queue& stream) { + alpaka::enqueue(stream, [this, device]() { + auto data = new CallbackData{waitingTaskHolder_, device}; + std::unique_ptr guard{reinterpret_cast(data)}; + edm::WaitingTaskWithArenaHolder& waitingTaskHolder = guard->holder; + int device2 = guard->device; + waitingTaskHolder.doneWaiting(nullptr); + }); + } + } // namespace impl + + //////////////////// + + ScopedContextAcquire::~ScopedContextAcquire() { + holderHelper_.enqueueCallback(device(), stream()); + if (contextState_) { + contextState_->set(device(), streamPtr()); + } + } + + void ScopedContextAcquire::throwNoState() { + throw std::runtime_error( + "Calling ScopedContextAcquire::insertNextTask() requires ScopedContextAcquire to be constructed with " + "ContextState, but that was not the case"); + } + + //////////////////// + + ScopedContextProduce::~ScopedContextProduce() { + // Intentionally not checking the return value to avoid throwing + // exceptions. If this call would fail, we should get failures + // elsewhere as well. + //cudaEventRecord(event_.get(), stream()); + //alpaka::enqueue(stream(), getEvent(::ALPAKA_ACCELERATOR_NAMESPACE::Device).get()); + //TODO + } + + //////////////////// + + ScopedContextTask::~ScopedContextTask() { holderHelper_.enqueueCallback(device(), stream()); } + +} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/AlpakaCore/alpaka/StreamCache.cc b/src/alpaka/AlpakaCore/alpaka/StreamCache.cc new file mode 100644 index 000000000..d722528de --- /dev/null +++ b/src/alpaka/AlpakaCore/alpaka/StreamCache.cc @@ -0,0 +1,25 @@ +#include "AlpakaCore/StreamCache.h" + +namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { + + // StreamCache should be constructed by the first call to + // getStreamCache() only if we have CUDA devices present + StreamCache::StreamCache() : cache_(::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::deviceCount()) {} + + void StreamCache::clear() { + // Reset the contents of the caches, but leave an + // edm::ReusableObjectHolder alive for each device. This is needed + // mostly for the unit tests, where the function-static + // StreamCache lives through multiple tests (and go through + // multiple shutdowns of the framework). + cache_.clear(); + cache_.resize(::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::deviceCount()); + } + + StreamCache& getStreamCache() { + // the public interface is thread safe + static StreamCache cache; + return cache; + } + +} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/AlpakaCore/alpaka/chooseDevice.cc b/src/alpaka/AlpakaCore/alpaka/chooseDevice.cc new file mode 100644 index 000000000..d7296fd41 --- /dev/null +++ b/src/alpaka/AlpakaCore/alpaka/chooseDevice.cc @@ -0,0 +1,17 @@ +#include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/chooseDevice.h" +#include "AlpakaCore/deviceCount.h" + +namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { + + int chooseDevice(edm::StreamID id) { + // For startes we "statically" assign the device based on + // edm::Stream number. This is suboptimal if the number of + // edm::Streams is not a multiple of the number of CUDA devices + // (and even then there is no load balancing). + + // TODO: improve the "assignment" logic + return id % ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::deviceCount(); + } + +} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/AlpakaCore/alpaka/deviceCount.cc b/src/alpaka/AlpakaCore/alpaka/deviceCount.cc new file mode 100644 index 000000000..001b1e817 --- /dev/null +++ b/src/alpaka/AlpakaCore/alpaka/deviceCount.cc @@ -0,0 +1,14 @@ +#include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/deviceCount.h" + +namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { + + int deviceCount() { + int ndevices = 1; +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + ndevices = alpaka::getDevCount<::ALPAKA_ACCELERATOR_NAMESPACE::Platform>(); +#endif + return ndevices; + } + +} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/AlpakaCore/alpakaConfigAcc.h b/src/alpaka/AlpakaCore/alpakaConfigAcc.h index db6a8ed2d..e0da1cf1b 100644 --- a/src/alpaka/AlpakaCore/alpakaConfigAcc.h +++ b/src/alpaka/AlpakaCore/alpakaConfigAcc.h @@ -92,11 +92,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { using Device = alpaka::Dev; using Platform = alpaka::Pltf; static_assert(std::is_same_v>, - STRINGIFY(alpaka::Dev) " and " STRINGIFY( - alpaka::Dev) " are different types."); + STRINGIFY(alpaka::Dev<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>) " and " STRINGIFY( + alpaka::Dev<::ALPAKA_ACCELERATOR_NAMESPACE::Acc2D>) " are different types."); static_assert(std::is_same_v>>, - STRINGIFY(alpaka::Pltf>) " and " STRINGIFY( - alpaka::Pltf>) " are different types."); + STRINGIFY(alpaka::Pltf>) " and " STRINGIFY( + alpaka::Pltf>) " are different types."); + + using Event = alpaka::Event; template using AlpakaAccBuf1D = alpaka::Buf; diff --git a/src/alpaka/AlpakaCore/alpakaMemoryHelper.h b/src/alpaka/AlpakaCore/alpakaMemoryHelper.h index fd3050b3e..19f9c308f 100644 --- a/src/alpaka/AlpakaCore/alpakaMemoryHelper.h +++ b/src/alpaka/AlpakaCore/alpakaMemoryHelper.h @@ -1,42 +1,40 @@ -#ifndef ALPAKAMEMORYHELPER_H -#define ALPAKAMEMORYHELPER_H +#ifndef AlpakaCore_alpakaMemoryHelper_h +#define AlpakaCore_alpakaMemoryHelper_h #include "AlpakaCore/alpakaConfig.h" #include "AlpakaCore/alpakaDevices.h" using namespace alpaka_common; -namespace cms { - namespace alpakatools { +namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { - template - auto allocHostBuf(const Extent& extent) { - return alpaka::allocBuf(host, extent); - } + template + auto allocHostBuf(const Extent& extent) { + return alpaka::allocBuf(host, extent); + } - template - auto createHostView(TData* data, const Extent& extent) { - return alpaka::ViewPlainPtr(data, host, extent); - } + template + auto createHostView(TData* data, const Extent& extent) { + return alpaka::ViewPlainPtr(data, host, extent); + } - template - auto allocDeviceBuf(const Extent& extent) { - return alpaka::allocBuf(ALPAKA_ACCELERATOR_NAMESPACE::device, extent); - } + template + auto allocDeviceBuf(const Extent& extent) { + return alpaka::allocBuf(::ALPAKA_ACCELERATOR_NAMESPACE::device, extent); + } - template - auto createDeviceView(const TData* data, const Extent& extent) { - return alpaka::ViewPlainPtr( - data, ALPAKA_ACCELERATOR_NAMESPACE::device, extent); - } + template + auto createDeviceView(const TData* data, const Extent& extent) { + return alpaka::ViewPlainPtr<::ALPAKA_ACCELERATOR_NAMESPACE::Device, const TData, Dim1D, Idx>( + data, ::ALPAKA_ACCELERATOR_NAMESPACE::device, extent); + } - template - auto createDeviceView(TData* data, const Extent& extent) { - return alpaka::ViewPlainPtr( - data, ALPAKA_ACCELERATOR_NAMESPACE::device, extent); - } + template + auto createDeviceView(TData* data, const Extent& extent) { + return alpaka::ViewPlainPtr<::ALPAKA_ACCELERATOR_NAMESPACE::Device, TData, Dim1D, Idx>( + data, ::ALPAKA_ACCELERATOR_NAMESPACE::device, extent); + } - } // namespace alpakatools -} // namespace cms +} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE -#endif // ALPAKAMEMORYHELPER_H +#endif // AlpakaCore_alpakaMemoryHelper_h diff --git a/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h b/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h index f10d694fd..1bcec521a 100644 --- a/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h +++ b/src/alpaka/AlpakaCore/alpakaWorkDivHelper.h @@ -1,302 +1,306 @@ -#ifndef ALPAKAWORKDIVHELPER_H -#define ALPAKAWORKDIVHELPER_H +#ifndef AlpakaCore_alpakaWorkDivHelper_h +#define AlpakaCore_alpakaWorkDivHelper_h #include "AlpakaCore/alpakaConfig.h" using namespace alpaka_common; -namespace cms { - namespace alpakatools { +namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { - /********************************************* + /********************************************* * WORKDIV CREATION ********************************************/ - /* + /* * Creates the accelerator-dependent workdiv. */ - template - WorkDiv make_workdiv(const Vec& blocksPerGrid, const Vec& threadsPerBlockOrElementsPerThread) { - // On the GPU: - // threadsPerBlockOrElementsPerThread is the number of threads per block. - // Each thread is looking at a single element: elementsPerThread is always 1. + template + WorkDiv make_workdiv(const Vec& blocksPerGrid, const Vec& threadsPerBlockOrElementsPerThread) { + // On the GPU: + // threadsPerBlockOrElementsPerThread is the number of threads per block. + // Each thread is looking at a single element: elementsPerThread is always 1. #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - const Vec& elementsPerThread = Vec::ones(); - return WorkDiv(blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread); + const Vec& elementsPerThread = Vec::ones(); + return WorkDiv(blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread); #else - // On the CPU: - // Run serially with a single thread per block: threadsPerBlock is always 1. - // threadsPerBlockOrElementsPerThread is the number of elements per thread. - const Vec& threadsPerBlock = Vec::ones(); - return WorkDiv(blocksPerGrid, threadsPerBlock, threadsPerBlockOrElementsPerThread); + // On the CPU: + // Run serially with a single thread per block: threadsPerBlock is always 1. + // threadsPerBlockOrElementsPerThread is the number of elements per thread. + const Vec& threadsPerBlock = Vec::ones(); + return WorkDiv(blocksPerGrid, threadsPerBlock, threadsPerBlockOrElementsPerThread); #endif - } + } - /********************************************* + /********************************************* * RANGE COMPUTATION ********************************************/ - /* + /* * Computes the range of the elements indexes, local to the block. * Warning: the max index is not truncated by the max number of elements of interest. */ - template - ALPAKA_FN_ACC std::pair element_index_range_in_block(const TAcc& acc, - const Idx elementIdxShift, - const unsigned int dimIndex = 0u) { - // Take into account the thread index in block. - const Idx threadIdxLocal(alpaka::getIdx(acc)[dimIndex]); - const Idx threadDimension(alpaka::getWorkDiv(acc)[dimIndex]); - - // Compute the elements indexes in block. - // Obviously relevant for CPU only. - // For GPU, threadDimension == 1, and elementIdx == firstElementIdx == threadIdx + elementIdxShift. - const Idx firstElementIdxLocal = threadIdxLocal * threadDimension; - const Idx firstElementIdx = firstElementIdxLocal + elementIdxShift; // Add the shift! - const Idx endElementIdxUncut = firstElementIdx + threadDimension; - - // Return element indexes, shifted by elementIdxShift. - return {firstElementIdx, endElementIdxUncut}; - } - - /* + template + ALPAKA_FN_ACC std::pair element_index_range_in_block(const TAcc& acc, + const Idx elementIdxShift, + const unsigned int dimIndex = 0u) { + // Take into account the thread index in block. + const Idx threadIdxLocal(alpaka::getIdx(acc)[dimIndex]); + const Idx threadDimension(alpaka::getWorkDiv(acc)[dimIndex]); + + // Compute the elements indexes in block. + // Obviously relevant for CPU only. + // For GPU, threadDimension == 1, and elementIdx == firstElementIdx == threadIdx + elementIdxShift. + const Idx firstElementIdxLocal = threadIdxLocal * threadDimension; + const Idx firstElementIdx = firstElementIdxLocal + elementIdxShift; // Add the shift! + const Idx endElementIdxUncut = firstElementIdx + threadDimension; + + // Return element indexes, shifted by elementIdxShift. + return {firstElementIdx, endElementIdxUncut}; + } + + /* * Computes the range of the elements indexes, local to the block. * Truncated by the max number of elements of interest. */ - template - ALPAKA_FN_ACC std::pair element_index_range_in_block_truncated(const TAcc& acc, - const Idx maxNumberOfElements, - const Idx elementIdxShift, - const unsigned int dimIndex = 0u) { - // Check dimension - //static_assert(alpaka::Dim::value == Dim1D::value, - // "Accelerator and maxNumberOfElements need to have same dimension."); - auto [firstElementIdxLocal, endElementIdxLocal] = element_index_range_in_block(acc, elementIdxShift, dimIndex); - - // Truncate - endElementIdxLocal = std::min(endElementIdxLocal, maxNumberOfElements); - - // Return element indexes, shifted by elementIdxShift, and truncated by maxNumberOfElements. - return {firstElementIdxLocal, endElementIdxLocal}; - } - - /* + template + ALPAKA_FN_ACC std::pair element_index_range_in_block_truncated(const TAcc& acc, + const Idx maxNumberOfElements, + const Idx elementIdxShift, + const unsigned int dimIndex = 0u) { + // Check dimension + //static_assert(alpaka::Dim::value == Dim1D::value, + // "Accelerator and maxNumberOfElements need to have same dimension."); + auto [firstElementIdxLocal, endElementIdxLocal] = element_index_range_in_block(acc, elementIdxShift, dimIndex); + + // Truncate + endElementIdxLocal = std::min(endElementIdxLocal, maxNumberOfElements); + + // Return element indexes, shifted by elementIdxShift, and truncated by maxNumberOfElements. + return {firstElementIdxLocal, endElementIdxLocal}; + } + + /* * Computes the range of the elements indexes in grid. * Warning: the max index is not truncated by the max number of elements of interest. */ - template - ALPAKA_FN_ACC std::pair element_index_range_in_grid(const TAcc& acc, - Idx elementIdxShift, - const unsigned int dimIndex = 0u) { - // Take into account the block index in grid. - const Idx blockIdxInGrid(alpaka::getIdx(acc)[dimIndex]); - const Idx blockDimension(alpaka::getWorkDiv(acc)[dimIndex]); - - // Shift to get global indices in grid (instead of local to the block) - elementIdxShift += blockIdxInGrid * blockDimension; - - // Return element indexes, shifted by elementIdxShift. - return element_index_range_in_block(acc, elementIdxShift, dimIndex); - } - - /* + template + ALPAKA_FN_ACC std::pair element_index_range_in_grid(const TAcc& acc, + Idx elementIdxShift, + const unsigned int dimIndex = 0u) { + // Take into account the block index in grid. + const Idx blockIdxInGrid(alpaka::getIdx(acc)[dimIndex]); + const Idx blockDimension(alpaka::getWorkDiv(acc)[dimIndex]); + + // Shift to get global indices in grid (instead of local to the block) + elementIdxShift += blockIdxInGrid * blockDimension; + + // Return element indexes, shifted by elementIdxShift. + return element_index_range_in_block(acc, elementIdxShift, dimIndex); + } + + /* * Computes the range of the elements indexes in grid. * Truncated by the max number of elements of interest. */ - template - ALPAKA_FN_ACC std::pair element_index_range_in_grid_truncated(const TAcc& acc, - const Idx maxNumberOfElements, - Idx elementIdxShift, - const unsigned int dimIndex = 0u) { - // Check dimension - //static_assert(dimIndex <= alpaka::Dim::value, - //"Accelerator and maxNumberOfElements need to have same dimension."); - auto [firstElementIdxGlobal, endElementIdxGlobal] = element_index_range_in_grid(acc, elementIdxShift, dimIndex); - - // Truncate - endElementIdxGlobal = std::min(endElementIdxGlobal, maxNumberOfElements); - - // Return element indexes, shifted by elementIdxShift, and truncated by maxNumberOfElements. - return {firstElementIdxGlobal, endElementIdxGlobal}; - } - - /* + template + ALPAKA_FN_ACC std::pair element_index_range_in_grid_truncated(const TAcc& acc, + const Idx maxNumberOfElements, + Idx elementIdxShift, + const unsigned int dimIndex = 0u) { + // Check dimension + //static_assert(dimIndex <= alpaka::Dim::value, + //"Accelerator and maxNumberOfElements need to have same dimension."); + auto [firstElementIdxGlobal, endElementIdxGlobal] = element_index_range_in_grid(acc, elementIdxShift, dimIndex); + + // Truncate + endElementIdxGlobal = std::min(endElementIdxGlobal, maxNumberOfElements); + + // Return element indexes, shifted by elementIdxShift, and truncated by maxNumberOfElements. + return {firstElementIdxGlobal, endElementIdxGlobal}; + } + + /* * Computes the range of the element(s) index(es) in grid. * Truncated by the max number of elements of interest. */ - template - ALPAKA_FN_ACC std::pair element_index_range_in_grid_truncated(const TAcc& acc, - const Idx maxNumberOfElements, - const unsigned int dimIndex = 0u) { - Idx elementIdxShift = 0u; - return element_index_range_in_grid_truncated(acc, maxNumberOfElements, elementIdxShift, dimIndex); - } - - /********************************************* + template + ALPAKA_FN_ACC std::pair element_index_range_in_grid_truncated(const TAcc& acc, + const Idx maxNumberOfElements, + const unsigned int dimIndex = 0u) { + Idx elementIdxShift = 0u; + return element_index_range_in_grid_truncated(acc, maxNumberOfElements, elementIdxShift, dimIndex); + } + + /********************************************* * LOOP ON ALL ELEMENTS ********************************************/ - /* + /* * Loop on all (CPU) elements. * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. * Indexes are local to the BLOCK. */ - template - ALPAKA_FN_ACC void for_each_element_in_block(const TAcc& acc, - const Idx maxNumberOfElements, - const Idx elementIdxShift, - const Func func, - const unsigned int dimIndex = 0) { - const auto& [firstElementIdx, endElementIdx] = - cms::alpakatools::element_index_range_in_block_truncated(acc, maxNumberOfElements, elementIdxShift, dimIndex); - - for (Idx elementIdx = firstElementIdx; elementIdx < endElementIdx; ++elementIdx) { - func(elementIdx); - } + template + ALPAKA_FN_ACC void for_each_element_in_block(const TAcc& acc, + const Idx maxNumberOfElements, + const Idx elementIdxShift, + const Func func, + const unsigned int dimIndex = 0) { + const auto& [firstElementIdx, endElementIdx] = + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::element_index_range_in_block_truncated( + acc, maxNumberOfElements, elementIdxShift, dimIndex); + + for (Idx elementIdx = firstElementIdx; elementIdx < endElementIdx; ++elementIdx) { + func(elementIdx); } + } - /* + /* * Overload for elementIdxShift = 0 */ - template - ALPAKA_FN_ACC void for_each_element_in_block(const TAcc& acc, - const Idx maxNumberOfElements, - const Func func, - const unsigned int dimIndex = 0) { - const Idx elementIdxShift = 0; - cms::alpakatools::for_each_element_in_block(acc, maxNumberOfElements, elementIdxShift, func, dimIndex); - } - - /* + template + ALPAKA_FN_ACC void for_each_element_in_block(const TAcc& acc, + const Idx maxNumberOfElements, + const Func func, + const unsigned int dimIndex = 0) { + const Idx elementIdxShift = 0; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block( + acc, maxNumberOfElements, elementIdxShift, func, dimIndex); + } + + /* * Loop on all (CPU) elements. * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. * Indexes are expressed in GRID 'frame-of-reference'. */ - template - ALPAKA_FN_ACC void for_each_element_in_grid(const TAcc& acc, - const Idx maxNumberOfElements, - Idx elementIdxShift, - const Func func, - const unsigned int dimIndex = 0) { - // Take into account the block index in grid to compute the element indices. - const Idx blockIdxInGrid(alpaka::getIdx(acc)[dimIndex]); - const Idx blockDimension(alpaka::getWorkDiv(acc)[dimIndex]); - elementIdxShift += blockIdxInGrid * blockDimension; - - for_each_element_in_block(acc, maxNumberOfElements, elementIdxShift, func, dimIndex); - } - - /* + template + ALPAKA_FN_ACC void for_each_element_in_grid(const TAcc& acc, + const Idx maxNumberOfElements, + Idx elementIdxShift, + const Func func, + const unsigned int dimIndex = 0) { + // Take into account the block index in grid to compute the element indices. + const Idx blockIdxInGrid(alpaka::getIdx(acc)[dimIndex]); + const Idx blockDimension(alpaka::getWorkDiv(acc)[dimIndex]); + elementIdxShift += blockIdxInGrid * blockDimension; + + for_each_element_in_block(acc, maxNumberOfElements, elementIdxShift, func, dimIndex); + } + + /* * Overload for elementIdxShift = 0 */ - template - ALPAKA_FN_ACC void for_each_element_in_grid(const TAcc& acc, - const Idx maxNumberOfElements, - const Func func, - const unsigned int dimIndex = 0) { - const Idx elementIdxShift = 0; - cms::alpakatools::for_each_element_in_grid(acc, maxNumberOfElements, elementIdxShift, func, dimIndex); - } - - /************************************************************** + template + ALPAKA_FN_ACC void for_each_element_in_grid(const TAcc& acc, + const Idx maxNumberOfElements, + const Func func, + const unsigned int dimIndex = 0) { + const Idx elementIdxShift = 0; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid( + acc, maxNumberOfElements, elementIdxShift, func, dimIndex); + } + + /************************************************************** * LOOP ON ALL ELEMENTS, WITH STRIDED ACCESS **************************************************************/ - /* + /* * (CPU) Loop on all elements + (CPU/GPU) Strided access. * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. * Stride to full problem size, by BLOCK size. * Indexes are local to the BLOCK. */ - template - ALPAKA_FN_ACC void for_each_element_in_block_strided(const TAcc& acc, - const Idx maxNumberOfElements, - const Idx elementIdxShift, - const Func func, - const unsigned int dimIndex = 0) { - // Get thread / element indices in block. - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_index_range_in_block(acc, elementIdxShift, dimIndex); - - // Stride = block size. - const Idx blockDimension(alpaka::getWorkDiv(acc)[dimIndex]); - - // Strided access. - for (Idx threadIdx = firstElementIdxNoStride, endElementIdx = endElementIdxNoStride; - threadIdx < maxNumberOfElements; - threadIdx += blockDimension, endElementIdx += blockDimension) { - // (CPU) Loop on all elements. - if (endElementIdx > maxNumberOfElements) { - endElementIdx = maxNumberOfElements; - } - for (Idx i = threadIdx; i < endElementIdx; ++i) { - func(i); - } + template + ALPAKA_FN_ACC void for_each_element_in_block_strided(const TAcc& acc, + const Idx maxNumberOfElements, + const Idx elementIdxShift, + const Func func, + const unsigned int dimIndex = 0) { + // Get thread / element indices in block. + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::element_index_range_in_block(acc, elementIdxShift, dimIndex); + + // Stride = block size. + const Idx blockDimension(alpaka::getWorkDiv(acc)[dimIndex]); + + // Strided access. + for (Idx threadIdx = firstElementIdxNoStride, endElementIdx = endElementIdxNoStride; + threadIdx < maxNumberOfElements; + threadIdx += blockDimension, endElementIdx += blockDimension) { + // (CPU) Loop on all elements. + if (endElementIdx > maxNumberOfElements) { + endElementIdx = maxNumberOfElements; + } + for (Idx i = threadIdx; i < endElementIdx; ++i) { + func(i); } } + } - /* + /* * Overload for elementIdxShift = 0 */ - template - ALPAKA_FN_ACC void for_each_element_in_block_strided(const TAcc& acc, - const Idx maxNumberOfElements, - const Func func, - const unsigned int dimIndex = 0) { - const Idx elementIdxShift = 0; - cms::alpakatools::for_each_element_in_block_strided(acc, maxNumberOfElements, elementIdxShift, func, dimIndex); - } - - /* + template + ALPAKA_FN_ACC void for_each_element_in_block_strided(const TAcc& acc, + const Idx maxNumberOfElements, + const Func func, + const unsigned int dimIndex = 0) { + const Idx elementIdxShift = 0; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, maxNumberOfElements, elementIdxShift, func, dimIndex); + } + + /* * (CPU) Loop on all elements + (CPU/GPU) Strided access. * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. * Stride to full problem size, by GRID size. * Indexes are local to the GRID. */ - template - ALPAKA_FN_ACC void for_each_element_in_grid_strided(const TAcc& acc, - const Idx maxNumberOfElements, - const Idx elementIdxShift, - const Func func, - const unsigned int dimIndex = 0) { - // Get thread / element indices in block. - const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_index_range_in_grid(acc, elementIdxShift, dimIndex); - - // Stride = grid size. - const Idx gridDimension(alpaka::getWorkDiv(acc)[dimIndex]); - - // Strided access. - for (Idx threadIdx = firstElementIdxNoStride, endElementIdx = endElementIdxNoStride; - threadIdx < maxNumberOfElements; - threadIdx += gridDimension, endElementIdx += gridDimension) { - // (CPU) Loop on all elements. - if (endElementIdx > maxNumberOfElements) { - endElementIdx = maxNumberOfElements; - } - for (Idx i = threadIdx; i < endElementIdx; ++i) { - func(i); - } + template + ALPAKA_FN_ACC void for_each_element_in_grid_strided(const TAcc& acc, + const Idx maxNumberOfElements, + const Idx elementIdxShift, + const Func func, + const unsigned int dimIndex = 0) { + // Get thread / element indices in block. + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::element_index_range_in_grid(acc, elementIdxShift, dimIndex); + + // Stride = grid size. + const Idx gridDimension(alpaka::getWorkDiv(acc)[dimIndex]); + + // Strided access. + for (Idx threadIdx = firstElementIdxNoStride, endElementIdx = endElementIdxNoStride; + threadIdx < maxNumberOfElements; + threadIdx += gridDimension, endElementIdx += gridDimension) { + // (CPU) Loop on all elements. + if (endElementIdx > maxNumberOfElements) { + endElementIdx = maxNumberOfElements; + } + for (Idx i = threadIdx; i < endElementIdx; ++i) { + func(i); } } + } - /* + /* * Overload for elementIdxShift = 0 */ - template - ALPAKA_FN_ACC void for_each_element_in_grid_strided(const TAcc& acc, - const Idx maxNumberOfElements, - const Func func, - const unsigned int dimIndex = 0) { - const Idx elementIdxShift = 0; - cms::alpakatools::for_each_element_in_grid_strided(acc, maxNumberOfElements, elementIdxShift, func, dimIndex); - } - - /************************************************************** + template + ALPAKA_FN_ACC void for_each_element_in_grid_strided(const TAcc& acc, + const Idx maxNumberOfElements, + const Func func, + const unsigned int dimIndex = 0) { + const Idx elementIdxShift = 0; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, maxNumberOfElements, elementIdxShift, func, dimIndex); + } + + /************************************************************** * LOOP ON ALL ELEMENTS WITH ONE LOOP **************************************************************/ - /* + /* * Case where the input index i has reached the end of threadDimension: strides the input index. * Otherwise: do nothing. * NB 1: This helper function is used as a trick to only have one loop (like in legacy), instead of 2 loops @@ -306,21 +310,20 @@ namespace cms { * and hence avoids a lot of legacy code reshuffling. * NB 2: Modifies i, firstElementIdx and endElementIdx. */ - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool next_valid_element_index_strided( - Idx& i, Idx& firstElementIdx, Idx& endElementIdx, const Idx stride, const Idx maxNumberOfElements) { - bool isNextStrideElementValid = true; - if (i == endElementIdx) { - firstElementIdx += stride; - endElementIdx += stride; - i = firstElementIdx; - if (i >= maxNumberOfElements) { - isNextStrideElementValid = false; - } + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool next_valid_element_index_strided( + Idx& i, Idx& firstElementIdx, Idx& endElementIdx, const Idx stride, const Idx maxNumberOfElements) { + bool isNextStrideElementValid = true; + if (i == endElementIdx) { + firstElementIdx += stride; + endElementIdx += stride; + i = firstElementIdx; + if (i >= maxNumberOfElements) { + isNextStrideElementValid = false; } - return isNextStrideElementValid; } + return isNextStrideElementValid; + } - } // namespace alpakatools -} // namespace cms +} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE -#endif // ALPAKAWORKDIVHELPER_H +#endif // AlpakaCore_alpakaWorkDivHelper_h diff --git a/src/alpaka/AlpakaCore/chooseDevice.h b/src/alpaka/AlpakaCore/chooseDevice.h new file mode 100644 index 000000000..9580e4439 --- /dev/null +++ b/src/alpaka/AlpakaCore/chooseDevice.h @@ -0,0 +1,13 @@ +#ifndef HeterogeneousCore_AlpakaCore_chooseDevice_h +#define HeterogeneousCore_AlpakaCore_chooseDevice_h + +#include "AlpakaCore/alpakaConfig.h" +#include "Framework/Event.h" + +namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { + + int chooseDevice(edm::StreamID id); + +} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE + +#endif // HeterogeneousCore_AlpakaCore_chooseDevice_h diff --git a/src/alpaka/AlpakaCore/currentDevice.h b/src/alpaka/AlpakaCore/currentDevice.h new file mode 100644 index 000000000..8141214a9 --- /dev/null +++ b/src/alpaka/AlpakaCore/currentDevice.h @@ -0,0 +1,22 @@ +#ifndef HeterogenousCore_AlpakaUtilities_currentDevice_h +#define HeterogenousCore_AlpakaUtilities_currentDevice_h + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED +#include +#endif + +#include "AlpakaCore/alpakaConfig.h" + +namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { + + inline int currentDevice() { + int dev = 0; +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + cudaGetDevice(&dev); +#endif + return dev; + } + +} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE + +#endif diff --git a/src/alpaka/AlpakaCore/deviceCount.h b/src/alpaka/AlpakaCore/deviceCount.h new file mode 100644 index 000000000..69da217ef --- /dev/null +++ b/src/alpaka/AlpakaCore/deviceCount.h @@ -0,0 +1,12 @@ +#ifndef HeterogenousCore_AlpakaUtilities_deviceCount_h +#define HeterogenousCore_AlpakaUtilities_deviceCount_h + +#include "AlpakaCore/alpakaConfig.h" + +namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { + + int deviceCount(); + +} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE + +#endif // HeterogenousCore_AlpakaUtilities_deviceCount_h diff --git a/src/alpaka/AlpakaCore/eventWorkHasCompleted.h b/src/alpaka/AlpakaCore/eventWorkHasCompleted.h new file mode 100644 index 000000000..8001ff4e7 --- /dev/null +++ b/src/alpaka/AlpakaCore/eventWorkHasCompleted.h @@ -0,0 +1,23 @@ +#ifndef HeterogeneousCore_AlpakaUtilities_eventWorkHasCompleted_h +#define HeterogeneousCore_AlpakaUtilities_eventWorkHasCompleted_h + +#include "AlpakaCore/alpakaConfig.h" + +namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { + + /** + * Returns true if the work captured by the event (=queued to the + * CUDA stream at the point of cudaEventRecord()) has completed. + * + * Returns false if any captured work is incomplete. + * + * In case of errors, throws an exception. + */ + + inline bool eventWorkHasCompleted(alpaka::Event<::ALPAKA_ACCELERATOR_NAMESPACE::Queue> event) { + return alpaka::isComplete(event); + } + +} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE + +#endif // HeterogeneousCore_AlpakaUtilities_eventWorkHasCompleted_h diff --git a/src/alpaka/AlpakaCore/prefixScan.h b/src/alpaka/AlpakaCore/prefixScan.h index a23a2e342..56ba0a8c3 100644 --- a/src/alpaka/AlpakaCore/prefixScan.h +++ b/src/alpaka/AlpakaCore/prefixScan.h @@ -4,6 +4,7 @@ #include #include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/threadfence.h" #include "Framework/CMSUnrollLoop.h" #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED @@ -33,200 +34,165 @@ ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void warpPrefixScan(uint32_t laneId, T* c, u c[i] = x; } -#endif +#endif // ALPAKA_ACC_GPU_CUDA_ENABLED + +namespace cms::alpakatools { -namespace cms { - namespace alpakatools { - // limited to 32*32 elements.... - template - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void blockPrefixScan(const T_Acc& acc, - T const* ci, - T* co, - uint32_t size, - T* ws + // limited to 32*32 elements.... + template + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void blockPrefixScan(const T_Acc& acc, + T const* ci, + T* co, + uint32_t size, + T* ws #ifndef ALPAKA_ACC_GPU_CUDA_ENABLED - = nullptr + = nullptr #endif - ) { + ) { #if defined ALPAKA_ACC_GPU_CUDA_ENABLED and __CUDA_ARCH__ - uint32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); - uint32_t const gridBlockIdx(alpaka::getIdx(acc)[0u]); - uint32_t const blockThreadIdx(alpaka::getIdx(acc)[0u]); - assert(ws); - ALPAKA_ASSERT_OFFLOAD(size <= 1024); - ALPAKA_ASSERT_OFFLOAD(0 == blockDimension % 32); - auto first = blockThreadIdx; - auto mask = __ballot_sync(0xffffffff, first < size); - auto laneId = blockThreadIdx & 0x1f; - - for (auto i = first; i < size; i += blockDimension) { - warpPrefixScan(laneId, ci, co, i, mask); - auto warpId = i / 32; - ALPAKA_ASSERT_OFFLOAD(warpId < 32); - if (31 == laneId) - ws[warpId] = co[i]; - mask = __ballot_sync(mask, i + blockDimension < size); - } - alpaka::syncBlockThreads(acc); - if (size <= 32) - return; - if (blockThreadIdx < 32) { - warpPrefixScan(laneId, ws, blockThreadIdx, 0xffffffff); - } - alpaka::syncBlockThreads(acc); - for (auto i = first + 32; i < size; i += blockDimension) { - uint32_t warpId = i / 32; - co[i] += ws[warpId - 1]; - } - alpaka::syncBlockThreads(acc); - + const int32_t blockDim(alpaka::getWorkDiv(acc)[0u]); + const int32_t gridBlockIdx(alpaka::getIdx(acc)[0u]); + const int32_t blockThreadIdx(alpaka::getIdx(acc)[0u]); + ALPAKA_ASSERT_OFFLOAD(ws); + ALPAKA_ASSERT_OFFLOAD(size <= 1024); + ALPAKA_ASSERT_OFFLOAD(0 == blockDim % 32); + auto first = blockThreadIdx; + auto mask = __ballot_sync(0xffffffff, first < size); + auto laneId = blockThreadIdx & 0x1f; + + for (auto i = first; i < size; i += blockDim) { + warpPrefixScan(laneId, ci, co, i, mask); + auto warpId = i / 32; + ALPAKA_ASSERT_OFFLOAD(warpId < 32); + if (31 == laneId) + ws[warpId] = co[i]; + mask = __ballot_sync(mask, i + blockDim < size); + } + alpaka::syncBlockThreads(acc); + if (size <= 32) + return; + if (blockThreadIdx < 32) { + warpPrefixScan(laneId, ws, blockThreadIdx, 0xffffffff); + } + alpaka::syncBlockThreads(acc); + for (auto i = first + 32; i < size; i += blockDim) { + uint32_t warpId = i / 32; + co[i] += ws[warpId - 1]; + } + alpaka::syncBlockThreads(acc); #else - co[0] = ci[0]; - for (uint32_t i = 1; i < size; ++i) - co[i] = ci[i] + co[i - 1]; + co[0] = ci[0]; + for (uint32_t i = 1; i < size; ++i) + co[i] = ci[i] + co[i - 1]; #endif - } + } - template - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void blockPrefixScan(const T_Acc& acc, - T* __restrict__ c, - uint32_t size, - T* __restrict__ ws + template + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void blockPrefixScan(const T_Acc& acc, + T* __restrict__ c, + uint32_t size, + T* __restrict__ ws #ifndef ALPAKA_ACC_GPU_CUDA_ENABLED - = nullptr + = nullptr #endif - ) { + ) { #if defined ALPAKA_ACC_GPU_CUDA_ENABLED and __CUDA_ARCH__ - uint32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); - uint32_t const gridBlockIdx(alpaka::getIdx(acc)[0u]); - uint32_t const blockThreadIdx(alpaka::getIdx(acc)[0u]); - assert(ws); - ALPAKA_ASSERT_OFFLOAD(size <= 1024); - ALPAKA_ASSERT_OFFLOAD(0 == blockDimension % 32); - auto first = blockThreadIdx; - auto mask = __ballot_sync(0xffffffff, first < size); - auto laneId = blockThreadIdx & 0x1f; - - for (auto i = first; i < size; i += blockDimension) { - warpPrefixScan(laneId, c, i, mask); - auto warpId = i / 32; - ALPAKA_ASSERT_OFFLOAD(warpId < 32); - if (31 == laneId) - ws[warpId] = c[i]; - mask = __ballot_sync(mask, i + blockDimension < size); - } - alpaka::syncBlockThreads(acc); - if (size <= 32) - return; - if (blockThreadIdx < 32) { - warpPrefixScan(laneId, ws, blockThreadIdx, 0xffffffff); - } - alpaka::syncBlockThreads(acc); - for (auto i = first + 32; i < size; i += blockDimension) { - auto warpId = i / 32; - c[i] += ws[warpId - 1]; - } - alpaka::syncBlockThreads(acc); + const int32_t blockDim(alpaka::getWorkDiv(acc)[0u]); + const int32_t gridBlockIdx(alpaka::getIdx(acc)[0u]); + const int32_t blockThreadIdx(alpaka::getIdx(acc)[0u]); + ALPAKA_ASSERT_OFFLOAD(ws); + ALPAKA_ASSERT_OFFLOAD(size <= 1024); + ALPAKA_ASSERT_OFFLOAD(0 == blockDim % 32); + auto first = blockThreadIdx; + auto mask = __ballot_sync(0xffffffff, first < size); + auto laneId = blockThreadIdx & 0x1f; + + for (auto i = first; i < size; i += blockDim) { + warpPrefixScan(laneId, c, i, mask); + auto warpId = i / 32; + ALPAKA_ASSERT_OFFLOAD(warpId < 32); + if (31 == laneId) + ws[warpId] = c[i]; + mask = __ballot_sync(mask, i + blockDim < size); + } + alpaka::syncBlockThreads(acc); + if (size <= 32) + return; + if (blockThreadIdx < 32) { + warpPrefixScan(laneId, ws, blockThreadIdx, 0xffffffff); + } + alpaka::syncBlockThreads(acc); + for (auto i = first + 32; i < size; i += blockDim) { + auto warpId = i / 32; + c[i] += ws[warpId - 1]; + } + alpaka::syncBlockThreads(acc); #else - for (uint32_t i = 1; i < size; ++i) - c[i] += c[i - 1]; + for (uint32_t i = 1; i < size; ++i) + c[i] += c[i - 1]; #endif - } + } - // limited to 1024*1024 elements.... - template - struct multiBlockPrefixScanFirstStep { - template - ALPAKA_FN_ACC void operator()(const T_Acc& acc, T const* ci, T* co, int32_t size) const { - uint32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); - uint32_t const threadDimension(alpaka::getWorkDiv(acc)[0u]); - uint32_t const blockIdx(alpaka::getIdx(acc)[0u]); - - auto& ws = alpaka::declareSharedVar(acc); - // first each block does a scan of size 1024; (better be enough blocks....) -#ifndef NDEBUG - uint32_t const gridDimension(alpaka::getWorkDiv(acc)[0u]); - ALPAKA_ASSERT_OFFLOAD(gridDimension / threadDimension <= 1024); -#endif - int off = blockDimension * blockIdx * threadDimension; - if (size - off > 0) - blockPrefixScan(acc, ci + off, co + off, std::min(int(blockDimension * threadDimension), size - off), ws); + // limited to 1024*1024 elements.... + template + struct multiBlockPrefixScan { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, T const* ci, T* co, int32_t size, int32_t* pc) const { + const int32_t blockDim(alpaka::getWorkDiv(acc)[0u]); + const int32_t threadDim(alpaka::getWorkDiv(acc)[0u]); + const int32_t blockIdx(alpaka::getIdx(acc)[0u]); + const int32_t threadIdx(alpaka::getIdx(acc)[0u]); + + // first each block does a scan of size 1024; (better be enough blocks....) + int32_t const gridDim(alpaka::getWorkDiv(acc)[0u]); + ALPAKA_ASSERT_OFFLOAD(gridDim / threadDim <= 1024); + int off = blockDim * blockIdx * threadDim; + auto& ws = alpaka::declareSharedVar(acc); + if (size - off > 0) + blockPrefixScan(acc, ci + off, co + off, std::min(int(blockDim * threadDim), size - off), ws); + + auto& isLastBlockDone = alpaka::declareSharedVar(acc); + if (0 == threadIdx) { + ::cms::alpakatools::threadfence(acc); + auto value = alpaka::atomicAdd(acc, pc, 1, alpaka::hierarchy::Blocks{}); // block counter + isLastBlockDone = (value == (gridDim - 1)); } - }; - - // limited to 1024*1024 elements.... - template - struct multiBlockPrefixScanSecondStep { - template - ALPAKA_FN_ACC void operator()(const T_Acc& acc, T const* ci, T* co, int32_t size, int32_t numBlocks) const { - uint32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); - uint32_t const threadDimension(alpaka::getWorkDiv(acc)[0u]); - - uint32_t const threadIdx(alpaka::getIdx(acc)[0u]); - - auto* const psum(alpaka::getDynSharedMem(acc)); - - // first each block does a scan of size 1024; (better be enough blocks....) - ALPAKA_ASSERT_OFFLOAD(static_cast(blockDimension * threadDimension) >= numBlocks); - for (int elemId = 0; elemId < static_cast(threadDimension); ++elemId) { - int index = +threadIdx * threadDimension + elemId; - - if (index < numBlocks) { - int lastElementOfPreviousBlockId = index * blockDimension * threadDimension - 1; - psum[index] = (lastElementOfPreviousBlockId < size and lastElementOfPreviousBlockId >= 0) - ? co[lastElementOfPreviousBlockId] - : T(0); - } - } - alpaka::syncBlockThreads(acc); + alpaka::syncBlockThreads(acc); + + if (!isLastBlockDone) + return; + + ALPAKA_ASSERT_OFFLOAD(gridDim == *pc); + + auto& psum = alpaka::declareSharedVar(acc); + + ALPAKA_ASSERT_OFFLOAD(static_cast(blockDim * threadDim) >= gridDim); - auto& ws = alpaka::declareSharedVar(acc); - blockPrefixScan(acc, psum, psum, numBlocks, ws); + for (int elemId = 0; elemId < static_cast(threadDim); ++elemId) { + int index = +threadIdx * threadDim + elemId; - for (int elemId = 0; elemId < static_cast(threadDimension); ++elemId) { - int first = threadIdx * threadDimension + elemId; - for (int i = first + blockDimension * threadDimension; i < size; i += blockDimension * threadDimension) { - auto k = i / (blockDimension * threadDimension); - co[i] += psum[k]; - } + if (index < gridDim) { + int lastElementOfPreviousBlockId = index * blockDim * threadDim - 1; + psum[index] = (lastElementOfPreviousBlockId < size and lastElementOfPreviousBlockId >= 0) + ? co[lastElementOfPreviousBlockId] + : T(0); } } - }; - - } // namespace alpakatools -} // namespace cms - -namespace alpaka { - namespace traits { - - //############################################################################# - //! The trait for getting the size of the block shared dynamic memory for a kernel. - template - struct BlockSharedMemDynSizeBytes, TAcc> { - //----------------------------------------------------------------------------- - //! \return The size of the shared memory allocated for a block. - template - ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes( - cms::alpakatools::multiBlockPrefixScanSecondStep const& myKernel, - TVec const& blockThreadExtent, - TVec const& threadElemExtent, - T const* ci, - T* co, - int32_t size, - int32_t numBlocks) -> T { - alpaka::ignore_unused(myKernel); - alpaka::ignore_unused(blockThreadExtent); - alpaka::ignore_unused(threadElemExtent); - alpaka::ignore_unused(ci); - alpaka::ignore_unused(co); - alpaka::ignore_unused(size); - - return static_cast(numBlocks) * sizeof(T); + + alpaka::syncBlockThreads(acc); + blockPrefixScan(acc, psum, psum, gridDim, ws); + + for (int elemId = 0; elemId < static_cast(threadDim); ++elemId) { + int first = threadIdx * threadDim + elemId; + for (int i = first + blockDim * threadDim; i < size; i += blockDim * threadDim) { + auto k = i / (blockDim * threadDim); + co[i] += psum[k]; + } } - }; + } + }; - } // namespace traits -} // namespace alpaka +} // namespace cms::alpakatools #endif // HeterogeneousCore_AlpakaUtilities_interface_prefixScan_h diff --git a/src/alpaka/AlpakaCore/radixSort.h b/src/alpaka/AlpakaCore/radixSort.h index f345b439f..77aa97d2b 100644 --- a/src/alpaka/AlpakaCore/radixSort.h +++ b/src/alpaka/AlpakaCore/radixSort.h @@ -4,182 +4,186 @@ #include #include +#include "AlpakaCore/alpakaConfig.h" #include "AlpakaCore/alpakaKernelCommon.h" -namespace cms { - namespace alpakatools { +namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE { - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void dummyReorder( - const T_Acc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {} + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void dummyReorder( + const T_Acc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {} - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void reorderSigned( - const T_Acc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { - //move negative first... + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void reorderSigned( + const T_Acc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + //move negative first... - auto& firstNeg = alpaka::declareSharedVar(acc); - firstNeg = a[ind[0]] < 0 ? 0 : size; - alpaka::syncBlockThreads(acc); + auto& firstNeg = alpaka::declareSharedVar(acc); + firstNeg = a[ind[0]] < 0 ? 0 : size; + alpaka::syncBlockThreads(acc); - // find first negative - cms::alpakatools::for_each_element_in_block_strided(acc, size - 1, [&](uint32_t i) { - if ((a[ind[i]] ^ a[ind[i + 1]]) < 0) - firstNeg = i + 1; - }); + // find first negative + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, size - 1, [&](uint32_t i) { + if ((a[ind[i]] ^ a[ind[i + 1]]) < 0) + firstNeg = i + 1; + }); - alpaka::syncBlockThreads(acc); + alpaka::syncBlockThreads(acc); - cms::alpakatools::for_each_element_in_block_strided( - acc, size, firstNeg, [&](uint32_t i) { ind2[i - firstNeg] = ind[i]; }); - alpaka::syncBlockThreads(acc); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, size, firstNeg, [&](uint32_t i) { ind2[i - firstNeg] = ind[i]; }); + alpaka::syncBlockThreads(acc); - cms::alpakatools::for_each_element_in_block_strided( - acc, firstNeg, [&](uint32_t i) { ind2[i + size - firstNeg] = ind[i]; }); - alpaka::syncBlockThreads(acc); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, firstNeg, [&](uint32_t i) { ind2[i + size - firstNeg] = ind[i]; }); + alpaka::syncBlockThreads(acc); - cms::alpakatools::for_each_element_in_block_strided(acc, size, [&](uint32_t i) { ind[i] = ind2[i]; }); - } + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, size, [&](uint32_t i) { ind[i] = ind2[i]; }); + } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void reorderFloat( - const T_Acc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { - //move negative first... + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void reorderFloat( + const T_Acc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + //move negative first... - auto& firstNeg = alpaka::declareSharedVar(acc); - firstNeg = a[ind[0]] < 0 ? 0 : size; - alpaka::syncBlockThreads(acc); + auto& firstNeg = alpaka::declareSharedVar(acc); + firstNeg = a[ind[0]] < 0 ? 0 : size; + alpaka::syncBlockThreads(acc); - // find first negative - cms::alpakatools::for_each_element_in_block_strided(acc, size - 1, [&](uint32_t i) { - if ((a[ind[i]] ^ a[ind[i + 1]]) < 0) - firstNeg = i + 1; - }); - alpaka::syncBlockThreads(acc); + // find first negative + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, size - 1, [&](uint32_t i) { + if ((a[ind[i]] ^ a[ind[i + 1]]) < 0) + firstNeg = i + 1; + }); + alpaka::syncBlockThreads(acc); - cms::alpakatools::for_each_element_in_block_strided( - acc, size, firstNeg, [&](uint32_t i) { ind2[size - firstNeg - i - 1] = ind[i]; }); - alpaka::syncBlockThreads(acc); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, size, firstNeg, [&](uint32_t i) { ind2[size - firstNeg - i - 1] = ind[i]; }); + alpaka::syncBlockThreads(acc); - cms::alpakatools::for_each_element_in_block_strided( - acc, firstNeg, [&](uint32_t i) { ind2[i + size - firstNeg] = ind[i]; }); - alpaka::syncBlockThreads(acc); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, firstNeg, [&](uint32_t i) { ind2[i + size - firstNeg] = ind[i]; }); + alpaka::syncBlockThreads(acc); - cms::alpakatools::for_each_element_in_block_strided(acc, size, [&](uint32_t i) { ind[i] = ind2[i]; }); - } + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, size, [&](uint32_t i) { ind[i] = ind2[i]; }); + } - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void radixSortImpl( - const T_Acc& acc, T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) { - constexpr int d = 8, w = 8 * sizeof(T); - constexpr int sb = 1 << d; - constexpr int ps = int(sizeof(T)) - NS; + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void radixSortImpl( + const T_Acc& acc, T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) { + constexpr int d = 8, w = 8 * sizeof(T); + constexpr int sb = 1 << d; + constexpr int ps = int(sizeof(T)) - NS; - auto& c = alpaka::declareSharedVar(acc); - auto& ct = alpaka::declareSharedVar(acc); - auto& cu = alpaka::declareSharedVar(acc); + auto& c = alpaka::declareSharedVar(acc); + auto& ct = alpaka::declareSharedVar(acc); + auto& cu = alpaka::declareSharedVar(acc); - auto& ibs = alpaka::declareSharedVar(acc); - auto& p = alpaka::declareSharedVar(acc); + auto& ibs = alpaka::declareSharedVar(acc); + auto& p = alpaka::declareSharedVar(acc); - ALPAKA_ASSERT_OFFLOAD(size > 0); + ALPAKA_ASSERT_OFFLOAD(size > 0); - const uint32_t blockDimension(alpaka::getWorkDiv(acc)[0u]); - ALPAKA_ASSERT_OFFLOAD(blockDimension >= sb); + const uint32_t blockDimension(alpaka::getWorkDiv(acc)[0u]); + ALPAKA_ASSERT_OFFLOAD(blockDimension >= sb); - // bool debug = false; // threadIdx.x==0 && blockIdx.x==5; + // bool debug = false; // threadIdx.x==0 && blockIdx.x==5; - p = ps; + p = ps; - auto j = ind; - auto k = ind2; + auto j = ind; + auto k = ind2; - cms::alpakatools::for_each_element_in_block_strided(acc, size, [&](uint32_t i) { j[i] = i; }); - alpaka::syncBlockThreads(acc); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, size, [&](uint32_t i) { j[i] = i; }); + alpaka::syncBlockThreads(acc); - while (alpaka::syncBlockThreadsPredicate(acc, (p < w / d))) { - cms::alpakatools::for_each_element_in_block(acc, sb, [&](uint32_t i) { c[i] = 0; }); - alpaka::syncBlockThreads(acc); + while (alpaka::syncBlockThreadsPredicate(acc, (p < w / d))) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block( + acc, sb, [&](uint32_t i) { c[i] = 0; }); + alpaka::syncBlockThreads(acc); - // fill bins - cms::alpakatools::for_each_element_in_block_strided(acc, size, [&](uint32_t i) { - auto bin = (a[j[i]] >> d * p) & (sb - 1); - alpaka::atomicAdd(acc, &c[bin], 1, alpaka::hierarchy::Blocks{}); - }); - alpaka::syncBlockThreads(acc); + // fill bins + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, size, [&](uint32_t i) { + auto bin = (a[j[i]] >> d * p) & (sb - 1); + alpaka::atomicAdd(acc, &c[bin], 1, alpaka::hierarchy::Blocks{}); + }); + alpaka::syncBlockThreads(acc); - // prefix scan "optimized"???... - cms::alpakatools::for_each_element_in_block(acc, sb, [&](uint32_t i) { - auto x = c[i]; - auto laneId = i & 0x1f; + // prefix scan "optimized"???... + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block(acc, sb, [&](uint32_t i) { + auto x = c[i]; + auto laneId = i & 0x1f; - for (int offset = 1; offset < 32; offset <<= 1) { - auto y = __shfl_up_sync(0xffffffff, x, offset); - if (laneId >= (uint32_t)offset) - x += y; - } - ct[i] = x; - }); - alpaka::syncBlockThreads(acc); + for (int offset = 1; offset < 32; offset <<= 1) { + auto y = __shfl_up_sync(0xffffffff, x, offset); + if (laneId >= (uint32_t)offset) + x += y; + } + ct[i] = x; + }); + alpaka::syncBlockThreads(acc); - cms::alpakatools::for_each_element_in_block(acc, sb, [&](uint32_t i) { - auto ss = (i / 32) * 32 - 1; - c[i] = ct[i]; - for (int i = ss; i > 0; i -= 32) - c[i] += ct[i]; - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block(acc, sb, [&](uint32_t i) { + auto ss = (i / 32) * 32 - 1; + c[i] = ct[i]; + for (int i = ss; i > 0; i -= 32) + c[i] += ct[i]; + }); - /* + /* //prefix scan for the nulls (for documentation) if (threadIdxLocal==0) for (int i = 1; i < sb; ++i) c[i] += c[i-1]; */ - // broadcast - ibs = size - 1; + // broadcast + ibs = size - 1; + alpaka::syncBlockThreads(acc); + + while (alpaka::syncBlockThreadsPredicate(acc, ibs > 0)) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block(acc, sb, [&](uint32_t i) { + cu[i] = -1; + ct[i] = -1; + }); alpaka::syncBlockThreads(acc); - while (alpaka::syncBlockThreadsPredicate(acc, ibs > 0)) { - cms::alpakatools::for_each_element_in_block(acc, sb, [&](uint32_t i) { - cu[i] = -1; - ct[i] = -1; - }); - alpaka::syncBlockThreads(acc); - - cms::alpakatools::for_each_element_in_block(acc, sb, [&](uint32_t idx) { - int i = ibs - idx; - int32_t bin = -1; - if (i >= 0) { - bin = (a[j[i]] >> d * p) & (sb - 1); - ct[idx] = bin; - alpaka::atomicMax(acc, &cu[bin], int(i), alpaka::hierarchy::Blocks{}); - } - }); - alpaka::syncBlockThreads(acc); - - cms::alpakatools::for_each_element_in_block(acc, sb, [&](uint32_t idx) { - int i = ibs - idx; - int32_t bin = (i >= 0 ? ((a[j[i]] >> d * p) & (sb - 1)) : -1); - if (i >= 0 && i == cu[bin]) // ensure to keep them in order - for (int ii = idx; ii < sb; ++ii) - if (ct[ii] == bin) { - auto oi = ii - idx; - // assert(i>=oi);if(i>=oi) - k[--c[bin]] = j[i - oi]; - } - }); - alpaka::syncBlockThreads(acc); - - const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); - if (threadIdxLocal == 0) - ibs -= sb; - alpaka::syncBlockThreads(acc); - } + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block(acc, sb, [&](uint32_t idx) { + int i = ibs - idx; + int32_t bin = -1; + if (i >= 0) { + bin = (a[j[i]] >> d * p) & (sb - 1); + ct[idx] = bin; + alpaka::atomicMax(acc, &cu[bin], int(i), alpaka::hierarchy::Blocks{}); + } + }); + alpaka::syncBlockThreads(acc); + + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block(acc, sb, [&](uint32_t idx) { + int i = ibs - idx; + int32_t bin = (i >= 0 ? ((a[j[i]] >> d * p) & (sb - 1)) : -1); + if (i >= 0 && i == cu[bin]) // ensure to keep them in order + for (int ii = idx; ii < sb; ++ii) + if (ct[ii] == bin) { + auto oi = ii - idx; + // assert(i>=oi);if(i>=oi) + k[--c[bin]] = j[i - oi]; + } + }); + alpaka::syncBlockThreads(acc); - /* + const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); + if (threadIdxLocal == 0) + ibs -= sb; + alpaka::syncBlockThreads(acc); + } + + /* // broadcast for the nulls (for documentation) if (threadIdxLocal==0) for (int i=size-first-1; i>=0; i--) { // =blockDim.x) { @@ -189,61 +193,62 @@ namespace cms { } */ - alpaka::syncBlockThreads(acc); - ALPAKA_ASSERT_OFFLOAD(c[0] == 0); - - // swap (local, ok) - auto t = j; - j = k; - k = t; - - const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); - if (threadIdxLocal == 0) - ++p; - alpaka::syncBlockThreads(acc); - } - - if ((w != 8) && (0 == (NS & 1))) - ALPAKA_ASSERT_OFFLOAD(j == ind); // w/d is even so ind is correct - - if (j != ind) // odd... - cms::alpakatools::for_each_element_in_block_strided(acc, size, [&](uint32_t i) { ind[i] = ind2[i]; }); - alpaka::syncBlockThreads(acc); + ALPAKA_ASSERT_OFFLOAD(c[0] == 0); - // now move negative first... (if signed) - reorder(acc, a, ind, ind2, size); - } - - template ::value, T>::type* = nullptr> - ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void radixSort( - const T_Acc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { - radixSortImpl(acc, a, ind, ind2, size, dummyReorder); - } + // swap (local, ok) + auto t = j; + j = k; + k = t; - template ::value && std::is_signed::value, T>::type* = nullptr> - ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void radixSort( - const T_Acc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { - radixSortImpl(acc, a, ind, ind2, size, reorderSigned); - } - - template ::value, T>::type* = nullptr> - ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void radixSort( - const T_Acc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { - using I = int; - radixSortImpl(acc, (I const*)(a), ind, ind2, size, reorderFloat); + const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); + if (threadIdxLocal == 0) + ++p; + alpaka::syncBlockThreads(acc); } - /* Not needed + if ((w != 8) && (0 == (NS & 1))) + ALPAKA_ASSERT_OFFLOAD(j == ind); // w/d is even so ind is correct + + if (j != ind) // odd... + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, size, [&](uint32_t i) { ind[i] = ind2[i]; }); + + alpaka::syncBlockThreads(acc); + + // now move negative first... (if signed) + reorder(acc, a, ind, ind2, size); + } + + template ::value, T>::type* = nullptr> + ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void radixSort( + const T_Acc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + radixSortImpl(acc, a, ind, ind2, size, dummyReorder); + } + + template ::value && std::is_signed::value, T>::type* = nullptr> + ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void radixSort( + const T_Acc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + radixSortImpl(acc, a, ind, ind2, size, reorderSigned); + } + + template ::value, T>::type* = nullptr> + ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void radixSort( + const T_Acc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + using I = int; + radixSortImpl(acc, (I const*)(a), ind, ind2, size, reorderFloat); + } + + /* Not needed template ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void radixSortMulti(T const* v, uint16_t* index, @@ -273,7 +278,6 @@ ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void radixSortMult } */ - } // namespace alpakatools -} // namespace cms +} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE #endif // HeterogeneousCoreCUDAUtilities_radixSort_H diff --git a/src/alpaka/AlpakaCore/threadfence.h b/src/alpaka/AlpakaCore/threadfence.h index 294702866..feeb9c710 100644 --- a/src/alpaka/AlpakaCore/threadfence.h +++ b/src/alpaka/AlpakaCore/threadfence.h @@ -6,45 +6,43 @@ #include "AlpakaCore/alpakaCommon.h" #include "AlpakaCore/alpakaKernelCommon.h" -namespace cms { - namespace alpakatools { +namespace cms::alpakatools { - // device-wide memory fence - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void threadfence(T_Acc const& acc) { - static_assert(std::is_same_v, - "cms::alpakatools::threadfence(acc) has not been implemented for this Accelerator type."); - } + // device-wide memory fence + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void threadfence(T_Acc const& acc) { + static_assert(std::is_same_v, + "cms::alpakatools::threadfence(acc) has not been implemented for this Accelerator type."); + } #ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED - // device-wide memory fence - // CPU serial implementation: no fence needed - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void threadfence(alpaka::AccCpuSerial const& acc) { - // serial implementation with a single thread, no fence needed - } + // device-wide memory fence + // CPU serial implementation: no fence needed + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void threadfence(alpaka::AccCpuSerial const& acc) { + // serial implementation with a single thread, no fence needed + } #endif // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED #ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED - // device-wide memory fence - // CPU parallel implementation using TBB tasks: std::atomic_thread_fence() - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void threadfence(alpaka::AccCpuTbbBlocks const& acc) { - std::atomic_thread_fence(std::memory_order_acq_rel); - } + // device-wide memory fence + // CPU parallel implementation using TBB tasks: std::atomic_thread_fence() + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void threadfence(alpaka::AccCpuTbbBlocks const& acc) { + std::atomic_thread_fence(std::memory_order_acq_rel); + } #endif // ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - // device-wide memory fence - // GPU parallel implementation using CUDA: __threadfence() - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void threadfence(alpaka::AccGpuCudaRt const& acc) { - // device-only function - __threadfence(); - } + // device-wide memory fence + // GPU parallel implementation using CUDA: __threadfence() + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void threadfence(alpaka::AccGpuCudaRt const& acc) { + // device-only function + __threadfence(); + } #endif // ALPAKA_ACC_GPU_CUDA_ENABLED - } // namespace alpakatools -} // namespace cms +} // namespace cms::alpakatools #endif // HeterogeneousCoreCUDAUtilities_threadfence_h diff --git a/src/alpaka/AlpakaDataFormats/BeamSpotAlpaka.h b/src/alpaka/AlpakaDataFormats/BeamSpotAlpaka.h index 0729df6f5..5ae141f67 100644 --- a/src/alpaka/AlpakaDataFormats/BeamSpotAlpaka.h +++ b/src/alpaka/AlpakaDataFormats/BeamSpotAlpaka.h @@ -12,13 +12,16 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { public: BeamSpotAlpaka() = default; - BeamSpotAlpaka(BeamSpotPOD const* data, Queue& queue) : data_d{cms::alpakatools::allocDeviceBuf(1u)} { - auto data_h{cms::alpakatools::createHostView(data, 1u)}; + BeamSpotAlpaka(BeamSpotPOD const* data, Queue& queue) + : data_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)} { + auto data_h{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createHostView(data, 1u)}; alpaka::memcpy(queue, data_d, data_h, 1u); - alpaka::wait(queue); + // alpaka::wait(queue); } + //TODO ANTONIO + const BeamSpotPOD* data() const { return alpaka::getPtrNative(data_d); } private: diff --git a/src/alpaka/AlpakaDataFormats/PixelTrackAlpaka.h b/src/alpaka/AlpakaDataFormats/PixelTrackAlpaka.h index 880cb8b5f..c6e0f36f0 100644 --- a/src/alpaka/AlpakaDataFormats/PixelTrackAlpaka.h +++ b/src/alpaka/AlpakaDataFormats/PixelTrackAlpaka.h @@ -1,10 +1,9 @@ #ifndef CUDADataFormatsTrackTrackHeterogeneous_H #define CUDADataFormatsTrackTrackHeterogeneous_H -#include "AlpakaDataFormats/TrajectoryStateSoA.h" #include "AlpakaCore/HistoContainer.h" - #include "AlpakaCore/alpakaCommon.h" +#include "AlpakaDataFormats/TrajectoryStateSoA.h" namespace trackQuality { enum Quality : uint8_t { bad = 0, dup, loose, strict, tight, highPurity }; @@ -17,7 +16,7 @@ class TrackSoAT { using Quality = trackQuality::Quality; using hindex_type = uint16_t; - using HitContainer = cms::alpakatools::OneToManyAssoc; + using HitContainer = ::cms::alpakatools::OneToManyAssoc; // Always check quality is at least loose! // CUDA does not support enums in __lgc ... diff --git a/src/alpaka/AlpakaDataFormats/SiPixelClustersAlpaka.h b/src/alpaka/AlpakaDataFormats/SiPixelClustersAlpaka.h index 35af5833c..0b7261f27 100644 --- a/src/alpaka/AlpakaDataFormats/SiPixelClustersAlpaka.h +++ b/src/alpaka/AlpakaDataFormats/SiPixelClustersAlpaka.h @@ -9,10 +9,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { public: SiPixelClustersAlpaka() = default; explicit SiPixelClustersAlpaka(size_t maxClusters) - : moduleStart_d{cms::alpakatools::allocDeviceBuf(maxClusters + 1)}, - clusInModule_d{cms::alpakatools::allocDeviceBuf(maxClusters)}, - moduleId_d{cms::alpakatools::allocDeviceBuf(maxClusters)}, - clusModuleStart_d{cms::alpakatools::allocDeviceBuf(maxClusters + 1)} {} + : moduleStart_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxClusters + 1)}, + clusInModule_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxClusters)}, + moduleId_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxClusters)}, + clusModuleStart_d{ + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxClusters + 1)} {} ~SiPixelClustersAlpaka() = default; SiPixelClustersAlpaka(const SiPixelClustersAlpaka &) = delete; @@ -41,7 +42,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { class DeviceConstView { public: - // TO DO: removed __ldg, check impact on perf with src/cuda. + // TO DO: removed __ldg, check impact on perf ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t moduleStart(int i) const { return moduleStart_[i]; } ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t clusInModule(int i) const { return clusInModule_[i]; } ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t moduleId(int i) const { return moduleId_[i]; } diff --git a/src/alpaka/AlpakaDataFormats/SiPixelDigiErrorsAlpaka.h b/src/alpaka/AlpakaDataFormats/SiPixelDigiErrorsAlpaka.h index a8098a8b4..f42b3eff2 100644 --- a/src/alpaka/AlpakaDataFormats/SiPixelDigiErrorsAlpaka.h +++ b/src/alpaka/AlpakaDataFormats/SiPixelDigiErrorsAlpaka.h @@ -11,20 +11,19 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { class SiPixelDigiErrorsAlpaka { public: SiPixelDigiErrorsAlpaka() = default; - explicit SiPixelDigiErrorsAlpaka(size_t maxFedWords, PixelFormatterErrors errors) - : data_d{cms::alpakatools::allocDeviceBuf(maxFedWords)}, - error_d{cms::alpakatools::allocDeviceBuf>(1u)}, - error_h{cms::alpakatools::allocHostBuf>(1u)}, + explicit SiPixelDigiErrorsAlpaka(size_t maxFedWords, PixelFormatterErrors errors, Queue& queue) + : data_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxFedWords)}, + error_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf< + ::cms::alpakatools::SimpleVector>(1u)}, + error_h{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf< + ::cms::alpakatools::SimpleVector>(1u)}, formatterErrors_h{std::move(errors)} { auto perror_h = alpaka::getPtrNative(error_h); perror_h->construct(maxFedWords, alpaka::getPtrNative(data_d)); ALPAKA_ASSERT_OFFLOAD(perror_h->empty()); ALPAKA_ASSERT_OFFLOAD(perror_h->capacity() == static_cast(maxFedWords)); - // TO DO: nothing really async in here for now... Pass the queue in constructor argument instead, and don't wait anymore! - Queue queue(device); alpaka::memcpy(queue, error_d, error_h, 1u); - alpaka::wait(queue); } ~SiPixelDigiErrorsAlpaka() = default; @@ -35,13 +34,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { const PixelFormatterErrors& formatterErrors() const { return formatterErrors_h; } - cms::alpakatools::SimpleVector* error() { return alpaka::getPtrNative(error_d); } - cms::alpakatools::SimpleVector const* error() const { return alpaka::getPtrNative(error_d); } - cms::alpakatools::SimpleVector const* c_error() const { return alpaka::getPtrNative(error_d); } + ::cms::alpakatools::SimpleVector* error() { return alpaka::getPtrNative(error_d); } + ::cms::alpakatools::SimpleVector const* error() const { return alpaka::getPtrNative(error_d); } + ::cms::alpakatools::SimpleVector const* c_error() const { return alpaka::getPtrNative(error_d); } #ifdef TODO using HostDataError = - std::pair, AlpakaHostBuf>; + std::pair<::cms::alpakatools::SimpleVector, AlpakaHostBuf>; HostDataError dataErrorToHostAsync(cudaStream_t stream) const; void copyErrorToHostAsync(cudaStream_t stream); @@ -49,8 +48,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { private: AlpakaDeviceBuf data_d; - AlpakaDeviceBuf> error_d; - AlpakaHostBuf> error_h; + AlpakaDeviceBuf<::cms::alpakatools::SimpleVector> error_d; + AlpakaHostBuf<::cms::alpakatools::SimpleVector> error_h; PixelFormatterErrors formatterErrors_h; }; diff --git a/src/alpaka/AlpakaDataFormats/SiPixelDigisAlpaka.h b/src/alpaka/AlpakaDataFormats/SiPixelDigisAlpaka.h index 01fa3ab98..84c11de0d 100644 --- a/src/alpaka/AlpakaDataFormats/SiPixelDigisAlpaka.h +++ b/src/alpaka/AlpakaDataFormats/SiPixelDigisAlpaka.h @@ -9,13 +9,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { public: SiPixelDigisAlpaka() = default; explicit SiPixelDigisAlpaka(size_t maxFedWords) - : xx_d{cms::alpakatools::allocDeviceBuf(maxFedWords)}, - yy_d{cms::alpakatools::allocDeviceBuf(maxFedWords)}, - adc_d{cms::alpakatools::allocDeviceBuf(maxFedWords)}, - moduleInd_d{cms::alpakatools::allocDeviceBuf(maxFedWords)}, - clus_d{cms::alpakatools::allocDeviceBuf(maxFedWords)}, - pdigi_d{cms::alpakatools::allocDeviceBuf(maxFedWords)}, - rawIdArr_d{cms::alpakatools::allocDeviceBuf(maxFedWords)} {} + : xx_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxFedWords)}, + yy_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxFedWords)}, + adc_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxFedWords)}, + moduleInd_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxFedWords)}, + clus_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxFedWords)}, + pdigi_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxFedWords)}, + rawIdArr_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxFedWords)} {} ~SiPixelDigisAlpaka() = default; SiPixelDigisAlpaka(const SiPixelDigisAlpaka &) = delete; @@ -57,7 +57,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // TO DO: nothing async in here for now... Pass the queue as argument instead, and don't wait anymore! auto adcToHostAsync(Queue &queue) const { - auto ret = cms::alpakatools::allocHostBuf(nDigis()); + auto ret = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(nDigis()); alpaka::memcpy(queue, ret, adc_d, nDigis()); return ret; } diff --git a/src/alpaka/AlpakaDataFormats/TrackingRecHit2DAlpaka.h b/src/alpaka/AlpakaDataFormats/TrackingRecHit2DAlpaka.h index db7a9fb8c..1b9de3447 100644 --- a/src/alpaka/AlpakaDataFormats/TrackingRecHit2DAlpaka.h +++ b/src/alpaka/AlpakaDataFormats/TrackingRecHit2DAlpaka.h @@ -14,29 +14,32 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { explicit TrackingRecHit2DAlpaka(uint32_t nHits, const pixelCPEforGPU::ParamsOnGPU* cpeParams, - const uint32_t* hitsModuleStart) + const uint32_t* hitsModuleStart, + Queue& queue) : m_nHits(nHits), // NON-OWNING DEVICE POINTERS: m_hitsModuleStart(hitsModuleStart), // OWNING DEVICE POINTERS: - m_xl{cms::alpakatools::allocDeviceBuf(nHits)}, - m_yl{cms::alpakatools::allocDeviceBuf(nHits)}, - m_xerr{cms::alpakatools::allocDeviceBuf(nHits)}, - m_yerr{cms::alpakatools::allocDeviceBuf(nHits)}, - m_xg{cms::alpakatools::allocDeviceBuf(nHits)}, - m_yg{cms::alpakatools::allocDeviceBuf(nHits)}, - m_zg{cms::alpakatools::allocDeviceBuf(nHits)}, - m_rg{cms::alpakatools::allocDeviceBuf(nHits)}, - m_iphi{cms::alpakatools::allocDeviceBuf(nHits)}, - m_charge{cms::alpakatools::allocDeviceBuf(nHits)}, - m_xsize{cms::alpakatools::allocDeviceBuf(nHits)}, - m_ysize{cms::alpakatools::allocDeviceBuf(nHits)}, - m_detInd{cms::alpakatools::allocDeviceBuf(nHits)}, - m_averageGeometry{cms::alpakatools::allocDeviceBuf(1u)}, - m_hitsLayerStart{cms::alpakatools::allocDeviceBuf(nHits)}, - m_hist{cms::alpakatools::allocDeviceBuf(1u)}, + m_xl{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, + m_yl{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, + m_xerr{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, + m_yerr{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, + m_xg{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, + m_yg{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, + m_zg{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, + m_rg{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, + m_iphi{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, + m_charge{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, + m_xsize{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, + m_ysize{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, + m_detInd{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, + m_averageGeometry{ + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf( + 1u)}, + m_hitsLayerStart{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(nHits)}, + m_hist{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}, // SOA view: - m_view{cms::alpakatools::allocDeviceBuf(1u)} { + m_view{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)} { // the hits are actually accessed in order only in building // if ordering is relevant they may have to be stored phi-ordered by layer or so // this will break 1to1 correspondence with cluster and module locality @@ -52,30 +55,31 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { view.m_cpeParams = cpeParams; // Raw pointers to data owned here in TrackingRecHit2DAlpaka object: -#define SET(name) view.name = alpaka::getPtrNative(name) - SET(m_xl); - SET(m_yl); - SET(m_xerr); - SET(m_yerr); - SET(m_xg); - SET(m_yg); - SET(m_zg); - SET(m_rg); - SET(m_iphi); - SET(m_charge); - SET(m_xsize); - SET(m_ysize); - SET(m_detInd); - SET(m_averageGeometry); - SET(m_hitsLayerStart); - SET(m_hist); -#undef SET + view.m_xl = alpaka::getPtrNative(m_xl); + view.m_yl = alpaka::getPtrNative(m_yl); + view.m_xerr = alpaka::getPtrNative(m_xerr); + view.m_yerr = alpaka::getPtrNative(m_yerr); + view.m_xg = alpaka::getPtrNative(m_xg); + view.m_yg = alpaka::getPtrNative(m_yg); + view.m_zg = alpaka::getPtrNative(m_zg); + view.m_rg = alpaka::getPtrNative(m_rg); + view.m_iphi = alpaka::getPtrNative(m_iphi); + view.m_charge = alpaka::getPtrNative(m_charge); + view.m_xsize = alpaka::getPtrNative(m_xsize); + view.m_ysize = alpaka::getPtrNative(m_ysize); + view.m_detInd = alpaka::getPtrNative(m_detInd); + view.m_averageGeometry = alpaka::getPtrNative(m_averageGeometry); + view.m_hitsLayerStart = alpaka::getPtrNative(m_hitsLayerStart); + view.m_hist = alpaka::getPtrNative(m_hist); // SoA view on device: - Queue queue(device); - auto view_h{cms::alpakatools::createHostView(&view, 1u)}; + auto view_h{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createHostView(&view, 1u)}; alpaka::memcpy(queue, m_view, view_h, 1u); +#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED + // FIXME: this is necessary when using the TBB backend, otherwise the next "kernel" using these data will crash. + // We do not know why it is necessary, nor why it has to be here and not right after the call to this constructor. alpaka::wait(queue); +#endif } ~TrackingRecHit2DAlpaka() = default; @@ -98,57 +102,57 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto const* c_iphi() const { return alpaka::getPtrNative(m_iphi); } auto xlToHostAsync(Queue& queue) const { - auto ret = cms::alpakatools::allocHostBuf(nHits()); + auto ret = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(nHits()); alpaka::memcpy(queue, ret, m_xl, nHits()); return ret; } auto ylToHostAsync(Queue& queue) const { - auto ret = cms::alpakatools::allocHostBuf(nHits()); + auto ret = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(nHits()); alpaka::memcpy(queue, ret, m_yl, nHits()); return ret; } auto xerrToHostAsync(Queue& queue) const { - auto ret = cms::alpakatools::allocHostBuf(nHits()); + auto ret = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(nHits()); alpaka::memcpy(queue, ret, m_xerr, nHits()); return ret; } auto yerrToHostAsync(Queue& queue) const { - auto ret = cms::alpakatools::allocHostBuf(nHits()); + auto ret = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(nHits()); alpaka::memcpy(queue, ret, m_yerr, nHits()); return ret; } auto xgToHostAsync(Queue& queue) const { - auto ret = cms::alpakatools::allocHostBuf(nHits()); + auto ret = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(nHits()); alpaka::memcpy(queue, ret, m_xg, nHits()); return ret; } auto ygToHostAsync(Queue& queue) const { - auto ret = cms::alpakatools::allocHostBuf(nHits()); + auto ret = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(nHits()); alpaka::memcpy(queue, ret, m_yg, nHits()); return ret; } auto zgToHostAsync(Queue& queue) const { - auto ret = cms::alpakatools::allocHostBuf(nHits()); + auto ret = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(nHits()); alpaka::memcpy(queue, ret, m_zg, nHits()); return ret; } auto rgToHostAsync(Queue& queue) const { - auto ret = cms::alpakatools::allocHostBuf(nHits()); + auto ret = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(nHits()); alpaka::memcpy(queue, ret, m_rg, nHits()); return ret; } auto chargeToHostAsync(Queue& queue) const { - auto ret = cms::alpakatools::allocHostBuf(nHits()); + auto ret = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(nHits()); alpaka::memcpy(queue, ret, m_charge, nHits()); return ret; } auto xsizeToHostAsync(Queue& queue) const { - auto ret = cms::alpakatools::allocHostBuf(nHits()); + auto ret = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(nHits()); alpaka::memcpy(queue, ret, m_xsize, nHits()); return ret; } auto ysizeToHostAsync(Queue& queue) const { - auto ret = cms::alpakatools::allocHostBuf(nHits()); + auto ret = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(nHits()); alpaka::memcpy(queue, ret, m_ysize, nHits()); return ret; } diff --git a/src/alpaka/AlpakaDataFormats/TrackingRecHit2DSOAView.h b/src/alpaka/AlpakaDataFormats/TrackingRecHit2DSOAView.h index e270e4a1e..4cbaf2316 100644 --- a/src/alpaka/AlpakaDataFormats/TrackingRecHit2DSOAView.h +++ b/src/alpaka/AlpakaDataFormats/TrackingRecHit2DSOAView.h @@ -16,8 +16,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { static constexpr uint32_t maxHits() { return gpuClustering::MaxNumClusters; } using hindex_type = uint16_t; // if above is <=2^16 - using Hist = - cms::alpakatools::HistoContainer; + using Hist = ::cms::alpakatools:: + HistoContainer; using AverageGeometry = phase1PixelTopology::AverageGeometry; @@ -25,10 +25,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t nHits() const { return m_nHits; } + // TO DO: removed __ldg, check impact on perf ALPAKA_FN_ACC ALPAKA_FN_INLINE float& xLocal(int i) { return m_xl[i]; } - ALPAKA_FN_ACC ALPAKA_FN_INLINE float xLocal(int i) const { - return m_xl[i]; - } // TO DO: removed __ldg from legacy, check impact on perf. + ALPAKA_FN_ACC ALPAKA_FN_INLINE float xLocal(int i) const { return m_xl[i]; } ALPAKA_FN_ACC ALPAKA_FN_INLINE float& yLocal(int i) { return m_yl[i]; } ALPAKA_FN_ACC ALPAKA_FN_INLINE float yLocal(int i) const { return m_yl[i]; } diff --git a/src/alpaka/CondFormats/PixelCPEFast.h b/src/alpaka/CondFormats/PixelCPEFast.h index cc386ba02..9cc2c0941 100644 --- a/src/alpaka/CondFormats/PixelCPEFast.h +++ b/src/alpaka/CondFormats/PixelCPEFast.h @@ -6,6 +6,8 @@ #include "CondFormats/pixelCPEforGPU.h" #include "AlpakaCore/alpakaCommon.h" +#include "AlpakaCore/ESProduct.h" +#include "AlpakaCore/alpakaMemoryHelper.h" namespace ALPAKA_ACCELERATOR_NAMESPACE { @@ -24,7 +26,60 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ~PixelCPEFast() = default; - pixelCPEforGPU::ParamsOnGPU const* params() const { return alpaka::getPtrNative(m_params); } + pixelCPEforGPU::ParamsOnGPU const *params() const { return alpaka::getPtrNative(m_params); } + + template + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ESProduct getGPUData(T_Acc acc) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ESProduct gpuData_(acc); + return gpuData_; + } + + // The return value can only be used safely in kernels launched on + // the same cudaStream, or after cudaStreamSynchronize. + + template + const pixelCPEforGPU::ParamsOnGPU getGPUProductAsync(T_Acc acc, Queue queue) const { + auto gpuData_ = getGPUData(acc); + + const auto &data = gpuData_.dataForCurrentDeviceAsync(queue, [this](GPUData &data, Queue queue) { + // and now copy to device... + auto cParams = + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u); + data.h_paramsOnGPU.m_commonParams = alpaka::getPtrNative(cParams); + + uint32_t size_detParams = alpaka::extent::getExtentVec(this->m_detParams)[0u]; + auto detParams = + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(size_detParams); + data.h_paramsOnGPU.m_detParams = alpaka::getPtrNative(detParams); + + auto avgGeom = + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u); + data.h_paramsOnGPU.m_averageGeometry = alpaka::getPtrNative(avgGeom); + + auto layerGeom = + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u); + data.h_paramsOnGPU.m_layerGeometry = alpaka::getPtrNative(layerGeom); + + auto parGPU = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u); + data.d_paramsOnGPU = alpaka::getPtrNative(parGPU); + + alpaka::prepareForAsyncCopy(cParams); + alpaka::prepareForAsyncCopy(detParams); + alpaka::prepareForAsyncCopy(avgGeom); + alpaka::prepareForAsyncCopy(layerGeom); + alpaka::prepareForAsyncCopy(parGPU); + + alpaka::memcpy(queue, data.d_paramsOnGPU, data.h_paramsOnGPU, 1u); + alpaka::memcpy(queue, data.h_paramsOnGPU.m_commonParams, this->m_commonParams, 1u); + alpaka::memcpy(queue, data.h_paramsOnGPU.m_averageGeometry, this->m_averageGeometry, 1u); + alpaka::memcpy(queue, data.h_paramsOnGPU.m_layerGeometry, this->m_layerGeometry, 1u); + alpaka::memcpy(queue, data.h_paramsOnGPU.m_detParams, alpaka::getPtrNative(this->m_detParams), size_detParams); + }); +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + return *data.d_paramsOnGPU; +#endif + return data.h_paramsOnGPU; + } private: AlpakaDeviceBuf m_commonParams; @@ -32,6 +87,18 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { AlpakaDeviceBuf m_layerGeometry; AlpakaDeviceBuf m_averageGeometry; AlpakaDeviceBuf m_params; + + struct GPUData { + // not needed if not used on CPU... + pixelCPEforGPU::ParamsOnGPU h_paramsOnGPU; + pixelCPEforGPU::ParamsOnGPU *d_paramsOnGPU = nullptr; // copy of the above on the Device + ~GPUData() { + if (d_paramsOnGPU != nullptr) { + //cudafree + } + } + }; + //cms::alpakatools::ESProduct gpuData_; }; } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/Framework/ReusableObjectHolder.h b/src/alpaka/Framework/ReusableObjectHolder.h index 8f711306c..a3918017e 100644 --- a/src/alpaka/Framework/ReusableObjectHolder.h +++ b/src/alpaka/Framework/ReusableObjectHolder.h @@ -74,6 +74,7 @@ #include #include "tbb/task.h" #include "tbb/concurrent_queue.h" +#include namespace edm { template > diff --git a/src/alpaka/Makefile.deps b/src/alpaka/Makefile.deps index e9f6b10c4..c89dd8632 100644 --- a/src/alpaka/Makefile.deps +++ b/src/alpaka/Makefile.deps @@ -1,7 +1,8 @@ -alpaka_EXTERNAL_DEPENDS := TBB EIGEN ALPAKA BOOST +alpaka_EXTERNAL_DEPENDS := TBB EIGEN ALPAKA BOOST BACKTRACE ifdef CUDA_BASE alpaka_EXTERNAL_DEPENDS += CUDA endif +AlpakaCore_DEPENDS := Framework BeamSpotProducer_DEPENDS := Framework AlpakaCore AlpakaDataFormats DataFormats PixelTriplets_DEPENDS := Framework AlpakaCore AlpakaDataFormats PixelVertexFinding_DEPENDS := Framework AlpakaCore AlpakaDataFormats DataFormats CondFormats diff --git a/src/alpaka/plugin-BeamSpotProducer/alpaka/BeamSpotToAlpaka.cc b/src/alpaka/plugin-BeamSpotProducer/alpaka/BeamSpotToAlpaka.cc index 9dc0b9b72..6b5c8268e 100644 --- a/src/alpaka/plugin-BeamSpotProducer/alpaka/BeamSpotToAlpaka.cc +++ b/src/alpaka/plugin-BeamSpotProducer/alpaka/BeamSpotToAlpaka.cc @@ -4,6 +4,7 @@ #include "Framework/Event.h" #include "Framework/EventSetup.h" #include "Framework/PluginFactory.h" +#include "AlpakaCore/ScopedContext.h" namespace ALPAKA_ACCELERATOR_NAMESPACE { @@ -17,7 +18,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { private: edm::EDPutTokenT bsPutToken_; // TO DO: Add implementation of cms::alpaka::Product? - // const edm::EDPutTokenT> bsPutToken_; + // const edm::EDPutTokenT<::cms::alpaka::Product> bsPutToken_; }; BeamSpotToAlpaka::BeamSpotToAlpaka(edm::ProductRegistry& reg) : bsPutToken_{reg.produces()} {} @@ -26,13 +27,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto const& bsRaw = iSetup.get(); // TO DO: Add inter-event parallelization. cms::alpaka::ScopedContextProduce? - // cms::alpaka::ScopedContextProduce ctx{iEvent.streamID()}; - Queue queue(device); - BeamSpotAlpaka bs{&bsRaw, queue}; - - iEvent.emplace(bsPutToken_, std::move(bs)); - - alpaka::wait(queue); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{::ALPAKA_ACCELERATOR_NAMESPACE::device, + iEvent.streamID()}; + BeamSpotAlpaka bsDevice(&bsRaw, ctx.stream()); + ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::device, iEvent, bsPutToken_, std::move(bsDevice)); } } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/BrokenLine.h b/src/alpaka/plugin-PixelTriplets/alpaka/BrokenLine.h index 082eddad8..2006b1c95 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/BrokenLine.h +++ b/src/alpaka/plugin-PixelTriplets/alpaka/BrokenLine.h @@ -14,7 +14,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { |cov(phi, d )|cov( d , d )|cov( k , d )| \n |cov(phi, k )|cov( d , k )|cov( k , k )| */ - using karimaki_circle_fit = ALPAKA_ACCELERATOR_NAMESPACE::Rfit::circle_fit; + using karimaki_circle_fit = ::ALPAKA_ACCELERATOR_NAMESPACE::Rfit::circle_fit; /*! \brief data needed for the Broken Line fit procedure. diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/BrokenLineFitOnGPU.cc b/src/alpaka/plugin-PixelTriplets/alpaka/BrokenLineFitOnGPU.cc index 91269e6d8..32d9d2629 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/BrokenLineFitOnGPU.cc +++ b/src/alpaka/plugin-PixelTriplets/alpaka/BrokenLineFitOnGPU.cc @@ -12,19 +12,20 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { const auto blockSize = 64; const auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize; - const WorkDiv1D workDivTriplets = cms::alpakatools::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); - const WorkDiv1D workDivQuadsPenta = - cms::alpakatools::make_workdiv(Vec1D::all(numberOfBlocks / 4), Vec1D::all(blockSize)); + const WorkDiv1D workDivTriplets = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( + Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); + const WorkDiv1D workDivQuadsPenta = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( + Vec1D::all(numberOfBlocks / 4), Vec1D::all(blockSize)); // Fit internals - auto hitsGPU_ = cms::alpakatools::allocDeviceBuf(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / - sizeof(double)); + auto hitsGPU_ = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf( + maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double)); - auto hits_geGPU_ = - cms::alpakatools::allocDeviceBuf(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float)); + auto hits_geGPU_ = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf( + maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float)); - auto fast_fit_resultsGPU_ = - cms::alpakatools::allocDeviceBuf(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double)); + auto fast_fit_resultsGPU_ = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf( + maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double)); for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { // fit triplets diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/BrokenLineFitOnGPU.h b/src/alpaka/plugin-PixelTriplets/alpaka/BrokenLineFitOnGPU.h index ad1289432..66af94782 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/BrokenLineFitOnGPU.h +++ b/src/alpaka/plugin-PixelTriplets/alpaka/BrokenLineFitOnGPU.h @@ -53,69 +53,70 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { #endif const auto nt = Rfit::maxNumberOfConcurrentFits(); - cms::alpakatools::for_each_element_in_grid_strided(acc, nt, [&](uint32_t local_idx) { - auto tuple_idx = local_idx + offset; - if (tuple_idx >= tupleMultiplicity->size(nHits)) - return; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, nt, [&](uint32_t local_idx) { + auto tuple_idx = local_idx + offset; + if (tuple_idx >= tupleMultiplicity->size(nHits)) + return; - // get it from the ntuple container (one to one to helix) - auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); - ALPAKA_ASSERT_OFFLOAD(tkid < foundNtuplets->nbins()); + // get it from the ntuple container (one to one to helix) + auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); + ALPAKA_ASSERT_OFFLOAD(tkid < foundNtuplets->nbins()); - ALPAKA_ASSERT_OFFLOAD(foundNtuplets->size(tkid) == nHits); + ALPAKA_ASSERT_OFFLOAD(foundNtuplets->size(tkid) == nHits); - Rfit::Map3xNd hits(phits + local_idx); - Rfit::Map4d fast_fit(pfast_fit + local_idx); - Rfit::Map6xNf hits_ge(phits_ge + local_idx); + Rfit::Map3xNd hits(phits + local_idx); + Rfit::Map4d fast_fit(pfast_fit + local_idx); + Rfit::Map6xNf hits_ge(phits_ge + local_idx); #ifdef BL_DUMP_HITS - auto &&done = alpaka::declareSharedVar(acc); - done = 0; - alpaka::syncBlockThreads(acc); - bool dump = - (foundNtuplets->size(tkid) == 5 && 0 == alpaka::atomicAdd(acc, &done, 1, alpaka::hierarchy::Blocks{})); + auto &&done = alpaka::declareSharedVar(acc); + done = 0; + alpaka::syncBlockThreads(acc); + bool dump = + (foundNtuplets->size(tkid) == 5 && 0 == alpaka::atomicAdd(acc, &done, 1, alpaka::hierarchy::Blocks{})); #endif - // Prepare data structure - auto const *hitId = foundNtuplets->begin(tkid); - for (unsigned int i = 0; i < hitsInFit; ++i) { - auto hit = hitId[i]; - float ge[6]; - hhp->cpeParams() - .detParams(hhp->detectorIndex(hit)) - .frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge); + // Prepare data structure + auto const *hitId = foundNtuplets->begin(tkid); + for (unsigned int i = 0; i < hitsInFit; ++i) { + auto hit = hitId[i]; + float ge[6]; + hhp->cpeParams() + .detParams(hhp->detectorIndex(hit)) + .frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge); #ifdef BL_DUMP_HITS - if (dump) { - printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n", - tkid, - hhp->detectorIndex(hit), - i, - hhp->xGlobal(hit), - hhp->yGlobal(hit), - hhp->zGlobal(hit)); - printf("Error: %d: %d hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n", - tkid, - hhp->detetectorIndex(hit), - i, - ge[0], - ge[1], - ge[2], - ge[3], - ge[4], - ge[5]); - } + if (dump) { + printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n", + tkid, + hhp->detectorIndex(hit), + i, + hhp->xGlobal(hit), + hhp->yGlobal(hit), + hhp->zGlobal(hit)); + printf("Error: %d: %d hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n", + tkid, + hhp->detetectorIndex(hit), + i, + ge[0], + ge[1], + ge[2], + ge[3], + ge[4], + ge[5]); + } #endif - hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit); - hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5]; - } - BrokenLine::BL_Fast_fit(hits, fast_fit); - - // no NaN here.... - ALPAKA_ASSERT_OFFLOAD(fast_fit(0) == fast_fit(0)); - ALPAKA_ASSERT_OFFLOAD(fast_fit(1) == fast_fit(1)); - ALPAKA_ASSERT_OFFLOAD(fast_fit(2) == fast_fit(2)); - ALPAKA_ASSERT_OFFLOAD(fast_fit(3) == fast_fit(3)); - }); + hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit); + hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5]; + } + BrokenLine::BL_Fast_fit(hits, fast_fit); + + // no NaN here.... + ALPAKA_ASSERT_OFFLOAD(fast_fit(0) == fast_fit(0)); + ALPAKA_ASSERT_OFFLOAD(fast_fit(1) == fast_fit(1)); + ALPAKA_ASSERT_OFFLOAD(fast_fit(2) == fast_fit(2)); + ALPAKA_ASSERT_OFFLOAD(fast_fit(3) == fast_fit(3)); + }); } // kernel operator() }; // struct @@ -144,54 +145,55 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { //for (int local_idx = local_start, nt = Rfit::maxNumberOfConcurrentFits(); local_idx < nt; //local_idx += gridDim.x * blockDim.x) { const auto nt = Rfit::maxNumberOfConcurrentFits(); - cms::alpakatools::for_each_element_in_grid_strided(acc, nt, [&](uint32_t local_idx) { - auto tuple_idx = local_idx + offset; - if (tuple_idx >= tupleMultiplicity->size(nHits)) - return; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, nt, [&](uint32_t local_idx) { + auto tuple_idx = local_idx + offset; + if (tuple_idx >= tupleMultiplicity->size(nHits)) + return; - // get it for the ntuple container (one to one to helix) - auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); + // get it for the ntuple container (one to one to helix) + auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); - Rfit::Map3xNd hits(phits + local_idx); - Rfit::Map4d fast_fit(pfast_fit + local_idx); - Rfit::Map6xNf hits_ge(phits_ge + local_idx); + Rfit::Map3xNd hits(phits + local_idx); + Rfit::Map4d fast_fit(pfast_fit + local_idx); + Rfit::Map6xNf hits_ge(phits_ge + local_idx); - BrokenLine::PreparedBrokenLineData data; - Rfit::Matrix3d Jacob; + BrokenLine::PreparedBrokenLineData data; + Rfit::Matrix3d Jacob; - BrokenLine::karimaki_circle_fit circle; - Rfit::line_fit line; + BrokenLine::karimaki_circle_fit circle; + Rfit::line_fit line; - BrokenLine::prepareBrokenLineData(hits, fast_fit, B, data); - BrokenLine::BL_Line_fit(hits_ge, fast_fit, B, data, line); - BrokenLine::BL_Circle_fit(hits, hits_ge, fast_fit, B, data, circle); + BrokenLine::prepareBrokenLineData(hits, fast_fit, B, data); + BrokenLine::BL_Line_fit(hits_ge, fast_fit, B, data, line); + BrokenLine::BL_Circle_fit(hits, hits_ge, fast_fit, B, data, circle); - results->stateAtBS.copyFromCircle(circle.par, circle.cov, line.par, line.cov, 1.f / float(B), tkid); - results->pt(tkid) = float(B) / float(std::abs(circle.par(2))); - results->eta(tkid) = asinhf(line.par(0)); - results->chi2(tkid) = (circle.chi2 + line.chi2) / (2 * N - 5); + results->stateAtBS.copyFromCircle(circle.par, circle.cov, line.par, line.cov, 1.f / float(B), tkid); + results->pt(tkid) = float(B) / float(std::abs(circle.par(2))); + results->eta(tkid) = asinhf(line.par(0)); + results->chi2(tkid) = (circle.chi2 + line.chi2) / (2 * N - 5); #ifdef BROKENLINE_DEBUG - if (!(circle.chi2 >= 0) || !(line.chi2 >= 0)) - printf("kernelBLFit failed! %f/%f\n", circle.chi2, line.chi2); - printf("kernelBLFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n", - N, - nHits, - tkid, - circle.par(0), - circle.par(1), - circle.par(2)); - printf("kernelBLHits line.par(0,1): %d %f,%f\n", tkid, line.par(0), line.par(1)); - printf("kernelBLHits chi2 cov %f/%f %e,%e,%e,%e,%e\n", - circle.chi2, - line.chi2, - circle.cov(0, 0), - circle.cov(1, 1), - circle.cov(2, 2), - line.cov(0, 0), - line.cov(1, 1)); + if (!(circle.chi2 >= 0) || !(line.chi2 >= 0)) + printf("kernelBLFit failed! %f/%f\n", circle.chi2, line.chi2); + printf("kernelBLFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n", + N, + nHits, + tkid, + circle.par(0), + circle.par(1), + circle.par(2)); + printf("kernelBLHits line.par(0,1): %d %f,%f\n", tkid, line.par(0), line.par(1)); + printf("kernelBLHits chi2 cov %f/%f %e,%e,%e,%e,%e\n", + circle.chi2, + line.chi2, + circle.cov(0, 0), + circle.cov(1, 1), + circle.cov(2, 2), + line.cov(0, 0), + line.cov(1, 1)); #endif - }); + }); } // kernel operator() }; // struct diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/CAConstants.h b/src/alpaka/plugin-PixelTriplets/alpaka/CAConstants.h index af639cc86..5cace84b5 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/CAConstants.h +++ b/src/alpaka/plugin-PixelTriplets/alpaka/CAConstants.h @@ -46,22 +46,22 @@ namespace CAConstants { using tindex_type = uint16_t; // for tuples #ifndef ONLY_PHICUT - using CellNeighbors = cms::alpakatools::VecArray; - using CellTracks = cms::alpakatools::VecArray; + using CellNeighbors = ::cms::alpakatools::VecArray; + using CellTracks = ::cms::alpakatools::VecArray; #else - using CellNeighbors = cms::alpakatools::VecArray; - using CellTracks = cms::alpakatools::VecArray; + using CellNeighbors = ::cms::alpakatools::VecArray; + using CellTracks = ::cms::alpakatools::VecArray; #endif - using CellNeighborsVector = cms::alpakatools::SimpleVector; - using CellTracksVector = cms::alpakatools::SimpleVector; + using CellNeighborsVector = ::cms::alpakatools::SimpleVector; + using CellTracksVector = ::cms::alpakatools::SimpleVector; - using OuterHitOfCell = cms::alpakatools::VecArray; + using OuterHitOfCell = ::cms::alpakatools::VecArray; - using TuplesContainer = cms::alpakatools::OneToManyAssoc; - using HitToTuple = cms::alpakatools:: + using TuplesContainer = ::cms::alpakatools::OneToManyAssoc; + using HitToTuple = ::cms::alpakatools:: OneToManyAssoc; // 3.5 should be enough - using TupleMultiplicity = cms::alpakatools::OneToManyAssoc; + using TupleMultiplicity = ::cms::alpakatools::OneToManyAssoc; } // namespace CAConstants diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletAlpaka.cc b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletAlpaka.cc index ca5008eac..b02415308 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletAlpaka.cc +++ b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletAlpaka.cc @@ -6,6 +6,7 @@ #include "CAHitNtupletGeneratorOnGPU.h" #include "AlpakaDataFormats/PixelTrackAlpaka.h" #include "AlpakaDataFormats/TrackingRecHit2DAlpaka.h" +#include "AlpakaCore/ScopedContext.h" #include "AlpakaCore/alpakaCommon.h" @@ -35,8 +36,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto const& hits = iEvent.get(tokenHitGPU_); - Queue queue(device); - iEvent.emplace(tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, queue)); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{::ALPAKA_ACCELERATOR_NAMESPACE::device, + iEvent.streamID()}; + ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::device, + iEvent, + tokenTrackGPU_, + gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream())); } } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernels.cc b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernels.cc index 54b6a0121..f542cf940 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernels.cc +++ b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernels.cc @@ -9,8 +9,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // NB: MPORTANT: This could be tuned to benefit from innermost loop. const auto blockSize = 128; const auto numberOfBlocks = (HitContainer::capacity() + blockSize - 1) / blockSize; - const WorkDiv1D fillHitDetWorkDiv = - cms::alpakatools::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); + const WorkDiv1D fillHitDetWorkDiv = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( + Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); alpaka::enqueue( queue, alpaka::createTaskKernel( @@ -47,7 +47,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ALPAKA_ASSERT_OFFLOAD(blockSize > 0 && 0 == blockSize % 16); const Vec2D blks(numberOfBlocks, 1u); const Vec2D thrs(blockSize, stride); - const WorkDiv2D kernelConnectWorkDiv = cms::alpakatools::make_workdiv(blks, thrs); + const WorkDiv2D kernelConnectWorkDiv = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(blks, thrs); alpaka::enqueue(queue, alpaka::createTaskKernel( kernelConnectWorkDiv, @@ -73,7 +73,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { const uint32_t numberOfBlocks = (nhits + blockSize - 1) / blockSize; const Vec2D blks(numberOfBlocks, 1u); const Vec2D thrs(blockSize, stride); - const WorkDiv2D fishboneWorkDiv = cms::alpakatools::make_workdiv(blks, thrs); + const WorkDiv2D fishboneWorkDiv = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(blks, thrs); alpaka::enqueue(queue, alpaka::createTaskKernel(fishboneWorkDiv, @@ -89,7 +89,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { blockSize = 64; numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; - WorkDiv1D workDiv1D = cms::alpakatools::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); + WorkDiv1D workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(numberOfBlocks), + Vec1D::all(blockSize)); alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv1D, kernel_find_ntuplets(), @@ -117,15 +118,17 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { blockSize = 128; numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize; - workDiv1D = cms::alpakatools::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); + workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(numberOfBlocks), + Vec1D::all(blockSize)); alpaka::enqueue( queue, alpaka::createTaskKernel( - workDiv1D, cms::alpakatools::finalizeBulk(), alpaka::getPtrNative(device_hitTuple_apc_), tuples_d)); + workDiv1D, ::cms::alpakatools::finalizeBulk(), alpaka::getPtrNative(device_hitTuple_apc_), tuples_d)); // remove duplicates (tracks that share a doublet) numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; - workDiv1D = cms::alpakatools::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); + workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(numberOfBlocks), + Vec1D::all(blockSize)); alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv1D, kernel_earlyDuplicateRemover(), @@ -136,7 +139,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { blockSize = 128; numberOfBlocks = (3 * CAConstants::maxTuples() / 4 + blockSize - 1) / blockSize; - workDiv1D = cms::alpakatools::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); + workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(numberOfBlocks), + Vec1D::all(blockSize)); alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv1D, kernel_countMultiplicity(), @@ -144,9 +148,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { quality_d, alpaka::getPtrNative(device_tupleMultiplicity_))); - cms::alpakatools::launchFinalize(alpaka::getPtrNative(device_tupleMultiplicity_), queue); + ::cms::alpakatools::launchFinalize(alpaka::getPtrNative(device_tupleMultiplicity_), queue); - workDiv1D = cms::alpakatools::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); + workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(numberOfBlocks), + Vec1D::all(blockSize)); alpaka::enqueue( queue, alpaka::createTaskKernel( @@ -160,7 +165,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { const Vec2D blks(numberOfBlocks, 1u); const Vec2D thrs(blockSize, stride); - const WorkDiv2D workDiv2D = cms::alpakatools::make_workdiv(blks, thrs); + const WorkDiv2D workDiv2D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(blks, thrs); alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv2D, gpuPixelDoublets::fishbone(), @@ -175,7 +180,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { if (m_params.doStats_) { numberOfBlocks = (std::max(nhits, m_params.maxNumberOfDoublets_) + blockSize - 1) / blockSize; - workDiv1D = cms::alpakatools::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); + workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(numberOfBlocks), + Vec1D::all(blockSize)); alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv1D, kernel_checkOverflows(), @@ -217,7 +223,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { int threadsPerBlock = 128; // at least one block! int blocks = (std::max(1U, nhits) + threadsPerBlock - 1) / threadsPerBlock; - const WorkDiv1D workDiv1D = cms::alpakatools::make_workdiv(Vec1D::all(blocks), Vec1D::all(threadsPerBlock)); + const WorkDiv1D workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( + Vec1D::all(blocks), Vec1D::all(threadsPerBlock)); alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv1D, gpuPixelDoublets::initDoublets(), @@ -251,7 +258,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { const uint32_t blocks = (4 * nhits + threadsPerBlock - 1) / threadsPerBlock; const Vec2D blks(blocks, 1u); const Vec2D thrs(threadsPerBlock, stride); - const WorkDiv2D workDiv2D = cms::alpakatools::make_workdiv(blks, thrs); + const WorkDiv2D workDiv2D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(blks, thrs); alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv2D, gpuPixelDoublets::getDoubletsFromHisto(), @@ -283,7 +290,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // classify tracks based on kinematics auto numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize; - WorkDiv1D workDiv1D = cms::alpakatools::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); + WorkDiv1D workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(numberOfBlocks), + Vec1D::all(blockSize)); alpaka::enqueue(queue, alpaka::createTaskKernel( workDiv1D, kernel_classifyTracks(), tuples_d, tracks_d, m_params.cuts_, quality_d)); @@ -291,7 +299,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { if (m_params.lateFishbone_) { // apply fishbone cleaning to good tracks numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; - workDiv1D = cms::alpakatools::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); + workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(numberOfBlocks), + Vec1D::all(blockSize)); alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv1D, kernel_fishboneCleaner(), @@ -303,7 +312,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // remove duplicates (tracks that share a doublet) numberOfBlocks = (3 * m_params.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; - workDiv1D = cms::alpakatools::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); + workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(numberOfBlocks), + Vec1D::all(blockSize)); alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv1D, kernel_fastDuplicateRemover(), @@ -315,15 +325,17 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { if (m_params.minHitsPerNtuplet_ < 4 || m_params.doStats_) { // fill hit->track "map" numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize; - workDiv1D = cms::alpakatools::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); + workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(numberOfBlocks), + Vec1D::all(blockSize)); alpaka::enqueue( queue, alpaka::createTaskKernel( workDiv1D, kernel_countHitInTracks(), tuples_d, quality_d, alpaka::getPtrNative(device_hitToTuple_))); - cms::alpakatools::launchFinalize(alpaka::getPtrNative(device_hitToTuple_), queue); + ::cms::alpakatools::launchFinalize(alpaka::getPtrNative(device_hitToTuple_), queue); - workDiv1D = cms::alpakatools::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); + workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(numberOfBlocks), + Vec1D::all(blockSize)); alpaka::enqueue( queue, alpaka::createTaskKernel( @@ -333,7 +345,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { if (m_params.minHitsPerNtuplet_ < 4) { // remove duplicates (tracks that share a hit) numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize; - workDiv1D = cms::alpakatools::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); + workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(numberOfBlocks), + Vec1D::all(blockSize)); alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv1D, kernel_tripletCleaner(), @@ -348,7 +361,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { if (m_params.doStats_) { // counters (add flag???) numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize; - workDiv1D = cms::alpakatools::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); + workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(numberOfBlocks), + Vec1D::all(blockSize)); alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv1D, kernel_doStatsForHitInTracks(), @@ -356,7 +370,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { alpaka::getPtrNative(counters_))); numberOfBlocks = (3 * CAConstants::maxNumberOfQuadruplets() / 4 + blockSize - 1) / blockSize; - workDiv1D = cms::alpakatools::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); + workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(numberOfBlocks), + Vec1D::all(blockSize)); alpaka::enqueue(queue, alpaka::createTaskKernel( workDiv1D, kernel_doStatsForTracks(), tuples_d, quality_d, alpaka::getPtrNative(counters_))); @@ -369,7 +384,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { #ifdef DUMP_GPU_TK_TUPLES static std::atomic iev(0); ++iev; - workDiv1D = cms::alpakatools::make_workdiv(Vec1D::all(1u), Vec1D::all(32u)); + workDiv1D = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(1u), Vec1D::all(32u)); alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv1D, kernel_print_found_ntuplets(), @@ -384,7 +399,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } void CAHitNtupletGeneratorKernels::printCounters(Queue &queue) { - const WorkDiv1D workDiv1D = cms::alpakatools::make_workdiv(Vec1D::all(1u), Vec1D::all(1u)); + const WorkDiv1D workDiv1D = + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(1u), Vec1D::all(1u)); alpaka::enqueue( queue, alpaka::createTaskKernel(workDiv1D, kernel_printCounters(), alpaka::getPtrNative(counters_))); alpaka::wait(queue); diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernels.h b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernels.h index d73b12c40..2d12a32eb 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernels.h +++ b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernels.h @@ -23,8 +23,8 @@ namespace cAHitNtupletGenerator { unsigned long long nZeroTrackCells; }; - using HitsView = ALPAKA_ACCELERATOR_NAMESPACE::TrackingRecHit2DSOAView; - using HitsOnGPU = ALPAKA_ACCELERATOR_NAMESPACE::TrackingRecHit2DSOAView; + using HitsView = ::ALPAKA_ACCELERATOR_NAMESPACE::TrackingRecHit2DSOAView; + using HitsOnGPU = ::ALPAKA_ACCELERATOR_NAMESPACE::TrackingRecHit2DSOAView; using HitToTuple = CAConstants::HitToTuple; using TupleMultiplicity = CAConstants::TupleMultiplicity; @@ -155,38 +155,48 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { using TkSoA = pixelTrack::TrackSoA; using HitContainer = pixelTrack::HitContainer; - CAHitNtupletGeneratorKernels(Params const& params, uint32_t nhits) + CAHitNtupletGeneratorKernels(Params const& params, uint32_t nhits, Queue& queue) : m_params(params), ////////////////////////////////////////////////////////// // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER) ////////////////////////////////////////////////////////// - counters_{cms::alpakatools::allocDeviceBuf(1u)}, + counters_{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}, - device_hitToTuple_{cms::alpakatools::allocDeviceBuf(1u)}, - device_tupleMultiplicity_{cms::alpakatools::allocDeviceBuf(1u)}, + device_hitToTuple_{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}, + device_tupleMultiplicity_{ + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}, - device_theCells_{cms::alpakatools::allocDeviceBuf(params.maxNumberOfDoublets_)}, + device_theCells_{ + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(params.maxNumberOfDoublets_)}, // in principle we can use "nhits" to heuristically dimension the workspace... - device_isOuterHitOfCell_{cms::alpakatools::allocDeviceBuf(std::max(1U, nhits))}, + device_isOuterHitOfCell_{ + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf( + std::max(1U, nhits))}, - device_theCellNeighbors_{cms::alpakatools::allocDeviceBuf(1u)}, - device_theCellTracks_{cms::alpakatools::allocDeviceBuf(1u)}, + device_theCellNeighbors_{ + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}, + device_theCellTracks_{ + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}, - //cellStorage_{cms::alpakatools::allocDeviceBuf(CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) + CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks))}, + //cellStorage_{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellNeighbors) + CAConstants::maxNumOfActiveDoublets() * sizeof(GPUCACell::CellTracks))}, device_theCellNeighborsContainer_{ - cms::alpakatools::allocDeviceBuf(CAConstants::maxNumOfActiveDoublets())}, + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf( + CAConstants::maxNumOfActiveDoublets())}, device_theCellTracksContainer_{ - cms::alpakatools::allocDeviceBuf(CAConstants::maxNumOfActiveDoublets())}, + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf( + CAConstants::maxNumOfActiveDoublets())}, - //device_storage_{cms::alpakatools::allocDeviceBuf(3u)}, - //device_hitTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get()}, - //device_hitToTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get() + 1; + //device_storage_{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf<::ALPAKA_ACCELERATOR_NAMESPACE::cmscuda::AtomicPairCounter::c_type>(3u)}, + //device_hitTuple_apc_ = (::cms::alpakatools::AtomicPairCounter*)device_storage_.get()}, + //device_hitToTuple_apc_ = (::cms::alpakatools::AtomicPairCounter*)device_storage_.get() + 1; //device_nCells_ = (uint32_t*)(device_storage_.get() + 2)}, - device_hitTuple_apc_{cms::alpakatools::allocDeviceBuf(1u)}, - device_hitToTuple_apc_{cms::alpakatools::allocDeviceBuf(1u)}, - device_nCells_{cms::alpakatools::allocDeviceBuf(1u)} { - Queue queue(device); - + device_hitTuple_apc_{ + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf<::cms::alpakatools::AtomicPairCounter>( + 1u)}, + device_hitToTuple_apc_{ + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf<::cms::alpakatools::AtomicPairCounter>( + 1u)}, + device_nCells_{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)} { alpaka::memset(queue, counters_, 0, 1u); alpaka::memset(queue, device_nCells_, 0, 1u); @@ -240,11 +250,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { AlpakaDeviceBuf device_theCellNeighborsContainer_; // Was non-owning in legacy! AlpakaDeviceBuf device_theCellTracksContainer_; // Was non-owning in legacy! - // AlpakaDeviceBuf device_storage_; // NB: In legacy + // AlpakaDeviceBuf<::cms::alpakatools::AtomicPairCounter::c_type> device_storage_; // NB: In legacy // NB: Here, data from device_storage_ (legacy) directly owned by the following: - AlpakaDeviceBuf device_hitTuple_apc_; // Was non-owning in legacy! - AlpakaDeviceBuf device_hitToTuple_apc_; // Was non-owning in legacy! - AlpakaDeviceBuf device_nCells_; // Was non-owning in legacy! + AlpakaDeviceBuf<::cms::alpakatools::AtomicPairCounter> device_hitTuple_apc_; // Was non-owning in legacy! + AlpakaDeviceBuf<::cms::alpakatools::AtomicPairCounter> device_hitToTuple_apc_; // Was non-owning in legacy! + AlpakaDeviceBuf device_nCells_; // Was non-owning in legacy! }; } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernelsImpl.h b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernelsImpl.h index fbf175836..2134ca693 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernelsImpl.h +++ b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorKernelsImpl.h @@ -34,7 +34,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ALPAKA_FN_ACC void operator()(const T_Acc &acc, HitContainer const *foundNtuplets, CAConstants::TupleMultiplicity *tupleMultiplicity, - cms::alpakatools::AtomicPairCounter *apc, + ::cms::alpakatools::AtomicPairCounter *apc, GPUCACell const *__restrict__ cells, uint32_t const *__restrict__ nCells, gpuPixelDoublets::CellNeighborsVector const *cellNeighbors, @@ -72,13 +72,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } const auto ntNbins = foundNtuplets->nbins(); - cms::alpakatools::for_each_element_in_grid_strided(acc, ntNbins, [&](uint32_t idx) { - if (foundNtuplets->size(idx) > 5) - printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx)); - ALPAKA_ASSERT_OFFLOAD(foundNtuplets->size(idx) < 6); - for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih) - ALPAKA_ASSERT_OFFLOAD(*ih < nHits); - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, ntNbins, [&](uint32_t idx) { + if (foundNtuplets->size(idx) > 5) + printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx)); + ALPAKA_ASSERT_OFFLOAD(foundNtuplets->size(idx) < 6); + for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih) + ALPAKA_ASSERT_OFFLOAD(*ih < nHits); + }); #endif if (0 == threadIdx) { @@ -93,21 +94,22 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } const auto ntNCells = (*nCells); - cms::alpakatools::for_each_element_in_grid_strided(acc, ntNCells, [&](uint32_t idx) { - auto const &thisCell = cells[idx]; - if (thisCell.outerNeighbors().full()) //++tooManyNeighbors[thisCell.theLayerPairId]; - printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.theLayerPairId); - if (thisCell.tracks().full()) //++tooManyTracks[thisCell.theLayerPairId]; - printf("Tracks overflow %d in %d\n", idx, thisCell.theLayerPairId); - if (thisCell.theDoubletId < 0) - alpaka::atomicAdd(acc, &c.nKilledCells, 1ull, alpaka::hierarchy::Blocks{}); - if (0 == thisCell.theUsed) - alpaka::atomicAdd(acc, &c.nEmptyCells, 1ull, alpaka::hierarchy::Blocks{}); - if (thisCell.tracks().empty()) - alpaka::atomicAdd(acc, &c.nZeroTrackCells, 1ull, alpaka::hierarchy::Blocks{}); - }); - - cms::alpakatools::for_each_element_in_grid_strided(acc, nHits, [&](uint32_t idx) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, ntNCells, [&](uint32_t idx) { + auto const &thisCell = cells[idx]; + if (thisCell.outerNeighbors().full()) //++tooManyNeighbors[thisCell.theLayerPairId]; + printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.theLayerPairId); + if (thisCell.tracks().full()) //++tooManyTracks[thisCell.theLayerPairId]; + printf("Tracks overflow %d in %d\n", idx, thisCell.theLayerPairId); + if (thisCell.theDoubletId < 0) + alpaka::atomicAdd(acc, &c.nKilledCells, 1ull, alpaka::hierarchy::Blocks{}); + if (0 == thisCell.theUsed) + alpaka::atomicAdd(acc, &c.nEmptyCells, 1ull, alpaka::hierarchy::Blocks{}); + if (thisCell.tracks().empty()) + alpaka::atomicAdd(acc, &c.nZeroTrackCells, 1ull, alpaka::hierarchy::Blocks{}); + }); + + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(acc, nHits, [&](uint32_t idx) { if (isOuterHitOfCell[idx].full()) // ++tooManyOuterHitOfCell; printf("OuterHitOfCell overflow %d\n", idx); }); @@ -123,14 +125,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { constexpr auto bad = trackQuality::bad; const auto ntNCells = (*nCells); - cms::alpakatools::for_each_element_in_grid_strided(acc, ntNCells, [&](uint32_t idx) { - auto const &thisCell = cells[idx]; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, ntNCells, [&](uint32_t idx) { + auto const &thisCell = cells[idx]; - if (thisCell.theDoubletId < 0) { - for (auto it : thisCell.tracks()) - quality[it] = bad; - } - }); + if (thisCell.theDoubletId < 0) { + for (auto it : thisCell.tracks()) + quality[it] = bad; + } + }); } }; @@ -147,27 +150,28 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ALPAKA_ASSERT_OFFLOAD(nCells); const auto ntNCells = (*nCells); - cms::alpakatools::for_each_element_in_grid_strided(acc, ntNCells, [&](uint32_t idx) { - auto const &thisCell = cells[idx]; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, ntNCells, [&](uint32_t idx) { + auto const &thisCell = cells[idx]; - if (thisCell.tracks().size() >= 2) { - //if (0==thisCell.theUsed) continue; - // if (thisCell.theDoubletId < 0) continue; + if (thisCell.tracks().size() >= 2) { + //if (0==thisCell.theUsed) continue; + // if (thisCell.theDoubletId < 0) continue; - uint32_t maxNh = 0; + uint32_t maxNh = 0; - // find maxNh - for (auto it : thisCell.tracks()) { - auto nh = foundNtuplets->size(it); - maxNh = std::max(nh, maxNh); - } + // find maxNh + for (auto it : thisCell.tracks()) { + auto nh = foundNtuplets->size(it); + maxNh = std::max(nh, maxNh); + } - for (auto it : thisCell.tracks()) { - if (foundNtuplets->size(it) != maxNh) - quality[it] = dup; //no race: simple assignment of the same constant - } - } - }); + for (auto it : thisCell.tracks()) { + if (foundNtuplets->size(it) != maxNh) + quality[it] = dup; //no race: simple assignment of the same constant + } + } + }); } }; @@ -184,41 +188,42 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ALPAKA_ASSERT_OFFLOAD(nCells); - cms::alpakatools::for_each_element_in_grid_strided(acc, (*nCells), [&](uint32_t idx) { - auto const &thisCell = cells[idx]; - if (thisCell.tracks().size() >= 2) { - // if (thisCell.theDoubletId < 0) continue; - - float mc = 10000.f; - uint16_t im = 60000; - - auto score = [&](auto it) { - return std::abs(tracks->tip(it)); // tip - // return tracks->chi2(it); //chi2 - }; - - // find min socre - for (auto it : thisCell.tracks()) { - if (tracks->quality(it) == loose && score(it) < mc) { - mc = score(it); - im = it; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, (*nCells), [&](uint32_t idx) { + auto const &thisCell = cells[idx]; + if (thisCell.tracks().size() >= 2) { + // if (thisCell.theDoubletId < 0) continue; + + float mc = 10000.f; + uint16_t im = 60000; + + auto score = [&](auto it) { + return std::abs(tracks->tip(it)); // tip + // return tracks->chi2(it); //chi2 + }; + + // find min socre + for (auto it : thisCell.tracks()) { + if (tracks->quality(it) == loose && score(it) < mc) { + mc = score(it); + im = it; + } + } + // mark all other duplicates + for (auto it : thisCell.tracks()) { + if (tracks->quality(it) != bad && it != im) + tracks->quality(it) = dup; //no race: simple assignment of the same constant + } } - } - // mark all other duplicates - for (auto it : thisCell.tracks()) { - if (tracks->quality(it) != bad && it != im) - tracks->quality(it) = dup; //no race: simple assignment of the same constant - } - } - }); + }); } }; struct kernel_connect { template ALPAKA_FN_ACC void operator()(const T_Acc &acc, - cms::alpakatools::AtomicPairCounter *apc1, - cms::alpakatools::AtomicPairCounter *apc2, // just to zero them, + ::cms::alpakatools::AtomicPairCounter *apc1, + ::cms::alpakatools::AtomicPairCounter *apc2, // just to zero them, GPUCACell::Hits const *__restrict__ hhp, GPUCACell *cells, uint32_t const *__restrict__ nCells, @@ -242,7 +247,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { (*apc2) = 0; } // ready for next kernel - cms::alpakatools::for_each_element_in_grid_strided( + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( acc, (*nCells), 0u, @@ -264,7 +269,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto zo = thisCell.get_outer_z(hh); auto isBarrel = thisCell.get_inner_detIndex(hh) < last_barrel_detIndex; - cms::alpakatools::for_each_element_in_block_strided( + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( acc, numberOfPossibleNeighbors, 0u, @@ -310,7 +315,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { uint32_t const *nCells, gpuPixelDoublets::CellTracksVector *cellTracks, HitContainer *foundNtuplets, - cms::alpakatools::AtomicPairCounter *apc, + ::cms::alpakatools::AtomicPairCounter *apc, Quality *__restrict__ quality, unsigned int minHitsPerNtuplet) const { // recursive: not obvious to widen @@ -318,22 +323,23 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { //auto first = threadIdx.x + blockIdx.x * blockDim.x; //for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { - cms::alpakatools::for_each_element_in_grid_strided(acc, (*nCells), [&](uint32_t idx) { - auto const &thisCell = cells[idx]; - if (thisCell.theDoubletId >= 0) { // cut by earlyFishbone - - auto pid = thisCell.theLayerPairId; - auto doit = minHitsPerNtuplet > 3 ? pid < 3 : pid < 8 || pid > 12; - if (doit) { - GPUCACell::TmpTuple stack; - stack.reset(); - thisCell.find_ntuplets( - acc, hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3); - ALPAKA_ASSERT_OFFLOAD(stack.empty()); - // printf("in %d found quadruplets: %d\n", cellIndex, apc->get()); - } - } - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, (*nCells), [&](uint32_t idx) { + auto const &thisCell = cells[idx]; + if (thisCell.theDoubletId >= 0) { // cut by earlyFishbone + + auto pid = thisCell.theLayerPairId; + auto doit = minHitsPerNtuplet > 3 ? pid < 3 : pid < 8 || pid > 12; + if (doit) { + GPUCACell::TmpTuple stack; + stack.reset(); + thisCell.find_ntuplets( + acc, hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3); + ALPAKA_ASSERT_OFFLOAD(stack.empty()); + // printf("in %d found quadruplets: %d\n", cellIndex, apc->get()); + } + } + }); } }; @@ -344,11 +350,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { GPUCACell *__restrict__ cells, uint32_t const *nCells) const { // auto const &hh = *hhp; - cms::alpakatools::for_each_element_in_grid_strided(acc, (*nCells), [&](uint32_t idx) { - auto &thisCell = cells[idx]; - if (!thisCell.tracks().empty()) - thisCell.theUsed |= 2; - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, (*nCells), [&](uint32_t idx) { + auto &thisCell = cells[idx]; + if (!thisCell.tracks().empty()) + thisCell.theUsed |= 2; + }); } }; @@ -358,16 +365,17 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { HitContainer const *__restrict__ foundNtuplets, Quality const *__restrict__ quality, CAConstants::TupleMultiplicity *tupleMultiplicity) const { - cms::alpakatools::for_each_element_in_grid_strided(acc, foundNtuplets->nbins(), [&](uint32_t it) { - auto nhits = foundNtuplets->size(it); - if (nhits >= 3 && quality[it] != trackQuality::dup) { - ALPAKA_ASSERT_OFFLOAD(quality[it] == trackQuality::bad); - if (nhits > 5) - printf("wrong mult %d %d\n", it, nhits); - ALPAKA_ASSERT_OFFLOAD(nhits < 8); - tupleMultiplicity->countDirect(acc, nhits); - } - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, foundNtuplets->nbins(), [&](uint32_t it) { + auto nhits = foundNtuplets->size(it); + if (nhits >= 3 && quality[it] != trackQuality::dup) { + ALPAKA_ASSERT_OFFLOAD(quality[it] == trackQuality::bad); + if (nhits > 5) + printf("wrong mult %d %d\n", it, nhits); + ALPAKA_ASSERT_OFFLOAD(nhits < 8); + tupleMultiplicity->countDirect(acc, nhits); + } + }); } }; @@ -377,16 +385,17 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { HitContainer const *__restrict__ foundNtuplets, Quality const *__restrict__ quality, CAConstants::TupleMultiplicity *tupleMultiplicity) const { - cms::alpakatools::for_each_element_in_grid_strided(acc, foundNtuplets->nbins(), [&](uint32_t it) { - auto nhits = foundNtuplets->size(it); - if (nhits >= 3 && quality[it] != trackQuality::dup) { - ALPAKA_ASSERT_OFFLOAD(quality[it] == trackQuality::bad); - if (nhits > 5) - printf("wrong mult %d %d\n", it, nhits); - ALPAKA_ASSERT_OFFLOAD(nhits < 8); - tupleMultiplicity->fillDirect(acc, nhits, it); - } - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, foundNtuplets->nbins(), [&](uint32_t it) { + auto nhits = foundNtuplets->size(it); + if (nhits >= 3 && quality[it] != trackQuality::dup) { + ALPAKA_ASSERT_OFFLOAD(quality[it] == trackQuality::bad); + if (nhits > 5) + printf("wrong mult %d %d\n", it, nhits); + ALPAKA_ASSERT_OFFLOAD(nhits < 8); + tupleMultiplicity->fillDirect(acc, nhits, it); + } + }); } }; @@ -397,63 +406,64 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { TkSoA const *__restrict__ tracks, CAHitNtupletGeneratorKernels::QualityCuts cuts, Quality *__restrict__ quality) const { - cms::alpakatools::for_each_element_in_grid_strided(acc, tuples->nbins(), [&](uint32_t it) { - auto nhits = tuples->size(it); - if (nhits == 0) - return; // guard - - // if duplicate: not even fit - // mark doublets as bad - if (quality[it] != trackQuality::dup && nhits >= 3) { - ALPAKA_ASSERT_OFFLOAD(quality[it] == trackQuality::bad); - - // if the fit has any invalid parameters, mark it as bad - bool isNaN = false; - for (int i = 0; i < 5; ++i) { - isNaN |= std::isnan(tracks->stateAtBS.state(it)(i)); - } - if (!isNaN) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, tuples->nbins(), [&](uint32_t it) { + auto nhits = tuples->size(it); + if (nhits == 0) + return; // guard + + // if duplicate: not even fit + // mark doublets as bad + if (quality[it] != trackQuality::dup && nhits >= 3) { + ALPAKA_ASSERT_OFFLOAD(quality[it] == trackQuality::bad); + + // if the fit has any invalid parameters, mark it as bad + bool isNaN = false; + for (int i = 0; i < 5; ++i) { + isNaN |= std::isnan(tracks->stateAtBS.state(it)(i)); + } + if (!isNaN) { #ifdef NTUPLE_DEBUG - printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it)); + printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it)); #endif - // compute a pT-dependent chi2 cut - // default parameters: - // - chi2MaxPt = 10 GeV - // - chi2Coeff = { 0.68177776, 0.74609577, -0.08035491, 0.00315399 } - // - chi2Scale = 30 for broken line fit, 45 for Riemann fit - // (see CAHitNtupletGeneratorGPU.cc) - float pt = std::min(tracks->pt(it), cuts.chi2MaxPt); - float chi2Cut = - cuts.chi2Scale * - (cuts.chi2Coeff[0] + pt * (cuts.chi2Coeff[1] + pt * (cuts.chi2Coeff[2] + pt * cuts.chi2Coeff[3]))); - // above number were for Quads not normalized so for the time being just multiple by ndof for Quads (triplets to be understood) - if (3.f * tracks->chi2(it) < chi2Cut) { + // compute a pT-dependent chi2 cut + // default parameters: + // - chi2MaxPt = 10 GeV + // - chi2Coeff = { 0.68177776, 0.74609577, -0.08035491, 0.00315399 } + // - chi2Scale = 30 for broken line fit, 45 for Riemann fit + // (see CAHitNtupletGeneratorGPU.cc) + float pt = std::min(tracks->pt(it), cuts.chi2MaxPt); + float chi2Cut = + cuts.chi2Scale * + (cuts.chi2Coeff[0] + pt * (cuts.chi2Coeff[1] + pt * (cuts.chi2Coeff[2] + pt * cuts.chi2Coeff[3]))); + // above number were for Quads not normalized so for the time being just multiple by ndof for Quads (triplets to be understood) + if (3.f * tracks->chi2(it) < chi2Cut) { #ifdef NTUPLE_DEBUG - printf("Bad fit %d size %d pt %f eta %f chi2 %f\n", - it, - tuples->size(it), - tracks->pt(it), - tracks->eta(it), - 3.f * tracks->chi2(it)); + printf("Bad fit %d size %d pt %f eta %f chi2 %f\n", + it, + tuples->size(it), + tracks->pt(it), + tracks->eta(it), + 3.f * tracks->chi2(it)); #endif - // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip) - // default cuts: - // - for triplets: |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm - // - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm - // (see CAHitNtupletGeneratorGPU.cc) - auto const ®ion = (nhits > 3) ? cuts.quadruplet : cuts.triplet; - bool isOk = (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and - (std::abs(tracks->zip(it)) < region.maxZip); - - if (isOk) - quality[it] = trackQuality::loose; - - } // chi2Cut - } // !isNaN - } // trackQuality and nhits - }); + // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip) + // default cuts: + // - for triplets: |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm + // - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm + // (see CAHitNtupletGeneratorGPU.cc) + auto const ®ion = (nhits > 3) ? cuts.quadruplet : cuts.triplet; + bool isOk = (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and + (std::abs(tracks->zip(it)) < region.maxZip); + + if (isOk) + quality[it] = trackQuality::loose; + + } // chi2Cut + } // !isNaN + } // trackQuality and nhits + }); } }; @@ -463,13 +473,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { HitContainer const *__restrict__ tuples, Quality const *__restrict__ quality, CAHitNtupletGeneratorKernels::Counters *counters) const { - cms::alpakatools::for_each_element_in_grid_strided(acc, tuples->nbins(), [&](uint32_t idx) { - if (tuples->size(idx) == 0) - return; //guard - if (quality[idx] == trackQuality::loose) { - alpaka::atomicAdd(acc, &(counters->nGoodTracks), 1ull, alpaka::hierarchy::Blocks{}); - } - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, tuples->nbins(), [&](uint32_t idx) { + if (tuples->size(idx) == 0) + return; //guard + if (quality[idx] == trackQuality::loose) { + alpaka::atomicAdd(acc, &(counters->nGoodTracks), 1ull, alpaka::hierarchy::Blocks{}); + } + }); } }; @@ -479,14 +490,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { HitContainer const *__restrict__ tuples, Quality const *__restrict__ quality, CAHitNtupletGeneratorKernels::HitToTuple *hitToTuple) const { - cms::alpakatools::for_each_element_in_grid_strided(acc, tuples->nbins(), [&](uint32_t idx) { - if (tuples->size(idx) == 0) - return; // guard - if (quality[idx] == trackQuality::loose) { - for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) - hitToTuple->countDirect(acc, *h); - } - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, tuples->nbins(), [&](uint32_t idx) { + if (tuples->size(idx) == 0) + return; // guard + if (quality[idx] == trackQuality::loose) { + for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) + hitToTuple->countDirect(acc, *h); + } + }); } }; @@ -496,14 +508,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { HitContainer const *__restrict__ tuples, Quality const *__restrict__ quality, CAHitNtupletGeneratorKernels::HitToTuple *hitToTuple) const { - cms::alpakatools::for_each_element_in_grid_strided(acc, tuples->nbins(), [&](uint32_t idx) { - if (tuples->size(idx) == 0) - return; // guard - if (quality[idx] == trackQuality::loose) { - for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) - hitToTuple->fillDirect(acc, *h, idx); - } - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, tuples->nbins(), [&](uint32_t idx) { + if (tuples->size(idx) == 0) + return; // guard + if (quality[idx] == trackQuality::loose) { + for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) + hitToTuple->fillDirect(acc, *h, idx); + } + }); } }; @@ -514,17 +527,18 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { TrackingRecHit2DSOAView const *__restrict__ hhp, HitContainer *__restrict__ hitDetIndices) const { // copy offsets - cms::alpakatools::for_each_element_in_grid_strided( + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( acc, tuples->totbins(), [&](uint32_t idx) { hitDetIndices->off[idx] = tuples->off[idx]; }); // fill hit indices auto const &hh = *hhp; #ifndef NDEBUG auto nhits = hh.nHits(); #endif - cms::alpakatools::for_each_element_in_grid_strided(acc, tuples->size(), [&](uint32_t idx) { - ALPAKA_ASSERT_OFFLOAD(tuples->bins[idx] < nhits); - hitDetIndices->bins[idx] = hh.detectorIndex(tuples->bins[idx]); - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, tuples->size(), [&](uint32_t idx) { + ALPAKA_ASSERT_OFFLOAD(tuples->bins[idx] < nhits); + hitDetIndices->bins[idx] = hh.detectorIndex(tuples->bins[idx]); + }); } }; @@ -534,13 +548,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { CAHitNtupletGeneratorKernels::HitToTuple const *__restrict__ hitToTuple, CAHitNtupletGeneratorKernels::Counters *counters) const { auto &c = *counters; - cms::alpakatools::for_each_element_in_grid_strided(acc, hitToTuple->nbins(), [&](uint32_t idx) { - if (hitToTuple->size(idx) != 0) { // SHALL NOT BE break - alpaka::atomicAdd(acc, &c.nUsedHits, 1ull, alpaka::hierarchy::Blocks{}); - if (hitToTuple->size(idx) > 1) - alpaka::atomicAdd(acc, &c.nDupHits, 1ull, alpaka::hierarchy::Blocks{}); - } - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, hitToTuple->nbins(), [&](uint32_t idx) { + if (hitToTuple->size(idx) != 0) { // SHALL NOT BE break + alpaka::atomicAdd(acc, &c.nUsedHits, 1ull, alpaka::hierarchy::Blocks{}); + if (hitToTuple->size(idx) > 1) + alpaka::atomicAdd(acc, &c.nDupHits, 1ull, alpaka::hierarchy::Blocks{}); + } + }); } }; @@ -563,44 +578,45 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // auto const & hh = *hhp; // auto l1end = hh.hitsLayerStart_d[1]; - cms::alpakatools::for_each_element_in_grid_strided(acc, phitToTuple->nbins(), [&](uint32_t idx) { - if (hitToTuple.size(idx) >= 2) { - float mc = 10000.f; - uint16_t im = 60000; - uint32_t maxNh = 0; - - // find maxNh - for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { - uint32_t nh = foundNtuplets.size(*it); - maxNh = std::max(nh, maxNh); - } - // kill all tracks shorter than maxHn (only triplets???) - for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { - uint32_t nh = foundNtuplets.size(*it); - if (maxNh != nh) - quality[*it] = dup; - } - - if (maxNh <= 3) { - // if (idx>=l1end) continue; // only for layer 1 - // for triplets choose best tip! - for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { - auto const it = *ip; - if (quality[it] != bad && std::abs(tracks.tip(it)) < mc) { - mc = std::abs(tracks.tip(it)); - im = it; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, phitToTuple->nbins(), [&](uint32_t idx) { + if (hitToTuple.size(idx) >= 2) { + float mc = 10000.f; + uint16_t im = 60000; + uint32_t maxNh = 0; + + // find maxNh + for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { + uint32_t nh = foundNtuplets.size(*it); + maxNh = std::max(nh, maxNh); + } + // kill all tracks shorter than maxHn (only triplets???) + for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { + uint32_t nh = foundNtuplets.size(*it); + if (maxNh != nh) + quality[*it] = dup; } - } - // mark duplicates - for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { - auto const it = *ip; - if (quality[it] != bad && it != im) - quality[it] = dup; //no race: simple assignment of the same constant - } - } // maxNh - } // hitToTuple.size - }); // loop over hits + if (maxNh <= 3) { + // if (idx>=l1end) continue; // only for layer 1 + // for triplets choose best tip! + for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { + auto const it = *ip; + if (quality[it] != bad && std::abs(tracks.tip(it)) < mc) { + mc = std::abs(tracks.tip(it)); + im = it; + } + } + // mark duplicates + for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { + auto const it = *ip; + if (quality[it] != bad && it != im) + quality[it] = dup; //no race: simple assignment of the same constant + } + + } // maxNh + } // hitToTuple.size + }); // loop over hits } }; @@ -617,7 +633,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto const &foundNtuplets = *ptuples; auto const &tracks = *ptracks; const auto np = std::min(maxPrint, foundNtuplets.nbins()); - cms::alpakatools::for_each_element_in_grid_strided(acc, np, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(acc, np, [&](uint32_t i) { auto nh = foundNtuplets.size(i); if (nh >= 3) { printf("TK: %d %d %d %f %f %f %f %f %f %f %d %d %d %d %d\n", diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorOnGPU.cc b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorOnGPU.cc index 300bbf2ae..0c128ed04 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorOnGPU.cc +++ b/src/alpaka/plugin-PixelTriplets/alpaka/CAHitNtupletGeneratorOnGPU.cc @@ -91,10 +91,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { PixelTrackAlpaka CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DAlpaka const& hits_d, float bfield, Queue& queue) const { - PixelTrackAlpaka tracks{cms::alpakatools::allocDeviceBuf(1u)}; + PixelTrackAlpaka tracks{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}; auto* soa = alpaka::getPtrNative(tracks); - CAHitNtupletGeneratorKernels kernels(m_params, hits_d.nHits()); + CAHitNtupletGeneratorKernels kernels(m_params, hits_d.nHits(), queue); kernels.buildDoublets(hits_d, queue); kernels.launchKernels(hits_d, soa, queue); kernels.fillHitDetIndices(hits_d.view(), soa, queue); // in principle needed only if Hits not "available" diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/GPUCACell.h b/src/alpaka/plugin-PixelTriplets/alpaka/GPUCACell.h index 3e6afafd5..f1408815a 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/GPUCACell.h +++ b/src/alpaka/plugin-PixelTriplets/alpaka/GPUCACell.h @@ -30,10 +30,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { using CellNeighborsVector = CAConstants::CellNeighborsVector; using CellTracksVector = CAConstants::CellTracksVector; - using Hits = ALPAKA_ACCELERATOR_NAMESPACE::TrackingRecHit2DSOAView; + using Hits = ::ALPAKA_ACCELERATOR_NAMESPACE::TrackingRecHit2DSOAView; using hindex_type = Hits::hindex_type; - using TmpTuple = cms::alpakatools::VecArray; + using TmpTuple = ::cms::alpakatools::VecArray; using HitContainer = pixelTrack::HitContainer; using Quality = trackQuality::Quality; @@ -87,7 +87,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } else return -1; } - cms::alpakatools::threadfence(acc); + ::cms::alpakatools::threadfence(acc); return outerNeighbors().push_back(acc, t); } @@ -114,7 +114,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } else return -1; } - cms::alpakatools::threadfence(acc); + ::cms::alpakatools::threadfence(acc); return tracks().push_back(acc, t); } @@ -329,7 +329,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { GPUCACell* __restrict__ cells, CellTracksVector& cellTracks, HitContainer& foundNtuplets, - cms::alpakatools::AtomicPairCounter& apc, + ::cms::alpakatools::AtomicPairCounter& apc, Quality* __restrict__ quality, TmpTuple& tmpNtuplet, const unsigned int minHitsPerNtuplet, diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/PixelTrackSoAFromAlpaka.cc b/src/alpaka/plugin-PixelTriplets/alpaka/PixelTrackSoAFromAlpaka.cc index a45d87af1..acd2932ee 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/PixelTrackSoAFromAlpaka.cc +++ b/src/alpaka/plugin-PixelTriplets/alpaka/PixelTrackSoAFromAlpaka.cc @@ -5,6 +5,7 @@ #include "Framework/Event.h" #include "Framework/PluginFactory.h" #include "Framework/EDProducer.h" +#include "AlpakaCore/ScopedContext.h" namespace ALPAKA_ACCELERATOR_NAMESPACE { @@ -74,13 +75,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED auto const& inputData = iEvent.get(tokenAlpaka_); - auto outputData = cms::alpakatools::allocHostBuf(1u); - Queue queue(device); - alpaka::memcpy(queue, outputData, inputData, 1u); - alpaka::wait(queue); + auto outputData = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(1u); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{::ALPAKA_ACCELERATOR_NAMESPACE::device, + iEvent.streamID()}; + alpaka::memcpy(ctx.stream(), outputData, inputData, 1u); // DO NOT make a copy (actually TWO....) - iEvent.emplace(tokenSOA_, std::move(outputData)); + ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::device, iEvent, tokenSOA_, std::move(outputData)); #endif } diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/RiemannFitOnGPU.cc b/src/alpaka/plugin-PixelTriplets/alpaka/RiemannFitOnGPU.cc index d30c1373f..fdce8ef07 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/RiemannFitOnGPU.cc +++ b/src/alpaka/plugin-PixelTriplets/alpaka/RiemannFitOnGPU.cc @@ -12,25 +12,27 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { const auto blockSize = 64; const auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize; - const WorkDiv1D workDivTriplets = cms::alpakatools::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); - const WorkDiv1D workDivQuadsPenta = - cms::alpakatools::make_workdiv(Vec1D::all(numberOfBlocks / 4), Vec1D::all(blockSize)); + const WorkDiv1D workDivTriplets = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( + Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); + const WorkDiv1D workDivQuadsPenta = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( + Vec1D::all(numberOfBlocks / 4), Vec1D::all(blockSize)); // Fit internals - auto hitsGPU_ = cms::alpakatools::allocDeviceBuf(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / - sizeof(double)); + auto hitsGPU_ = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf( + maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>) / sizeof(double)); - auto hits_geGPU_ = - cms::alpakatools::allocDeviceBuf(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float)); + auto hits_geGPU_ = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf( + maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f) / sizeof(float)); - auto fast_fit_resultsGPU_ = - cms::alpakatools::allocDeviceBuf(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double)); + auto fast_fit_resultsGPU_ = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf( + maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d) / sizeof(double)); //auto circle_fit_resultsGPU_holder = //cms::cuda::make_device_unique(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit), stream); //Rfit::circle_fit *circle_fit_resultsGPU_ = (Rfit::circle_fit *)(circle_fit_resultsGPU_holder.get()); - //auto circle_fit_resultsGPU_holder = cms::alpakatools::allocDeviceBuf(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit)); - auto circle_fit_resultsGPU_ = cms::alpakatools::allocDeviceBuf(maxNumberOfConcurrentFits_); + //auto circle_fit_resultsGPU_holder = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxNumberOfConcurrentFits_ * sizeof(Rfit::circle_fit)); + auto circle_fit_resultsGPU_ = + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(maxNumberOfConcurrentFits_); for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { // triplets diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/RiemannFitOnGPU.h b/src/alpaka/plugin-PixelTriplets/alpaka/RiemannFitOnGPU.h index 7ae373f32..bdc4573b0 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/RiemannFitOnGPU.h +++ b/src/alpaka/plugin-PixelTriplets/alpaka/RiemannFitOnGPU.h @@ -47,43 +47,44 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { #endif const auto nt = Rfit::maxNumberOfConcurrentFits(); - cms::alpakatools::for_each_element_in_grid_strided(acc, nt, [&](uint32_t local_idx) { - auto tuple_idx = local_idx + offset; - if (tuple_idx >= tupleMultiplicity->size(nHits)) - return; - - // get it from the ntuple container (one to one to helix) - auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); - ALPAKA_ASSERT_OFFLOAD(tkid < foundNtuplets->nbins()); - - ALPAKA_ASSERT_OFFLOAD(foundNtuplets->size(tkid) == nHits); - - Rfit::Map3xNd hits(phits + local_idx); - Rfit::Map4d fast_fit(pfast_fit + local_idx); - Rfit::Map6xNf hits_ge(phits_ge + local_idx); - - // Prepare data structure - auto const *hitId = foundNtuplets->begin(tkid); - for (unsigned int i = 0; i < hitsInFit; ++i) { - auto hit = hitId[i]; - // printf("Hit global: %f,%f,%f\n", hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]); - float ge[6]; - hhp->cpeParams() - .detParams(hhp->detectorIndex(hit)) - .frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge); - // printf("Error: %d: %f,%f,%f,%f,%f,%f\n",hhp->detInd_d[hit],ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]); - - hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit); - hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5]; - } - Rfit::Fast_fit(hits, fast_fit); - - // no NaN here.... - ALPAKA_ASSERT_OFFLOAD(fast_fit(0) == fast_fit(0)); - ALPAKA_ASSERT_OFFLOAD(fast_fit(1) == fast_fit(1)); - ALPAKA_ASSERT_OFFLOAD(fast_fit(2) == fast_fit(2)); - ALPAKA_ASSERT_OFFLOAD(fast_fit(3) == fast_fit(3)); - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, nt, [&](uint32_t local_idx) { + auto tuple_idx = local_idx + offset; + if (tuple_idx >= tupleMultiplicity->size(nHits)) + return; + + // get it from the ntuple container (one to one to helix) + auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); + ALPAKA_ASSERT_OFFLOAD(tkid < foundNtuplets->nbins()); + + ALPAKA_ASSERT_OFFLOAD(foundNtuplets->size(tkid) == nHits); + + Rfit::Map3xNd hits(phits + local_idx); + Rfit::Map4d fast_fit(pfast_fit + local_idx); + Rfit::Map6xNf hits_ge(phits_ge + local_idx); + + // Prepare data structure + auto const *hitId = foundNtuplets->begin(tkid); + for (unsigned int i = 0; i < hitsInFit; ++i) { + auto hit = hitId[i]; + // printf("Hit global: %f,%f,%f\n", hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]); + float ge[6]; + hhp->cpeParams() + .detParams(hhp->detectorIndex(hit)) + .frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge); + // printf("Error: %d: %f,%f,%f,%f,%f,%f\n",hhp->detInd_d[hit],ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]); + + hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit); + hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5]; + } + Rfit::Fast_fit(hits, fast_fit); + + // no NaN here.... + ALPAKA_ASSERT_OFFLOAD(fast_fit(0) == fast_fit(0)); + ALPAKA_ASSERT_OFFLOAD(fast_fit(1) == fast_fit(1)); + ALPAKA_ASSERT_OFFLOAD(fast_fit(2) == fast_fit(2)); + ALPAKA_ASSERT_OFFLOAD(fast_fit(3) == fast_fit(3)); + }); } // kernel operator() }; // struct @@ -107,28 +108,29 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // look in bin for this hit multiplicity const auto nt = Rfit::maxNumberOfConcurrentFits(); - cms::alpakatools::for_each_element_in_grid_strided(acc, nt, [&](uint32_t local_idx) { - auto tuple_idx = local_idx + offset; - if (tuple_idx >= tupleMultiplicity->size(nHits)) - return; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, nt, [&](uint32_t local_idx) { + auto tuple_idx = local_idx + offset; + if (tuple_idx >= tupleMultiplicity->size(nHits)) + return; - Rfit::Map3xNd hits(phits + local_idx); - Rfit::Map4d fast_fit(pfast_fit_input + local_idx); - Rfit::Map6xNf hits_ge(phits_ge + local_idx); + Rfit::Map3xNd hits(phits + local_idx); + Rfit::Map4d fast_fit(pfast_fit_input + local_idx); + Rfit::Map6xNf hits_ge(phits_ge + local_idx); - Rfit::VectorNd rad = (hits.block(0, 0, 2, N).colwise().norm()); + Rfit::VectorNd rad = (hits.block(0, 0, 2, N).colwise().norm()); - Rfit::Matrix2Nd hits_cov = Rfit::Matrix2Nd::Zero(); - Rfit::loadCovariance2D(hits_ge, hits_cov); + Rfit::Matrix2Nd hits_cov = Rfit::Matrix2Nd::Zero(); + Rfit::loadCovariance2D(hits_ge, hits_cov); - circle_fit[local_idx] = Rfit::Circle_fit(hits.block(0, 0, 2, N), hits_cov, fast_fit, rad, B, true); + circle_fit[local_idx] = Rfit::Circle_fit(hits.block(0, 0, 2, N), hits_cov, fast_fit, rad, B, true); #ifdef RIEMANN_DEBUG // auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); // printf("kernelCircleFit circle.par(0,1,2): %d %f,%f,%f\n", tkid, // circle_fit[local_idx].par(0), circle_fit[local_idx].par(1), circle_fit[local_idx].par(2)); #endif - }); + }); } // kernel operator() }; // struct @@ -154,47 +156,48 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // look in bin for this hit multiplicity const auto nt = Rfit::maxNumberOfConcurrentFits(); - cms::alpakatools::for_each_element_in_grid_strided(acc, nt, [&](uint32_t local_idx) { - auto tuple_idx = local_idx + offset; - if (tuple_idx >= tupleMultiplicity->size(nHits)) - return; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, nt, [&](uint32_t local_idx) { + auto tuple_idx = local_idx + offset; + if (tuple_idx >= tupleMultiplicity->size(nHits)) + return; - // get it for the ntuple container (one to one to helix) - auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); + // get it for the ntuple container (one to one to helix) + auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); - Rfit::Map3xNd hits(phits + local_idx); - Rfit::Map4d fast_fit(pfast_fit_input + local_idx); - Rfit::Map6xNf hits_ge(phits_ge + local_idx); + Rfit::Map3xNd hits(phits + local_idx); + Rfit::Map4d fast_fit(pfast_fit_input + local_idx); + Rfit::Map6xNf hits_ge(phits_ge + local_idx); - auto const &line_fit = Rfit::Line_fit(hits, hits_ge, circle_fit[local_idx], fast_fit, B, true); + auto const &line_fit = Rfit::Line_fit(hits, hits_ge, circle_fit[local_idx], fast_fit, B, true); - Rfit::fromCircleToPerigee(circle_fit[local_idx]); + Rfit::fromCircleToPerigee(circle_fit[local_idx]); - results->stateAtBS.copyFromCircle( - circle_fit[local_idx].par, circle_fit[local_idx].cov, line_fit.par, line_fit.cov, 1.f / float(B), tkid); - results->pt(tkid) = B / std::abs(circle_fit[local_idx].par(2)); - results->eta(tkid) = asinhf(line_fit.par(0)); - results->chi2(tkid) = (circle_fit[local_idx].chi2 + line_fit.chi2) / (2 * N - 5); + results->stateAtBS.copyFromCircle( + circle_fit[local_idx].par, circle_fit[local_idx].cov, line_fit.par, line_fit.cov, 1.f / float(B), tkid); + results->pt(tkid) = B / std::abs(circle_fit[local_idx].par(2)); + results->eta(tkid) = asinhf(line_fit.par(0)); + results->chi2(tkid) = (circle_fit[local_idx].chi2 + line_fit.chi2) / (2 * N - 5); #ifdef RIEMANN_DEBUG - printf("kernelLineFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n", - N, - nHits, - tkid, - circle_fit[local_idx].par(0), - circle_fit[local_idx].par(1), - circle_fit[local_idx].par(2)); - printf("kernelLineFit line.par(0,1): %d %f,%f\n", tkid, line_fit.par(0), line_fit.par(1)); - printf("kernelLineFit chi2 cov %f/%f %e,%e,%e,%e,%e\n", - circle_fit[local_idx].chi2, - line_fit.chi2, - circle_fit[local_idx].cov(0, 0), - circle_fit[local_idx].cov(1, 1), - circle_fit[local_idx].cov(2, 2), - line_fit.cov(0, 0), - line_fit.cov(1, 1)); + printf("kernelLineFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n", + N, + nHits, + tkid, + circle_fit[local_idx].par(0), + circle_fit[local_idx].par(1), + circle_fit[local_idx].par(2)); + printf("kernelLineFit line.par(0,1): %d %f,%f\n", tkid, line_fit.par(0), line_fit.par(1)); + printf("kernelLineFit chi2 cov %f/%f %e,%e,%e,%e,%e\n", + circle_fit[local_idx].chi2, + line_fit.chi2, + circle_fit[local_idx].cov(0, 0), + circle_fit[local_idx].cov(1, 1), + circle_fit[local_idx].cov(2, 2), + line_fit.cov(0, 0), + line_fit.cov(1, 1)); #endif - }); + }); } // kernel operator() }; // struct diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/gpuFishbone.h b/src/alpaka/plugin-PixelTriplets/alpaka/gpuFishbone.h index afbdca1df..f52240800 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/gpuFishbone.h +++ b/src/alpaka/plugin-PixelTriplets/alpaka/gpuFishbone.h @@ -43,16 +43,16 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { const uint32_t dimIndexX = 1u; const uint32_t blockDimensionX(alpaka::getWorkDiv(acc)[dimIndexX]); const auto& [firstElementIdxNoStrideX, endElementIdxNoStrideX] = - cms::alpakatools::element_index_range_in_block(acc, 0u, dimIndexX); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::element_index_range_in_block(acc, 0u, dimIndexX); // Outermost loop on Y const uint32_t gridDimensionY(alpaka::getWorkDiv(acc)[dimIndexY]); const auto& [firstElementIdxNoStrideY, endElementIdxNoStrideY] = - cms::alpakatools::element_index_range_in_grid(acc, 0u, dimIndexY); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::element_index_range_in_grid(acc, 0u, dimIndexY); uint32_t firstElementIdxY = firstElementIdxNoStrideY; uint32_t endElementIdxY = endElementIdxNoStrideY; for (uint32_t idy = firstElementIdxY, nt = nHits; idy < nt; ++idy) { - if (!cms::alpakatools::next_valid_element_index_strided( + if (not ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::next_valid_element_index_strided( idy, firstElementIdxY, endElementIdxY, gridDimensionY, nt)) break; @@ -88,7 +88,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { uint32_t firstElementIdxX = firstElementIdxNoStrideX; uint32_t endElementIdxX = endElementIdxNoStrideX; for (uint32_t ic = firstElementIdxX; ic < sg - 1; ++ic) { - if (!cms::alpakatools::next_valid_element_index_strided( + if (not ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::next_valid_element_index_strided( ic, firstElementIdxX, endElementIdxX, blockDimensionX, sg - 1)) break; diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/gpuPixelDoublets.h b/src/alpaka/plugin-PixelTriplets/alpaka/gpuPixelDoublets.h index 3cf3bdc91..32dc638c3 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/gpuPixelDoublets.h +++ b/src/alpaka/plugin-PixelTriplets/alpaka/gpuPixelDoublets.h @@ -74,7 +74,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { CellTracksVector* cellTracks, CellTracks* cellTracksContainer) const { ALPAKA_ASSERT_OFFLOAD(isOuterHitOfCell); - cms::alpakatools::for_each_element_in_grid_strided( + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( acc, nHits, [&](uint32_t i) { isOuterHitOfCell[i].reset(); }); const uint32_t threadIdx(alpaka::getIdx(acc)[0u]); diff --git a/src/alpaka/plugin-PixelTriplets/alpaka/gpuPixelDoubletsAlgos.h b/src/alpaka/plugin-PixelTriplets/alpaka/gpuPixelDoubletsAlgos.h index e735d4e22..9394c6275 100644 --- a/src/alpaka/plugin-PixelTriplets/alpaka/gpuPixelDoubletsAlgos.h +++ b/src/alpaka/plugin-PixelTriplets/alpaka/gpuPixelDoubletsAlgos.h @@ -90,16 +90,16 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // X runs faster const uint32_t blockDimensionX(alpaka::getWorkDiv(acc)[dimIndexX]); const auto& [firstElementIdxNoStrideX, endElementIdxNoStrideX] = - cms::alpakatools::element_index_range_in_block(acc, 0u, dimIndexX); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::element_index_range_in_block(acc, 0u, dimIndexX); // Outermost loop on Y const uint32_t gridDimensionY(alpaka::getWorkDiv(acc)[dimIndexY]); const auto& [firstElementIdxNoStrideY, endElementIdxNoStrideY] = - cms::alpakatools::element_index_range_in_grid(acc, 0u, dimIndexY); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::element_index_range_in_grid(acc, 0u, dimIndexY); uint32_t firstElementIdxY = firstElementIdxNoStrideY; uint32_t endElementIdxY = endElementIdxNoStrideY; for (uint32_t j = firstElementIdxY; j < ntot; ++j) { - if (!cms::alpakatools::next_valid_element_index_strided( + if (not ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::next_valid_element_index_strided( j, firstElementIdxY, endElementIdxY, gridDimensionY, ntot)) break; @@ -224,7 +224,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { uint32_t firstElementIdxX = firstElementIdxNoStrideX; uint32_t endElementIdxX = endElementIdxNoStrideX; for (uint32_t pIndex = firstElementIdxX; pIndex < maxpIndex; ++pIndex) { - if (!cms::alpakatools::next_valid_element_index_strided( + if (not ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::next_valid_element_index_strided( pIndex, firstElementIdxX, endElementIdxX, blockDimensionX, maxpIndex)) break; diff --git a/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexProducerAlpaka.cc b/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexProducerAlpaka.cc index fdfbdfa74..aa4401f30 100644 --- a/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexProducerAlpaka.cc +++ b/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexProducerAlpaka.cc @@ -1,3 +1,4 @@ + #include "AlpakaCore/alpakaCommon.h" #include "AlpakaDataFormats/PixelTrackAlpaka.h" @@ -8,6 +9,7 @@ #include "Framework/PluginFactory.h" #include "Framework/EDProducer.h" #include "Framework/RunningAverage.h" +#include "AlpakaCore/ScopedContext.h" #include "gpuVertexFinder.h" @@ -49,8 +51,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto const& tracksBuf = iEvent.get(tokenTrack_); auto const tracks = alpaka::getPtrNative(tracksBuf); - Queue queue(device); - iEvent.emplace(tokenVertex_, m_gpuAlgo.makeAsync(tracks, m_ptMin, queue)); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{::ALPAKA_ACCELERATOR_NAMESPACE::device, + iEvent.streamID()}; + ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::device, + iEvent, + tokenVertex_, + m_gpuAlgo.makeAsync(tracks, m_ptMin, ctx.stream())); } } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexSoAFromAlpaka.cc b/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexSoAFromAlpaka.cc index a82c356de..acee48940 100644 --- a/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexSoAFromAlpaka.cc +++ b/src/alpaka/plugin-PixelVertexFinding/alpaka/PixelVertexSoAFromAlpaka.cc @@ -5,6 +5,7 @@ #include "Framework/PluginFactory.h" #include "Framework/EDProducer.h" #include "Framework/RunningAverage.h" +#include "AlpakaCore/ScopedContext.h" namespace ALPAKA_ACCELERATOR_NAMESPACE { @@ -55,13 +56,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { void PixelVertexSoAFromAlpaka::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED auto const& inputData = iEvent.get(tokenAlpaka_); - auto outputData = cms::alpakatools::allocHostBuf(1u); - Queue queue(device); - alpaka::memcpy(queue, outputData, inputData, 1u); - alpaka::wait(queue); + auto outputData = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(1u); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{::ALPAKA_ACCELERATOR_NAMESPACE::device, + iEvent.streamID()}; + alpaka::memcpy(ctx.stream(), outputData, inputData, 1u); // No copies.... - iEvent.emplace(tokenSOA_, std::move(outputData)); + ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::device, iEvent, tokenSOA_, std::move(outputData)); #endif } diff --git a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuClusterTracksByDensity.h b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuClusterTracksByDensity.h index 51f6752e0..33f65d9bf 100644 --- a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuClusterTracksByDensity.h +++ b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuClusterTracksByDensity.h @@ -51,11 +51,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ALPAKA_ASSERT_OFFLOAD(pdata); ALPAKA_ASSERT_OFFLOAD(zt); - using Hist = cms::alpakatools::HistoContainer; + using Hist = ::cms::alpakatools::HistoContainer; auto& hist = alpaka::declareSharedVar(acc); auto& hws = alpaka::declareSharedVar(acc); - cms::alpakatools::for_each_element_in_block_strided(acc, Hist::totbins(), [&](uint32_t j) { hist.off[j] = 0; }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, Hist::totbins(), [&](uint32_t j) { hist.off[j] = 0; }); alpaka::syncBlockThreads(acc); if (verbose && 0 == threadIdxLocal) @@ -64,7 +65,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ALPAKA_ASSERT_OFFLOAD(nt <= hist.capacity()); // fill hist (bin shall be wider than "eps") - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { ALPAKA_ASSERT_OFFLOAD(i < ZVertices::MAXTRACKS); int iz = int(zt[i] * 10.); // valid if eps<=0.1 // iz = std::clamp(iz, INT8_MIN, INT8_MAX); // sorry c++17 only @@ -78,7 +79,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { }); alpaka::syncBlockThreads(acc); - cms::alpakatools::for_each_element_in_block(acc, 32, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block(acc, 32, [&](uint32_t i) { hws[i] = 0; // used by prefix scan... }); @@ -87,12 +88,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { alpaka::syncBlockThreads(acc); ALPAKA_ASSERT_OFFLOAD(hist.size() == nt); - cms::alpakatools::for_each_element_in_block_strided( + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( acc, nt, [&](uint32_t i) { hist.fill(acc, izt[i], uint16_t(i)); }); alpaka::syncBlockThreads(acc); // count neighbours - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { if (ezt2[i] <= er2mx) { auto loop = [&](uint32_t j) { if (i == j) @@ -105,14 +106,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { nn[i]++; }; - cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + ::cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); } }); alpaka::syncBlockThreads(acc); // find closest above me .... (we ignore the possibility of two j at same distance from i) - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { float mdist = eps; auto loop = [&](uint32_t j) { if (nn[j] < nn[i]) @@ -127,14 +128,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { mdist = dist; iv[i] = j; // assign to cluster (better be unique??) }; - cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + ::cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); }); alpaka::syncBlockThreads(acc); #ifdef GPU_DEBUG // mini verification - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { if (iv[i] != int(i)) ALPAKA_ASSERT_OFFLOAD(iv[iv[i]] != int(i)); }); @@ -142,7 +143,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { #endif // consolidate graph (percolate index of seed) - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { auto m = iv[i]; while (m != iv[m]) m = iv[m]; @@ -152,7 +153,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { #ifdef GPU_DEBUG alpaka::syncBlockThreads(acc); // mini verification - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { if (iv[i] != int(i)) ALPAKA_ASSERT_OFFLOAD(iv[iv[i]] != int(i)); }); @@ -160,7 +161,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { #ifdef GPU_DEBUG // and verify that we did not spit any cluster... - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { auto minJ = i; auto mdist = eps; auto loop = [&](uint32_t j) { @@ -176,7 +177,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { mdist = dist; minJ = j; }; - cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + ::cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); // should belong to the same cluster... ALPAKA_ASSERT_OFFLOAD(iv[i] == iv[minJ]); ALPAKA_ASSERT_OFFLOAD(nn[i] <= nn[iv[i]]); @@ -190,7 +191,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // find the number of different clusters, identified by a tracks with clus[i] == i and density larger than threshold; // mark these tracks with a negative id. - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { if (iv[i] == int(i)) { if (nn[i] >= minT) { auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Blocks{}); @@ -205,7 +206,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ALPAKA_ASSERT_OFFLOAD(foundClusters < ZVertices::MAXVTX); // propagate the negative id to all the tracks in the cluster. - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { if (iv[i] >= 0) { // mark each track in a cluster with the same id as the first one iv[i] = iv[iv[i]]; @@ -214,7 +215,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { alpaka::syncBlockThreads(acc); // adjust the cluster id to be a positive value starting from 0 - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { iv[i] = -iv[i] - 1; }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, nt, [&](uint32_t i) { iv[i] = -iv[i] - 1; }); nvIntermediate = nvFinal = foundClusters; diff --git a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuClusterTracksDBSCAN.h b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuClusterTracksDBSCAN.h index 0f34ec254..e2ebf9e7c 100644 --- a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuClusterTracksDBSCAN.h +++ b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuClusterTracksDBSCAN.h @@ -47,10 +47,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ALPAKA_ASSERT_OFFLOAD(pdata); ALPAKA_ASSERT_OFFLOAD(zt); - using Hist = cms::alpakatools::HistoContainer; + using Hist = ::cms::alpakatools::HistoContainer; auto& hist = alpaka::declareSharedVar(acc); auto& hws = alpaka::declareSharedVar(acc); - cms::alpakatools::for_each_element_in_block_strided(acc, Hist::totbins(), [&](uint32_t j) { hist.off[j] = 0; }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, Hist::totbins(), [&](uint32_t j) { hist.off[j] = 0; }); alpaka::syncBlockThreads(acc); if (verbose && 0 == threadIdxLocal) @@ -59,7 +60,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ALPAKA_ASSERT_OFFLOAD(nt <= hist.capacity()); // fill hist (bin shall be wider than "eps") - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { ALPAKA_ASSERT_OFFLOAD(i < ZVertices::MAXTRACKS); int iz = int(zt[i] * 10.); // valid if eps<=0.1 // iz = std::clamp(iz, INT8_MIN, INT8_MAX); // sorry c++17 only @@ -73,7 +74,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { }); alpaka::syncBlockThreads(acc); - cms::alpakatools::for_each_element_in_block(acc, 32, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block(acc, 32, [&](uint32_t i) { hws[i] = 0; // used by prefix scan... }); @@ -81,12 +82,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { hist.finalize(acc, hws); alpaka::syncBlockThreads(acc); ALPAKA_ASSERT_OFFLOAD(hist.size() == nt); - cms::alpakatools::for_each_element_in_block_strided( + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( acc, nt, [&](uint32_t i) { hist.fill(acc, izt[i], uint16_t(i)); }); alpaka::syncBlockThreads(acc); // count neighbours - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { if (ezt2[i] <= er2mx) { auto loop = [&](uint32_t j) { if (i == j) @@ -98,13 +99,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { nn[i]++; }; - cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + ::cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); } }); alpaka::syncBlockThreads(acc); // find NN with smaller z... - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { if (nn[i] >= minT) { // DBSCAN core rule float mz = zt[i]; auto loop = [&](uint32_t j) { @@ -119,14 +120,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { mz = zt[j]; iv[i] = j; // assign to cluster (better be unique??) }; - cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + ::cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); } }); alpaka::syncBlockThreads(acc); #ifdef GPU_DEBUG // mini verification - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { if (iv[i] != int(i)) ALPAKA_ASSERT_OFFLOAD(iv[iv[i]] != int(i)); }); @@ -134,7 +135,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { #endif // consolidate graph (percolate index of seed) - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { auto m = iv[i]; while (m != iv[m]) m = iv[m]; @@ -144,7 +145,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { #ifdef GPU_DEBUG // mini verification - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { if (iv[i] != int(i)) ALPAKA_ASSERT_OFFLOAD(iv[iv[i]] != int(i)); }); @@ -153,7 +154,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { #ifdef GPU_DEBUG // and verify that we did not spit any cluster... - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { if (nn[i] >= minT) { // DBSCAN core rule ALPAKA_ASSERT_OFFLOAD(zt[iv[i]] <= zt[i]); auto loop = [&](uint32_t j) { @@ -171,14 +172,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } ALPAKA_ASSERT_OFFLOAD(iv[i] == iv[j]); }; - cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + ::cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); } }); alpaka::syncBlockThreads(acc); #endif // collect edges (assign to closest cluster of closest point??? here to closest point) - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { // if (nn[i]==0 || nn[i]>=minT) continue; // DBSCAN edge rule if (nn[i] < minT) { // DBSCAN edge rule float mdist = eps; @@ -193,7 +194,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { mdist = dist; iv[i] = iv[j]; // assign to cluster (better be unique??) }; - cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + ::cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); } }); @@ -203,7 +204,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // find the number of different clusters, identified by a tracks with clus[i] == i; // mark these tracks with a negative id. - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { if (iv[i] == int(i)) { if (nn[i] >= minT) { auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Blocks{}); @@ -218,7 +219,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ALPAKA_ASSERT_OFFLOAD(foundClusters < ZVertices::MAXVTX); // propagate the negative id to all the tracks in the cluster. - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { if (iv[i] >= 0) { // mark each track in a cluster with the same id as the first one iv[i] = iv[iv[i]]; @@ -227,7 +228,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { alpaka::syncBlockThreads(acc); // adjust the cluster id to be a positive value starting from 0 - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { iv[i] = -iv[i] - 1; }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, nt, [&](uint32_t i) { iv[i] = -iv[i] - 1; }); nvIntermediate = nvFinal = foundClusters; diff --git a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuClusterTracksIterative.h b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuClusterTracksIterative.h index df9469e45..875eddeaf 100644 --- a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuClusterTracksIterative.h +++ b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuClusterTracksIterative.h @@ -47,10 +47,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ALPAKA_ASSERT_OFFLOAD(pdata); ALPAKA_ASSERT_OFFLOAD(zt); - using Hist = cms::alpakatools::HistoContainer; + using Hist = ::cms::alpakatools::HistoContainer; auto& hist = alpaka::declareSharedVar(acc); auto& hws = alpaka::declareSharedVar(acc); - cms::alpakatools::for_each_element_in_block_strided(acc, Hist::totbins(), [&](uint32_t j) { hist.off[j] = 0; }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, Hist::totbins(), [&](uint32_t j) { hist.off[j] = 0; }); alpaka::syncBlockThreads(acc); if (verbose && 0 == threadIdxLocal) @@ -59,7 +60,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ALPAKA_ASSERT_OFFLOAD(nt <= hist.capacity()); // fill hist (bin shall be wider than "eps") - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { ALPAKA_ASSERT_OFFLOAD(i < ZVertices::MAXTRACKS); int iz = int(zt[i] * 10.); // valid if eps<=0.1 // iz = std::clamp(iz, INT8_MIN, INT8_MAX); // sorry c++17 only @@ -73,7 +74,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { }); alpaka::syncBlockThreads(acc); - cms::alpakatools::for_each_element_in_block(acc, 32, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block(acc, 32, [&](uint32_t i) { hws[i] = 0; // used by prefix scan... }); @@ -81,12 +82,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { hist.finalize(acc, hws); alpaka::syncBlockThreads(acc); ALPAKA_ASSERT_OFFLOAD(hist.size() == nt); - cms::alpakatools::for_each_element_in_block_strided( + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( acc, nt, [&](uint32_t i) { hist.fill(acc, izt[i], uint16_t(i)); }); alpaka::syncBlockThreads(acc); // count neighbours - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { if (ezt2[i] <= er2mx) { auto loop = [&](uint32_t j) { if (i == j) @@ -99,7 +100,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { nn[i]++; }; - cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + ::cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); } }); @@ -112,47 +113,49 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { bool more = true; while (alpaka::syncBlockThreadsPredicate(acc, more)) { if (1 == nloops % 2) { - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { - auto m = iv[i]; - while (m != iv[m]) - m = iv[m]; - iv[i] = m; - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, nt, [&](uint32_t i) { + auto m = iv[i]; + while (m != iv[m]) + m = iv[m]; + iv[i] = m; + }); } else { more = false; - cms::alpakatools::for_each_element_in_block_strided(acc, hist.size(), [&](uint32_t k) { - auto p = hist.begin() + k; - auto i = (*p); - auto be = std::min(Hist::bin(izt[i]) + 1, int(hist.nbins() - 1)); - if (nn[i] >= minT) { // DBSCAN core rule - auto loop = [&](uint32_t j) { - ALPAKA_ASSERT_OFFLOAD(i != j); - if (nn[j] < minT) - return; // DBSCAN core rule - auto dist = std::abs(zt[i] - zt[j]); - if (dist > eps) - return; - if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) - return; - auto old = alpaka::atomicMin(acc, &iv[j], iv[i], alpaka::hierarchy::Blocks{}); - if (old != iv[i]) { - // end the loop only if no changes were applied - more = true; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, hist.size(), [&](uint32_t k) { + auto p = hist.begin() + k; + auto i = (*p); + auto be = std::min(Hist::bin(izt[i]) + 1, int(hist.nbins() - 1)); + if (nn[i] >= minT) { // DBSCAN core rule + auto loop = [&](uint32_t j) { + ALPAKA_ASSERT_OFFLOAD(i != j); + if (nn[j] < minT) + return; // DBSCAN core rule + auto dist = std::abs(zt[i] - zt[j]); + if (dist > eps) + return; + if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) + return; + auto old = alpaka::atomicMin(acc, &iv[j], iv[i], alpaka::hierarchy::Blocks{}); + if (old != iv[i]) { + // end the loop only if no changes were applied + more = true; + } + alpaka::atomicMin(acc, &iv[i], old, alpaka::hierarchy::Blocks{}); + }; + ++p; + for (; p < hist.end(be); ++p) + loop(*p); } - alpaka::atomicMin(acc, &iv[i], old, alpaka::hierarchy::Blocks{}); - }; - ++p; - for (; p < hist.end(be); ++p) - loop(*p); - } - }); // for k + }); // for k } if (threadIdxLocal == 0) ++nloops; } // while // collect edges (assign to closest cluster of closest point??? here to closest point) - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { // if (nn[i]==0 || nn[i]>=minT) continue; // DBSCAN edge rule if (nn[i] < minT) { // DBSCAN edge rule float mdist = eps; @@ -167,7 +170,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { mdist = dist; iv[i] = iv[j]; // assign to cluster (better be unique??) }; - cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + ::cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); } }); @@ -178,7 +181,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // find the number of different clusters, identified by a tracks with clus[i] == i; // mark these tracks with a negative id. - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { if (iv[i] == int(i)) { if (nn[i] >= minT) { auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Blocks{}); @@ -193,7 +196,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ALPAKA_ASSERT_OFFLOAD(foundClusters < ZVertices::MAXVTX); // propagate the negative id to all the tracks in the cluster. - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { if (iv[i] >= 0) { // mark each track in a cluster with the same id as the first one iv[i] = iv[iv[i]]; @@ -202,7 +205,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { alpaka::syncBlockThreads(acc); // adjust the cluster id to be a positive value starting from 0 - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { iv[i] = -iv[i] - 1; }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, nt, [&](uint32_t i) { iv[i] = -iv[i] - 1; }); nvIntermediate = nvFinal = foundClusters; diff --git a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuFitVertices.h b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuFitVertices.h index 54918645f..bd3e7c6ac 100644 --- a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuFitVertices.h +++ b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuFitVertices.h @@ -42,11 +42,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto foundClusters = nvFinal; // zero - cms::alpakatools::for_each_element_in_block_strided(acc, foundClusters, [&](uint32_t i) { - zv[i] = 0; - wv[i] = 0; - chi2[i] = 0; - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, foundClusters, [&](uint32_t i) { + zv[i] = 0; + wv[i] = 0; + chi2[i] = 0; + }); // only for test auto& noise = alpaka::declareSharedVar(acc); @@ -57,7 +58,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { alpaka::syncBlockThreads(acc); // compute cluster location - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { if (iv[i] > 9990) { if (verbose) alpaka::atomicAdd(acc, &noise, 1, alpaka::hierarchy::Blocks{}); @@ -72,15 +73,16 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { alpaka::syncBlockThreads(acc); // reuse nn - cms::alpakatools::for_each_element_in_block_strided(acc, foundClusters, [&](uint32_t i) { - ALPAKA_ASSERT_OFFLOAD(wv[i] > 0.f); - zv[i] /= wv[i]; - nn[i] = -1; // ndof - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, foundClusters, [&](uint32_t i) { + ALPAKA_ASSERT_OFFLOAD(wv[i] > 0.f); + zv[i] /= wv[i]; + nn[i] = -1; // ndof + }); alpaka::syncBlockThreads(acc); // compute chi2 - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { if (iv[i] <= 9990) { auto c2 = zv[iv[i]] - zt[i]; c2 *= c2 / ezt2[i]; @@ -94,10 +96,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { }); alpaka::syncBlockThreads(acc); - cms::alpakatools::for_each_element_in_block_strided(acc, foundClusters, [&](uint32_t i) { - if (nn[i] > 0) - wv[i] *= float(nn[i]) / chi2[i]; - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, foundClusters, [&](uint32_t i) { + if (nn[i] > 0) + wv[i] *= float(nn[i]) / chi2[i]; + }); if (verbose && 0 == threadIdxLocal) printf("found %d proto clusters ", foundClusters); diff --git a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuSortByPt2.h b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuSortByPt2.h index f10206cc5..5e6d46e5b 100644 --- a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuSortByPt2.h +++ b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuSortByPt2.h @@ -33,13 +33,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { return; // fill indexing - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { data.idv[ws.itrk[i]] = iv[i]; }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, nt, [&](uint32_t i) { data.idv[ws.itrk[i]] = iv[i]; }); // can be done asynchronoisly at the end of previous event - cms::alpakatools::for_each_element_in_block_strided(acc, nvFinal, [&](uint32_t i) { ptv2[i] = 0; }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, nvFinal, [&](uint32_t i) { ptv2[i] = 0; }); alpaka::syncBlockThreads(acc); - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t i) { if (iv[i] <= 9990) { alpaka::atomicAdd(acc, &ptv2[iv[i]], ptt2[i], alpaka::hierarchy::Blocks{}); } @@ -55,7 +57,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED auto& sws = alpaka::declareSharedVar(acc); // sort using only 16 bits - cms::alpakatools::radixSort(acc, ptv2, sortInd, sws, nvFinal); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::radixSort(acc, ptv2, sortInd, sws, nvFinal); #else for (uint16_t i = 0; i < nvFinal; ++i) sortInd[i] = i; diff --git a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuSplitVertices.h b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuSplitVertices.h index 5e8589d44..28ff25cae 100644 --- a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuSplitVertices.h +++ b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuSplitVertices.h @@ -70,7 +70,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { alpaka::syncBlockThreads(acc); // copy to local - cms::alpakatools::for_each_element_in_block_strided(acc, nt, [&](uint32_t k) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nt, [&](uint32_t k) { if (iv[k] == int(kv)) { auto old = alpaka::atomicInc(acc, &nq, MAXTK, alpaka::hierarchy::Blocks{}); zz[old] = zt[k] - zv[kv]; @@ -100,7 +100,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } alpaka::syncBlockThreads(acc); - cms::alpakatools::for_each_element_in_block_strided(acc, nq, [&](uint32_t k) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nq, [&](uint32_t k) { auto i = newV[k]; alpaka::atomicAdd(acc, &znew[i], zz[k] * ww[k], alpaka::hierarchy::Blocks{}); alpaka::atomicAdd(acc, &wnew[i], ww[k], alpaka::hierarchy::Blocks{}); @@ -113,7 +113,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } alpaka::syncBlockThreads(acc); - cms::alpakatools::for_each_element_in_block_strided(acc, nq, [&](uint32_t k) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nq, [&](uint32_t k) { auto d0 = fabs(zz[k] - znew[0]); auto d1 = fabs(zz[k] - znew[1]); auto newer = d0 < d1 ? 0 : 1; @@ -144,7 +144,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { if (0 == threadIdxLocal) igv = alpaka::atomicAdd(acc, &ws.nvIntermediate, 1u, alpaka::hierarchy::Blocks{}); alpaka::syncBlockThreads(acc); - cms::alpakatools::for_each_element_in_block_strided(acc, nq, [&](uint32_t k) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nq, [&](uint32_t k) { if (1 == newV[k]) iv[it[k]] = igv; }); diff --git a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuVertexFinder.cc b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuVertexFinder.cc index 0aead8540..124829f75 100644 --- a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuVertexFinder.cc +++ b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuVertexFinder.cc @@ -22,31 +22,32 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto const& fit = tracks.stateAtBS; auto const* quality = tracks.qualityData(); - cms::alpakatools::for_each_element_in_grid_strided(acc, TkSoA::stride(), [&](uint32_t idx) { - auto nHits = tracks.nHits(idx); - if (nHits == 0) - return; // this is a guard: maybe we need to move to nTracks... - - // initialize soa... - soa->idv[idx] = -1; - - if (nHits < 4) - return; // no triplets - if (quality[idx] != trackQuality::loose) - return; - - auto pt = tracks.pt(idx); - - if (pt < ptMin) - return; - - auto& data = *pws; - auto it = alpaka::atomicAdd(acc, &data.ntrks, 1u, alpaka::hierarchy::Blocks{}); - data.itrk[it] = idx; - data.zt[it] = tracks.zip(idx); - data.ezt2[it] = fit.covariance(idx)(14); - data.ptt2[it] = pt * pt; - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, TkSoA::stride(), [&](uint32_t idx) { + auto nHits = tracks.nHits(idx); + if (nHits == 0) + return; // this is a guard: maybe we need to move to nTracks... + + // initialize soa... + soa->idv[idx] = -1; + + if (nHits < 4) + return; // no triplets + if (quality[idx] != trackQuality::loose) + return; + + auto pt = tracks.pt(idx); + + if (pt < ptMin) + return; + + auto& data = *pws; + auto it = alpaka::atomicAdd(acc, &data.ntrks, 1u, alpaka::hierarchy::Blocks{}); + data.itrk[it] = idx; + data.zt[it] = tracks.zip(idx); + data.ezt2[it] = fit.covariance(idx)(14); + data.ptt2[it] = pt * pt; + }); } }; @@ -106,28 +107,33 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // std::cout << "producing Vertices on GPU" << std::endl; ALPAKA_ASSERT_OFFLOAD(tksoa); - ZVertexAlpaka vertices{cms::alpakatools::allocDeviceBuf(1u)}; + ZVertexAlpaka vertices{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}; auto* soa = alpaka::getPtrNative(vertices); ALPAKA_ASSERT_OFFLOAD(soa); - auto ws_dBuf{cms::alpakatools::allocDeviceBuf(1u)}; + auto ws_dBuf{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}; auto ws_d = alpaka::getPtrNative(ws_dBuf); - auto nvFinalVerticesView = cms::alpakatools::createDeviceView(&soa->nvFinal, 1u); + auto nvFinalVerticesView = + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createDeviceView(&soa->nvFinal, 1u); alpaka::memset(queue, nvFinalVerticesView, 0, 1u); - auto ntrksWorkspaceView = cms::alpakatools::createDeviceView(&ws_d->ntrks, 1u); + auto ntrksWorkspaceView = + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createDeviceView(&ws_d->ntrks, 1u); alpaka::memset(queue, ntrksWorkspaceView, 0, 1u); - auto nvIntermediateWorkspaceView = cms::alpakatools::createDeviceView(&ws_d->nvIntermediate, 1u); + auto nvIntermediateWorkspaceView = + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createDeviceView(&ws_d->nvIntermediate, 1u); alpaka::memset(queue, nvIntermediateWorkspaceView, 0, 1u); const uint32_t blockSize = 128; const uint32_t numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize; - const WorkDiv1D loadTracksWorkDiv = - cms::alpakatools::make_workdiv(Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); + const WorkDiv1D loadTracksWorkDiv = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( + Vec1D::all(numberOfBlocks), Vec1D::all(blockSize)); alpaka::enqueue(queue, alpaka::createTaskKernel(loadTracksWorkDiv, loadTracks(), tksoa, soa, ws_d, ptMin)); - const WorkDiv1D finderSorterWorkDiv = cms::alpakatools::make_workdiv(Vec1D::all(1), Vec1D::all(1024 - 256)); - const WorkDiv1D splitterFitterWorkDiv = cms::alpakatools::make_workdiv(Vec1D::all(1024), Vec1D::all(128)); + const WorkDiv1D finderSorterWorkDiv = + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(1), Vec1D::all(1024 - 256)); + const WorkDiv1D splitterFitterWorkDiv = + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(1024), Vec1D::all(128)); if (oneKernel_) { // implemented only for density clustesrs diff --git a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuVertexFinder.h b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuVertexFinder.h index ad16972c3..909b2bf49 100644 --- a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuVertexFinder.h +++ b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuVertexFinder.h @@ -1,3 +1,4 @@ + #ifndef RecoPixelVertexing_PixelVertexFinding_src_gpuVertexFinder_h #define RecoPixelVertexing_PixelVertexFinding_src_gpuVertexFinder_h diff --git a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelFedCablingMapESProducer.cc b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelFedCablingMapESProducer.cc index 835c7743e..6de5aea11 100644 --- a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelFedCablingMapESProducer.cc +++ b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelFedCablingMapESProducer.cc @@ -31,13 +31,17 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { Queue queue(device); - auto cablingMap_h{cms::alpakatools::createHostView(&obj, 1u)}; - auto cablingMap_d{cms::alpakatools::allocDeviceBuf(1u)}; + auto cablingMap_h{ + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createHostView(&obj, 1u)}; + auto cablingMap_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}; + alpaka::prepareForAsyncCopy(cablingMap_d); alpaka::memcpy(queue, cablingMap_d, cablingMap_h, 1u); eventSetup.put(std::make_unique(std::move(cablingMap_d), true)); - auto modToUnp_h{cms::alpakatools::createHostView(modToUnpDefault.data(), modToUnpDefSize)}; - auto modToUnp_d{cms::alpakatools::allocDeviceBuf(modToUnpDefSize)}; + auto modToUnp_h{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createHostView( + modToUnpDefault.data(), modToUnpDefSize)}; + auto modToUnp_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(modToUnpDefSize)}; + alpaka::prepareForAsyncCopy(modToUnp_d); alpaka::memcpy(queue, modToUnp_d, modToUnp_h, modToUnpDefSize); alpaka::wait(queue); diff --git a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelGainCalibrationForHLTESProducer.cc b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelGainCalibrationForHLTESProducer.cc index 1a922789f..a54b5f57b 100644 --- a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelGainCalibrationForHLTESProducer.cc +++ b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelGainCalibrationForHLTESProducer.cc @@ -45,18 +45,27 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { Queue queue(device); const uint32_t numDecodingStructures = gainData.size() / sizeof(SiPixelGainForHLTonGPU_DecodingStructure); - auto ped_h{cms::alpakatools::createHostView( - reinterpret_cast(gainData.data()), numDecodingStructures)}; - auto ped_d{cms::alpakatools::allocDeviceBuf(numDecodingStructures)}; + auto ped_h{ + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createHostView( + reinterpret_cast(gainData.data()), numDecodingStructures)}; + auto ped_d{ + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf( + numDecodingStructures)}; + alpaka::prepareForAsyncCopy(ped_d); alpaka::memcpy(queue, ped_d, ped_h, numDecodingStructures); auto rangeAndCols_h{ - cms::alpakatools::createHostView(gain.rangeAndCols, 2000u)}; - auto rangeAndCols_d{cms::alpakatools::allocDeviceBuf(2000u)}; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createHostView( + gain.rangeAndCols, 2000u)}; + auto rangeAndCols_d{ + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(2000u)}; + alpaka::prepareForAsyncCopy(rangeAndCols_d); alpaka::memcpy(queue, rangeAndCols_d, rangeAndCols_h, 2000u); - auto fields_h{cms::alpakatools::createHostView(&gain.fields_, 1u)}; - auto fields_d{cms::alpakatools::allocDeviceBuf(1u)}; + auto fields_h{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createHostView( + &gain.fields_, 1u)}; + auto fields_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}; + alpaka::prepareForAsyncCopy(fields_d); alpaka::memcpy(queue, fields_d, fields_h, 1u); alpaka::wait(queue); diff --git a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToCluster.cc b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToCluster.cc index 45e228ed7..6e8631a08 100644 --- a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToCluster.cc +++ b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToCluster.cc @@ -12,6 +12,7 @@ #include "Framework/Event.h" #include "Framework/PluginFactory.h" #include "Framework/EDProducer.h" +#include "AlpakaCore/ScopedContext.h" #include "../ErrorChecker.h" #include "SiPixelRawToClusterGPUKernel.h" @@ -32,6 +33,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { private: void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ContextState ctxState_; + edm::EDGetTokenT rawGetToken_; edm::EDPutTokenT digiPutToken_; edm::EDPutTokenT digiErrorPutToken_; @@ -137,7 +140,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } // end of for loop - Queue queue(device); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{::ALPAKA_ACCELERATOR_NAMESPACE::device, + iEvent.streamID()}; gpuAlgo_.makeClustersAsync(isRun2_, gpuMap, gpuModulesToUnpack, @@ -149,16 +153,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { useQuality_, includeErrors_, false, // debug - queue); - - // TODO: synchronize explicitly for now - alpaka::wait(queue); + ctx.stream()); auto tmp = gpuAlgo_.getResults(); - iEvent.emplace(digiPutToken_, std::move(tmp.first)); - iEvent.emplace(clusterPutToken_, std::move(tmp.second)); + ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::device, iEvent, digiPutToken_, std::move(tmp.first)); + ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::device, iEvent, clusterPutToken_, std::move(tmp.second)); if (includeErrors_) { - iEvent.emplace(digiErrorPutToken_, gpuAlgo_.getErrors()); + ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::device, iEvent, digiErrorPutToken_, gpuAlgo_.getErrors()); } } diff --git a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.cc b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.cc index 15605dec0..2f10f0a61 100644 --- a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.cc +++ b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.cc @@ -32,8 +32,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { namespace pixelgpudetails { SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender() - : word_{cms::alpakatools::allocHostBuf(MAX_FED_WORDS)}, - fedId_{cms::alpakatools::allocHostBuf(MAX_FED_WORDS)} {} + : word_{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(MAX_FED_WORDS)}, + fedId_{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(MAX_FED_WORDS)} {} void SiPixelRawToClusterGPUKernel::WordFedAppender::initializeWordFed(int fedId, unsigned int wordCounterGPU, @@ -361,114 +361,115 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { uint32_t *pdigi, uint32_t *rawIdArr, uint16_t *moduleId, - cms::alpakatools::SimpleVector *err, + ::cms::alpakatools::SimpleVector *err, bool useQualityInfo, bool includeErrors, bool debug) const { - cms::alpakatools::for_each_element_in_grid_strided(acc, wordCounter, [&](uint32_t iloop) { - auto gIndex = iloop; - xx[gIndex] = 0; - yy[gIndex] = 0; - adc[gIndex] = 0; - bool skipROC = false; - - uint8_t fedId = fedIds[gIndex / 2]; // +1200; - - // initialize (too many coninue below) - pdigi[gIndex] = 0; - rawIdArr[gIndex] = 0; - moduleId[gIndex] = 9999; - - uint32_t ww = word[gIndex]; // Array containing 32 bit raw data - if (ww == 0) { - // 0 is an indicator of a noise/dead channel, skip these pixels during clusterization - return; - } - - uint32_t link = getLink(ww); // Extract link - uint32_t roc = getRoc(ww); // Extract Roc in link - ::pixelgpudetails::DetIdGPU detId = getRawId(cablingMap, fedId, link, roc); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, wordCounter, [&](uint32_t iloop) { + auto gIndex = iloop; + xx[gIndex] = 0; + yy[gIndex] = 0; + adc[gIndex] = 0; + bool skipROC = false; + + uint8_t fedId = fedIds[gIndex / 2]; // +1200; + + // initialize (too many coninue below) + pdigi[gIndex] = 0; + rawIdArr[gIndex] = 0; + moduleId[gIndex] = 9999; + + uint32_t ww = word[gIndex]; // Array containing 32 bit raw data + if (ww == 0) { + // 0 is an indicator of a noise/dead channel, skip these pixels during clusterization + return; + } - uint8_t errorType = checkROC(ww, fedId, link, cablingMap, debug); - skipROC = (roc < ::pixelgpudetails::maxROCIndex) ? false : (errorType != 0); - if (includeErrors and skipROC) { - uint32_t rID = getErrRawID(fedId, ww, errorType, cablingMap, debug); - err->push_back(acc, PixelErrorCompact{rID, ww, errorType, fedId}); - return; - } + uint32_t link = getLink(ww); // Extract link + uint32_t roc = getRoc(ww); // Extract Roc in link + ::pixelgpudetails::DetIdGPU detId = getRawId(cablingMap, fedId, link, roc); - uint32_t rawId = detId.RawId; - uint32_t rocIdInDetUnit = detId.rocInDet; - bool barrel = isBarrel(rawId); + uint8_t errorType = checkROC(ww, fedId, link, cablingMap, debug); + skipROC = (roc < ::pixelgpudetails::maxROCIndex) ? false : (errorType != 0); + if (includeErrors and skipROC) { + uint32_t rID = getErrRawID(fedId, ww, errorType, cablingMap, debug); + err->push_back(acc, PixelErrorCompact{rID, ww, errorType, fedId}); + return; + } - uint32_t index = fedId * ::pixelgpudetails::MAX_LINK * ::pixelgpudetails::MAX_ROC + - (link - 1) * ::pixelgpudetails::MAX_ROC + roc; - if (useQualityInfo) { - skipROC = cablingMap->badRocs[index]; - if (skipROC) - return; - } - skipROC = modToUnp[index]; - if (skipROC) - return; + uint32_t rawId = detId.RawId; + uint32_t rocIdInDetUnit = detId.rocInDet; + bool barrel = isBarrel(rawId); - uint32_t layer = 0; //, ladder =0; - int side = 0, panel = 0, module = 0; //disk = 0, blade = 0 + uint32_t index = fedId * ::pixelgpudetails::MAX_LINK * ::pixelgpudetails::MAX_ROC + + (link - 1) * ::pixelgpudetails::MAX_ROC + roc; + if (useQualityInfo) { + skipROC = cablingMap->badRocs[index]; + if (skipROC) + return; + } + skipROC = modToUnp[index]; + if (skipROC) + return; - if (barrel) { - layer = (rawId >> ::pixelgpudetails::layerStartBit) & ::pixelgpudetails::layerMask; - module = (rawId >> ::pixelgpudetails::moduleStartBit) & ::pixelgpudetails::moduleMask; - side = (module < 5) ? -1 : 1; - } else { - // endcap ids - layer = 0; - panel = (rawId >> ::pixelgpudetails::panelStartBit) & ::pixelgpudetails::panelMask; - //disk = (rawId >> diskStartBit_) & diskMask_; - side = (panel == 1) ? -1 : 1; - //blade = (rawId >> bladeStartBit_) & bladeMask_; - } + uint32_t layer = 0; //, ladder =0; + int side = 0, panel = 0, module = 0; //disk = 0, blade = 0 + + if (barrel) { + layer = (rawId >> ::pixelgpudetails::layerStartBit) & ::pixelgpudetails::layerMask; + module = (rawId >> ::pixelgpudetails::moduleStartBit) & ::pixelgpudetails::moduleMask; + side = (module < 5) ? -1 : 1; + } else { + // endcap ids + layer = 0; + panel = (rawId >> ::pixelgpudetails::panelStartBit) & ::pixelgpudetails::panelMask; + //disk = (rawId >> diskStartBit_) & diskMask_; + side = (panel == 1) ? -1 : 1; + //blade = (rawId >> bladeStartBit_) & bladeMask_; + } - // ***special case of layer to 1 be handled here - ::pixelgpudetails::Pixel localPix; - if (layer == 1) { - uint32_t col = (ww >> ::pixelgpudetails::COL_shift) & ::pixelgpudetails::COL_mask; - uint32_t row = (ww >> ::pixelgpudetails::ROW_shift) & ::pixelgpudetails::ROW_mask; - localPix.row = row; - localPix.col = col; - if (includeErrors) { - if (not rocRowColIsValid(row, col)) { - uint8_t error = conversionError(fedId, 3, debug); //use the device function and fill the arrays - err->push_back(acc, PixelErrorCompact{rawId, ww, error, fedId}); - if (debug) - printf("BPIX1 Error status: %i\n", error); - return; + // ***special case of layer to 1 be handled here + ::pixelgpudetails::Pixel localPix; + if (layer == 1) { + uint32_t col = (ww >> ::pixelgpudetails::COL_shift) & ::pixelgpudetails::COL_mask; + uint32_t row = (ww >> ::pixelgpudetails::ROW_shift) & ::pixelgpudetails::ROW_mask; + localPix.row = row; + localPix.col = col; + if (includeErrors) { + if (not rocRowColIsValid(row, col)) { + uint8_t error = conversionError(fedId, 3, debug); //use the device function and fill the arrays + err->push_back(acc, PixelErrorCompact{rawId, ww, error, fedId}); + if (debug) + printf("BPIX1 Error status: %i\n", error); + return; + } + } + } else { + // ***conversion rules for dcol and pxid + uint32_t dcol = (ww >> ::pixelgpudetails::DCOL_shift) & ::pixelgpudetails::DCOL_mask; + uint32_t pxid = (ww >> ::pixelgpudetails::PXID_shift) & ::pixelgpudetails::PXID_mask; + uint32_t row = ::pixelgpudetails::numRowsInRoc - pxid / 2; + uint32_t col = dcol * 2 + pxid % 2; + localPix.row = row; + localPix.col = col; + if (includeErrors and not dcolIsValid(dcol, pxid)) { + uint8_t error = conversionError(fedId, 3, debug); + err->push_back(acc, PixelErrorCompact{rawId, ww, error, fedId}); + if (debug) + printf("Error status: %i %d %d %d %d\n", error, dcol, pxid, fedId, roc); + return; + } } - } - } else { - // ***conversion rules for dcol and pxid - uint32_t dcol = (ww >> ::pixelgpudetails::DCOL_shift) & ::pixelgpudetails::DCOL_mask; - uint32_t pxid = (ww >> ::pixelgpudetails::PXID_shift) & ::pixelgpudetails::PXID_mask; - uint32_t row = ::pixelgpudetails::numRowsInRoc - pxid / 2; - uint32_t col = dcol * 2 + pxid % 2; - localPix.row = row; - localPix.col = col; - if (includeErrors and not dcolIsValid(dcol, pxid)) { - uint8_t error = conversionError(fedId, 3, debug); - err->push_back(acc, PixelErrorCompact{rawId, ww, error, fedId}); - if (debug) - printf("Error status: %i %d %d %d %d\n", error, dcol, pxid, fedId, roc); - return; - } - } - ::pixelgpudetails::Pixel globalPix = frameConversion(barrel, side, layer, rocIdInDetUnit, localPix); - xx[gIndex] = globalPix.row; // origin shifting by 1 0-159 - yy[gIndex] = globalPix.col; // origin shifting by 1 0-415 - adc[gIndex] = getADC(ww); - pdigi[gIndex] = ::pixelgpudetails::pack(globalPix.row, globalPix.col, adc[gIndex]); - moduleId[gIndex] = detId.moduleId; - rawIdArr[gIndex] = rawId; - }); // end of stride on grid + ::pixelgpudetails::Pixel globalPix = frameConversion(barrel, side, layer, rocIdInDetUnit, localPix); + xx[gIndex] = globalPix.row; // origin shifting by 1 0-159 + yy[gIndex] = globalPix.col; // origin shifting by 1 0-415 + adc[gIndex] = getADC(ww); + pdigi[gIndex] = ::pixelgpudetails::pack(globalPix.row, globalPix.col, adc[gIndex]); + moduleId[gIndex] = detId.moduleId; + rawIdArr[gIndex] = rawId; + }); // end of stride on grid } // end of Raw to Digi kernel operator() }; // end of Raw to Digi struct @@ -493,16 +494,17 @@ namespace pixelgpudetails { #endif // limit to MaxHitsInModule; - cms::alpakatools::for_each_element_in_block_strided(acc, gpuClustering::MaxNumModules, [&](uint32_t i) { - moduleStart[i + 1] = std::min(gpuClustering::maxHitsInModule(), cluStart[i]); - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, gpuClustering::MaxNumModules, [&](uint32_t i) { + moduleStart[i + 1] = std::min(gpuClustering::maxHitsInModule(), cluStart[i]); + }); auto &&ws = alpaka::declareSharedVar(acc); - cms::alpakatools::blockPrefixScan(acc, moduleStart + 1, moduleStart + 1, 1024, ws); - cms::alpakatools::blockPrefixScan( + ::cms::alpakatools::blockPrefixScan(acc, moduleStart + 1, moduleStart + 1, 1024, ws); + ::cms::alpakatools::blockPrefixScan( acc, moduleStart + 1025, moduleStart + 1025, gpuClustering::MaxNumModules - 1024, ws); - cms::alpakatools::for_each_element_in_block_strided( + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( acc, gpuClustering::MaxNumModules + 1, 1025u, [&](uint32_t i) { moduleStart[i] += moduleStart[1024]; }); alpaka::syncBlockThreads(acc); @@ -515,22 +517,24 @@ namespace pixelgpudetails { ALPAKA_ASSERT_OFFLOAD(moduleStart[gpuClustering::MaxNumModules] >= moduleStart[1025]); //for (int i = first, iend = gpuClustering::MaxNumModules + 1; i < iend; i += blockDim.x) { - cms::alpakatools::for_each_element_in_block_strided(acc, gpuClustering::MaxNumModules + 1, [&](uint32_t i) { - if (0 != i) - ALPAKA_ASSERT_OFFLOAD(moduleStart[i] >= moduleStart[i - i]); - // [BPX1, BPX2, BPX3, BPX4, FP1, FP2, FP3, FN1, FN2, FN3, LAST_VALID] - // [ 0, 96, 320, 672, 1184, 1296, 1408, 1520, 1632, 1744, 1856] - if (i == 96 || i == 1184 || i == 1744 || i == gpuClustering::MaxNumModules) - printf("moduleStart %d %d\n", i, moduleStart[i]); - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, gpuClustering::MaxNumModules + 1, [&](uint32_t i) { + if (0 != i) + ALPAKA_ASSERT_OFFLOAD(moduleStart[i] >= moduleStart[i - i]); + // [BPX1, BPX2, BPX3, BPX4, FP1, FP2, FP3, FN1, FN2, FN3, LAST_VALID] + // [ 0, 96, 320, 672, 1184, 1296, 1408, 1520, 1632, 1744, 1856] + if (i == 96 || i == 1184 || i == 1744 || i == gpuClustering::MaxNumModules) + printf("moduleStart %d %d\n", i, moduleStart[i]); + }); #endif // avoid overflow constexpr auto MAX_HITS = gpuClustering::MaxNumClusters; - cms::alpakatools::for_each_element_in_block_strided(acc, gpuClustering::MaxNumModules + 1, [&](uint32_t i) { - if (moduleStart[i] > MAX_HITS) - moduleStart[i] = MAX_HITS; - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, gpuClustering::MaxNumModules + 1, [&](uint32_t i) { + if (moduleStart[i] > MAX_HITS) + moduleStart[i] = MAX_HITS; + }); } // end of fillHitsModuleStart kernel operator() }; // end of fillHitsModuleStart struct @@ -561,7 +565,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { digis_d = SiPixelDigisAlpaka(pixelgpudetails::MAX_FED_WORDS); if (includeErrors) { - digiErrors_d = SiPixelDigiErrorsAlpaka(pixelgpudetails::MAX_FED_WORDS, std::move(errors)); + digiErrors_d = SiPixelDigiErrorsAlpaka(pixelgpudetails::MAX_FED_WORDS, std::move(errors), queue); } clusters_d = SiPixelClustersAlpaka(gpuClustering::MaxNumModules); @@ -575,16 +579,16 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { #endif const uint32_t blocks = (wordCounter + threadsPerBlockOrElementsPerThread - 1) / threadsPerBlockOrElementsPerThread; // fill it all - const WorkDiv1D &workDiv = - cms::alpakatools::make_workdiv(Vec1D::all(blocks), Vec1D::all(threadsPerBlockOrElementsPerThread)); + const WorkDiv1D &workDiv = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( + Vec1D::all(blocks), Vec1D::all(threadsPerBlockOrElementsPerThread)); ALPAKA_ASSERT_OFFLOAD(0 == wordCounter % 2); // wordCounter is the total no of words in each event to be trasfered on device - auto word_d = cms::alpakatools::allocDeviceBuf(wordCounter); + auto word_d = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(wordCounter); // NB: IMPORTANT: fedId_d: In legacy, wordCounter elements are allocated. // However, only the first half of elements end up eventually used: // hence, here, only wordCounter/2 elements are allocated. - auto fedId_d = cms::alpakatools::allocDeviceBuf(wordCounter / 2); + auto fedId_d = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(wordCounter / 2); alpaka::memcpy(queue, word_d, wordFed.word(), wordCounter); alpaka::memcpy(queue, fedId_d, wordFed.fedId(), wordCounter / 2); @@ -604,7 +608,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { digis_d.pdigi(), digis_d.rawIdArr(), digis_d.moduleInd(), - digiErrors_d.error(), + digiErrors_d->error(), useQualityInfo, includeErrors, debug)); @@ -634,8 +638,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { const int blocks = (std::max(int(wordCounter), int(gpuClustering::MaxNumModules)) + threadsPerBlockOrElementsPerThread - 1) / threadsPerBlockOrElementsPerThread; - const WorkDiv1D &workDiv = - cms::alpakatools::make_workdiv(Vec1D::all(blocks), Vec1D::all(threadsPerBlockOrElementsPerThread)); + const WorkDiv1D &workDiv = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( + Vec1D::all(blocks), Vec1D::all(threadsPerBlockOrElementsPerThread)); alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv, @@ -664,12 +668,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { alpaka::createTaskKernel( workDiv, countModules(), digis_d.c_moduleInd(), clusters_d.moduleStart(), digis_d.clus(), wordCounter)); - auto moduleStartFirstElement = cms::alpakatools::createDeviceView(clusters_d.moduleStart(), 1u); + auto moduleStartFirstElement = + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createDeviceView(clusters_d.moduleStart(), 1u); alpaka::memcpy(queue, nModules_Clusters_h, moduleStartFirstElement, 1u); const WorkDiv1D &workDivMaxNumModules = - cms::alpakatools::make_workdiv(Vec1D::all(MaxNumModules), Vec1D::all(256)); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(MaxNumModules), Vec1D::all(256)); // NB: With present findClus() / chargeCut() algorithm, // threadPerBlock (GPU) or elementsPerThread (CPU) = 256 show optimal performance. // Though, it does not have to be the same number for CPU/GPU cases. @@ -712,7 +717,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // available in the rechit producer without additional points of // synchronization/ExternalWork - const WorkDiv1D &workDivOneBlock = cms::alpakatools::make_workdiv(Vec1D::all(1u), Vec1D::all(1024u)); + const WorkDiv1D &workDivOneBlock = + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(1u), Vec1D::all(1024u)); // MUST be ONE block alpaka::enqueue(queue, @@ -722,12 +728,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { clusters_d.clusModuleStart())); // last element holds the number of all clusters - auto clusModuleStartView = cms::alpakatools::createDeviceView(clusters_d.clusModuleStart(), - gpuClustering::MaxNumModules + 1); + auto clusModuleStartView = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createDeviceView( + clusters_d.clusModuleStart(), gpuClustering::MaxNumModules + 1); const auto clusModuleStartLastElement = AlpakaDeviceSubView(clusModuleStartView, 1u, gpuClustering::MaxNumModules); // slice on host - auto nModules_Clusters_1_h{cms::alpakatools::allocHostBuf(1u)}; + auto nModules_Clusters_1_h{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(1u)}; auto p_nModules_Clusters_1_h = alpaka::getPtrNative(nModules_Clusters_1_h); alpaka::memcpy(queue, nModules_Clusters_1_h, clusModuleStartLastElement, 1u); diff --git a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.h b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.h index 9891eb85f..27415c1bb 100644 --- a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.h +++ b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.h @@ -170,10 +170,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { }; SiPixelRawToClusterGPUKernel() - : nModules_Clusters_h{cms::alpakatools::allocHostBuf(2u)}, + : nModules_Clusters_h{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(2u)}, digis_d{SiPixelDigisAlpaka(0u)}, - clusters_d{SiPixelClustersAlpaka(0u)}, - digiErrors_d{SiPixelDigiErrorsAlpaka(0u, PixelFormatterErrors())} {}; + clusters_d{SiPixelClustersAlpaka(0u)} {}; + ~SiPixelRawToClusterGPUKernel() = default; SiPixelRawToClusterGPUKernel(const SiPixelRawToClusterGPUKernel&) = delete; @@ -201,7 +201,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { return std::make_pair(std::move(digis_d), std::move(clusters_d)); } - SiPixelDigiErrorsAlpaka&& getErrors() { return std::move(digiErrors_d); } + SiPixelDigiErrorsAlpaka&& getErrors() { return std::move(*digiErrors_d); } private: uint32_t nDigis = 0; @@ -210,7 +210,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { AlpakaHostBuf nModules_Clusters_h; SiPixelDigisAlpaka digis_d; SiPixelClustersAlpaka clusters_d; - SiPixelDigiErrorsAlpaka digiErrors_d; + std::optional digiErrors_d; }; // see RecoLocalTracker/SiPixelClusterizer diff --git a/src/alpaka/plugin-SiPixelClusterizer/alpaka/gpuCalibPixel.h b/src/alpaka/plugin-SiPixelClusterizer/alpaka/gpuCalibPixel.h index e005c59a9..7543e2b7f 100644 --- a/src/alpaka/plugin-SiPixelClusterizer/alpaka/gpuCalibPixel.h +++ b/src/alpaka/plugin-SiPixelClusterizer/alpaka/gpuCalibPixel.h @@ -44,33 +44,34 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { clusModuleStart[0] = moduleStart[0] = 0; } - cms::alpakatools::for_each_element_in_grid_strided( + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( acc, gpuClustering::MaxNumModules, [&](uint32_t i) { nClustersInModule[i] = 0; }); - cms::alpakatools::for_each_element_in_grid_strided(acc, numElements, [&](uint32_t i) { - if (id[i] != InvId) { - float conversionFactor = (isRun2) ? (id[i] < 96 ? VCaltoElectronGain_L1 : VCaltoElectronGain) : 1.f; - float offset = (isRun2) ? (id[i] < 96 ? VCaltoElectronOffset_L1 : VCaltoElectronOffset) : 0; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, numElements, [&](uint32_t i) { + if (id[i] != InvId) { + float conversionFactor = (isRun2) ? (id[i] < 96 ? VCaltoElectronGain_L1 : VCaltoElectronGain) : 1.f; + float offset = (isRun2) ? (id[i] < 96 ? VCaltoElectronOffset_L1 : VCaltoElectronOffset) : 0; - bool isDeadColumn = false, isNoisyColumn = false; + bool isDeadColumn = false, isNoisyColumn = false; - int row = x[i]; - int col = y[i]; - auto ret = SiPixelGainForHLTonGPU::getPedAndGain( - v_pedestals, rangeAndCols, fields, id[i], col, row, isDeadColumn, isNoisyColumn); - float pedestal = ret.first; - float gain = ret.second; - // float pedestal = 0; float gain = 1.; - if (isDeadColumn | isNoisyColumn) { - id[i] = InvId; - adc[i] = 0; - printf("bad pixel at %d in %d\n", i, id[i]); - } else { - float vcal = adc[i] * gain - pedestal * gain; - adc[i] = std::max(100, int(vcal * conversionFactor + offset)); - } - } - }); + int row = x[i]; + int col = y[i]; + auto ret = SiPixelGainForHLTonGPU::getPedAndGain( + v_pedestals, rangeAndCols, fields, id[i], col, row, isDeadColumn, isNoisyColumn); + float pedestal = ret.first; + float gain = ret.second; + // float pedestal = 0; float gain = 1.; + if (isDeadColumn | isNoisyColumn) { + id[i] = InvId; + adc[i] = 0; + printf("bad pixel at %d in %d\n", i, id[i]); + } else { + float vcal = adc[i] * gain - pedestal * gain; + adc[i] = std::max(100, int(vcal * conversionFactor + offset)); + } + } + }); } }; } // namespace gpuCalibPixel diff --git a/src/alpaka/plugin-SiPixelClusterizer/alpaka/gpuClusterChargeCut.h b/src/alpaka/plugin-SiPixelClusterizer/alpaka/gpuClusterChargeCut.h index f0eb10cb1..c86b879bd 100644 --- a/src/alpaka/plugin-SiPixelClusterizer/alpaka/gpuClusterChargeCut.h +++ b/src/alpaka/plugin-SiPixelClusterizer/alpaka/gpuClusterChargeCut.h @@ -47,14 +47,14 @@ namespace gpuClustering { // Get thread / CPU element indices in block. const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_index_range_in_block(acc, firstPixel); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::element_index_range_in_block(acc, firstPixel); if (nclus > MaxNumClustersPerModules) { uint32_t firstElementIdx = firstElementIdxNoStride; uint32_t endElementIdx = endElementIdxNoStride; // remove excess FIXME find a way to cut charge first.... for (uint32_t i = firstElementIdx; i < numElements; ++i) { - if (!cms::alpakatools::next_valid_element_index_strided( + if (not ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::next_valid_element_index_strided( i, firstElementIdx, endElementIdx, blockDimension, numElements)) break; if (id[i] == InvId) @@ -80,13 +80,14 @@ namespace gpuClustering { auto& newclusId = alpaka::declareSharedVar(acc); ALPAKA_ASSERT_OFFLOAD(nclus <= MaxNumClustersPerModules); - cms::alpakatools::for_each_element_in_block_strided(acc, nclus, [&](uint32_t i) { charge[i] = 0; }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, nclus, [&](uint32_t i) { charge[i] = 0; }); alpaka::syncBlockThreads(acc); uint32_t firstElementIdx = firstElementIdxNoStride; uint32_t endElementIdx = endElementIdxNoStride; for (uint32_t i = firstElementIdx; i < numElements; ++i) { - if (!cms::alpakatools::next_valid_element_index_strided( + if (not ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::next_valid_element_index_strided( i, firstElementIdx, endElementIdx, blockDimension, numElements)) break; if (id[i] == InvId) @@ -98,13 +99,13 @@ namespace gpuClustering { alpaka::syncBlockThreads(acc); auto chargeCut = thisModuleId < 96 ? 2000 : 4000; // move in constants (calib?) - cms::alpakatools::for_each_element_in_block_strided( + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( acc, nclus, [&](uint32_t i) { newclusId[i] = ok[i] = charge[i] > chargeCut ? 1 : 0; }); alpaka::syncBlockThreads(acc); // renumber auto& ws = alpaka::declareSharedVar(acc); - cms::alpakatools::blockPrefixScan(acc, newclusId, nclus, ws); + ::cms::alpakatools::blockPrefixScan(acc, newclusId, nclus, ws); ALPAKA_ASSERT_OFFLOAD(nclus >= newclusId[nclus - 1]); @@ -115,7 +116,7 @@ namespace gpuClustering { alpaka::syncBlockThreads(acc); // mark bad cluster again - cms::alpakatools::for_each_element_in_block_strided(acc, nclus, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, nclus, [&](uint32_t i) { if (0 == ok[i]) newclusId[i] = InvId + 1; }); @@ -125,7 +126,7 @@ namespace gpuClustering { firstElementIdx = firstElementIdxNoStride; endElementIdx = endElementIdxNoStride; for (uint32_t i = firstElementIdx; i < numElements; ++i) { - if (!cms::alpakatools::next_valid_element_index_strided( + if (not ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::next_valid_element_index_strided( i, firstElementIdx, endElementIdx, blockDimension, numElements)) break; if (id[i] == InvId) diff --git a/src/alpaka/plugin-SiPixelClusterizer/alpaka/gpuClustering.h b/src/alpaka/plugin-SiPixelClusterizer/alpaka/gpuClustering.h index c41f4b993..bf1bd0287 100644 --- a/src/alpaka/plugin-SiPixelClusterizer/alpaka/gpuClustering.h +++ b/src/alpaka/plugin-SiPixelClusterizer/alpaka/gpuClustering.h @@ -24,21 +24,22 @@ namespace gpuClustering { uint32_t* __restrict__ moduleStart, int32_t* __restrict__ clusterId, const unsigned int numElements) const { - cms::alpakatools::for_each_element_in_grid_strided(acc, numElements, [&](uint32_t i) { - clusterId[i] = i; - if (InvId != id[i]) { - int j = i - 1; - while (j >= 0 and id[j] == InvId) - --j; - if (j < 0 or id[j] != id[i]) { - // boundary... - auto loc = - alpaka::atomicInc(acc, moduleStart, std::decay_t(MaxNumModules), alpaka::hierarchy::Blocks{}); - - moduleStart[loc + 1] = i; - } - } - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, numElements, [&](uint32_t i) { + clusterId[i] = i; + if (InvId != id[i]) { + int j = i - 1; + while (j >= 0 and id[j] == InvId) + --j; + if (j < 0 or id[j] != id[i]) { + // boundary... + auto loc = alpaka::atomicInc( + acc, moduleStart, std::decay_t(MaxNumModules), alpaka::hierarchy::Blocks{}); + + moduleStart[loc + 1] = i; + } + } + }); } }; @@ -81,13 +82,13 @@ namespace gpuClustering { // Get thread / CPU element indices in block. const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_index_range_in_block(acc, firstPixel); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::element_index_range_in_block(acc, firstPixel); uint32_t firstElementIdx = firstElementIdxNoStride; uint32_t endElementIdx = endElementIdxNoStride; // skip threads not associated to an existing pixel for (uint32_t i = firstElementIdx; i < numElements; ++i) { - if (!cms::alpakatools::next_valid_element_index_strided( + if (not ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::next_valid_element_index_strided( i, firstElementIdx, endElementIdx, blockDimension, numElements)) break; if (id[i] == InvId) // skip invalid pixels @@ -101,11 +102,12 @@ namespace gpuClustering { //init hist (ymax=416 < 512 : 9bits) constexpr uint32_t maxPixInModule = 4000; constexpr auto nbins = phase1PixelTopology::numColsInModule + 2; //2+2; - using Hist = cms::alpakatools::HistoContainer; + using Hist = ::cms::alpakatools::HistoContainer; auto& hist = alpaka::declareSharedVar(acc); auto& ws = alpaka::declareSharedVar(acc); - cms::alpakatools::for_each_element_in_block_strided(acc, Hist::totbins(), [&](uint32_t j) { hist.off[j] = 0; }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, Hist::totbins(), [&](uint32_t j) { hist.off[j] = 0; }); alpaka::syncBlockThreads(acc); ALPAKA_ASSERT_OFFLOAD((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId))); @@ -128,16 +130,17 @@ namespace gpuClustering { #endif // fill histo - cms::alpakatools::for_each_element_in_block_strided(acc, msize, firstPixel, [&](uint32_t i) { - if (id[i] != InvId) { // skip invalid pixels - hist.count(acc, y[i]); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, msize, firstPixel, [&](uint32_t i) { + if (id[i] != InvId) { // skip invalid pixels + hist.count(acc, y[i]); #ifdef GPU_DEBUG - alpaka::atomicAdd(acc, &totGood, 1u, alpaka::hierarchy::Blocks{}); + alpaka::atomicAdd(acc, &totGood, 1u, alpaka::hierarchy::Blocks{}); #endif - } - }); + } + }); alpaka::syncBlockThreads(acc); - cms::alpakatools::for_each_element_in_block(acc, 32u, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block(acc, 32u, [&](uint32_t i) { ws[i] = 0; // used by prefix scan... }); alpaka::syncBlockThreads(acc); @@ -149,11 +152,12 @@ namespace gpuClustering { if (threadIdxLocal == 0) printf("histo size %d\n", hist.size()); #endif - cms::alpakatools::for_each_element_in_block_strided(acc, msize, firstPixel, [&](uint32_t i) { - if (id[i] != InvId) { // skip invalid pixels - hist.fill(acc, y[i], i - firstPixel); - } - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, msize, firstPixel, [&](uint32_t i) { + if (id[i] != InvId) { // skip invalid pixels + hist.fill(acc, y[i], i - firstPixel); + } + }); // Assume that we can cover the whole module with up to 16 blockDimension-wide iterations // This maxiter value was tuned for GPU, with 256 or 512 threads per block. @@ -195,12 +199,13 @@ namespace gpuClustering { auto& n60 = alpaka::declareSharedVar(acc); n40 = n60 = 0; alpaka::syncBlockThreads(acc); - cms::alpakatools::for_each_element_in_block_strided(acc, Hist::nbins(), [&](uint32_t j) { - if (hist.size(j) > 60) - alpaka::atomicAdd(acc, &n60, 1u, alpaka::hierarchy::Blocks{}); - if (hist.size(j) > 40) - alpaka::atomicAdd(acc, &n40, 1u, alpaka::hierarchy::Blocks{}); - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, Hist::nbins(), [&](uint32_t j) { + if (hist.size(j) > 60) + alpaka::atomicAdd(acc, &n60, 1u, alpaka::hierarchy::Blocks{}); + if (hist.size(j) > 40) + alpaka::atomicAdd(acc, &n40, 1u, alpaka::hierarchy::Blocks{}); + }); alpaka::syncBlockThreads(acc); if (0 == threadIdxLocal) { if (n60 > 0) @@ -213,30 +218,31 @@ namespace gpuClustering { // fill NN uint32_t k = 0u; - cms::alpakatools::for_each_element_in_block_strided(acc, hist.size(), [&](uint32_t j) { - const uint32_t jEquivalentClass = j % threadDimension; - k = j / blockDimension; - ALPAKA_ASSERT_OFFLOAD(k < maxiter); - auto p = hist.begin() + j; - auto i = *p + firstPixel; - ALPAKA_ASSERT_OFFLOAD(id[i] != InvId); - ALPAKA_ASSERT_OFFLOAD(id[i] == thisModuleId); // same module - int be = Hist::bin(y[i] + 1); - auto e = hist.end(be); - ++p; - ALPAKA_ASSERT_OFFLOAD(0 == nnn[k][jEquivalentClass]); - for (; p < e; ++p) { - auto m = (*p) + firstPixel; - ALPAKA_ASSERT_OFFLOAD(m != i); - ALPAKA_ASSERT_OFFLOAD(int(y[m]) - int(y[i]) >= 0); - ALPAKA_ASSERT_OFFLOAD(int(y[m]) - int(y[i]) <= 1); - if (std::abs(int(x[m]) - int(x[i])) <= 1) { - auto l = nnn[k][jEquivalentClass]++; - ALPAKA_ASSERT_OFFLOAD(l < maxNeighbours); - nn[k][jEquivalentClass][l] = *p; - } - } - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, hist.size(), [&](uint32_t j) { + const uint32_t jEquivalentClass = j % threadDimension; + k = j / blockDimension; + ALPAKA_ASSERT_OFFLOAD(k < maxiter); + auto p = hist.begin() + j; + auto i = *p + firstPixel; + ALPAKA_ASSERT_OFFLOAD(id[i] != InvId); + ALPAKA_ASSERT_OFFLOAD(id[i] == thisModuleId); // same module + int be = Hist::bin(y[i] + 1); + auto e = hist.end(be); + ++p; + ALPAKA_ASSERT_OFFLOAD(0 == nnn[k][jEquivalentClass]); + for (; p < e; ++p) { + auto m = (*p) + firstPixel; + ALPAKA_ASSERT_OFFLOAD(m != i); + ALPAKA_ASSERT_OFFLOAD(int(y[m]) - int(y[i]) >= 0); + ALPAKA_ASSERT_OFFLOAD(int(y[m]) - int(y[i]) <= 1); + if (std::abs(int(x[m]) - int(x[i])) <= 1) { + auto l = nnn[k][jEquivalentClass]++; + ALPAKA_ASSERT_OFFLOAD(l < maxNeighbours); + nn[k][jEquivalentClass][l] = *p; + } + } + }); // for each pixel, look at all the pixels until the end of the module; // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum; @@ -246,34 +252,36 @@ namespace gpuClustering { int nloops = 0; while (alpaka::syncBlockThreadsPredicate(acc, more)) { if (1 == nloops % 2) { - cms::alpakatools::for_each_element_in_block_strided(acc, hist.size(), [&](uint32_t j) { - auto p = hist.begin() + j; - auto i = *p + firstPixel; - auto m = clusterId[i]; - while (m != clusterId[m]) - m = clusterId[m]; - clusterId[i] = m; - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, hist.size(), [&](uint32_t j) { + auto p = hist.begin() + j; + auto i = *p + firstPixel; + auto m = clusterId[i]; + while (m != clusterId[m]) + m = clusterId[m]; + clusterId[i] = m; + }); } else { more = false; uint32_t k = 0u; - cms::alpakatools::for_each_element_in_block_strided(acc, hist.size(), [&](uint32_t j) { - k = j / blockDimension; - const uint32_t jEquivalentClass = j % threadDimension; - auto p = hist.begin() + j; - auto i = *p + firstPixel; - for (int kk = 0; kk < nnn[k][jEquivalentClass]; ++kk) { - auto l = nn[k][jEquivalentClass][kk]; - auto m = l + firstPixel; - ALPAKA_ASSERT_OFFLOAD(m != i); - auto old = alpaka::atomicMin(acc, &clusterId[m], clusterId[i], alpaka::hierarchy::Blocks{}); - if (old != clusterId[i]) { - // end the loop only if no changes were applied - more = true; - } - alpaka::atomicMin(acc, &clusterId[i], old, alpaka::hierarchy::Blocks{}); - } // nnloop - }); // pixel loop + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, hist.size(), [&](uint32_t j) { + k = j / blockDimension; + const uint32_t jEquivalentClass = j % threadDimension; + auto p = hist.begin() + j; + auto i = *p + firstPixel; + for (int kk = 0; kk < nnn[k][jEquivalentClass]; ++kk) { + auto l = nn[k][jEquivalentClass][kk]; + auto m = l + firstPixel; + ALPAKA_ASSERT_OFFLOAD(m != i); + auto old = alpaka::atomicMin(acc, &clusterId[m], clusterId[i], alpaka::hierarchy::Blocks{}); + if (old != clusterId[i]) { + // end the loop only if no changes were applied + more = true; + } + alpaka::atomicMin(acc, &clusterId[i], old, alpaka::hierarchy::Blocks{}); + } // nnloop + }); // pixel loop } ++nloops; } // end while @@ -300,43 +308,46 @@ namespace gpuClustering { // find the number of different clusters, identified by a pixels with clus[i] == i; // mark these pixels with a negative id. - cms::alpakatools::for_each_element_in_block_strided(acc, msize, firstPixel, [&](uint32_t i) { - if (id[i] != InvId) { // skip invalid pixels - if (clusterId[i] == static_cast(i)) { - auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Blocks{}); - clusterId[i] = -(old + 1); - } - } - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, msize, firstPixel, [&](uint32_t i) { + if (id[i] != InvId) { // skip invalid pixels + if (clusterId[i] == static_cast(i)) { + auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Blocks{}); + clusterId[i] = -(old + 1); + } + } + }); alpaka::syncBlockThreads(acc); // propagate the negative id to all the pixels in the cluster. - cms::alpakatools::for_each_element_in_block_strided(acc, msize, firstPixel, [&](uint32_t i) { - if (id[i] != InvId) { // skip invalid pixels - if (clusterId[i] >= 0) { - // mark each pixel in a cluster with the same id as the first one - clusterId[i] = clusterId[clusterId[i]]; - } - } - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, msize, firstPixel, [&](uint32_t i) { + if (id[i] != InvId) { // skip invalid pixels + if (clusterId[i] >= 0) { + // mark each pixel in a cluster with the same id as the first one + clusterId[i] = clusterId[clusterId[i]]; + } + } + }); alpaka::syncBlockThreads(acc); // adjust the cluster id to be a positive value starting from 0 - cms::alpakatools::for_each_element_in_block_strided(acc, msize, firstPixel, [&](uint32_t i) { - if (id[i] == InvId) { // skip invalid pixels - clusterId[i] = -9999; - } else { - clusterId[i] = -clusterId[i] - 1; - } - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, msize, firstPixel, [&](uint32_t i) { + if (id[i] == InvId) { // skip invalid pixels + clusterId[i] = -9999; + } else { + clusterId[i] = -clusterId[i] - 1; + } + }); alpaka::syncBlockThreads(acc); if (threadIdxLocal == 0) { nClustersInModule[thisModuleId] = foundClusters; moduleId[blockIdx] = thisModuleId; #ifdef GPU_DEBUG - if (foundClusters > ALPAKA_ACCELERATOR_NAMESPACE::gMaxHit) { - ALPAKA_ACCELERATOR_NAMESPACE::gMaxHit = foundClusters; + if (foundClusters > ::gpuClustering::ALPAKA_ACCELERATOR_NAMESPACE::gMaxHit) { + ::gpuClustering::ALPAKA_ACCELERATOR_NAMESPACE::gMaxHit = foundClusters; if (foundClusters > 8) printf("max hit %d in %d\n", foundClusters, thisModuleId); } diff --git a/src/alpaka/plugin-SiPixelRecHits/alpaka/PixelCPEFastESProducer.cc b/src/alpaka/plugin-SiPixelRecHits/alpaka/PixelCPEFastESProducer.cc index b94b76b22..59ea310d8 100644 --- a/src/alpaka/plugin-SiPixelRecHits/alpaka/PixelCPEFastESProducer.cc +++ b/src/alpaka/plugin-SiPixelRecHits/alpaka/PixelCPEFastESProducer.cc @@ -27,8 +27,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { pixelCPEforGPU::CommonParams commonParams; in.read(reinterpret_cast(&commonParams), sizeof(pixelCPEforGPU::CommonParams)); - auto commonParams_h{cms::alpakatools::createHostView(&commonParams, 1u)}; - auto commonParams_d{cms::alpakatools::allocDeviceBuf(1u)}; + auto commonParams_h{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createHostView( + &commonParams, 1u)}; + auto commonParams_d{ + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}; + alpaka::prepareForAsyncCopy(commonParams_d); alpaka::memcpy(queue, commonParams_d, commonParams_h, 1u); unsigned int ndetParams; @@ -37,20 +40,31 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { //detParams.resize(ndetParams); std::vector detParams(ndetParams); in.read(reinterpret_cast(detParams.data()), ndetParams * sizeof(pixelCPEforGPU::DetParams)); - auto detParams_h{cms::alpakatools::createHostView(detParams.data(), ndetParams)}; - auto detParams_d{cms::alpakatools::allocDeviceBuf(ndetParams)}; + auto detParams_h{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createHostView( + detParams.data(), ndetParams)}; + auto detParams_d{ + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(ndetParams)}; + alpaka::prepareForAsyncCopy(detParams_d); alpaka::memcpy(queue, detParams_d, detParams_h, ndetParams); pixelCPEforGPU::AverageGeometry averageGeometry; in.read(reinterpret_cast(&averageGeometry), sizeof(pixelCPEforGPU::AverageGeometry)); - auto averageGeometry_h{cms::alpakatools::createHostView(&averageGeometry, 1u)}; - auto averageGeometry_d{cms::alpakatools::allocDeviceBuf(1u)}; + auto averageGeometry_h{ + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createHostView( + &averageGeometry, 1u)}; + auto averageGeometry_d{ + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}; + alpaka::prepareForAsyncCopy(averageGeometry_d); alpaka::memcpy(queue, averageGeometry_d, averageGeometry_h, 1u); pixelCPEforGPU::LayerGeometry layerGeometry; in.read(reinterpret_cast(&layerGeometry), sizeof(pixelCPEforGPU::LayerGeometry)); - auto layerGeometry_h{cms::alpakatools::createHostView(&layerGeometry, 1u)}; - auto layerGeometry_d{cms::alpakatools::allocDeviceBuf(1u)}; + auto layerGeometry_h{ + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createHostView(&layerGeometry, + 1u)}; + auto layerGeometry_d{ + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}; + alpaka::prepareForAsyncCopy(layerGeometry_d); alpaka::memcpy(queue, layerGeometry_d, layerGeometry_h, 1u); pixelCPEforGPU::ParamsOnGPU params; @@ -58,8 +72,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { params.m_detParams = alpaka::getPtrNative(detParams_d); params.m_layerGeometry = alpaka::getPtrNative(layerGeometry_d); params.m_averageGeometry = alpaka::getPtrNative(averageGeometry_d); - auto params_h{cms::alpakatools::createHostView(¶ms, 1u)}; - auto params_d{cms::alpakatools::allocDeviceBuf(1u)}; + auto params_h{ + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createHostView(¶ms, 1u)}; + auto params_d{::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocDeviceBuf(1u)}; + alpaka::prepareForAsyncCopy(params_d); alpaka::memcpy(queue, params_d, params_h, 1u); alpaka::wait(queue); diff --git a/src/alpaka/plugin-SiPixelRecHits/alpaka/PixelRecHits.cc b/src/alpaka/plugin-SiPixelRecHits/alpaka/PixelRecHits.cc index a640f0bdf..cc1fc92a5 100644 --- a/src/alpaka/plugin-SiPixelRecHits/alpaka/PixelRecHits.cc +++ b/src/alpaka/plugin-SiPixelRecHits/alpaka/PixelRecHits.cc @@ -3,47 +3,45 @@ #include "PixelRecHits.h" #include "gpuPixelRecHits.h" -namespace { - struct setHitsLayerStart { - template - ALPAKA_FN_ACC void operator()(const T_Acc& acc, - uint32_t const* __restrict__ hitsModuleStart, - pixelCPEforGPU::ParamsOnGPU const* cpeParams, - uint32_t* hitsLayerStart) const { - ALPAKA_ASSERT_OFFLOAD(0 == hitsModuleStart[0]); +namespace ALPAKA_ACCELERATOR_NAMESPACE { - cms::alpakatools::for_each_element_in_grid(acc, 11, [&](uint32_t i) { - hitsLayerStart[i] = hitsModuleStart[cpeParams->layerGeometry().layerStart[i]]; + namespace { + struct setHitsLayerStart { + template + ALPAKA_FN_ACC void operator()(const T_Acc& acc, + uint32_t const* __restrict__ hitsModuleStart, + pixelCPEforGPU::ParamsOnGPU const* cpeParams, + uint32_t* hitsLayerStart) const { + ALPAKA_ASSERT_OFFLOAD(0 == hitsModuleStart[0]); + + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid(acc, 11, [&](uint32_t i) { + hitsLayerStart[i] = hitsModuleStart[cpeParams->layerGeometry().layerStart[i]]; #ifdef GPU_DEBUG - printf("LayerStart %d %d: %d\n", i, cpeParams->layerGeometry().layerStart[i], hitsLayerStart[i]); + printf("LayerStart %d %d: %d\n", i, cpeParams->layerGeometry().layerStart[i], hitsLayerStart[i]); #endif - }); - } - }; -} // namespace - -namespace ALPAKA_ACCELERATOR_NAMESPACE { + }); + } + }; + } // namespace namespace pixelgpudetails { TrackingRecHit2DAlpaka PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisAlpaka const& digis_d, SiPixelClustersAlpaka const& clusters_d, BeamSpotAlpaka const& bs_d, - pixelCPEforGPU::ParamsOnGPU const* cpeParams) const { + pixelCPEforGPU::ParamsOnGPU const* cpeParams, + Queue& queue) const { auto nHits = clusters_d.nClusters(); - TrackingRecHit2DAlpaka hits_d(nHits, cpeParams, clusters_d.clusModuleStart()); + TrackingRecHit2DAlpaka hits_d(nHits, cpeParams, clusters_d.clusModuleStart(), queue); const int threadsPerBlockOrElementsPerThread = 128; const int blocks = digis_d.nModules(); // active modules (with digis) - const WorkDiv1D& getHitsWorkDiv = - cms::alpakatools::make_workdiv(Vec1D::all(blocks), Vec1D::all(threadsPerBlockOrElementsPerThread)); + const WorkDiv1D& getHitsWorkDiv = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( + Vec1D::all(blocks), Vec1D::all(threadsPerBlockOrElementsPerThread)); #ifdef GPU_DEBUG std::cout << "launching getHits kernel for " << blocks << " blocks" << std::endl; #endif - - Queue queue(device); - if (blocks) { // protect from empty events alpaka::enqueue(queue, alpaka::createTaskKernel(getHitsWorkDiv, @@ -62,7 +60,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // assuming full warp of threads is better than a smaller number... if (nHits) { - const WorkDiv1D& oneBlockWorkDiv = cms::alpakatools::make_workdiv(Vec1D::all(1u), Vec1D::all(32u)); + const WorkDiv1D& oneBlockWorkDiv = + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(Vec1D::all(1u), Vec1D::all(32u)); alpaka::enqueue( queue, alpaka::createTaskKernel( @@ -70,15 +69,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } if (nHits) { - cms::alpakatools::fillManyFromVector( + ::cms::alpakatools::fillManyFromVector( hits_d.phiBinner(), 10, hits_d.c_iphi(), hits_d.c_hitsLayerStart(), nHits, 256, queue); } - //#ifdef GPU_DEBUG - //alpaka::wait(queue); - //#endif - +#ifdef GPU_DEBUG alpaka::wait(queue); +#endif + return hits_d; } diff --git a/src/alpaka/plugin-SiPixelRecHits/alpaka/PixelRecHits.h b/src/alpaka/plugin-SiPixelRecHits/alpaka/PixelRecHits.h index 23f3494b9..788d1f73c 100644 --- a/src/alpaka/plugin-SiPixelRecHits/alpaka/PixelRecHits.h +++ b/src/alpaka/plugin-SiPixelRecHits/alpaka/PixelRecHits.h @@ -26,7 +26,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { TrackingRecHit2DAlpaka makeHitsAsync(SiPixelDigisAlpaka const& digis_d, SiPixelClustersAlpaka const& clusters_d, BeamSpotAlpaka const& bs_d, - pixelCPEforGPU::ParamsOnGPU const* cpeParams) const; + pixelCPEforGPU::ParamsOnGPU const* cpeParams, + Queue& queue) const; }; } // namespace pixelgpudetails diff --git a/src/alpaka/plugin-SiPixelRecHits/alpaka/SiPixelRecHitAlpaka.cc b/src/alpaka/plugin-SiPixelRecHits/alpaka/SiPixelRecHitAlpaka.cc index 01472afdd..7a28ddfba 100644 --- a/src/alpaka/plugin-SiPixelRecHits/alpaka/SiPixelRecHitAlpaka.cc +++ b/src/alpaka/plugin-SiPixelRecHits/alpaka/SiPixelRecHitAlpaka.cc @@ -1,3 +1,4 @@ + #include "AlpakaDataFormats/BeamSpotAlpaka.h" #include "AlpakaDataFormats/SiPixelClustersAlpaka.h" #include "AlpakaDataFormats/SiPixelDigisAlpaka.h" @@ -7,6 +8,8 @@ #include "Framework/PluginFactory.h" #include "Framework/EDProducer.h" #include "CondFormats/PixelCPEFast.h" +#include "Framework/PluginFactory.h" +#include "AlpakaCore/ScopedContext.h" #include "PixelRecHits.h" // TODO : spit product from kernel @@ -51,7 +54,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } // TO DO: Async: Would need to add a queue as a parameter, not async for now! - iEvent.emplace(tokenHit_, gpuAlgo_.makeHitsAsync(digis, clusters, bs, fcpe.params())); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::ScopedContextProduce ctx{::ALPAKA_ACCELERATOR_NAMESPACE::device, + iEvent.streamID()}; + ctx.emplace(::ALPAKA_ACCELERATOR_NAMESPACE::device, + iEvent, + tokenHit_, + gpuAlgo_.makeHitsAsync(digis, clusters, bs, fcpe.params(), ctx.stream())); } } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/src/alpaka/plugin-SiPixelRecHits/alpaka/gpuPixelRecHits.h b/src/alpaka/plugin-SiPixelRecHits/alpaka/gpuPixelRecHits.h index 554b18a2a..8297b2747 100644 --- a/src/alpaka/plugin-SiPixelRecHits/alpaka/gpuPixelRecHits.h +++ b/src/alpaka/plugin-SiPixelRecHits/alpaka/gpuPixelRecHits.h @@ -6,10 +6,9 @@ #include #include "AlpakaCore/alpakaKernelCommon.h" - -#include "CondFormats/pixelCPEforGPU.h" #include "AlpakaDataFormats/BeamSpotAlpaka.h" #include "AlpakaDataFormats/TrackingRecHit2DAlpaka.h" +#include "CondFormats/pixelCPEforGPU.h" #include "DataFormats/approx_atan2.h" namespace ALPAKA_ACCELERATOR_NAMESPACE { @@ -43,18 +42,19 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto& agc = hits.averageGeometry(); auto const& ag = cpeParams->averageGeometry(); constexpr auto numberOfLaddersInBarrel = TrackingRecHit2DSOAView::AverageGeometry::numberOfLaddersInBarrel; - cms::alpakatools::for_each_element_in_block_strided(acc, numberOfLaddersInBarrel, [&](uint32_t il) { - agc.ladderZ[il] = ag.ladderZ[il] - bs->z; - agc.ladderX[il] = ag.ladderX[il] - bs->x; - agc.ladderY[il] = ag.ladderY[il] - bs->y; - agc.ladderR[il] = sqrt(agc.ladderX[il] * agc.ladderX[il] + agc.ladderY[il] * agc.ladderY[il]); - agc.ladderMinZ[il] = ag.ladderMinZ[il] - bs->z; - agc.ladderMaxZ[il] = ag.ladderMaxZ[il] - bs->z; - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, numberOfLaddersInBarrel, [&](uint32_t il) { + agc.ladderZ[il] = ag.ladderZ[il] - bs->z; + agc.ladderX[il] = ag.ladderX[il] - bs->x; + agc.ladderY[il] = ag.ladderY[il] - bs->y; + agc.ladderR[il] = sqrt(agc.ladderX[il] * agc.ladderX[il] + agc.ladderY[il] * agc.ladderY[il]); + agc.ladderMinZ[il] = ag.ladderMinZ[il] - bs->z; + agc.ladderMaxZ[il] = ag.ladderMaxZ[il] - bs->z; + }); if (threadIdxLocal == 0) { agc.endCapZ[0] = ag.endCapZ[0] - bs->z; agc.endCapZ[1] = ag.endCapZ[1] - bs->z; - // printf("endcapZ %f %f\n",agc.endCapZ[0],agc.endCapZ[1]); + //printf("endcapZ %f %f\n",agc.endCapZ[0],agc.endCapZ[1]); } } @@ -100,17 +100,18 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ALPAKA_ASSERT_OFFLOAD(nclus > MaxHitsInIter || (0 == startClus && nClusInIter == nclus && lastClus == nclus)); // init - cms::alpakatools::for_each_element_in_block_strided(acc, nClusInIter, [&](uint32_t ic) { - clusParams.minRow[ic] = std::numeric_limits::max(); - clusParams.maxRow[ic] = 0; - clusParams.minCol[ic] = std::numeric_limits::max(); - clusParams.maxCol[ic] = 0; - clusParams.charge[ic] = 0; - clusParams.Q_f_X[ic] = 0; - clusParams.Q_l_X[ic] = 0; - clusParams.Q_f_Y[ic] = 0; - clusParams.Q_l_Y[ic] = 0; - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, nClusInIter, [&](uint32_t ic) { + clusParams.minRow[ic] = std::numeric_limits::max(); + clusParams.maxRow[ic] = 0; + clusParams.minCol[ic] = std::numeric_limits::max(); + clusParams.maxCol[ic] = 0; + clusParams.charge[ic] = 0; + clusParams.Q_f_X[ic] = 0; + clusParams.Q_l_X[ic] = 0; + clusParams.Q_f_Y[ic] = 0; + clusParams.Q_l_Y[ic] = 0; + }); alpaka::syncBlockThreads(acc); @@ -118,11 +119,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { const uint32_t blockDimension(alpaka::getWorkDiv(acc)[0u]); const auto& [firstElementIdxNoStride, endElementIdxNoStride] = - cms::alpakatools::element_index_range_in_block(acc, first); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::element_index_range_in_block(acc, first); uint32_t rowsColsFirstElementIdx = firstElementIdxNoStride; uint32_t rowsColsEndElementIdx = endElementIdxNoStride; for (uint32_t i = rowsColsFirstElementIdx; i < numElements; ++i) { - if (!cms::alpakatools::next_valid_element_index_strided( + if (not ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::next_valid_element_index_strided( i, rowsColsFirstElementIdx, rowsColsEndElementIdx, blockDimension, numElements)) break; auto id = digis.moduleInd(i); @@ -152,7 +153,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { uint32_t chargeFirstElementIdx = firstElementIdxNoStride; uint32_t chargeEndElementIdx = endElementIdxNoStride; for (uint32_t i = chargeFirstElementIdx; i < numElements; ++i) { - if (!cms::alpakatools::next_valid_element_index_strided( + if (not ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::next_valid_element_index_strided( i, chargeFirstElementIdx, chargeEndElementIdx, blockDimension, numElements)) break; auto id = digis.moduleInd(i); @@ -186,51 +187,52 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { first = clusters.clusModuleStart(me) + startClus; - cms::alpakatools::for_each_element_in_block_strided(acc, nClusInIter, [&](uint32_t ic) { - auto h = first + ic; // output index in global memory + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, nClusInIter, [&](uint32_t ic) { + auto h = first + ic; // output index in global memory - // this cannot happen anymore - // TODO: was 'break', OTOH comment above says "should not happen", so hopefully 'return' is ok - if (h >= TrackingRecHit2DSOAView::maxHits()) - return; // overflow... - ALPAKA_ASSERT_OFFLOAD(h < hits.nHits()); - ALPAKA_ASSERT_OFFLOAD(h < clusters.clusModuleStart(me + 1)); + // this cannot happen anymore + // TODO: was 'break', OTOH comment above says "should not happen", so hopefully 'return' is ok + if (h >= TrackingRecHit2DSOAView::maxHits()) + return; // overflow... + ALPAKA_ASSERT_OFFLOAD(h < hits.nHits()); + ALPAKA_ASSERT_OFFLOAD(h < clusters.clusModuleStart(me + 1)); - pixelCPEforGPU::position(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic); - pixelCPEforGPU::errorFromDB(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic); + pixelCPEforGPU::position(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic); + pixelCPEforGPU::errorFromDB(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic); - // store it + // store it - hits.charge(h) = clusParams.charge[ic]; + hits.charge(h) = clusParams.charge[ic]; - hits.detectorIndex(h) = me; + hits.detectorIndex(h) = me; - float xl, yl; - hits.xLocal(h) = xl = clusParams.xpos[ic]; - hits.yLocal(h) = yl = clusParams.ypos[ic]; + float xl, yl; + hits.xLocal(h) = xl = clusParams.xpos[ic]; + hits.yLocal(h) = yl = clusParams.ypos[ic]; - hits.clusterSizeX(h) = clusParams.xsize[ic]; - hits.clusterSizeY(h) = clusParams.ysize[ic]; + hits.clusterSizeX(h) = clusParams.xsize[ic]; + hits.clusterSizeY(h) = clusParams.ysize[ic]; - hits.xerrLocal(h) = clusParams.xerr[ic] * clusParams.xerr[ic]; - hits.yerrLocal(h) = clusParams.yerr[ic] * clusParams.yerr[ic]; + hits.xerrLocal(h) = clusParams.xerr[ic] * clusParams.xerr[ic]; + hits.yerrLocal(h) = clusParams.yerr[ic] * clusParams.yerr[ic]; - // keep it local for computations - float xg, yg, zg; - // to global and compute phi... - cpeParams->detParams(me).frame.toGlobal(xl, yl, xg, yg, zg); - // here correct for the beamspot... - xg -= bs->x; - yg -= bs->y; - zg -= bs->z; + // keep it local for computations + float xg, yg, zg; + // to global and compute phi... + cpeParams->detParams(me).frame.toGlobal(xl, yl, xg, yg, zg); + // here correct for the beamspot... + xg -= bs->x; + yg -= bs->y; + zg -= bs->z; - hits.xGlobal(h) = xg; - hits.yGlobal(h) = yg; - hits.zGlobal(h) = zg; + hits.xGlobal(h) = xg; + hits.yGlobal(h) = yg; + hits.zGlobal(h) = zg; - hits.rGlobal(h) = std::sqrt(xg * xg + yg * yg); - hits.iphi(h) = unsafe_atan2s<7>(yg, xg); - }); + hits.rGlobal(h) = std::sqrt(xg * xg + yg * yg); + hits.iphi(h) = unsafe_atan2s<7>(yg, xg); + }); alpaka::syncBlockThreads(acc); } // end loop on batches diff --git a/src/alpaka/plugin-Validation/alpaka/HistoValidator.cc b/src/alpaka/plugin-Validation/alpaka/HistoValidator.cc index 5e9eb8ac7..a7e18e1fb 100644 --- a/src/alpaka/plugin-Validation/alpaka/HistoValidator.cc +++ b/src/alpaka/plugin-Validation/alpaka/HistoValidator.cc @@ -124,9 +124,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto const h_adcBuf = digis.adcToHostAsync(queue); auto const h_adc = alpaka::getPtrNative(h_adcBuf); - auto const d_clusInModuleView = - cms::alpakatools::createDeviceView(clusters.clusInModule(), gpuClustering::MaxNumModules); - auto h_clusInModuleBuf{cms::alpakatools::allocHostBuf(gpuClustering::MaxNumModules)}; + auto const d_clusInModuleView = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createDeviceView( + clusters.clusInModule(), gpuClustering::MaxNumModules); + auto h_clusInModuleBuf{ + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::allocHostBuf(gpuClustering::MaxNumModules)}; alpaka::memcpy(queue, h_clusInModuleBuf, d_clusInModuleView, gpuClustering::MaxNumModules); auto h_clusInModule = alpaka::getPtrNative(h_clusInModuleBuf); diff --git a/src/alpaka/test/alpaka/AtomicPairCounter_t.cc b/src/alpaka/test/alpaka/AtomicPairCounter_t.cc index 923b7f750..eafa2febe 100644 --- a/src/alpaka/test/alpaka/AtomicPairCounter_t.cc +++ b/src/alpaka/test/alpaka/AtomicPairCounter_t.cc @@ -10,8 +10,8 @@ using namespace ALPAKA_ACCELERATOR_NAMESPACE; struct update { template ALPAKA_FN_ACC void operator()( - const T_Acc &acc, cms::alpakatools::AtomicPairCounter *dc, uint32_t *ind, uint32_t *cont, uint32_t n) const { - cms::alpakatools::for_each_element_in_grid(acc, n, [&](uint32_t i) { + const T_Acc &acc, ::cms::alpakatools::AtomicPairCounter *dc, uint32_t *ind, uint32_t *cont, uint32_t n) const { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid(acc, n, [&](uint32_t i) { auto m = i % 11; m = m % 6 + 1; // max 6, no 0 auto c = dc->add(acc, m); @@ -26,7 +26,7 @@ struct update { struct finalize { template ALPAKA_FN_ACC void operator()(const T_Acc &acc, - cms::alpakatools::AtomicPairCounter const *dc, + ::cms::alpakatools::AtomicPairCounter const *dc, uint32_t *ind, uint32_t *cont, uint32_t n) const { @@ -38,11 +38,11 @@ struct finalize { struct verify { template ALPAKA_FN_ACC void operator()(const T_Acc &acc, - cms::alpakatools::AtomicPairCounter const *dc, + ::cms::alpakatools::AtomicPairCounter const *dc, uint32_t const *ind, uint32_t const *cont, uint32_t n) const { - cms::alpakatools::for_each_element_in_grid(acc, n, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid(acc, n, [&](uint32_t i) { assert(0 == ind[0]); assert(dc->get().m == n); assert(ind[n] == dc->get().n); @@ -65,10 +65,10 @@ int main() { constexpr uint32_t C = 1; const Vec1D sizeC(C); - auto c_dbuf = alpaka::allocBuf(device, sizeC); + auto c_dbuf = alpaka::allocBuf<::cms::alpakatools::AtomicPairCounter, Idx>(device, sizeC); alpaka::memset(queue, c_dbuf, 0, sizeC); - std::cout << "size " << C * sizeof(cms::alpakatools::AtomicPairCounter) << std::endl; + std::cout << "size " << C * sizeof(::cms::alpakatools::AtomicPairCounter) << std::endl; constexpr uint32_t N = 20000; constexpr uint32_t M = N * 6; @@ -82,7 +82,8 @@ int main() { // Update const Vec1D &blocksPerGrid(Vec1D(2000u)); const Vec1D &threadsPerBlockOrElementsPerThread(Vec1D(512u)); - const WorkDiv1D &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + const WorkDiv1D &workDiv = + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv, update(), @@ -94,8 +95,8 @@ int main() { // Finalize const Vec1D &blocksPerGridFinalize(Vec1D(1u)); const Vec1D &threadsPerBlockOrElementsPerThreadFinalize(Vec1D(1u)); - const WorkDiv1D &workDivFinalize = - cms::alpakatools::make_workdiv(blocksPerGridFinalize, threadsPerBlockOrElementsPerThreadFinalize); + const WorkDiv1D &workDivFinalize = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( + blocksPerGridFinalize, threadsPerBlockOrElementsPerThreadFinalize); alpaka::enqueue(queue, alpaka::createTaskKernel(workDivFinalize, finalize(), @@ -113,7 +114,7 @@ int main() { alpaka::getPtrNative(m_dbuf), NUM_VALUES)); - auto c_hbuf = alpaka::allocBuf(host, sizeC); + auto c_hbuf = alpaka::allocBuf<::cms::alpakatools::AtomicPairCounter, Idx>(host, sizeC); alpaka::memcpy(queue, c_hbuf, c_dbuf, sizeC); alpaka::wait(queue); diff --git a/src/alpaka/test/alpaka/HistoContainer_t.cc b/src/alpaka/test/alpaka/HistoContainer_t.cc index fbbfdad8d..5dcd2cf62 100644 --- a/src/alpaka/test/alpaka/HistoContainer_t.cc +++ b/src/alpaka/test/alpaka/HistoContainer_t.cc @@ -10,8 +10,8 @@ template void go(const DevHost& host, - const ALPAKA_ACCELERATOR_NAMESPACE::Device& device, - ALPAKA_ACCELERATOR_NAMESPACE::Queue& queue) { + const ::ALPAKA_ACCELERATOR_NAMESPACE::Device& device, + ::ALPAKA_ACCELERATOR_NAMESPACE::Queue& queue) { std::mt19937 eng; std::uniform_int_distribution rgen(std::numeric_limits::min(), std::numeric_limits::max()); @@ -24,7 +24,7 @@ void go(const DevHost& host, constexpr uint32_t nParts = 10; constexpr uint32_t partSize = N / nParts; - using Hist = cms::alpakatools::HistoContainer; + using Hist = ::cms::alpakatools::HistoContainer; std::cout << "HistoContainer " << (int)(offsetof(Hist, off)) << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' ' << Hist::capacity() << ' ' << offsetof(Hist, bins) - offsetof(Hist, off) << ' ' << (std::numeric_limits::max() - std::numeric_limits::min()) / Hist::nbins() << std::endl; @@ -163,8 +163,9 @@ void go(const DevHost& host, int main() { const DevHost host(alpaka::getDevByIdx(0u)); - const ALPAKA_ACCELERATOR_NAMESPACE::Device device(alpaka::getDevByIdx(0u)); - ALPAKA_ACCELERATOR_NAMESPACE::Queue queue(device); + const ::ALPAKA_ACCELERATOR_NAMESPACE::Device device( + alpaka::getDevByIdx<::ALPAKA_ACCELERATOR_NAMESPACE::Platform>(0u)); + ::ALPAKA_ACCELERATOR_NAMESPACE::Queue queue(device); go(host, device, queue); go(host, device, queue); diff --git a/src/alpaka/test/alpaka/OneHistoContainer_t.cc b/src/alpaka/test/alpaka/OneHistoContainer_t.cc index 89ddcf6f3..f49ee92cf 100644 --- a/src/alpaka/test/alpaka/OneHistoContainer_t.cc +++ b/src/alpaka/test/alpaka/OneHistoContainer_t.cc @@ -18,21 +18,24 @@ struct mykernel { printf("start kernel for %d data\n", N); } - using Hist = cms::alpakatools::HistoContainer; + using Hist = ::cms::alpakatools::HistoContainer; auto& hist = alpaka::declareSharedVar(acc); auto& ws = alpaka::declareSharedVar(acc); // set off zero - cms::alpakatools::for_each_element_in_block_strided(acc, Hist::totbins(), [&](uint32_t j) { hist.off[j] = 0; }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, Hist::totbins(), [&](uint32_t j) { hist.off[j] = 0; }); alpaka::syncBlockThreads(acc); // set bins zero - cms::alpakatools::for_each_element_in_block_strided(acc, Hist::totbins(), [&](uint32_t j) { hist.bins[j] = 0; }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, Hist::totbins(), [&](uint32_t j) { hist.bins[j] = 0; }); alpaka::syncBlockThreads(acc); // count - cms::alpakatools::for_each_element_in_block_strided(acc, N, [&](uint32_t j) { hist.count(acc, v[j]); }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, N, [&](uint32_t j) { hist.count(acc, v[j]); }); alpaka::syncBlockThreads(acc); assert(0 == hist.size()); @@ -45,17 +48,18 @@ struct mykernel { assert(N == hist.size()); // verify - cms::alpakatools::for_each_element_in_block_strided( + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( acc, Hist::nbins(), [&](uint32_t j) { assert(hist.off[j] <= hist.off[j + 1]); }); alpaka::syncBlockThreads(acc); - cms::alpakatools::for_each_element_in_block(acc, 32, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block(acc, 32, [&](uint32_t i) { ws[i] = 0; // used by prefix scan... }); alpaka::syncBlockThreads(acc); // fill - cms::alpakatools::for_each_element_in_block_strided(acc, N, [&](uint32_t j) { hist.fill(acc, v[j], j); }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, N, [&](uint32_t j) { hist.fill(acc, v[j], j); }); alpaka::syncBlockThreads(acc); assert(0 == hist.off[0]); @@ -63,56 +67,58 @@ struct mykernel { // bin #ifndef NDEBUG - cms::alpakatools::for_each_element_in_block_strided(acc, hist.size() - 1, [&](uint32_t j) { - auto p = hist.begin() + j; - assert((*p) < N); - auto k1 = Hist::bin(v[*p]); - auto k2 = Hist::bin(v[*(p + 1)]); - assert(k2 >= k1); - }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, hist.size() - 1, [&](uint32_t j) { + auto p = hist.begin() + j; + assert((*p) < N); + auto k1 = Hist::bin(v[*p]); + auto k2 = Hist::bin(v[*(p + 1)]); + assert(k2 >= k1); + }); #endif // forEachInWindow - cms::alpakatools::for_each_element_in_block_strided(acc, hist.size(), [&](uint32_t i) { - auto p = hist.begin() + i; - auto j = *p; + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, hist.size(), [&](uint32_t i) { + auto p = hist.begin() + i; + auto j = *p; #ifndef NDEBUG - auto b0 = Hist::bin(v[j]); + auto b0 = Hist::bin(v[j]); #endif - int tot = 0; - auto ftest = [&](unsigned int k) { - assert(k < N); - ++tot; - }; - cms::alpakatools::forEachInWindow(hist, v[j], v[j], ftest); + int tot = 0; + auto ftest = [&](unsigned int k) { + assert(k < N); + ++tot; + }; + ::cms::alpakatools::forEachInWindow(hist, v[j], v[j], ftest); #ifndef NDEBUG - int rtot = hist.size(b0); - assert(tot == rtot); + int rtot = hist.size(b0); + assert(tot == rtot); #endif - tot = 0; - auto vm = int(v[j]) - DELTA; - auto vp = int(v[j]) + DELTA; - constexpr int vmax = NBINS != 128 ? NBINS * 2 - 1 : std::numeric_limits::max(); - vm = std::max(vm, 0); - vm = std::min(vm, vmax); - vp = std::min(vp, vmax); - vp = std::max(vp, 0); - assert(vp >= vm); - cms::alpakatools::forEachInWindow(hist, vm, vp, ftest); + tot = 0; + auto vm = int(v[j]) - DELTA; + auto vp = int(v[j]) + DELTA; + constexpr int vmax = NBINS != 128 ? NBINS * 2 - 1 : std::numeric_limits::max(); + vm = std::max(vm, 0); + vm = std::min(vm, vmax); + vp = std::min(vp, vmax); + vp = std::max(vp, 0); + assert(vp >= vm); + ::cms::alpakatools::forEachInWindow(hist, vm, vp, ftest); #ifndef NDEBUG - int bp = Hist::bin(vp); - int bm = Hist::bin(vm); - rtot = hist.end(bp) - hist.begin(bm); - assert(tot == rtot); + int bp = Hist::bin(vp); + int bm = Hist::bin(vm); + rtot = hist.end(bp) - hist.begin(bm); + assert(tot == rtot); #endif - }); + }); } }; template void go(const DevHost& host, - const ALPAKA_ACCELERATOR_NAMESPACE::Device& device, - ALPAKA_ACCELERATOR_NAMESPACE::Queue& queue) { + const ::ALPAKA_ACCELERATOR_NAMESPACE::Device& device, + ::ALPAKA_ACCELERATOR_NAMESPACE::Queue& queue) { std::mt19937 eng; int rmin = std::numeric_limits::min(); @@ -125,7 +131,7 @@ void go(const DevHost& host, std::uniform_int_distribution rgen(rmin, rmax); constexpr unsigned int N = 12000; - using Hist = cms::alpakatools::HistoContainer; + using Hist = ::cms::alpakatools::HistoContainer; std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::capacity() << ' ' << (rmax - rmin) / Hist::nbins() << std::endl; std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl; @@ -145,9 +151,10 @@ void go(const DevHost& host, const Vec1D& threadsPerBlockOrElementsPerThread(Vec1D::all(256)); const Vec1D& blocksPerGrid(Vec1D::all(1)); - const WorkDiv1D& workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + const WorkDiv1D& workDiv = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( + blocksPerGrid, threadsPerBlockOrElementsPerThread); alpaka::enqueue(queue, - alpaka::createTaskKernel( + alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>( workDiv, mykernel(), alpaka::getPtrNative(v_dbuf), N)); } alpaka::wait(queue); @@ -155,8 +162,9 @@ void go(const DevHost& host, int main() { const DevHost host(alpaka::getDevByIdx(0u)); - const ALPAKA_ACCELERATOR_NAMESPACE::Device device(alpaka::getDevByIdx(0u)); - ALPAKA_ACCELERATOR_NAMESPACE::Queue queue(device); + const ::ALPAKA_ACCELERATOR_NAMESPACE::Device device( + alpaka::getDevByIdx<::ALPAKA_ACCELERATOR_NAMESPACE::Platform>(0u)); + ::ALPAKA_ACCELERATOR_NAMESPACE::Queue queue(device); go(host, device, queue); go(host, device, queue); diff --git a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc index 7c6874133..56de417e0 100644 --- a/src/alpaka/test/alpaka/OneToManyAssoc_t.cc +++ b/src/alpaka/test/alpaka/OneToManyAssoc_t.cc @@ -12,9 +12,9 @@ constexpr uint32_t MaxElem = 64000; constexpr uint32_t MaxTk = 8000; constexpr uint32_t MaxAssocs = 4 * MaxTk; -using Assoc = cms::alpakatools::OneToManyAssoc; -using SmallAssoc = cms::alpakatools::OneToManyAssoc; -using Multiplicity = cms::alpakatools::OneToManyAssoc; +using Assoc = ::cms::alpakatools::OneToManyAssoc; +using SmallAssoc = ::cms::alpakatools::OneToManyAssoc; +using Multiplicity = ::cms::alpakatools::OneToManyAssoc; using TK = std::array; struct countMultiLocal { @@ -23,7 +23,7 @@ struct countMultiLocal { TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, uint32_t n) const { - cms::alpakatools::for_each_element_in_grid_strided(acc, n, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(acc, n, [&](uint32_t i) { auto& local = alpaka::declareSharedVar(acc); const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); const bool oncePerSharedMemoryAccess = (threadIdxLocal == 0); @@ -46,14 +46,15 @@ struct countMulti { TK const* __restrict__ tk, Multiplicity* __restrict__ assoc, uint32_t n) const { - cms::alpakatools::for_each_element_in_grid_strided(acc, n, [&](uint32_t i) { assoc->countDirect(acc, 2 + i % 4); }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( + acc, n, [&](uint32_t i) { assoc->countDirect(acc, 2 + i % 4); }); } }; struct verifyMulti { template ALPAKA_FN_ACC void operator()(const T_Acc& acc, Multiplicity* __restrict__ m1, Multiplicity* __restrict__ m2) const { - cms::alpakatools::for_each_element_in_grid_strided( + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided( acc, Multiplicity::totbins(), [&](uint32_t i) { assert(m1->off[i] == m2->off[i]); }); } }; @@ -64,7 +65,7 @@ struct count { TK const* __restrict__ tk, Assoc* __restrict__ assoc, uint32_t n) const { - cms::alpakatools::for_each_element_in_grid_strided(acc, 4 * n, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(acc, 4 * n, [&](uint32_t i) { auto k = i / 4; auto j = i - 4 * k; assert(j < 4); @@ -84,7 +85,7 @@ struct fill { TK const* __restrict__ tk, Assoc* __restrict__ assoc, uint32_t n) const { - cms::alpakatools::for_each_element_in_grid_strided(acc, 4 * n, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(acc, 4 * n, [&](uint32_t i) { auto k = i / 4; auto j = i - 4 * k; assert(j < 4); @@ -108,11 +109,11 @@ struct verify { struct fillBulk { template ALPAKA_FN_ACC void operator()(const T_Acc& acc, - cms::alpakatools::AtomicPairCounter* apc, + ::cms::alpakatools::AtomicPairCounter* apc, TK const* __restrict__ tk, Assoc* __restrict__ assoc, uint32_t n) const { - cms::alpakatools::for_each_element_in_grid_strided(acc, n, [&](uint32_t k) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid_strided(acc, n, [&](uint32_t k) { auto m = tk[k][3] < MaxElem ? 4 : 3; assoc->bulkFill(acc, *apc, &tk[k][0], m); }); @@ -123,7 +124,7 @@ struct verifyBulk { template ALPAKA_FN_ACC void operator()(const T_Acc& acc, Assoc const* __restrict__ assoc, - cms::alpakatools::AtomicPairCounter const* apc) const { + ::cms::alpakatools::AtomicPairCounter const* apc) const { if (apc->get().m >= Assoc::nbins()) { printf("Overflow %d %d\n", apc->get().m, Assoc::nbins()); } @@ -133,8 +134,9 @@ struct verifyBulk { int main() { const DevHost host(alpaka::getDevByIdx(0u)); - const ALPAKA_ACCELERATOR_NAMESPACE::Device device(alpaka::getDevByIdx(0u)); - ALPAKA_ACCELERATOR_NAMESPACE::Queue queue(device); + const ::ALPAKA_ACCELERATOR_NAMESPACE::Device device( + alpaka::getDevByIdx<::ALPAKA_ACCELERATOR_NAMESPACE::Platform>(0u)); + ::ALPAKA_ACCELERATOR_NAMESPACE::Queue queue(device); std::cout << "OneToManyAssoc " << sizeof(Assoc) << ' ' << Assoc::nbins() << ' ' << Assoc::capacity() << std::endl; std::cout << "OneToManyAssoc (small) " << sizeof(SmallAssoc) << ' ' << SmallAssoc::nbins() << ' ' @@ -191,23 +193,24 @@ int main() { const Vec1D threadsPerBlockOrElementsPerThread(nThreads); const unsigned int nBlocks4N = (4 * N + nThreads - 1) / nThreads; const Vec1D blocksPerGrid4N(nBlocks4N); - const WorkDiv1D& workDiv4N = cms::alpakatools::make_workdiv(blocksPerGrid4N, threadsPerBlockOrElementsPerThread); + const WorkDiv1D& workDiv4N = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( + blocksPerGrid4N, threadsPerBlockOrElementsPerThread); launchZero(alpaka::getPtrNative(a_dbuf), queue); alpaka::enqueue(queue, - alpaka::createTaskKernel( + alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>( workDiv4N, count(), alpaka::getPtrNative(v_dbuf), alpaka::getPtrNative(a_dbuf), N)); - cms::alpakatools::launchFinalize(alpaka::getPtrNative(a_dbuf), queue); + ::cms::alpakatools::launchFinalize(alpaka::getPtrNative(a_dbuf), queue); alpaka::enqueue( queue, - alpaka::createTaskKernel( + alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>( WorkDiv1D{Vec1D::all(1u), Vec1D::all(1u), Vec1D::all(1u)}, verify(), alpaka::getPtrNative(a_dbuf))); alpaka::enqueue(queue, - alpaka::createTaskKernel( + alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>( workDiv4N, fill(), alpaka::getPtrNative(v_dbuf), alpaka::getPtrNative(a_dbuf), N)); auto la_hbuf = alpaka::allocBuf(host, 1u); @@ -233,28 +236,29 @@ int main() { std::cout << "found with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << z << std::endl; // now the inverse map (actually this is the direct....) - auto dc_dbuf = alpaka::allocBuf(device, 1u); + auto dc_dbuf = alpaka::allocBuf<::cms::alpakatools::AtomicPairCounter, Idx>(device, 1u); alpaka::memset(queue, dc_dbuf, 0, 1u); const unsigned int nBlocks = (N + nThreads - 1) / nThreads; const Vec1D blocksPerGrid(nBlocks); - const WorkDiv1D& workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + const WorkDiv1D& workDiv = + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); alpaka::enqueue(queue, - alpaka::createTaskKernel(workDiv, - fillBulk(), - alpaka::getPtrNative(dc_dbuf), - alpaka::getPtrNative(v_dbuf), - alpaka::getPtrNative(a_dbuf), - N)); + alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>(workDiv, + fillBulk(), + alpaka::getPtrNative(dc_dbuf), + alpaka::getPtrNative(v_dbuf), + alpaka::getPtrNative(a_dbuf), + N)); alpaka::enqueue( queue, - alpaka::createTaskKernel( - workDiv, cms::alpakatools::finalizeBulk(), alpaka::getPtrNative(dc_dbuf), alpaka::getPtrNative(a_dbuf))); + alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>( + workDiv, ::cms::alpakatools::finalizeBulk(), alpaka::getPtrNative(dc_dbuf), alpaka::getPtrNative(a_dbuf))); alpaka::enqueue(queue, - alpaka::createTaskKernel( + alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>( WorkDiv1D{Vec1D::all(1u), Vec1D::all(1u), Vec1D::all(1u)}, verifyBulk(), alpaka::getPtrNative(a_dbuf), @@ -262,7 +266,7 @@ int main() { alpaka::memcpy(queue, la_hbuf, a_dbuf, 1u); - auto dc_hbuf = alpaka::allocBuf(host, 1u); + auto dc_hbuf = alpaka::allocBuf<::cms::alpakatools::AtomicPairCounter, Idx>(host, 1u); alpaka::memcpy(queue, dc_hbuf, dc_dbuf, 1u); alpaka::wait(queue); auto dc = alpaka::getPtrNative(dc_hbuf); @@ -272,20 +276,20 @@ int main() { alpaka::memset(queue, sa_dbuf, 0, 1u); alpaka::enqueue(queue, - alpaka::createTaskKernel(workDiv, - fillBulk(), - alpaka::getPtrNative(dc_dbuf), - alpaka::getPtrNative(v_dbuf), - alpaka::getPtrNative(sa_dbuf), - N)); + alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>(workDiv, + fillBulk(), + alpaka::getPtrNative(dc_dbuf), + alpaka::getPtrNative(v_dbuf), + alpaka::getPtrNative(sa_dbuf), + N)); alpaka::enqueue( queue, - alpaka::createTaskKernel( - workDiv, cms::alpakatools::finalizeBulk(), alpaka::getPtrNative(dc_dbuf), alpaka::getPtrNative(sa_dbuf))); + alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>( + workDiv, ::cms::alpakatools::finalizeBulk(), alpaka::getPtrNative(dc_dbuf), alpaka::getPtrNative(sa_dbuf))); alpaka::enqueue(queue, - alpaka::createTaskKernel( + alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>( WorkDiv1D{Vec1D::all(1u), Vec1D::all(1u), Vec1D::all(1u)}, verifyBulk(), alpaka::getPtrNative(sa_dbuf), @@ -318,27 +322,27 @@ int main() { launchZero(alpaka::getPtrNative(m2_dbuf), queue); alpaka::enqueue(queue, - alpaka::createTaskKernel( + alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>( workDiv4N, countMulti(), alpaka::getPtrNative(v_dbuf), alpaka::getPtrNative(m1_dbuf), N)); alpaka::enqueue(queue, - alpaka::createTaskKernel( + alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>( workDiv4N, countMultiLocal(), alpaka::getPtrNative(v_dbuf), alpaka::getPtrNative(m2_dbuf), N)); const Vec1D blocksPerGridTotBins(1u); const Vec1D threadsPerBlockOrElementsPerThreadTotBins(Multiplicity::totbins()); - const WorkDiv1D& workDivTotBins = - cms::alpakatools::make_workdiv(blocksPerGridTotBins, threadsPerBlockOrElementsPerThreadTotBins); + const WorkDiv1D& workDivTotBins = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( + blocksPerGridTotBins, threadsPerBlockOrElementsPerThreadTotBins); alpaka::enqueue(queue, - alpaka::createTaskKernel( + alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>( workDivTotBins, verifyMulti(), alpaka::getPtrNative(m1_dbuf), alpaka::getPtrNative(m2_dbuf))); - cms::alpakatools::launchFinalize(alpaka::getPtrNative(m1_dbuf), queue); - cms::alpakatools::launchFinalize(alpaka::getPtrNative(m2_dbuf), queue); + ::cms::alpakatools::launchFinalize(alpaka::getPtrNative(m1_dbuf), queue); + ::cms::alpakatools::launchFinalize(alpaka::getPtrNative(m2_dbuf), queue); alpaka::enqueue(queue, - alpaka::createTaskKernel( + alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>( workDivTotBins, verifyMulti(), alpaka::getPtrNative(m1_dbuf), alpaka::getPtrNative(m2_dbuf))); alpaka::wait(queue); diff --git a/src/alpaka/test/alpaka/clustering_t.cc b/src/alpaka/test/alpaka/clustering_t.cc index 377b58778..2466d32fe 100644 --- a/src/alpaka/test/alpaka/clustering_t.cc +++ b/src/alpaka/test/alpaka/clustering_t.cc @@ -18,8 +18,9 @@ int main(void) { const DevHost host(alpaka::getDevByIdx(0u)); - const ALPAKA_ACCELERATOR_NAMESPACE::Device device(alpaka::getDevByIdx(0u)); - ALPAKA_ACCELERATOR_NAMESPACE::Queue queue(device); + const ::ALPAKA_ACCELERATOR_NAMESPACE::Device device( + alpaka::getDevByIdx<::ALPAKA_ACCELERATOR_NAMESPACE::Platform>(0u)); + ::ALPAKA_ACCELERATOR_NAMESPACE::Queue queue(device); constexpr unsigned int numElements = 256 * 2000; // these in reality are already on GPU @@ -250,22 +251,22 @@ int main(void) { // COUNT MODULES const int blocksPerGridCountModules = (numElements + threadsPerBlockOrElementsPerThread - 1) / threadsPerBlockOrElementsPerThread; - const WorkDiv1D& workDivCountModules = cms::alpakatools::make_workdiv( + const WorkDiv1D& workDivCountModules = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( Vec1D::all(blocksPerGridCountModules), Vec1D::all(threadsPerBlockOrElementsPerThread)); std::cout << "CUDA countModules kernel launch with " << blocksPerGridCountModules << " blocks of " << threadsPerBlockOrElementsPerThread << " threads (GPU) or elements (CPU). \n"; alpaka::enqueue( queue, - alpaka::createTaskKernel(workDivCountModules, - gpuClustering::countModules(), - alpaka::getPtrNative(d_id_buf), - alpaka::getPtrNative(d_moduleStart_buf), - alpaka::getPtrNative(d_clus_buf), - n)); + alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>(workDivCountModules, + gpuClustering::countModules(), + alpaka::getPtrNative(d_id_buf), + alpaka::getPtrNative(d_moduleStart_buf), + alpaka::getPtrNative(d_clus_buf), + n)); // FIND CLUSTER - const WorkDiv1D& workDivMaxNumModules = cms::alpakatools::make_workdiv( + const WorkDiv1D& workDivMaxNumModules = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( Vec1D::all(gpuClustering::MaxNumModules), Vec1D::all(threadsPerBlockOrElementsPerThread)); std::cout << "CUDA findModules kernel launch with " << gpuClustering::MaxNumModules << " blocks of " << threadsPerBlockOrElementsPerThread << " threads (GPU) or elements (CPU). \n"; @@ -274,16 +275,16 @@ int main(void) { alpaka::enqueue( queue, - alpaka::createTaskKernel(workDivMaxNumModules, - gpuClustering::findClus(), - alpaka::getPtrNative(d_id_buf), - alpaka::getPtrNative(d_x_buf), - alpaka::getPtrNative(d_y_buf), - alpaka::getPtrNative(d_moduleStart_buf), - alpaka::getPtrNative(d_clusInModule_buf), - alpaka::getPtrNative(d_moduleId_buf), - alpaka::getPtrNative(d_clus_buf), - n)); + alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>(workDivMaxNumModules, + gpuClustering::findClus(), + alpaka::getPtrNative(d_id_buf), + alpaka::getPtrNative(d_x_buf), + alpaka::getPtrNative(d_y_buf), + alpaka::getPtrNative(d_moduleStart_buf), + alpaka::getPtrNative(d_clusInModule_buf), + alpaka::getPtrNative(d_moduleId_buf), + alpaka::getPtrNative(d_clus_buf), + n)); alpaka::memcpy(queue, h_nModules_buf, d_moduleStart_buf, 1u); auto h_nclus_buf = alpaka::allocBuf(host, gpuClustering::MaxNumModules); @@ -309,15 +310,15 @@ int main(void) { // CLUSTER CHARGE CUT alpaka::enqueue( queue, - alpaka::createTaskKernel(workDivMaxNumModules, - gpuClustering::clusterChargeCut(), - alpaka::getPtrNative(d_id_buf), - alpaka::getPtrNative(d_adc_buf), - alpaka::getPtrNative(d_moduleStart_buf), - alpaka::getPtrNative(d_clusInModule_buf), - alpaka::getPtrNative(d_moduleId_buf), - alpaka::getPtrNative(d_clus_buf), - n)); + alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>(workDivMaxNumModules, + gpuClustering::clusterChargeCut(), + alpaka::getPtrNative(d_id_buf), + alpaka::getPtrNative(d_adc_buf), + alpaka::getPtrNative(d_moduleStart_buf), + alpaka::getPtrNative(d_clusInModule_buf), + alpaka::getPtrNative(d_moduleId_buf), + alpaka::getPtrNative(d_clus_buf), + n)); alpaka::memcpy(queue, h_id_buf, d_id_buf, n); alpaka::memcpy(queue, h_clus_buf, d_clus_buf, n); alpaka::memcpy(queue, h_nclus_buf, d_clusInModule_buf, gpuClustering::MaxNumModules); diff --git a/src/alpaka/test/alpaka/prefixScan_t.cc b/src/alpaka/test/alpaka/prefixScan_t.cc index 77172d5c5..592884845 100644 --- a/src/alpaka/test/alpaka/prefixScan_t.cc +++ b/src/alpaka/test/alpaka/prefixScan_t.cc @@ -26,17 +26,18 @@ struct testPrefixScan { auto& c = alpaka::declareSharedVar(acc); auto& co = alpaka::declareSharedVar(acc); - cms::alpakatools::for_each_element_in_block_strided(acc, size, [&](uint32_t i) { c[i] = 1; }); + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided( + acc, size, [&](uint32_t i) { c[i] = 1; }); alpaka::syncBlockThreads(acc); - cms::alpakatools::blockPrefixScan(acc, c, co, size, ws); - cms::alpakatools::blockPrefixScan(acc, c, size, ws); + ::cms::alpakatools::blockPrefixScan(acc, c, co, size, ws); + ::cms::alpakatools::blockPrefixScan(acc, c, size, ws); assert(1 == c[0]); assert(1 == co[0]); - cms::alpakatools::for_each_element_in_block_strided(acc, size, 1u, [&](uint32_t i) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_block_strided(acc, size, 1u, [&](uint32_t i) { assert(c[i] == c[i - 1] + 1); assert(c[i] == i + 1); assert(c[i] = co[i]); @@ -82,7 +83,7 @@ struct testWarpPrefixScan { struct init { template ALPAKA_FN_ACC void operator()(const T_Acc& acc, uint32_t* v, uint32_t val, uint32_t n) const { - cms::alpakatools::for_each_element_in_grid(acc, n, [&](uint32_t index) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid(acc, n, [&](uint32_t index) { v[index] = val; if (index == 0) @@ -94,7 +95,7 @@ struct init { struct verify { template ALPAKA_FN_ACC void operator()(const T_Acc& acc, uint32_t const* v, uint32_t n) const { - cms::alpakatools::for_each_element_in_grid(acc, n, [&](uint32_t index) { + ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::for_each_element_in_grid(acc, n, [&](uint32_t index) { assert(v[index] == index + 1); if (index == 0) @@ -116,7 +117,8 @@ int main() { const Vec1D threadsPerBlockOrElementsPerThread1(Vec1D::all(32)); const Vec1D blocksPerGrid1(Vec1D::all(1)); - const WorkDiv1D& workDivWarp = cms::alpakatools::make_workdiv(blocksPerGrid1, threadsPerBlockOrElementsPerThread1); + const WorkDiv1D& workDivWarp = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( + blocksPerGrid1, threadsPerBlockOrElementsPerThread1); alpaka::enqueue(queue, alpaka::createTaskKernel(workDivWarp, testWarpPrefixScan(), 32)); @@ -133,8 +135,8 @@ int main() { for (int bs = 32; bs <= 1024; bs += 32) { const Vec1D threadsPerBlockOrElementsPerThread2(Vec1D::all(bs)); const Vec1D blocksPerGrid2(Vec1D::all(1)); - const WorkDiv1D& workDivSingleBlock = - cms::alpakatools::make_workdiv(blocksPerGrid2, threadsPerBlockOrElementsPerThread2); + const WorkDiv1D& workDivSingleBlock = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( + blocksPerGrid2, threadsPerBlockOrElementsPerThread2); std::cout << "blocks per grid: " << blocksPerGrid2 << ", threads per block or elements per thread: " << threadsPerBlockOrElementsPerThread2 << std::endl; @@ -163,8 +165,8 @@ int main() { const Vec1D threadsPerBlockOrElementsPerThread3(Vec1D::all(nThreadsInit)); const auto nBlocksInit = (num_items + nThreadsInit - 1) / nThreadsInit; const Vec1D blocksPerGrid3(Vec1D::all(nBlocksInit)); - const WorkDiv1D& workDivMultiBlockInit = - cms::alpakatools::make_workdiv(blocksPerGrid3, threadsPerBlockOrElementsPerThread3); + const WorkDiv1D& workDivMultiBlockInit = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( + blocksPerGrid3, threadsPerBlockOrElementsPerThread3); alpaka::enqueue(queue, alpaka::createTaskKernel(workDivMultiBlockInit, init(), input_d, 1, num_items)); @@ -172,28 +174,18 @@ int main() { const Vec1D threadsPerBlockOrElementsPerThread4(Vec1D::all(nThreads)); const auto nBlocks = (num_items + nThreads - 1) / nThreads; const Vec1D blocksPerGrid4(Vec1D::all(nBlocks)); - const WorkDiv1D& workDivMultiBlock = - cms::alpakatools::make_workdiv(blocksPerGrid4, threadsPerBlockOrElementsPerThread4); + const WorkDiv1D& workDivMultiBlock = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv( + blocksPerGrid4, threadsPerBlockOrElementsPerThread4); std::cout << "launch multiBlockPrefixScan " << num_items << ' ' << nBlocks << std::endl; - alpaka::enqueue(queue, - alpaka::createTaskKernel(workDivMultiBlock, - cms::alpakatools::multiBlockPrefixScanFirstStep(), - input_d, - output1_d, - num_items)); - - const Vec1D blocksPerGridSecondStep(Vec1D::all(1)); - const WorkDiv1D& workDivMultiBlockSecondStep = - cms::alpakatools::make_workdiv(blocksPerGridSecondStep, threadsPerBlockOrElementsPerThread4); - alpaka::enqueue(queue, - alpaka::createTaskKernel(workDivMultiBlockSecondStep, - cms::alpakatools::multiBlockPrefixScanSecondStep(), - input_d, - output1_d, - num_items, - nBlocks)); - + auto d_pc(alpaka::allocBuf(device, size)); + int32_t* pc = alpaka::getPtrNative(d_pc); + + alpaka::memset(queue, d_pc, 0, size); + alpaka::enqueue( + queue, + alpaka::createTaskKernel( + workDivMultiBlock, ::cms::alpakatools::multiBlockPrefixScan(), input_d, output1_d, num_items, pc)); alpaka::enqueue(queue, alpaka::createTaskKernel(workDivMultiBlock, verify(), output1_d, num_items)); alpaka::wait(queue); // input_dBuf and output1_dBuf end of scope