diff --git a/HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h b/HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h new file mode 100644 index 0000000000000..fd91459f3343b --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h @@ -0,0 +1,53 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_AtomicPairCounter_h +#define HeterogeneousCore_AlpakaInterface_interface_AtomicPairCounter_h + +#include + +#include + +namespace cms::alpakatools { + + class AtomicPairCounter { + public: + using c_type = unsigned long long int; + + ALPAKA_FN_HOST_ACC AtomicPairCounter() {} + ALPAKA_FN_HOST_ACC AtomicPairCounter(c_type i) { counter.ac = i; } + + ALPAKA_FN_HOST_ACC AtomicPairCounter& operator=(c_type i) { + counter.ac = i; + return *this; + } + + struct Counters { + uint32_t n; // in a "One to Many" association is the number of "One" + uint32_t m; // in a "One to Many" association is the total number of associations + }; + + union Atomic2 { + Counters counters; + c_type ac; + }; + + static constexpr c_type incr = 1UL << 32; + + ALPAKA_FN_ACC Counters get() const { return counter.counters; } + + // increment n by 1 and m by i. return previous value + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE Counters add(const TAcc& acc, uint32_t i) { + c_type c = i; + c += incr; + + Atomic2 ret; + ret.ac = alpaka::atomicAdd(acc, &counter.ac, c, alpaka::hierarchy::Blocks{}); + return ret.counters; + } + + private: + Atomic2 counter; + }; + +} // namespace cms::alpakatools + +#endif // HeterogeneousCore_AlpakaInterface_interface_AtomicPairCounter_h diff --git a/HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h b/HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h new file mode 100644 index 0000000000000..b2e0e94af5236 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h @@ -0,0 +1,303 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_HistoContainer_h +#define HeterogeneousCore_AlpakaInterface_interface_HistoContainer_h + +#include +#include +#include +#include +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h" +#include "HeterogeneousCore/AlpakaInterface/interface/alpakastdAlgorithm.h" +#include "HeterogeneousCore/AlpakaInterface/interface/prefixScan.h" + +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +namespace cms { + namespace alpakatools { + + struct countFromVector { + template + ALPAKA_FN_ACC void operator()(const TAcc &acc, + Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) const { + const uint32_t nt = offsets[nh]; + for_each_element_in_grid_strided(acc, nt, [&](uint32_t i) { + auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); + ALPAKA_ASSERT_OFFLOAD((*off) > 0); + int32_t ih = off - offsets - 1; + ALPAKA_ASSERT_OFFLOAD(ih >= 0); + ALPAKA_ASSERT_OFFLOAD(ih < int(nh)); + h->count(acc, v[i], ih); + }); + } + }; + + struct fillFromVector { + template + ALPAKA_FN_ACC void operator()(const TAcc &acc, + Histo *__restrict__ h, + uint32_t nh, + T const *__restrict__ v, + uint32_t const *__restrict__ offsets) const { + const uint32_t nt = offsets[nh]; + for_each_element_in_grid_strided(acc, nt, [&](uint32_t i) { + auto off = alpaka_std::upper_bound(offsets, offsets + nh + 1, i); + ALPAKA_ASSERT_OFFLOAD((*off) > 0); + int32_t ih = off - offsets - 1; + ALPAKA_ASSERT_OFFLOAD(ih >= 0); + ALPAKA_ASSERT_OFFLOAD(ih < int(nh)); + h->fill(acc, v[i], i, ih); + }); + } + }; + + template + inline __attribute__((always_inline)) void launchZero(Histo *__restrict__ h, TQueue &queue) { + auto histoOffView = make_device_view(alpaka::getDev(queue), h->off, Histo::totbins()); + alpaka::memset(queue, histoOffView, 0); + } + + template + inline __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h, TQueue &queue) { + uint32_t *poff = h->off; + + const int num_items = Histo::totbins(); + + const auto threadsPerBlockOrElementsPerThread = 1024u; + const auto blocksPerGrid = divide_up_by(num_items, threadsPerBlockOrElementsPerThread); + const auto workDiv = make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + alpaka::exec(queue, workDiv, multiBlockPrefixScanFirstStep(), poff, poff, num_items); + + const auto workDivWith1Block = make_workdiv(1, threadsPerBlockOrElementsPerThread); + alpaka::exec( + queue, workDivWith1Block, multiBlockPrefixScanSecondStep(), poff, poff, num_items, blocksPerGrid); + } + + template + inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h, + uint32_t nh, + T const *v, + uint32_t const *offsets, + uint32_t totSize, + uint32_t nthreads, + TQueue &queue) { + launchZero(h, queue); + + const auto threadsPerBlockOrElementsPerThread = nthreads; + const auto blocksPerGrid = divide_up_by(totSize, nthreads); + const auto workDiv = make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + + alpaka::exec(queue, workDiv, countFromVector(), h, nh, v, offsets); + launchFinalize(h, queue); + + alpaka::exec(queue, workDiv, fillFromVector(), h, nh, v, offsets); + } + + struct finalizeBulk { + template + ALPAKA_FN_ACC void operator()(const TAcc &acc, AtomicPairCounter const *apc, Assoc *__restrict__ assoc) const { + assoc->bulkFinalizeFill(acc, *apc); + } + }; + + // iteratate over N bins left and right of the one containing "v" + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void forEachInBins(Hist const &hist, V value, int n, Func func) { + int bs = Hist::bin(value); + int be = std::min(int(Hist::nbins() - 1), bs + n); + bs = std::max(0, bs - n); + ALPAKA_ASSERT_OFFLOAD(be >= bs); + for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { + func(*pj); + } + } + + // iteratate over bins containing all values in window wmin, wmax + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) { + auto bs = Hist::bin(wmin); + auto be = Hist::bin(wmax); + ALPAKA_ASSERT_OFFLOAD(be >= bs); + for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { + func(*pj); + } + } + + template + class HistoContainer { + public: + using Counter = uint32_t; + + using CountersOnly = HistoContainer; + + using index_type = I; + using UT = typename std::make_unsigned::type; + + static constexpr uint32_t ilog2(uint32_t v) { + constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000}; + constexpr uint32_t s[] = {1, 2, 4, 8, 16}; + + uint32_t r = 0; // result of log2(v) will go here + for (auto i = 4; i >= 0; i--) + if (v & b[i]) { + v >>= s[i]; + r |= s[i]; + } + return r; + } + + static constexpr uint32_t sizeT() { return S; } + static constexpr uint32_t nbins() { return NBINS; } + static constexpr int32_t nhists() { return NHISTS; } + static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; } + static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; } + static constexpr int32_t capacity() { return SIZE; } + + static constexpr auto histOff(uint32_t nh) { return NBINS * nh; } + + static constexpr UT bin(T t) { + constexpr uint32_t shift = sizeT() - nbits(); + constexpr uint32_t mask = (1 << nbits()) - 1; + return (t >> shift) & mask; + } + + ALPAKA_FN_ACC ALPAKA_FN_INLINE void zero() { + for (auto &i : off) + i = 0; + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void add(const TAcc &acc, CountersOnly const &co) { + for (uint32_t i = 0; i < totbins(); ++i) { + alpaka::atomicAdd(acc, off + i, co.off[i], alpaka::hierarchy::Blocks{}); + } + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE static uint32_t atomicIncrement(const TAcc &acc, Counter &x) { + return alpaka::atomicAdd(acc, &x, 1u, alpaka::hierarchy::Blocks{}); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE static uint32_t atomicDecrement(const TAcc &acc, Counter &x) { + return alpaka::atomicSub(acc, &x, 1u, alpaka::hierarchy::Blocks{}); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void countDirect(const TAcc &acc, T b) { + ALPAKA_ASSERT_OFFLOAD(b < nbins()); + atomicIncrement(acc, off[b]); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void fillDirect(const TAcc &acc, T b, index_type j) { + ALPAKA_ASSERT_OFFLOAD(b < nbins()); + auto w = atomicDecrement(acc, off[b]); + ALPAKA_ASSERT_OFFLOAD(w > 0); + bins[w - 1] = j; + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE int32_t + bulkFill(const TAcc &acc, AtomicPairCounter &apc, index_type const *v, uint32_t n) { + auto c = apc.add(acc, n); + if (c.m >= nbins()) + return -int32_t(c.m); + off[c.m] = c.n; + for (uint32_t j = 0; j < n; ++j) + bins[c.n + j] = v[j]; + return c.m; + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void bulkFinalize(const TAcc &acc, AtomicPairCounter const &apc) { + off[apc.get().m] = apc.get().n; + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void bulkFinalizeFill(const TAcc &acc, AtomicPairCounter const &apc) { + auto m = apc.get().m; + auto n = apc.get().n; + + if (m >= nbins()) { // overflow! + off[nbins()] = uint32_t(off[nbins() - 1]); + return; + } + + for_each_element_in_grid_strided(acc, totbins(), m, [&](uint32_t i) { off[i] = n; }); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const TAcc &acc, T t) { + uint32_t b = bin(t); + ALPAKA_ASSERT_OFFLOAD(b < nbins()); + atomicIncrement(acc, off[b]); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const TAcc &acc, T t, index_type j) { + uint32_t b = bin(t); + ALPAKA_ASSERT_OFFLOAD(b < nbins()); + auto w = atomicDecrement(acc, off[b]); + ALPAKA_ASSERT_OFFLOAD(w > 0); + bins[w - 1] = j; + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void count(const TAcc &acc, T t, uint32_t nh) { + uint32_t b = bin(t); + ALPAKA_ASSERT_OFFLOAD(b < nbins()); + b += histOff(nh); + ALPAKA_ASSERT_OFFLOAD(b < totbins()); + atomicIncrement(acc, off[b]); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void fill(const TAcc &acc, T t, index_type j, uint32_t nh) { + uint32_t b = bin(t); + ALPAKA_ASSERT_OFFLOAD(b < nbins()); + b += histOff(nh); + ALPAKA_ASSERT_OFFLOAD(b < totbins()); + auto w = atomicDecrement(acc, off[b]); + ALPAKA_ASSERT_OFFLOAD(w > 0); + bins[w - 1] = j; + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void finalize(const TAcc &acc, Counter *ws = nullptr) { + ALPAKA_ASSERT_OFFLOAD(off[totbins() - 1] == 0); + blockPrefixScan(acc, off, totbins(), ws); + ALPAKA_ASSERT_OFFLOAD(off[totbins() - 1] == off[totbins() - 2]); + } + + constexpr auto size() const { return uint32_t(off[totbins() - 1]); } + constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; } + + constexpr index_type const *begin() const { return bins; } + constexpr index_type const *end() const { return begin() + size(); } + + constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; } + constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; } + + Counter off[totbins()]; + index_type bins[capacity()]; + }; + + template + using OneToManyAssoc = HistoContainer; + + } // namespace alpakatools +} // namespace cms +#endif // HeterogeneousCore_AlpakaInterface_interface_HistoContainer_h diff --git a/HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h b/HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h new file mode 100644 index 0000000000000..ee5a1f3b280a7 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h @@ -0,0 +1,142 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_SimpleVector_h +#define HeterogeneousCore_AlpakaInterface_interface_SimpleVector_h + +// author: Felice Pantaleo, CERN, 2018 +// alpaka integration in cmssw: Adriano Di Florio, 2022 + +#include +#include +#include + +namespace cms::alpakatools { + + template + struct SimpleVector { + constexpr SimpleVector() = default; + + // ownership of m_data stays within the caller + constexpr void construct(int capacity, T *data) { + m_size = 0; + m_capacity = capacity; + m_data = data; + } + + inline constexpr int push_back_unsafe(const T &element) { + auto previousSize = m_size; + m_size++; + if (previousSize < m_capacity) { + m_data[previousSize] = element; + return previousSize; + } else { + --m_size; + return -1; + } + } + + template + constexpr int emplace_back_unsafe(Ts &&...args) { + auto previousSize = m_size; + m_size++; + if (previousSize < m_capacity) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + --m_size; + return -1; + } + } + + ALPAKA_FN_ACC inline T &back() { return m_data[m_size - 1]; } + + ALPAKA_FN_ACC inline const T &back() const { + if (m_size > 0) { + return m_data[m_size - 1]; + } else + return T(); //undefined behaviour + } + + // thread-safe version of the vector, when used in a CUDA kernel + template + ALPAKA_FN_ACC int push_back(const TAcc &acc, const T &element) { + auto previousSize = alpaka::atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + if (previousSize < m_capacity) { + m_data[previousSize] = element; + return previousSize; + } else { + alpaka::atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + return -1; + } + } + + template + ALPAKA_FN_ACC int emplace_back(const TAcc &acc, Ts &&...args) { + auto previousSize = alpaka::atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + if (previousSize < m_capacity) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + alpaka::atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + return -1; + } + } + + // thread safe version of resize + template + ALPAKA_FN_ACC int extend(const TAcc &acc, int size = 1) { + auto previousSize = alpaka::atomicAdd(acc, &m_size, size, alpaka::hierarchy::Blocks{}); + if (previousSize < m_capacity) { + return previousSize; + } else { + alpaka::atomicSub(acc, &m_size, size, alpaka::hierarchy::Blocks{}); + return -1; + } + } + + template + ALPAKA_FN_ACC int shrink(const TAcc &acc, int size = 1) { + auto previousSize = alpaka::atomicSub(acc, &m_size, size, alpaka::hierarchy::Blocks{}); + if (previousSize >= size) { + return previousSize - size; + } else { + alpaka::atomicAdd(acc, &m_size, size, alpaka::hierarchy::Blocks{}); + return -1; + } + } + + inline constexpr bool empty() const { return m_size <= 0; } + inline constexpr bool full() const { return m_size >= m_capacity; } + inline constexpr T &operator[](int i) { return m_data[i]; } + inline constexpr const T &operator[](int i) const { return m_data[i]; } + inline constexpr void reset() { m_size = 0; } + inline constexpr int size() const { return m_size; } + inline constexpr int capacity() const { return m_capacity; } + inline constexpr T const *data() const { return m_data; } + inline constexpr void resize(int size) { m_size = size; } + inline constexpr void set_data(T *data) { m_data = data; } + + private: + int m_size; + int m_capacity; + + T *m_data; + }; + + // ownership of m_data stays within the caller + template + SimpleVector make_SimpleVector(int capacity, T *data) { + SimpleVector ret; + ret.construct(capacity, data); + return ret; + } + + // ownership of m_data stays within the caller + template + SimpleVector *make_SimpleVector(SimpleVector *mem, int capacity, T *data) { + auto ret = new (mem) SimpleVector(); + ret->construct(capacity, data); + return ret; + } + +} // namespace cms::alpakatools + +#endif // AlpakaCore_SimpleVector_h diff --git a/HeterogeneousCore/AlpakaInterface/interface/VecArray.h b/HeterogeneousCore/AlpakaInterface/interface/VecArray.h new file mode 100644 index 0000000000000..b708193dc1d02 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/VecArray.h @@ -0,0 +1,107 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_VecArray_h +#define HeterogeneousCore_AlpakaInterface_interface_VecArray_h + +// +// Author: Felice Pantaleo, CERN +// + +#include + +#include + +namespace cms::alpakatools { + + template + class VecArray { + public: + using self = VecArray; + using value_t = T; + + inline constexpr int push_back_unsafe(const T &element) { + auto previousSize = m_size; + m_size++; + if (previousSize < maxSize) { + m_data[previousSize] = element; + return previousSize; + } else { + --m_size; + return -1; + } + } + + template + constexpr int emplace_back_unsafe(Ts &&...args) { + auto previousSize = m_size; + m_size++; + if (previousSize < maxSize) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + --m_size; + return -1; + } + } + + inline constexpr T &back() const { + if (m_size > 0) { + return m_data[m_size - 1]; + } else + return T(); //undefined behaviour + } + + // thread-safe version of the vector, when used in a CUDA kernel + template + ALPAKA_FN_ACC int push_back(const TAcc &acc, const T &element) { + auto previousSize = alpaka::atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + if (previousSize < maxSize) { + m_data[previousSize] = element; + return previousSize; + } else { + alpaka::atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + return -1; + } + } + + template + ALPAKA_FN_ACC int emplace_back(const TAcc &acc, Ts &&...args) { + auto previousSize = alpaka::atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + if (previousSize < maxSize) { + (new (&m_data[previousSize]) T(std::forward(args)...)); + return previousSize; + } else { + alpaka::atomicSub(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); + return -1; + } + } + + inline constexpr T pop_back() { + if (m_size > 0) { + auto previousSize = m_size--; + return m_data[previousSize - 1]; + } else + return T(); + } + + inline constexpr T const *begin() const { return m_data; } + inline constexpr T const *end() const { return m_data + m_size; } + inline constexpr T *begin() { return m_data; } + inline constexpr T *end() { return m_data + m_size; } + inline constexpr int size() const { return m_size; } + inline constexpr T &operator[](int i) { return m_data[i]; } + inline constexpr const T &operator[](int i) const { return m_data[i]; } + inline constexpr void reset() { m_size = 0; } + inline static constexpr int capacity() { return maxSize; } + inline constexpr T const *data() const { return m_data; } + inline constexpr void resize(int size) { m_size = size; } + inline constexpr bool empty() const { return 0 == m_size; } + inline constexpr bool full() const { return maxSize == m_size; } + + private: + T m_data[maxSize]; + + int m_size; + }; + +} // namespace cms::alpakatools + +#endif // HeterogeneousCore_AlpakaInterface_interface_VecArray_h diff --git a/HeterogeneousCore/AlpakaInterface/interface/alpakastdAlgorithm.h b/HeterogeneousCore/AlpakaInterface/interface/alpakastdAlgorithm.h new file mode 100644 index 0000000000000..ca3a6e5997766 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/alpakastdAlgorithm.h @@ -0,0 +1,35 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_alpakastdAlgorithm_h +#define HeterogeneousCore_AlpakaInterface_interface_alpakastdAlgorithm_h + +#include +#include +#include + +#include + +// reimplementation of std algorithms able to compile with Alpaka, +// mostly by declaring them constexpr + +namespace alpaka_std { + + template > + ALPAKA_FN_HOST_ACC constexpr RandomIt upper_bound(RandomIt first, RandomIt last, const T &value, Compare comp = {}) { + auto count = last - first; + + while (count > 0) { + auto it = first; + auto step = count / 2; + it += step; + if (!comp(value, *it)) { + first = ++it; + count -= step + 1; + } else { + count = step; + } + } + return first; + } + +} // namespace alpaka_std + +#endif // HeterogeneousCore_AlpakaInterface_interface_alpakastdAlgorithm_h diff --git a/HeterogeneousCore/AlpakaInterface/interface/prefixScan.h b/HeterogeneousCore/AlpakaInterface/interface/prefixScan.h new file mode 100644 index 0000000000000..a1b0199de1ca7 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/prefixScan.h @@ -0,0 +1,252 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_prefixScan_h +#define HeterogeneousCore_AlpakaInterface_interface_prefixScan_h + +#include +#include + +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "FWCore/Utilities/interface/CMSUnrollLoop.h" + +/* +FIXME: this version of prefix scan is not fully updated +w.r.t. to the CUDA one, that uses a single kernel. +See: https://github.com/cms-sw/cmssw/blob/dca2f3286ff7aa67b6e4021c17698ec31a400e47/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h +*/ + +namespace cms { + namespace alpakatools { + + // FIXME warpSize should be device-dependent + constexpr uint32_t warpSize = 32; + constexpr uint64_t warpMask = ~(~0ull << warpSize); + +#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)) || \ + (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)) + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void warpPrefixScan(uint32_t laneId, T const* ci, T* co, uint32_t i, uint32_t mask) { +#if defined(__HIP_DEVICE_COMPILE__) + ALPAKA_ASSERT_OFFLOAD(mask == warpMask); +#endif + // ci and co may be the same + auto x = ci[i]; + CMS_UNROLL_LOOP + for (uint32_t offset = 1; offset < warpSize; offset <<= 1) { +#if defined(__CUDA_ARCH__) + auto y = __shfl_up_sync(mask, x, offset); +#elif defined(__HIP_DEVICE_COMPILE__) + auto y = __shfl_up(x, offset); +#endif + if (laneId >= offset) + x += y; + } + co[i] = x; + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void warpPrefixScan(uint32_t laneId, T* c, uint32_t i, uint32_t mask) { +#if defined(__HIP_DEVICE_COMPILE__) + ALPAKA_ASSERT_OFFLOAD(mask == warpMask); +#endif + auto x = c[i]; + CMS_UNROLL_LOOP + for (uint32_t offset = 1; offset < warpSize; offset <<= 1) { +#if defined(__CUDA_ARCH__) + auto y = __shfl_up_sync(mask, x, offset); +#elif defined(__HIP_DEVICE_COMPILE__) + auto y = __shfl_up(x, offset); +#endif + if (laneId >= offset) + x += y; + } + c[i] = x; + } + +#endif // (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)) + + // limited to warpSize² elements + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void blockPrefixScan( + const TAcc& acc, T const* ci, T* co, uint32_t size, T* ws = nullptr) { +#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)) || \ + (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)) + uint32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); + uint32_t const blockThreadIdx(alpaka::getIdx(acc)[0u]); + ALPAKA_ASSERT_OFFLOAD(ws); + ALPAKA_ASSERT_OFFLOAD(size <= warpSize * warpSize); + ALPAKA_ASSERT_OFFLOAD(0 == blockDimension % warpSize); + auto first = blockThreadIdx; +#if defined(__CUDA_ARCH__) + auto mask = __ballot_sync(warpMask, first < size); +#elif defined(__HIP_DEVICE_COMPILE__) + auto mask = warpMask; +#endif + auto laneId = blockThreadIdx & (warpSize - 1); + + for (auto i = first; i < size; i += blockDimension) { + warpPrefixScan(laneId, ci, co, i, mask); + auto warpId = i / warpSize; + // FIXME test ? + ALPAKA_ASSERT_OFFLOAD(warpId < warpSize); + if ((warpSize - 1) == laneId) + ws[warpId] = co[i]; +#if defined(__CUDA_ARCH__) + mask = __ballot_sync(mask, i + blockDimension < size); +#endif + } + alpaka::syncBlockThreads(acc); + if (size <= warpSize) + return; + if (blockThreadIdx < warpSize) { + warpPrefixScan(laneId, ws, blockThreadIdx, warpMask); + } + alpaka::syncBlockThreads(acc); + for (auto i = first + warpSize; i < size; i += blockDimension) { + uint32_t warpId = i / warpSize; + co[i] += ws[warpId - 1]; + } + alpaka::syncBlockThreads(acc); +#else + co[0] = ci[0]; + for (uint32_t i = 1; i < size; ++i) + co[i] = ci[i] + co[i - 1]; +#endif // (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)) + } + + template + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void blockPrefixScan(const TAcc& acc, + T* __restrict__ c, + uint32_t size, + T* __restrict__ ws = nullptr) { +#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)) || \ + (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)) + uint32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); + uint32_t const blockThreadIdx(alpaka::getIdx(acc)[0u]); + ALPAKA_ASSERT_OFFLOAD(ws); + ALPAKA_ASSERT_OFFLOAD(size <= warpSize * warpSize); + ALPAKA_ASSERT_OFFLOAD(0 == blockDimension % warpSize); + auto first = blockThreadIdx; +#if defined(__CUDA_ARCH__) + auto mask = __ballot_sync(warpMask, first < size); +#elif defined(__HIP_DEVICE_COMPILE__) + auto mask = warpMask; +#endif + auto laneId = blockThreadIdx & (warpSize - 1); + + for (auto i = first; i < size; i += blockDimension) { + warpPrefixScan(laneId, c, i, mask); + auto warpId = i / warpSize; + ALPAKA_ASSERT_OFFLOAD(warpId < warpSize); + if ((warpSize - 1) == laneId) + ws[warpId] = c[i]; +#if defined(__CUDA_ARCH__) + mask = __ballot_sync(mask, i + blockDimension < size); +#endif + } + alpaka::syncBlockThreads(acc); + if (size <= warpSize) + return; + if (blockThreadIdx < warpSize) { + warpPrefixScan(laneId, ws, blockThreadIdx, warpMask); + } + alpaka::syncBlockThreads(acc); + for (auto i = first + warpSize; i < size; i += blockDimension) { + auto warpId = i / warpSize; + c[i] += ws[warpId - 1]; + } + alpaka::syncBlockThreads(acc); +#else + for (uint32_t i = 1; i < size; ++i) + c[i] += c[i - 1]; +#endif // (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)) + } + + // limited to warpSize⁴ elements + template + struct multiBlockPrefixScanFirstStep { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, T const* ci, T* co, int32_t size) const { + uint32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); + uint32_t const threadDimension(alpaka::getWorkDiv(acc)[0u]); + uint32_t const blockIdx(alpaka::getIdx(acc)[0u]); + + auto& ws = alpaka::declareSharedVar(acc); + // first each block does a scan of size warpSize² (better be enough blocks) +#ifndef NDEBUG + [[maybe_unused]] uint32_t const gridDimension(alpaka::getWorkDiv(acc)[0u]); + ALPAKA_ASSERT_OFFLOAD(gridDimension / threadDimension <= (warpSize * warpSize)); +#endif +#if 0 + // this is not yet available in alpaka, see + // https://github.com/alpaka-group/alpaka/issues/1648 + ALPAKA_ASSERT_OFFLOAD(sizeof(T) * gridDimension <= dynamic_smem_size()); // size of psum below +#endif + int off = blockDimension * blockIdx * threadDimension; + if (size - off > 0) + blockPrefixScan(acc, ci + off, co + off, std::min(int(blockDimension * threadDimension), size - off), ws); + } + }; + + // limited to warpSize⁴ elements + template + struct multiBlockPrefixScanSecondStep { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, T const* ci, T* co, int32_t size, int32_t numBlocks) const { + uint32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); + uint32_t const threadDimension(alpaka::getWorkDiv(acc)[0u]); + uint32_t const threadIdx(alpaka::getIdx(acc)[0u]); + + T* const psum = alpaka::getDynSharedMem(acc); + + // first each block does a scan of size warpSize² (better be enough blocks) + ALPAKA_ASSERT_OFFLOAD(static_cast(blockDimension * threadDimension) >= numBlocks); + for (int elemId = 0; elemId < static_cast(threadDimension); ++elemId) { + int index = +threadIdx * threadDimension + elemId; + + if (index < numBlocks) { + int lastElementOfPreviousBlockId = index * blockDimension * threadDimension - 1; + psum[index] = (lastElementOfPreviousBlockId < size and lastElementOfPreviousBlockId >= 0) + ? co[lastElementOfPreviousBlockId] + : T(0); + } + } + + alpaka::syncBlockThreads(acc); + + auto& ws = alpaka::declareSharedVar(acc); + blockPrefixScan(acc, psum, psum, numBlocks, ws); + + for (int elemId = 0; elemId < static_cast(threadDimension); ++elemId) { + int first = threadIdx * threadDimension + elemId; + for (int i = first + blockDimension * threadDimension; i < size; i += blockDimension * threadDimension) { + auto k = i / (blockDimension * threadDimension); + co[i] += psum[k]; + } + } + } + }; + + } // namespace alpakatools +} // namespace cms + +// declare the amount of block shared memory used by the multiBlockPrefixScanSecondStep kernel +namespace alpaka::trait { + template + struct BlockSharedMemDynSizeBytes, TAcc> { + template + ALPAKA_FN_HOST_ACC static std::size_t getBlockSharedMemDynSizeBytes( + cms::alpakatools::multiBlockPrefixScanSecondStep const& /* kernel */, + TVec const& /* blockThreadExtent */, + TVec const& /* threadElemExtent */, + T const* /* ci */, + T* /* co */, + int32_t /* size */, + int32_t numBlocks) { + return sizeof(T) * numBlocks; + } + }; +} // namespace alpaka::trait + +#endif // HeterogeneousCore_AlpakaInterface_interface_prefixScan_h diff --git a/HeterogeneousCore/AlpakaInterface/interface/radixSort.h b/HeterogeneousCore/AlpakaInterface/interface/radixSort.h new file mode 100644 index 0000000000000..33af8deeb43aa --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/radixSort.h @@ -0,0 +1,250 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_radixSort_h +#define HeterogeneousCore_AlpakaInterface_interface_radixSort_h + +#include +#include +#include + +namespace cms::alpakatools { + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void dummyReorder( + const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) {} + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void reorderSigned( + const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + //move negative first... + + auto& firstNeg = alpaka::declareSharedVar(acc); + firstNeg = a[ind[0]] < 0 ? 0 : size; + alpaka::syncBlockThreads(acc); + + // find first negative + for_each_element_in_block_strided(acc, size - 1, [&](uint32_t idx) { + if ((a[ind[idx]] ^ a[ind[idx + 1]]) < 0) + firstNeg = idx + 1; + }); + + alpaka::syncBlockThreads(acc); + + for_each_element_in_block_strided(acc, size, firstNeg, [&](uint32_t idx) { ind2[idx - firstNeg] = ind[idx]; }); + alpaka::syncBlockThreads(acc); + + for_each_element_in_block_strided(acc, firstNeg, [&](uint32_t idx) { ind2[idx + size - firstNeg] = ind[idx]; }); + alpaka::syncBlockThreads(acc); + + for_each_element_in_block_strided(acc, size, [&](uint32_t idx) { ind[idx] = ind2[idx]; }); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void reorderFloat( + const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + //move negative first... + + auto& firstNeg = alpaka::declareSharedVar(acc); + firstNeg = a[ind[0]] < 0 ? 0 : size; + alpaka::syncBlockThreads(acc); + + // find first negative + for_each_element_in_block_strided(acc, size - 1, [&](uint32_t idx) { + if ((a[ind[idx]] ^ a[ind[idx + 1]]) < 0) + firstNeg = idx + 1; + }); + alpaka::syncBlockThreads(acc); + + for_each_element_in_block_strided(acc, size, firstNeg, [&](uint32_t idx) { ind2[size - idx - 1] = ind[idx]; }); + alpaka::syncBlockThreads(acc); + + for_each_element_in_block_strided(acc, firstNeg, [&](uint32_t idx) { ind2[idx + size - firstNeg] = ind[idx]; }); + alpaka::syncBlockThreads(acc); + + for_each_element_in_block_strided(acc, size, [&](uint32_t idx) { ind[idx] = ind2[idx]; }); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void radixSortImpl( + const TAcc& acc, T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) { +#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)) || \ + (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)) + const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); + const uint32_t blockDimension(alpaka::getWorkDiv(acc)[0u]); + + constexpr int d = 8, w = 8 * sizeof(T); + constexpr int sb = 1 << d; + constexpr int ps = int(sizeof(T)) - NS; + + auto& c = alpaka::declareSharedVar(acc); + auto& ct = alpaka::declareSharedVar(acc); + auto& cu = alpaka::declareSharedVar(acc); + auto& ibs = alpaka::declareSharedVar(acc); + auto& p = alpaka::declareSharedVar(acc); + + ALPAKA_ASSERT_OFFLOAD(size > 0); + ALPAKA_ASSERT_OFFLOAD(blockDimension >= sb); + + p = ps; + + auto j = ind; + auto k = ind2; + + for_each_element_in_block_strided(acc, size, [&](uint32_t idx) { j[idx] = idx; }); + alpaka::syncBlockThreads(acc); + + while (alpaka::syncBlockThreadsPredicate(acc, (p < w / d))) { + for_each_element_in_block_strided(acc, sb, [&](uint32_t idx) { c[idx] = 0; }); + alpaka::syncBlockThreads(acc); + + // fill bins + for_each_element_in_block_strided(acc, size, [&](uint32_t idx) { + auto bin = (a[j[idx]] >> d * p) & (sb - 1); + alpaka::atomicAdd(acc, &c[bin], 1, alpaka::hierarchy::Threads{}); + }); + alpaka::syncBlockThreads(acc); + + // prefix scan "optimized"???... + for_each_element_in_block(acc, sb, [&](uint32_t idx) { + auto x = c[idx]; + auto laneId = idx & 0x1f; + + for (int offset = 1; offset < 32; offset <<= 1) { +#if defined(__CUDA_ARCH__) + auto y = __shfl_up_sync(0xffffffff, x, offset); +#elif defined(__HIP_DEVICE_COMPILE__) + auto y = __shfl_up(x, offset); +#endif + if (laneId >= (uint32_t)offset) + x += y; + } + ct[idx] = x; + }); + alpaka::syncBlockThreads(acc); + + for_each_element_in_block(acc, sb, [&](uint32_t idx) { + auto ss = (idx / 32) * 32 - 1; + c[idx] = ct[idx]; + for (int i = ss; i > 0; i -= 32) + c[idx] += ct[i]; + }); + + /* + //prefix scan for the nulls (for documentation) + if (threadIdxLocal==0) + for (int i = 1; i < sb; ++i) c[i] += c[i-1]; + */ + + // broadcast + ibs = size - 1; + alpaka::syncBlockThreads(acc); + + while (alpaka::syncBlockThreadsPredicate(acc, ibs > 0)) { + for_each_element_in_block(acc, sb, [&](uint32_t idx) { + cu[idx] = -1; + ct[idx] = -1; + }); + alpaka::syncBlockThreads(acc); + + for_each_element_in_block(acc, sb, [&](uint32_t idx) { + int i = ibs - idx; + int32_t bin = -1; + if (i >= 0) { + bin = (a[j[i]] >> d * p) & (sb - 1); + ct[idx] = bin; + alpaka::atomicMax(acc, &cu[bin], int(i), alpaka::hierarchy::Threads{}); + } + }); + alpaka::syncBlockThreads(acc); + + for_each_element_in_block(acc, sb, [&](uint32_t idx) { + int i = ibs - idx; + int32_t bin = (i >= 0 ? ((a[j[i]] >> d * p) & (sb - 1)) : -1); + if (i >= 0 && i == cu[bin]) // ensure to keep them in order + for (int ii = idx; ii < sb; ++ii) + if (ct[ii] == bin) { + auto oi = ii - idx; + // assert(i>=oi);if(i>=oi) + k[--c[bin]] = j[i - oi]; + } + }); + alpaka::syncBlockThreads(acc); + + if (threadIdxLocal == 0) { + ibs -= sb; + // cms-patatrack/pixeltrack-standalone#210 + alpaka::mem_fence(acc, alpaka::memory_scope::Grid{}); + } + alpaka::syncBlockThreads(acc); + } + + /* + // broadcast for the nulls (for documentation) + if (threadIdxLocal==0) + for (int i=size-first-1; i>=0; i--) { // =blockDim.x) { + auto bin = (a[j[i]] >> d*p)&(sb-1); + auto ik = atomicSub(&c[bin],1); + k[ik-1] = j[i]; + } + */ + + alpaka::syncBlockThreads(acc); + ALPAKA_ASSERT_OFFLOAD(c[0] == 0); + + // swap (local, ok) + auto t = j; + j = k; + k = t; + + const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); + if (threadIdxLocal == 0) + ++p; + alpaka::syncBlockThreads(acc); + } + + if ((w != 8) && (0 == (NS & 1))) + ALPAKA_ASSERT_OFFLOAD(j == ind); // w/d is even so ind is correct + + if (j != ind) // odd... + for_each_element_in_block_strided(acc, size, [&](uint32_t idx) { ind[idx] = ind2[idx]; }); + + alpaka::syncBlockThreads(acc); + + // now move negative first... (if signed) + reorder(acc, a, ind, ind2, size); +#endif + } + + template ::value, T>::type* = nullptr> + ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void radixSort( + const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + radixSortImpl(acc, a, ind, ind2, size, dummyReorder); + } + + template ::value && std::is_signed::value, T>::type* = nullptr> + ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void radixSort( + const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + radixSortImpl(acc, a, ind, ind2, size, reorderSigned); + } + + template ::value, T>::type* = nullptr> + ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void radixSort( + const TAcc& acc, T const* a, uint16_t* ind, uint16_t* ind2, uint32_t size) { + static_assert(sizeof(T) == sizeof(int), "radixSort with the wrong type size"); + using I = int; + radixSortImpl(acc, (I const*)(a), ind, ind2, size, reorderFloat); + } + +} // namespace cms::alpakatools + +#endif // HeterogeneousCore_AlpakaInterface_interface_radixSort_h diff --git a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h index 0d295855976da..ecee1ff031daa 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/workdivision.h +++ b/HeterogeneousCore/AlpakaInterface/interface/workdivision.h @@ -302,6 +302,210 @@ namespace cms::alpakatools { const Vec extent_; }; + /********************************************* + * RANGE COMPUTATION + ********************************************/ + + /* + * Computes the range of the elements indexes, local to the block. + * Warning: the max index is not truncated by the max number of elements of interest. + */ + template + ALPAKA_FN_ACC std::pair element_index_range_in_block(const TAcc& acc, + const Idx elementIdxShift, + const unsigned int dimIndex = 0u) { + // Take into account the thread index in block. + const Idx threadIdxLocal(alpaka::getIdx(acc)[dimIndex]); + const Idx threadDimension(alpaka::getWorkDiv(acc)[dimIndex]); + + // Compute the elements indexes in block. + // Obviously relevant for CPU only. + // For GPU, threadDimension == 1, and elementIdx == firstElementIdx == threadIdx + elementIdxShift. + const Idx firstElementIdxLocal = threadIdxLocal * threadDimension; + const Idx firstElementIdx = firstElementIdxLocal + elementIdxShift; // Add the shift! + const Idx endElementIdxUncut = firstElementIdx + threadDimension; + + // Return element indexes, shifted by elementIdxShift. + return {firstElementIdx, endElementIdxUncut}; + } + + /* + * Computes the range of the elements indexes, local to the block. + * Truncated by the max number of elements of interest. + */ + template + ALPAKA_FN_ACC std::pair element_index_range_in_block_truncated(const TAcc& acc, + const Idx maxNumberOfElements, + const Idx elementIdxShift, + const unsigned int dimIndex = 0u) { + // Check dimension + //static_assert(alpaka::Dim::value == Dim1::value, + // "Accelerator and maxNumberOfElements need to have same dimension."); + auto [firstElementIdxLocal, endElementIdxLocal] = element_index_range_in_block(acc, elementIdxShift, dimIndex); + + // Truncate + endElementIdxLocal = std::min(endElementIdxLocal, maxNumberOfElements); + + // Return element indexes, shifted by elementIdxShift, and truncated by maxNumberOfElements. + return {firstElementIdxLocal, endElementIdxLocal}; + } + + /* + * Computes the range of the elements indexes in grid. + * Warning: the max index is not truncated by the max number of elements of interest. + */ + template + ALPAKA_FN_ACC std::pair element_index_range_in_grid(const TAcc& acc, + Idx elementIdxShift, + const unsigned int dimIndex = 0u) { + // Take into account the block index in grid. + const Idx blockIdxInGrid(alpaka::getIdx(acc)[dimIndex]); + const Idx blockDimension(alpaka::getWorkDiv(acc)[dimIndex]); + + // Shift to get global indices in grid (instead of local to the block) + elementIdxShift += blockIdxInGrid * blockDimension; + + // Return element indexes, shifted by elementIdxShift. + return element_index_range_in_block(acc, elementIdxShift, dimIndex); + } + + /* + * Loop on all (CPU) elements. + * Elements loop makes sense in CPU case only. In GPU case, elementIdx = firstElementIdx = threadIdx + shift. + * Indexes are local to the BLOCK. + */ + template + ALPAKA_FN_ACC void for_each_element_in_block(const TAcc& acc, + const Idx maxNumberOfElements, + const Idx elementIdxShift, + const Func func, + const unsigned int dimIndex = 0) { + const auto& [firstElementIdx, endElementIdx] = + element_index_range_in_block_truncated(acc, maxNumberOfElements, elementIdxShift, dimIndex); + + for (Idx elementIdx = firstElementIdx; elementIdx < endElementIdx; ++elementIdx) { + func(elementIdx); + } + } + + /* + * Overload for elementIdxShift = 0 + */ + template + ALPAKA_FN_ACC void for_each_element_in_block(const TAcc& acc, + const Idx maxNumberOfElements, + const Func func, + const unsigned int dimIndex = 0) { + const Idx elementIdxShift = 0; + for_each_element_in_block(acc, maxNumberOfElements, elementIdxShift, func, dimIndex); + } + + /************************************************************** + * LOOP ON ALL ELEMENTS WITH ONE LOOP + **************************************************************/ + + /* + * Case where the input index i has reached the end of threadDimension: strides the input index. + * Otherwise: do nothing. + * NB 1: This helper function is used as a trick to only have one loop (like in legacy), instead of 2 loops + * (like in all the other Alpaka helpers, 'for_each_element_in_block_strided' for example, + * because of the additional loop over elements in Alpaka model). + * This allows to keep the 'continue' and 'break' statements as-is from legacy code, + * and hence avoids a lot of legacy code reshuffling. + * NB 2: Modifies i, firstElementIdx and endElementIdx. + */ + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool next_valid_element_index_strided( + Idx& i, Idx& firstElementIdx, Idx& endElementIdx, const Idx stride, const Idx maxNumberOfElements) { + bool isNextStrideElementValid = true; + if (i == endElementIdx) { + firstElementIdx += stride; + endElementIdx += stride; + i = firstElementIdx; + if (i >= maxNumberOfElements) { + isNextStrideElementValid = false; + } + } + return isNextStrideElementValid; + } + + template + ALPAKA_FN_ACC void for_each_element_in_block_strided(const TAcc& acc, + const Idx maxNumberOfElements, + const Idx elementIdxShift, + const Func func, + const unsigned int dimIndex = 0) { + // Get thread / element indices in block. + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = + element_index_range_in_block(acc, elementIdxShift, dimIndex); + + // Stride = block size. + const Idx blockDimension(alpaka::getWorkDiv(acc)[dimIndex]); + + // Strided access. + for (Idx threadIdx = firstElementIdxNoStride, endElementIdx = endElementIdxNoStride; + threadIdx < maxNumberOfElements; + threadIdx += blockDimension, endElementIdx += blockDimension) { + // (CPU) Loop on all elements. + if (endElementIdx > maxNumberOfElements) { + endElementIdx = maxNumberOfElements; + } + for (Idx i = threadIdx; i < endElementIdx; ++i) { + func(i); + } + } + } + + /* + * Overload for elementIdxShift = 0 + */ + template + ALPAKA_FN_ACC void for_each_element_in_block_strided(const TAcc& acc, + const Idx maxNumberOfElements, + const Func func, + const unsigned int dimIndex = 0) { + const Idx elementIdxShift = 0; + for_each_element_in_block_strided(acc, maxNumberOfElements, elementIdxShift, func, dimIndex); + } + + template + ALPAKA_FN_ACC void for_each_element_in_grid_strided(const TAcc& acc, + const Idx maxNumberOfElements, + const Idx elementIdxShift, + const Func func, + const unsigned int dimIndex = 0) { + // Get thread / element indices in block. + const auto& [firstElementIdxNoStride, endElementIdxNoStride] = + element_index_range_in_grid(acc, elementIdxShift, dimIndex); + + // Stride = grid size. + const Idx gridDimension(alpaka::getWorkDiv(acc)[dimIndex]); + + // Strided access. + for (Idx threadIdx = firstElementIdxNoStride, endElementIdx = endElementIdxNoStride; + threadIdx < maxNumberOfElements; + threadIdx += gridDimension, endElementIdx += gridDimension) { + // (CPU) Loop on all elements. + if (endElementIdx > maxNumberOfElements) { + endElementIdx = maxNumberOfElements; + } + for (Idx i = threadIdx; i < endElementIdx; ++i) { + func(i); + } + } + } + + /* + * Overload for elementIdxShift = 0 + */ + template + ALPAKA_FN_ACC void for_each_element_in_grid_strided(const TAcc& acc, + const Idx maxNumberOfElements, + const Func func, + const unsigned int dimIndex = 0) { + const Idx elementIdxShift = 0; + for_each_element_in_grid_strided(acc, maxNumberOfElements, elementIdxShift, func, dimIndex); + } + } // namespace cms::alpakatools #endif // HeterogeneousCore_AlpakaInterface_interface_workdivision_h diff --git a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml index a9bb5a65b3987..4e53313f248c8 100644 --- a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml +++ b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml @@ -11,3 +11,35 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc new file mode 100644 index 0000000000000..27734cac8ed49 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testAtomicPairCounter.dev.cc @@ -0,0 +1,110 @@ +#include +#include + +#define CATCH_CONFIG_MAIN +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h" + +using namespace cms::alpakatools; +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +static constexpr auto s_tag = "[" ALPAKA_TYPE_ALIAS_NAME(alpakaTestAtomicPair) "]"; + +struct update { + template + ALPAKA_FN_ACC void operator()( + const TAcc &acc, AtomicPairCounter *dc, uint32_t *ind, uint32_t *cont, uint32_t n) const { + for_each_element_in_grid_strided(acc, n, [&](uint32_t i) { + auto m = i % 11; + m = m % 6 + 1; // max 6, no 0 + auto c = dc->add(acc, m); + assert(c.m < n); + ind[c.m] = c.n; + for (uint32_t j = c.n; j < c.n + m; ++j) + cont[j] = i; + }); + } +}; + +struct finalize { + template + ALPAKA_FN_ACC void operator()( + const TAcc &acc, AtomicPairCounter const *dc, uint32_t *ind, uint32_t *cont, uint32_t n) const { + assert(dc->get().m == n); + ind[n] = dc->get().n; + } +}; + +TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestAtomicPair), s_tag) { + SECTION("AtomicPairCounter") { + auto const &devices = cms::alpakatools::devices(); + if (devices.empty()) { + std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) + << ", the test will be skipped.\n"; + return; + } + // run the test on each device + for (auto const &device : devices) { + std::cout << "Test AtomicPairCounter on " << alpaka::getName(device) << '\n'; + auto queue = Queue(device); + + auto c_d = make_device_buffer(queue); + alpaka::memset(queue, c_d, 0); + + std::cout << "- size " << sizeof(AtomicPairCounter) << std::endl; + + constexpr uint32_t N = 20000; + constexpr uint32_t M = N * 6; + auto n_d = make_device_buffer(queue, N); + auto m_d = make_device_buffer(queue, M); + + constexpr uint32_t NUM_VALUES = 10000; + + // Update + const auto blocksPerGrid = 2000u; + const auto threadsPerBlockOrElementsPerThread = 512u; + const auto workDiv = make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + alpaka::enqueue( + queue, alpaka::createTaskKernel(workDiv, update(), c_d.data(), n_d.data(), m_d.data(), NUM_VALUES)); + + // Finalize + const auto blocksPerGridFinalize = 1u; + const auto threadsPerBlockOrElementsPerThreadFinalize = 1u; + const auto workDivFinalize = + make_workdiv(blocksPerGridFinalize, threadsPerBlockOrElementsPerThreadFinalize); + alpaka::enqueue( + queue, + alpaka::createTaskKernel(workDivFinalize, finalize(), c_d.data(), n_d.data(), m_d.data(), NUM_VALUES)); + + auto c_h = make_host_buffer(queue); + auto n_h = make_host_buffer(queue, N); + auto m_h = make_host_buffer(queue, M); + + // copy the results from the device to the host + alpaka::memcpy(queue, c_h, c_d); + alpaka::memcpy(queue, n_h, n_d); + alpaka::memcpy(queue, m_h, m_d); + + // wait for all the operations to complete + alpaka::wait(queue); + + REQUIRE(c_h.data()->get().m == NUM_VALUES); + REQUIRE(n_h[NUM_VALUES] == c_h.data()->get().n); + REQUIRE(n_h[0] == 0); + + for (size_t i = 0; i < NUM_VALUES; ++i) { + auto ib = n_h.data()[i]; + auto ie = n_h.data()[i + 1]; + auto k = m_h.data()[ib++]; + REQUIRE(k < NUM_VALUES); + + for (; ib < ie; ++ib) + REQUIRE(m_h.data()[ib] == k); + } + } + } +} \ No newline at end of file diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testHistoContainer.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testHistoContainer.dev.cc new file mode 100644 index 0000000000000..0df2be53d92c7 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testHistoContainer.dev.cc @@ -0,0 +1,195 @@ +#include +#include +#include +#include +#include +#include + +#define CATCH_CONFIG_MAIN +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h" + +using namespace cms::alpakatools; +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +static constexpr auto s_tag = "[" ALPAKA_TYPE_ALIAS_NAME(alpakaTestHistoContainer) "]"; + +template +int go(const DevHost& host, const Device& device, Queue& queue) { + std::mt19937 eng(2708); + std::uniform_int_distribution rgen(std::numeric_limits::min(), std::numeric_limits::max()); + + constexpr unsigned int N = 12000; + auto v = make_host_buffer(queue, N); + auto v_d = make_device_buffer(queue, N); + alpaka::memcpy(queue, v_d, v); + + constexpr uint32_t nParts = 10; + constexpr uint32_t partSize = N / nParts; + + using Hist = HistoContainer; + std::cout << "HistoContainer " << (int)(offsetof(Hist, off)) << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' ' + << Hist::capacity() << ' ' << offsetof(Hist, bins) - offsetof(Hist, off) << ' ' + << (std::numeric_limits::max() - std::numeric_limits::min()) / Hist::nbins() << std::endl; + + auto offsets = make_host_buffer(queue, nParts + 1); + auto offsets_d = make_device_buffer(queue, nParts + 1); + + auto h = make_host_buffer(queue); + auto h_d = make_device_buffer(queue); + + for (int it = 0; it < 5; ++it) { + offsets[0] = 0; + for (uint32_t j = 1; j < nParts + 1; ++j) { + offsets[j] = offsets[j - 1] + partSize - 3 * j; + assert(offsets[j] <= N); + } + + if (it == 1) { // special cases... + offsets[0] = 0; + offsets[1] = 0; + offsets[2] = 19; + offsets[3] = 32 + offsets[2]; + offsets[4] = 123 + offsets[3]; + offsets[5] = 256 + offsets[4]; + offsets[6] = 311 + offsets[5]; + offsets[7] = 2111 + offsets[6]; + offsets[8] = 256 * 11 + offsets[7]; + offsets[9] = 44 + offsets[8]; + offsets[10] = 3297 + offsets[9]; + } + + alpaka::memcpy(queue, offsets_d, offsets); + + for (long long j = 0; j < N; j++) + v[j] = rgen(eng); + + if (it == 2) { // big bin + for (long long j = 1000; j < 2000; j++) + v[j] = sizeof(T) == 1 ? 22 : 3456; + } + + // for(unsigned int i=0;i" << v[i] << std::endl; + // } + alpaka::memcpy(queue, v_d, v); + + alpaka::memset(queue, h_d, 0); + + alpaka::wait(queue); + + std::cout << "Calling fillManyFromVector - " << h->size() << std::endl; + fillManyFromVector(h_d.data(), nParts, v_d.data(), offsets_d.data(), offsets[10], 256, queue); + + alpaka::memcpy(queue, h, h_d); + // std::cout << "Calling fillManyFromVector - " << h->size() << std::endl; + alpaka::wait(queue); + std::cout << "Copied results" << std::endl; + // for(int i =0;i<=10;i++) + // { + // std::cout << offsets[i] <<" - "<< h->size() << std::endl; + // } + + assert(0 == h->off[0]); + assert(offsets[10] == h->size()); + + auto verify = [&](uint32_t i, uint32_t k, uint32_t t1, uint32_t t2) { + assert(t1 < N); + assert(t2 < N); + if (T(v[t1] - v[t2]) <= 0) + std::cout << "for " << i << ':' << v[k] << " failed " << v[t1] << ' ' << v[t2] << std::endl; + }; + + auto incr = [](auto& k) { return k = (k + 1) % Hist::nbins(); }; + + // make sure it spans 3 bins... + auto window = T(1300); + + for (uint32_t j = 0; j < nParts; ++j) { + auto off = Hist::histOff(j); + for (uint32_t i = 0; i < Hist::nbins(); ++i) { + auto ii = i + off; + if (0 == h->size(ii)) + continue; + auto k = *h->begin(ii); + if (j % 2) + k = *(h->begin(ii) + (h->end(ii) - h->begin(ii)) / 2); +#ifndef NDEBUG + auto bk = h->bin(v[k]); +#endif + assert(bk == i); + assert(k < offsets[j + 1]); + auto kl = h->bin(v[k] - window); + auto kh = h->bin(v[k] + window); + assert(kl != i); + assert(kh != i); + // std::cout << kl << ' ' << kh << std::endl; + + auto me = v[k]; + auto tot = 0; + auto nm = 0; + bool l = true; + auto khh = kh; + incr(khh); + for (auto kk = kl; kk != khh; incr(kk)) { + if (kk != kl && kk != kh) + nm += h->size(kk + off); + for (auto p = h->begin(kk + off); p < h->end(kk + off); ++p) { + if (std::min(std::abs(T(v[*p] - me)), std::abs(T(me - v[*p]))) > window) { + } else { + ++tot; + } + } + if (kk == i) { + l = false; + continue; + } + if (l) + for (auto p = h->begin(kk + off); p < h->end(kk + off); ++p) + verify(i, k, k, (*p)); + else + for (auto p = h->begin(kk + off); p < h->end(kk + off); ++p) + verify(i, k, (*p), k); + } + if (!(tot >= nm)) { + std::cout << "too bad " << j << ' ' << i << ' ' << int(me) << '/' << (int)T(me - window) << '/' + << (int)T(me + window) << ": " << kl << '/' << kh << ' ' << khh << ' ' << tot << '/' << nm + << std::endl; + } + if (l) + std::cout << "what? " << j << ' ' << i << ' ' << int(me) << '/' << (int)T(me - window) << '/' + << (int)T(me + window) << ": " << kl << '/' << kh << ' ' << khh << ' ' << tot << '/' << nm + << std::endl; + assert(!l); + } + } + } + + return 0; +} + +TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestHistoContainer), s_tag) { + SECTION("HistoContainerKernel") { + // get the list of devices on the current platform + auto const& devices = cms::alpakatools::devices(); + auto const& host = cms::alpakatools::host(); + if (devices.empty()) { + std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) + << ", the test will be skipped.\n"; + return; + } + // run the test on each device + for (auto const& device : devices) { + std::cout << "Test Histo Container on " << alpaka::getName(device) << '\n'; + auto queue = Queue(device); + + REQUIRE(go(host, device, queue) == 0); + REQUIRE(go(host, device, queue) == 0); + } + } +} diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc new file mode 100644 index 0000000000000..5df8b2ae01701 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneHistoContainer.dev.cc @@ -0,0 +1,178 @@ +#include +#include +#include +#include +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h" + +using namespace cms::alpakatools; +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +template +struct mykernel { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, T const* __restrict__ v, uint32_t N) const { + assert(v); + assert(N == 12000); + + const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); + if (threadIdxLocal == 0) { + printf("start kernel for %d data\n", N); + } + + using Hist = HistoContainer; + + auto& hist = alpaka::declareSharedVar(acc); + auto& ws = alpaka::declareSharedVar(acc); + + // set off zero + for_each_element_in_block_strided(acc, Hist::totbins(), [&](uint32_t j) { hist.off[j] = 0; }); + alpaka::syncBlockThreads(acc); + + // set bins zero + for_each_element_in_block_strided(acc, Hist::totbins(), [&](uint32_t j) { hist.bins[j] = 0; }); + alpaka::syncBlockThreads(acc); + + // count + for_each_element_in_block_strided(acc, N, [&](uint32_t j) { hist.count(acc, v[j]); }); + alpaka::syncBlockThreads(acc); + + assert(0 == hist.size()); + alpaka::syncBlockThreads(acc); + + // finalize + hist.finalize(acc, ws); + alpaka::syncBlockThreads(acc); + + assert(N == hist.size()); + + // verify + for_each_element_in_block_strided(acc, Hist::nbins(), [&](uint32_t j) { assert(hist.off[j] <= hist.off[j + 1]); }); + alpaka::syncBlockThreads(acc); + + for_each_element_in_block(acc, 32, [&](uint32_t i) { + ws[i] = 0; // used by prefix scan... + }); + alpaka::syncBlockThreads(acc); + + // fill + for_each_element_in_block_strided(acc, N, [&](uint32_t j) { hist.fill(acc, v[j], j); }); + alpaka::syncBlockThreads(acc); + + assert(0 == hist.off[0]); + assert(N == hist.size()); + + // bin +#ifndef NDEBUG + for_each_element_in_block_strided(acc, hist.size() - 1, [&](uint32_t j) { + auto p = hist.begin() + j; + assert((*p) < N); + auto k1 = Hist::bin(v[*p]); + auto k2 = Hist::bin(v[*(p + 1)]); + assert(k2 >= k1); + }); +#endif + + // forEachInWindow + for_each_element_in_block_strided(acc, hist.size(), [&](uint32_t i) { + auto p = hist.begin() + i; + auto j = *p; +#ifndef NDEBUG + auto b0 = Hist::bin(v[j]); +#endif + int tot = 0; + auto ftest = [&](unsigned int k) { + assert(k < N); + ++tot; + }; + forEachInWindow(hist, v[j], v[j], ftest); +#ifndef NDEBUG + int rtot = hist.size(b0); + assert(tot == rtot); +#endif + tot = 0; + auto vm = int(v[j]) - DELTA; + auto vp = int(v[j]) + DELTA; + constexpr int vmax = NBINS != 128 ? NBINS * 2 - 1 : std::numeric_limits::max(); + vm = std::max(vm, 0); + vm = std::min(vm, vmax); + vp = std::min(vp, vmax); + vp = std::max(vp, 0); + assert(vp >= vm); + forEachInWindow(hist, vm, vp, ftest); +#ifndef NDEBUG + int bp = Hist::bin(vp); + int bm = Hist::bin(vm); + rtot = hist.end(bp) - hist.begin(bm); + assert(tot == rtot); +#endif + }); + } +}; + +template +void go(const DevHost& host, const Device& device, Queue& queue) { + std::mt19937 eng; + + int rmin = std::numeric_limits::min(); + int rmax = std::numeric_limits::max(); + if (NBINS != 128) { + rmin = 0; + rmax = NBINS * 2 - 1; + } + + std::uniform_int_distribution rgen(rmin, rmax); + constexpr unsigned int N = 12000; + + using Hist = HistoContainer; + std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::capacity() << ' ' + << (rmax - rmin) / Hist::nbins() << std::endl; + std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl; + + auto v = make_host_buffer(queue, N); + auto v_d = make_device_buffer(queue, N); + + for (int it = 0; it < 5; ++it) { + for (long long j = 0; j < N; j++) + v[j] = rgen(eng); + if (it == 2) + for (long long j = N / 2; j < N / 2 + N / 4; j++) + v[j] = 4; + + alpaka::memcpy(queue, v_d, v); + + const auto threadsPerBlockOrElementsPerThread = 256u; + const auto blocksPerGrid = 1u; + const auto workDiv = make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv, mykernel(), v_d.data(), N)); + } + alpaka::wait(queue); +} + +int main() { + auto const& devices = cms::alpakatools::devices(); + if (devices.empty()) { + std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) + << ", the test will be skipped.\n"; + return 0; + } + + auto const& host = cms::alpakatools::host(); + + // run the test on each device + for (auto const& device : devices) { + std::cout << "Test One Histo Container on " << alpaka::getName(device) << '\n'; + + auto queue = Queue(device); + + go(host, device, queue); + go(host, device, queue); + go(host, device, queue); + } + + return 0; +} \ No newline at end of file diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneToManyAssoc.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneToManyAssoc.dev.cc new file mode 100644 index 0000000000000..41b6f8a477b66 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testOneToManyAssoc.dev.cc @@ -0,0 +1,310 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h" + +constexpr uint32_t MaxElem = 64000; +constexpr uint32_t MaxTk = 8000; +constexpr uint32_t MaxAssocs = 4 * MaxTk; + +using namespace cms::alpakatools; +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +using Assoc = OneToManyAssoc; +using SmallAssoc = OneToManyAssoc; +using Multiplicity = OneToManyAssoc; +using TK = std::array; + +struct countMultiLocal { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, + TK const* __restrict__ tk, + Multiplicity* __restrict__ assoc, + uint32_t n) const { + for_each_element_in_grid_strided(acc, n, [&](uint32_t i) { + auto& local = alpaka::declareSharedVar(acc); + const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); + const bool oncePerSharedMemoryAccess = (threadIdxLocal == 0); + if (oncePerSharedMemoryAccess) { + local.zero(); + } + alpaka::syncBlockThreads(acc); + local.countDirect(acc, 2 + i % 4); + alpaka::syncBlockThreads(acc); + if (oncePerSharedMemoryAccess) { + assoc->add(acc, local); + } + }); + } +}; + +struct countMulti { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, + TK const* __restrict__ tk, + Multiplicity* __restrict__ assoc, + uint32_t n) const { + for_each_element_in_grid_strided(acc, n, [&](uint32_t i) { assoc->countDirect(acc, 2 + i % 4); }); + } +}; + +struct verifyMulti { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, Multiplicity* __restrict__ m1, Multiplicity* __restrict__ m2) const { + for_each_element_in_grid_strided( + acc, Multiplicity::totbins(), [&](uint32_t i) { assert(m1->off[i] == m2->off[i]); }); + } +}; + +struct count { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, + TK const* __restrict__ tk, + Assoc* __restrict__ assoc, + uint32_t n) const { + for_each_element_in_grid_strided(acc, 4 * n, [&](uint32_t i) { + auto k = i / 4; + auto j = i - 4 * k; + assert(j < 4); + if (k >= n) { + return; + } + if (tk[k][j] < MaxElem) { + assoc->countDirect(acc, tk[k][j]); + } + }); + } +}; + +struct fill { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, + TK const* __restrict__ tk, + Assoc* __restrict__ assoc, + uint32_t n) const { + for_each_element_in_grid_strided(acc, 4 * n, [&](uint32_t i) { + auto k = i / 4; + auto j = i - 4 * k; + assert(j < 4); + if (k >= n) { + return; + } + if (tk[k][j] < MaxElem) { + assoc->fillDirect(acc, tk[k][j], k); + } + }); + } +}; + +struct verify { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, Assoc* __restrict__ assoc) const { + assert(assoc->size() < Assoc::capacity()); + } +}; + +struct fillBulk { + template + ALPAKA_FN_ACC void operator()( + const TAcc& acc, AtomicPairCounter* apc, TK const* __restrict__ tk, Assoc* __restrict__ assoc, uint32_t n) const { + for_each_element_in_grid_strided(acc, n, [&](uint32_t k) { + auto m = tk[k][3] < MaxElem ? 4 : 3; + assoc->bulkFill(acc, *apc, &tk[k][0], m); + }); + } +}; + +struct verifyBulk { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, Assoc const* __restrict__ assoc, AtomicPairCounter const* apc) const { + if (apc->get().m >= Assoc::nbins()) { + printf("Overflow %d %d\n", apc->get().m, Assoc::nbins()); + } + assert(assoc->size() < Assoc::capacity()); + } +}; + +int main() { + // get the list of devices on the current platform + auto const& devices = cms::alpakatools::devices(); + if (devices.empty()) { + std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) + << ", the test will be skipped.\n"; + return 0; + } + + // run the test on each device + for (auto const& device : devices) { + Queue queue(device); + + std::cout << "OneToManyAssoc " << sizeof(Assoc) << ' ' << Assoc::nbins() << ' ' << Assoc::capacity() << std::endl; + std::cout << "OneToManyAssoc (small) " << sizeof(SmallAssoc) << ' ' << SmallAssoc::nbins() << ' ' + << SmallAssoc::capacity() << std::endl; + + std::mt19937 eng; + std::geometric_distribution rdm(0.8); + + constexpr uint32_t N = 4000; + + auto tr = make_host_buffer[]>(queue, N); + // fill with "index" to element + long long ave = 0; + int imax = 0; + auto n = 0U; + auto z = 0U; + auto nz = 0U; + for (auto i = 0U; i < 4U; ++i) { + auto j = 0U; + while (j < N && n < MaxElem) { + if (z == 11) { + ++n; + z = 0; + ++nz; + continue; + } // a bit of not assoc + auto x = rdm(eng); + auto k = std::min(j + x + 1, N); + if (i == 3 && z == 3) { // some triplets time to time + for (; j < k; ++j) + tr[j][i] = MaxElem + 1; + } else { + ave += x + 1; + imax = std::max(imax, x); + for (; j < k; ++j) + tr[j][i] = n; + ++n; + } + ++z; + } + assert(n <= MaxElem); + assert(j <= N); + } + std::cout << "filled with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << nz << std::endl; + + auto v_d = make_device_buffer[]>(queue, N); + alpaka::memcpy(queue, v_d, tr); + + auto a_d = make_device_buffer(queue); + alpaka::memset(queue, a_d, 0); + + const auto threadsPerBlockOrElementsPerThread = 256u; + const auto blocksPerGrid4N = divide_up_by(4 * N, threadsPerBlockOrElementsPerThread); + const auto workDiv4N = make_workdiv(blocksPerGrid4N, threadsPerBlockOrElementsPerThread); + + launchZero(a_d.data(), queue); + + alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv4N, count(), v_d.data(), a_d.data(), N)); + + launchFinalize(a_d.data(), queue); + + alpaka::enqueue(queue, alpaka::createTaskKernel(WorkDiv1D{1u, 1u, 1u}, verify(), a_d.data())); + + alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv4N, fill(), v_d.data(), a_d.data(), N)); + + auto la = make_host_buffer(queue); + alpaka::memcpy(queue, la, a_d); + alpaka::wait(queue); + + std::cout << la->size() << std::endl; + imax = 0; + ave = 0; + z = 0; + for (auto i = 0U; i < n; ++i) { + auto x = la->size(i); + if (x == 0) { + z++; + continue; + } + ave += x; + imax = std::max(imax, int(x)); + } + assert(0 == la->size(n)); + std::cout << "found with " << n << " elements " << double(ave) / n << ' ' << imax << ' ' << z << std::endl; + + // now the inverse map (actually this is the direct....) + auto dc_d = make_device_buffer(queue); + alpaka::memset(queue, dc_d, 0); + + const auto blocksPerGrid = divide_up_by(N, threadsPerBlockOrElementsPerThread); + const auto workDiv = make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + + alpaka::enqueue(queue, + alpaka::createTaskKernel(workDiv, fillBulk(), dc_d.data(), v_d.data(), a_d.data(), N)); + + alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv, finalizeBulk(), dc_d.data(), a_d.data())); + + alpaka::enqueue(queue, + alpaka::createTaskKernel(WorkDiv1D{1u, 1u, 1u}, verifyBulk(), a_d.data(), dc_d.data())); + + alpaka::memcpy(queue, la, a_d); + + auto dc = make_host_buffer(queue); + alpaka::memcpy(queue, dc, dc_d); + alpaka::wait(queue); + + alpaka::memset(queue, dc_d, 0); + auto sa_d = make_device_buffer(queue); + alpaka::memset(queue, sa_d, 0); + + alpaka::enqueue(queue, + alpaka::createTaskKernel(workDiv, fillBulk(), dc_d.data(), v_d.data(), sa_d.data(), N)); + + alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv, finalizeBulk(), dc_d.data(), sa_d.data())); + + alpaka::enqueue(queue, + alpaka::createTaskKernel(WorkDiv1D{1u, 1u, 1u}, verifyBulk(), sa_d.data(), dc_d.data())); + + std::cout << "final counter value " << dc->get().n << ' ' << dc->get().m << std::endl; + + std::cout << la->size() << std::endl; + imax = 0; + ave = 0; + for (auto i = 0U; i < N; ++i) { + auto x = la->size(i); + if (!(x == 4 || x == 3)) { + std::cout << i << ' ' << x << std::endl; + } + assert(x == 4 || x == 3); + ave += x; + imax = std::max(imax, int(x)); + } + assert(0 == la->size(N)); + std::cout << "found with ave occupancy " << double(ave) / N << ' ' << imax << std::endl; + + // here verify use of block local counters + auto m1_d = make_device_buffer(queue); + alpaka::memset(queue, m1_d, 0); + auto m2_d = make_device_buffer(queue); + alpaka::memset(queue, m2_d, 0); + + launchZero(m1_d.data(), queue); + launchZero(m2_d.data(), queue); + + alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv4N, countMulti(), v_d.data(), m1_d.data(), N)); + + alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv4N, countMultiLocal(), v_d.data(), m2_d.data(), N)); + + const auto blocksPerGridTotBins = 1u; + const auto threadsPerBlockOrElementsPerThreadTotBins = Multiplicity::totbins(); + const auto workDivTotBins = make_workdiv(blocksPerGridTotBins, threadsPerBlockOrElementsPerThreadTotBins); + + alpaka::enqueue(queue, alpaka::createTaskKernel(workDivTotBins, verifyMulti(), m1_d.data(), m2_d.data())); + + launchFinalize(m1_d.data(), queue); + launchFinalize(m2_d.data(), queue); + + alpaka::enqueue(queue, alpaka::createTaskKernel(workDivTotBins, verifyMulti(), m1_d.data(), m2_d.data())); + + alpaka::wait(queue); + + return 0; + } +} \ No newline at end of file diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testPrefixScan.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testPrefixScan.dev.cc new file mode 100644 index 0000000000000..b5431d19a6645 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testPrefixScan.dev.cc @@ -0,0 +1,209 @@ +#include +#include +#include +#include +#include +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "HeterogeneousCore/AlpakaInterface/interface/prefixScan.h" + +using namespace cms::alpakatools; +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +// static constexpr auto s_tag = "[" ALPAKA_TYPE_ALIAS_NAME(alpakaTestPrefixScan) "]"; + +template +struct format_traits { +public: + static const constexpr char* failed_msg = "failed %d %d %d: %d %d\n"; +}; + +template <> +struct format_traits { +public: + static const constexpr char* failed_msg = "failed %d %d %d: %f %f\n"; +}; + +template +struct testPrefixScan { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, unsigned int size) const { + auto& ws = alpaka::declareSharedVar(acc); + auto& c = alpaka::declareSharedVar(acc); + auto& co = alpaka::declareSharedVar(acc); + + for_each_element_in_block_strided(acc, size, [&](uint32_t i) { c[i] = 1; }); + + alpaka::syncBlockThreads(acc); + + blockPrefixScan(acc, c, co, size, ws); + blockPrefixScan(acc, c, size, ws); + + assert(1 == c[0]); + assert(1 == co[0]); + + for_each_element_in_block_strided(acc, size, 1u, [&](uint32_t i) { + assert(c[i] == c[i - 1] + 1); + assert(c[i] == i + 1); + assert(c[i] == co[i]); + }); + } +}; + +/* + * NB: GPU-only, so do not care about elements here. + */ +template +struct testWarpPrefixScan { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, uint32_t size) const { +#if defined(ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND) && defined(__CUDA_ARCH__) || \ + defined(ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND) && defined(__HIP_DEVICE_COMPILE__) + assert(size <= 32); + auto& c = alpaka::declareSharedVar(acc); + auto& co = alpaka::declareSharedVar(acc); + + uint32_t const blockDimension = alpaka::getWorkDiv(acc)[0u]; + uint32_t const blockThreadIdx = alpaka::getIdx(acc)[0u]; + auto i = blockThreadIdx; + c[i] = 1; + alpaka::syncBlockThreads(acc); + auto laneId = blockThreadIdx & 0x1f; + + warpPrefixScan(laneId, c, co, i, 0xffffffff); + warpPrefixScan(laneId, c, i, 0xffffffff); + + alpaka::syncBlockThreads(acc); + + assert(1 == c[0]); + assert(1 == co[0]); + if (i != 0) { + if (c[i] != c[i - 1] + 1) + printf(format_traits::failed_msg, size, i, blockDimension, c[i], c[i - 1]); + assert(c[i] == c[i - 1] + 1); + assert(c[i] == static_cast(i + 1)); + assert(c[i] == co[i]); + } +#endif + } +}; + +struct init { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, uint32_t* v, uint32_t val, uint32_t n) const { + for_each_element_in_grid_strided(acc, n, [&](uint32_t index) { + v[index] = val; + + if (index == 0) + printf("init\n"); + }); + } +}; + +struct verify { + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, uint32_t const* v, uint32_t n) const { + for_each_element_in_grid_strided(acc, n, [&](uint32_t index) { + assert(v[index] == index + 1); + + if (index == 0) + printf("verify\n"); + }); + } +}; + +int main() { + // get the list of devices on the current platform + auto const& devices = cms::alpakatools::devices(); + // auto const& host = cms::alpakatools::host(); + + if (devices.empty()) { + std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) + << ", the test will be skipped.\n"; + return 0; + } + + for (auto const& device : devices) { + std::cout << "Test prefix scan on " << alpaka::getName(device) << '\n'; + auto queue = Queue(device); + // WARP PREFIXSCAN (OBVIOUSLY GPU-ONLY) +#if defined(ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND) || defined(ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND) + std::cout << "warp level" << std::endl; + + const auto threadsPerBlockOrElementsPerThread = 32; + const auto blocksPerGrid = 1; + const auto workDivWarp = make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); + + alpaka::enqueue(queue, alpaka::createTaskKernel(workDivWarp, testWarpPrefixScan(), 32)); + alpaka::enqueue(queue, alpaka::createTaskKernel(workDivWarp, testWarpPrefixScan(), 16)); + alpaka::enqueue(queue, alpaka::createTaskKernel(workDivWarp, testWarpPrefixScan(), 5)); +#endif + + // PORTABLE BLOCK PREFIXSCAN + std::cout << "block level" << std::endl; + + // Running kernel with 1 block, and bs threads per block or elements per thread. + // NB: obviously for tests only, for perf would need to use bs = 1024 in GPU version. + for (int bs = 32; bs <= 1024; bs += 32) { + const auto blocksPerGrid2 = 1; + const auto workDivSingleBlock = make_workdiv(blocksPerGrid2, bs); + + std::cout << "blocks per grid: " << blocksPerGrid2 << ", threads per block or elements per thread: " << bs + << std::endl; + + // Problem size + for (int j = 1; j <= 1024; ++j) { + alpaka::enqueue(queue, alpaka::createTaskKernel(workDivSingleBlock, testPrefixScan(), j)); + alpaka::enqueue(queue, alpaka::createTaskKernel(workDivSingleBlock, testPrefixScan(), j)); + } + } + + // PORTABLE MULTI-BLOCK PREFIXSCAN + int num_items = 200; + for (int ksize = 1; ksize < 4; ++ksize) { + std::cout << "multiblock" << std::endl; + num_items *= 10; + + auto input_d = make_device_buffer(queue, num_items); + auto output1_d = make_device_buffer(queue, num_items); + + const auto nThreadsInit = 256; // NB: 1024 would be better + const auto nBlocksInit = divide_up_by(num_items, nThreadsInit); + const auto workDivMultiBlockInit = make_workdiv(nBlocksInit, nThreadsInit); + + alpaka::enqueue(queue, + alpaka::createTaskKernel(workDivMultiBlockInit, init(), input_d.data(), 1, num_items)); + + const auto nThreads = 1024; + const auto nBlocks = divide_up_by(num_items, nThreads); + const auto workDivMultiBlock = make_workdiv(nBlocks, nThreads); + + std::cout << "launch multiBlockPrefixScan " << num_items << ' ' << nBlocks << std::endl; + alpaka::enqueue(queue, + alpaka::createTaskKernel(workDivMultiBlock, + multiBlockPrefixScanFirstStep(), + input_d.data(), + output1_d.data(), + num_items)); + + const auto blocksPerGridSecondStep = 1; + const auto workDivMultiBlockSecondStep = make_workdiv(blocksPerGridSecondStep, nThreads); + alpaka::enqueue(queue, + alpaka::createTaskKernel(workDivMultiBlockSecondStep, + multiBlockPrefixScanSecondStep(), + input_d.data(), + output1_d.data(), + num_items, + nBlocks)); + + alpaka::enqueue(queue, alpaka::createTaskKernel(workDivMultiBlock, verify(), output1_d.data(), num_items)); + + alpaka::wait(queue); // input_d and output1_d end of scope + } // ksize + } + + return 0; +} \ No newline at end of file