Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class TrackingRecHit2DHeterogeneous {
auto hitsModuleStart() const { return m_hitsModuleStart; }
auto hitsLayerStart() { return m_hitsLayerStart; }
auto phiBinner() { return m_hist; }
auto binnerStorage() { return m_histStorage; }
auto iphi() { return m_iphi; }

// only the local coord and detector index
Expand All @@ -42,8 +43,9 @@ class TrackingRecHit2DHeterogeneous {
cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const;

private:
// number of elements of size 16 and 32 respectively
static constexpr uint32_t n16 = 4;
static constexpr uint32_t n32 = 9;
static constexpr uint32_t n32 = 10;
Comment on lines 47 to +48
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As suggested by @slava77 here, could you add a comment about what 4 and 9 10 stand for ?

Copy link
Author

@VinInn VinInn Dec 15, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be pretty obvious from the code where it is used.
They are of course the number of elements with size 16 and size 32 respectively.
I will add a comment.

static_assert(sizeof(uint32_t) == sizeof(float)); // just stating the obvious

unique_ptr<uint16_t[]> m_store16; //!
Expand All @@ -60,6 +62,7 @@ class TrackingRecHit2DHeterogeneous {

// needed as kernel params...
Hist* m_hist;
Hist::index_type* m_histStorage;
uint32_t* m_hitsLayerStart;
int16_t* m_iphi;
};
Expand Down Expand Up @@ -98,14 +101,19 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
// this will break 1to1 correspondence with cluster and module locality
// so unless proven VERY inefficient we keep it ordered as generated
m_store16 = Traits::template make_device_unique<uint16_t[]>(nHits * n16, stream);
m_store32 = Traits::template make_device_unique<float[]>(nHits * n32 + 11, stream);
m_store32 =
Traits::template make_device_unique<float[]>(nHits * n32 + phase1PixelTopology::numberOfLayers + 1, stream);
m_HistStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::Hist>(stream);

static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float));
static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::Hist::index_type));

auto get16 = [&](int i) { return m_store16.get() + i * nHits; };
auto get32 = [&](int i) { return m_store32.get() + i * nHits; };

// copy all the pointers
m_hist = view->m_hist = m_HistStore.get();
m_histStorage = view->m_histStorage = reinterpret_cast<TrackingRecHit2DSOAView::Hist::index_type*>(get32(9));

view->m_xl = get32(0);
view->m_yl = get32(1);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@ class TrackingRecHit2DSOAView {
static constexpr uint32_t maxHits() { return gpuClustering::MaxNumClusters; }
using hindex_type = uint32_t; // if above is <=2^32

using PhiBinner =
cms::cuda::HistoContainer<int16_t, 128, gpuClustering::MaxNumClusters, 8 * sizeof(int16_t), hindex_type, 10>;
using PhiBinner = cms::cuda::HistoContainer<int16_t, 128, -1, 8 * sizeof(int16_t), hindex_type, 10>;

using Hist = PhiBinner; // FIXME

Expand Down Expand Up @@ -96,6 +95,7 @@ class TrackingRecHit2DSOAView {
uint32_t* m_hitsLayerStart;

PhiBinner* m_hist; // FIXME use a more descriptive name consistently
PhiBinner::index_type* m_histStorage;

uint32_t m_nHits;
};
Expand Down
49 changes: 49 additions & 0 deletions HeterogeneousCore/CUDAUtilities/interface/FlexiStorage.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#ifndef HeterogeneousCore_CUDAUtilities_interface_FlexiStorage_h
#define HeterogeneousCore_CUDAUtilities_interface_FlexiStorage_h

#include <cstdint>

namespace cms {
namespace cuda {

template <typename I, int S>
class FlexiStorage {
public:
constexpr int capacity() const { return S; }

constexpr I& operator[](int i) { return m_v[i]; }
constexpr const I& operator[](int i) const { return m_v[i]; }

constexpr I* data() { return m_v; }
constexpr I const* data() const { return m_v; }

private:
I m_v[S];
};

template <typename I>
class FlexiStorage<I, -1> {
public:
constexpr void init(I* v, int s) {
m_v = v;
m_capacity = s;
}

constexpr int capacity() const { return m_capacity; }

constexpr I& operator[](int i) { return m_v[i]; }
constexpr const I& operator[](int i) const { return m_v[i]; }

constexpr I* data() { return m_v; }
constexpr I const* data() const { return m_v; }

private:
I* m_v;
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would it make sense to use

Suggested change
I* m_v;
std::unique_ptr<I[]> m_v;

to handle the ownership of the memory ?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is external storage. This class is allocated on the GPU!

int m_capacity;
};

} // namespace cuda

} // namespace cms

#endif
195 changes: 23 additions & 172 deletions HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
Original file line number Diff line number Diff line change
@@ -1,19 +1,7 @@
#ifndef HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
#define HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h

#include <algorithm>
#ifndef __CUDA_ARCH__
#include <atomic>
#endif // __CUDA_ARCH__
#include <cstddef>
#include <cstdint>
#include <type_traits>

#include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
#include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h"

namespace cms {
namespace cuda {
Expand Down Expand Up @@ -50,61 +38,27 @@ namespace cms {
}
}

template <typename Histo>
inline __attribute__((always_inline)) void launchZero(Histo *__restrict__ h,
cudaStream_t stream
#ifndef __CUDACC__
= cudaStreamDefault
#endif
) {
uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off));
int32_t size = offsetof(Histo, bins) - offsetof(Histo, off);
assert(size >= int(sizeof(uint32_t) * Histo::totbins()));
#ifdef __CUDACC__
cudaCheck(cudaMemsetAsync(poff, 0, size, stream));
#else
::memset(poff, 0, size);
#endif
}

template <typename Histo>
inline __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h,
cudaStream_t stream
#ifndef __CUDACC__
= cudaStreamDefault
#endif
) {
#ifdef __CUDACC__
uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off));
int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Histo, psws));
auto nthreads = 1024;
auto nblocks = (Histo::totbins() + nthreads - 1) / nthreads;
multiBlockPrefixScan<<<nblocks, nthreads, sizeof(int32_t) * nblocks, stream>>>(
poff, poff, Histo::totbins(), ppsws);
cudaCheck(cudaGetLastError());
#else
h->finalize();
#endif
}

template <typename Histo, typename T>
inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h,
uint32_t nh,
T const *__restrict__ v,
uint32_t const *__restrict__ offsets,
uint32_t totSize,
int32_t totSize,
int nthreads,
typename Histo::index_type *mem,
cudaStream_t stream
#ifndef __CUDACC__
= cudaStreamDefault
#endif
) {
launchZero(h, stream);
typename Histo::View view = {h, nullptr, mem, -1, totSize};
launchZero(view, stream);
#ifdef __CUDACC__
auto nblocks = (totSize + nthreads - 1) / nthreads;
assert(nblocks > 0);
countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
cudaCheck(cudaGetLastError());
launchFinalize(h, stream);
launchFinalize(view, stream);
fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
cudaCheck(cudaGetLastError());
#else
Expand All @@ -114,11 +68,6 @@ namespace cms {
#endif
}

template <typename Assoc>
__global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) {
assoc->bulkFinalizeFill(*apc);
}

// iteratate over N bins left and right of the one containing "v"
template <typename Hist, typename V, typename Func>
__host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) {
Expand All @@ -142,20 +91,19 @@ namespace cms {
}
}

template <typename T, // the type of the discretized input values
uint32_t NBINS, // number of bins
uint32_t SIZE, // max number of element
template <typename T, // the type of the discretized input values
uint32_t NBINS, // number of bins
int32_t SIZE, // max number of element. If -1 is initialized at runtime using external storage
uint32_t S = sizeof(T) * 8, // number of significant bits in T
typename I = uint32_t, // type stored in the container (usually an index in a vector of the input values)
uint32_t NHISTS = 1 // number of histos stored
>
class HistoContainer {
class HistoContainer : public OneToManyAssoc<I, NHISTS * NBINS + 1, SIZE> {
public:
using Counter = uint32_t;

using CountersOnly = HistoContainer<T, NBINS, 0, S, I, NHISTS>;

using index_type = I;
using Base = OneToManyAssoc<I, NHISTS * NBINS + 1, SIZE>;
using View = typename Base::View;
using Counter = typename Base::Counter;
using index_type = typename Base::index_type;
using UT = typename std::make_unsigned<T>::type;

static constexpr uint32_t ilog2(uint32_t v) {
Expand All @@ -176,7 +124,8 @@ namespace cms {
static constexpr uint32_t nhists() { return NHISTS; }
static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; }
static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; }
static constexpr uint32_t capacity() { return SIZE; }

// static_assert(int32_t(totbins())==Base::ctNOnes());

static constexpr auto histOff(uint32_t nh) { return NBINS * nh; }

Expand All @@ -186,137 +135,39 @@ namespace cms {
return (t >> shift) & mask;
}

__host__ __device__ void zero() {
for (auto &i : off)
i = 0;
}

__host__ __device__ __forceinline__ void add(CountersOnly const &co) {
for (uint32_t i = 0; i < totbins(); ++i) {
#ifdef __CUDA_ARCH__
atomicAdd(off + i, co.off[i]);
#else
auto &a = (std::atomic<Counter> &)(off[i]);
a += co.off[i];
#endif
}
}

static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) {
#ifdef __CUDA_ARCH__
return atomicAdd(&x, 1);
#else
auto &a = (std::atomic<Counter> &)(x);
return a++;
#endif
}

static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) {
#ifdef __CUDA_ARCH__
return atomicSub(&x, 1);
#else
auto &a = (std::atomic<Counter> &)(x);
return a--;
#endif
}

__host__ __device__ __forceinline__ void countDirect(T b) {
assert(b < nbins());
atomicIncrement(off[b]);
}

__host__ __device__ __forceinline__ void fillDirect(T b, index_type j) {
assert(b < nbins());
auto w = atomicDecrement(off[b]);
assert(w > 0);
bins[w - 1] = j;
}

__host__ __device__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) {
auto c = apc.add(n);
if (c.m >= nbins())
return -int32_t(c.m);
off[c.m] = c.n;
for (uint32_t j = 0; j < n; ++j)
bins[c.n + j] = v[j];
return c.m;
}

__host__ __device__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) {
off[apc.get().m] = apc.get().n;
}

__host__ __device__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) {
auto m = apc.get().m;
auto n = apc.get().n;
if (m >= nbins()) { // overflow!
off[nbins()] = uint32_t(off[nbins() - 1]);
return;
}
auto first = m + blockDim.x * blockIdx.x + threadIdx.x;
for (auto i = first; i < totbins(); i += gridDim.x * blockDim.x) {
off[i] = n;
}
}

__host__ __device__ __forceinline__ void count(T t) {
uint32_t b = bin(t);
assert(b < nbins());
atomicIncrement(off[b]);
Base::atomicIncrement(this->off[b]);
}

__host__ __device__ __forceinline__ void fill(T t, index_type j) {
uint32_t b = bin(t);
assert(b < nbins());
auto w = atomicDecrement(off[b]);
auto w = Base::atomicDecrement(this->off[b]);
assert(w > 0);
bins[w - 1] = j;
this->content[w - 1] = j;
}

__host__ __device__ __forceinline__ void count(T t, uint32_t nh) {
uint32_t b = bin(t);
assert(b < nbins());
b += histOff(nh);
assert(b < totbins());
atomicIncrement(off[b]);
Base::atomicIncrement(this->off[b]);
}

__host__ __device__ __forceinline__ void fill(T t, index_type j, uint32_t nh) {
uint32_t b = bin(t);
assert(b < nbins());
b += histOff(nh);
assert(b < totbins());
auto w = atomicDecrement(off[b]);
auto w = Base::atomicDecrement(this->off[b]);
assert(w > 0);
bins[w - 1] = j;
}

__host__ __device__ __forceinline__ void finalize(Counter *ws = nullptr) {
assert(off[totbins() - 1] == 0);
blockPrefixScan(off, totbins(), ws);
assert(off[totbins() - 1] == off[totbins() - 2]);
this->content[w - 1] = j;
}

constexpr auto size() const { return uint32_t(off[totbins() - 1]); }
constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; }

constexpr index_type const *begin() const { return bins; }
constexpr index_type const *end() const { return begin() + size(); }

constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; }
constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; }

Counter off[totbins()];
int32_t psws; // prefix-scan working space
index_type bins[capacity()];
};

template <typename I, // type stored in the container (usually an index in a vector of the input values)
uint32_t MAXONES, // max number of "ones"
uint32_t MAXMANYS // max number of "manys"
>
using OneToManyAssoc = HistoContainer<uint32_t, MAXONES, MAXMANYS, sizeof(uint32_t) * 8, I, 1>;

} // namespace cuda
} // namespace cms

Expand Down
Loading