Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,8 @@ make cuda ... USER_CXXFLAGS="-DCUDA_DISABLE_CACHING_ALLOCATOR -DCUDA_DISABLE_ASY

#### `cudadev`

This program is currently equivalent to `cuda`.
This program corresponds to the updated version of the pixel tracking software integrated in
[CMSSW_12_0_0_pre3](https://github.com/cms-sw/cmssw/tree/CMSSW_12_0_0_pre3).

The use of caching allocator can be disabled at compile time setting the
`CUDADEV_DISABLE_CACHING_ALLOCATOR` preprocessor symbol:
Expand Down
49 changes: 49 additions & 0 deletions src/cudadev/CUDACore/FlexiStorage.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#ifndef HeterogeneousCore_CUDAUtilities_interface_FlexiStorage_h
#define HeterogeneousCore_CUDAUtilities_interface_FlexiStorage_h

#include <cstdint>

namespace cms {
namespace cuda {

template <typename I, int S>
class FlexiStorage {
public:
constexpr int capacity() const { return S; }

constexpr I& operator[](int i) { return m_v[i]; }
constexpr const I& operator[](int i) const { return m_v[i]; }

constexpr I* data() { return m_v; }
constexpr I const* data() const { return m_v; }

private:
I m_v[S];
};

template <typename I>
class FlexiStorage<I, -1> {
public:
constexpr void init(I* v, int s) {
m_v = v;
m_capacity = s;
}

constexpr int capacity() const { return m_capacity; }

constexpr I& operator[](int i) { return m_v[i]; }
constexpr const I& operator[](int i) const { return m_v[i]; }

constexpr I* data() { return m_v; }
constexpr I const* data() const { return m_v; }

private:
I* m_v;
int m_capacity;
};

} // namespace cuda

} // namespace cms

#endif
195 changes: 23 additions & 172 deletions src/cudadev/CUDACore/HistoContainer.h
Original file line number Diff line number Diff line change
@@ -1,19 +1,7 @@
#ifndef HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
#define HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h

#include <algorithm>
#ifndef __CUDA_ARCH__
#include <atomic>
#endif // __CUDA_ARCH__
#include <cstddef>
#include <cstdint>
#include <type_traits>

#include "CUDACore/AtomicPairCounter.h"
#include "CUDACore/cudaCheck.h"
#include "CUDACore/cuda_assert.h"
#include "CUDACore/cudastdAlgorithm.h"
#include "CUDACore/prefixScan.h"
#include "CUDACore/OneToManyAssoc.h"

namespace cms {
namespace cuda {
Expand Down Expand Up @@ -50,61 +38,27 @@ namespace cms {
}
}

template <typename Histo>
inline __attribute__((always_inline)) void launchZero(Histo *__restrict__ h,
cudaStream_t stream
#ifndef __CUDACC__
= cudaStreamDefault
#endif
) {
uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off));
int32_t size = offsetof(Histo, bins) - offsetof(Histo, off);
assert(size >= int(sizeof(uint32_t) * Histo::totbins()));
#ifdef __CUDACC__
cudaCheck(cudaMemsetAsync(poff, 0, size, stream));
#else
::memset(poff, 0, size);
#endif
}

template <typename Histo>
inline __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h,
cudaStream_t stream
#ifndef __CUDACC__
= cudaStreamDefault
#endif
) {
#ifdef __CUDACC__
uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off));
int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Histo, psws));
auto nthreads = 1024;
auto nblocks = (Histo::totbins() + nthreads - 1) / nthreads;
multiBlockPrefixScan<<<nblocks, nthreads, sizeof(int32_t) * nblocks, stream>>>(
poff, poff, Histo::totbins(), ppsws);
cudaCheck(cudaGetLastError());
#else
h->finalize();
#endif
}

template <typename Histo, typename T>
inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h,
uint32_t nh,
T const *__restrict__ v,
uint32_t const *__restrict__ offsets,
uint32_t totSize,
int32_t totSize,
int nthreads,
typename Histo::index_type *mem,
cudaStream_t stream
#ifndef __CUDACC__
= cudaStreamDefault
#endif
) {
launchZero(h, stream);
typename Histo::View view = {h, nullptr, mem, -1, totSize};
launchZero(view, stream);
#ifdef __CUDACC__
auto nblocks = (totSize + nthreads - 1) / nthreads;
assert(nblocks > 0);
countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
cudaCheck(cudaGetLastError());
launchFinalize(h, stream);
launchFinalize(view, stream);
fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
cudaCheck(cudaGetLastError());
#else
Expand All @@ -114,11 +68,6 @@ namespace cms {
#endif
}

template <typename Assoc>
__global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) {
assoc->bulkFinalizeFill(*apc);
}

// iteratate over N bins left and right of the one containing "v"
template <typename Hist, typename V, typename Func>
__host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) {
Expand All @@ -142,20 +91,19 @@ namespace cms {
}
}

template <typename T, // the type of the discretized input values
uint32_t NBINS, // number of bins
uint32_t SIZE, // max number of element
template <typename T, // the type of the discretized input values
uint32_t NBINS, // number of bins
int32_t SIZE, // max number of element. If -1 is initialized at runtime using external storage
uint32_t S = sizeof(T) * 8, // number of significant bits in T
typename I = uint32_t, // type stored in the container (usually an index in a vector of the input values)
uint32_t NHISTS = 1 // number of histos stored
>
class HistoContainer {
class HistoContainer : public OneToManyAssoc<I, NHISTS * NBINS + 1, SIZE> {
public:
using Counter = uint32_t;

using CountersOnly = HistoContainer<T, NBINS, 0, S, I, NHISTS>;

using index_type = I;
using Base = OneToManyAssoc<I, NHISTS * NBINS + 1, SIZE>;
using View = typename Base::View;
using Counter = typename Base::Counter;
using index_type = typename Base::index_type;
using UT = typename std::make_unsigned<T>::type;

static constexpr uint32_t ilog2(uint32_t v) {
Expand All @@ -176,7 +124,8 @@ namespace cms {
static constexpr uint32_t nhists() { return NHISTS; }
static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; }
static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; }
static constexpr uint32_t capacity() { return SIZE; }

// static_assert(int32_t(totbins())==Base::ctNOnes());

static constexpr auto histOff(uint32_t nh) { return NBINS * nh; }

Expand All @@ -186,137 +135,39 @@ namespace cms {
return (t >> shift) & mask;
}

__host__ __device__ void zero() {
for (auto &i : off)
i = 0;
}

__host__ __device__ __forceinline__ void add(CountersOnly const &co) {
for (uint32_t i = 0; i < totbins(); ++i) {
#ifdef __CUDA_ARCH__
atomicAdd(off + i, co.off[i]);
#else
auto &a = (std::atomic<Counter> &)(off[i]);
a += co.off[i];
#endif
}
}

static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) {
#ifdef __CUDA_ARCH__
return atomicAdd(&x, 1);
#else
auto &a = (std::atomic<Counter> &)(x);
return a++;
#endif
}

static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) {
#ifdef __CUDA_ARCH__
return atomicSub(&x, 1);
#else
auto &a = (std::atomic<Counter> &)(x);
return a--;
#endif
}

__host__ __device__ __forceinline__ void countDirect(T b) {
assert(b < nbins());
atomicIncrement(off[b]);
}

__host__ __device__ __forceinline__ void fillDirect(T b, index_type j) {
assert(b < nbins());
auto w = atomicDecrement(off[b]);
assert(w > 0);
bins[w - 1] = j;
}

__host__ __device__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) {
auto c = apc.add(n);
if (c.m >= nbins())
return -int32_t(c.m);
off[c.m] = c.n;
for (uint32_t j = 0; j < n; ++j)
bins[c.n + j] = v[j];
return c.m;
}

__host__ __device__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) {
off[apc.get().m] = apc.get().n;
}

__host__ __device__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) {
auto m = apc.get().m;
auto n = apc.get().n;
if (m >= nbins()) { // overflow!
off[nbins()] = uint32_t(off[nbins() - 1]);
return;
}
auto first = m + blockDim.x * blockIdx.x + threadIdx.x;
for (auto i = first; i < totbins(); i += gridDim.x * blockDim.x) {
off[i] = n;
}
}

__host__ __device__ __forceinline__ void count(T t) {
uint32_t b = bin(t);
assert(b < nbins());
atomicIncrement(off[b]);
Base::atomicIncrement(this->off[b]);
}

__host__ __device__ __forceinline__ void fill(T t, index_type j) {
uint32_t b = bin(t);
assert(b < nbins());
auto w = atomicDecrement(off[b]);
auto w = Base::atomicDecrement(this->off[b]);
assert(w > 0);
bins[w - 1] = j;
this->content[w - 1] = j;
}

__host__ __device__ __forceinline__ void count(T t, uint32_t nh) {
uint32_t b = bin(t);
assert(b < nbins());
b += histOff(nh);
assert(b < totbins());
atomicIncrement(off[b]);
Base::atomicIncrement(this->off[b]);
}

__host__ __device__ __forceinline__ void fill(T t, index_type j, uint32_t nh) {
uint32_t b = bin(t);
assert(b < nbins());
b += histOff(nh);
assert(b < totbins());
auto w = atomicDecrement(off[b]);
auto w = Base::atomicDecrement(this->off[b]);
assert(w > 0);
bins[w - 1] = j;
}

__host__ __device__ __forceinline__ void finalize(Counter *ws = nullptr) {
assert(off[totbins() - 1] == 0);
blockPrefixScan(off, totbins(), ws);
assert(off[totbins() - 1] == off[totbins() - 2]);
this->content[w - 1] = j;
}

constexpr auto size() const { return uint32_t(off[totbins() - 1]); }
constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; }

constexpr index_type const *begin() const { return bins; }
constexpr index_type const *end() const { return begin() + size(); }

constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; }
constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; }

Counter off[totbins()];
int32_t psws; // prefix-scan working space
index_type bins[capacity()];
};

template <typename I, // type stored in the container (usually an index in a vector of the input values)
uint32_t MAXONES, // max number of "ones"
uint32_t MAXMANYS // max number of "manys"
>
using OneToManyAssoc = HistoContainer<uint32_t, MAXONES, MAXMANYS, sizeof(uint32_t) * 8, I, 1>;

} // namespace cuda
} // namespace cms

Expand Down
1 change: 1 addition & 0 deletions src/cudadev/CUDACore/HostAllocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include <memory>
#include <new>

#include <cuda_runtime.h>

namespace cms {
Expand Down
Loading