Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 38 additions & 11 deletions src/alpaka/AlpakaCore/ContextState.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,28 @@

#include <memory>

#include "AlpakaCore/alpakaConfig.h"
#include <alpaka/alpaka.hpp>

namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
namespace cms::alpakatools {

template <typename TQueue>
class ScopedContextAcquire;

template <typename TQueue>
class ScopedContextProduce;

template <typename TQueue>
class ScopedContextTask;

/**
* The purpose of this class is to deliver the device and CUDA stream
* information from ExternalWork's acquire() to producer() via a
* member/StreamCache variable.
*/
template <typename TQueue>
class ContextState {
public:
using Queue = ::ALPAKA_ACCELERATOR_NAMESPACE::Queue;
using Queue = TQueue;
using Device = alpaka::Dev<Queue>;

ContextState() = default;
Expand All @@ -26,18 +36,26 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
ContextState& operator=(ContextState&& other) = delete;

private:
friend class ScopedContextAcquire;
friend class ScopedContextProduce;
friend class ScopedContextTask;
friend class ScopedContextAcquire<TQueue>;
friend class ScopedContextProduce<TQueue>;
friend class ScopedContextTask<TQueue>;

void set(std::shared_ptr<Queue> stream) {
throwIfStream();
stream_ = std::move(stream);
}

Device device() const { return alpaka::getDev(*stream_); }
Device device() const {
throwIfNoStream();
return alpaka::getDev(*stream_);
}

const std::shared_ptr<Queue>& streamPtr() const {
Queue stream() const {
throwIfNoStream();
return *stream_;
}

std::shared_ptr<Queue> const& streamPtr() const {
throwIfNoStream();
return stream_;
}
Expand All @@ -51,12 +69,21 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
return std::move(stream_);
}

void throwIfStream() const;
void throwIfNoStream() const;
void throwIfStream() const {
if (stream_) {
throw std::runtime_error("Trying to set ContextState, but it already had a valid state");
}
}

void throwIfNoStream() const {
if (not stream_) {
throw std::runtime_error("Trying to get ContextState, but it did not have a valid state");
}
}

std::shared_ptr<Queue> stream_;
};

} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE
} // namespace cms::alpakatools

#endif // HeterogeneousCore_AlpakaCore_ContextState_h
12 changes: 5 additions & 7 deletions src/alpaka/AlpakaCore/ESProduct.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,19 @@

#include "AlpakaCore/alpakaConfig.h"
#include "AlpakaCore/EventCache.h"
#include "AlpakaCore/currentDevice.h"
#include "AlpakaCore/deviceCount.h"
#include "AlpakaCore/eventWorkHasCompleted.h"

namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {

template <typename T>
class ESProduct {
public:
using Queue = ::ALPAKA_ACCELERATOR_NAMESPACE::Queue;
using Event = ::ALPAKA_ACCELERATOR_NAMESPACE::Event;

template <typename T_Acc>
ESProduct(T_Acc acc) : gpuDataPerDevice_(::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::deviceCount()) {
ESProduct() : gpuDataPerDevice_(::ALPAKA_ACCELERATOR_NAMESPACE::devices.size()) {
for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) {
gpuDataPerDevice_[i].m_event = ::cms::alpakatools::getEventCache<Event>().get(acc);
gpuDataPerDevice_[i].m_event = ::cms::alpakatools::getEventCache<Event>().get(::ALPAKA_ACCELERATOR_NAMESPACE::devices[i]);
}
}

Expand All @@ -32,8 +30,8 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
// which enqueues asynchronous transfers (possibly kernels as well)
// to the CUDA stream
template <typename F>
const T& dataForCurrentDeviceAsync(::ALPAKA_ACCELERATOR_NAMESPACE::Queue queue, F transferAsync) const {
auto device = currentDevice();
const T& dataForDeviceAsync(Queue queue, F transferAsync) const {
auto device = cms::alpakatools::getDevIndex(alpaka::getDev(queue));
auto& data = gpuDataPerDevice_[device];

// If GPU data has already been filled, we can return it
Expand Down
35 changes: 20 additions & 15 deletions src/alpaka/AlpakaCore/HistoContainer.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ namespace cms {
ALPAKA_FN_HOST ALPAKA_FN_INLINE __attribute__((always_inline)) void launchZero(
Histo *__restrict__ h, ::ALPAKA_ACCELERATOR_NAMESPACE::Queue &queue) {
uint32_t *poff = (uint32_t *)(char *)(&(h->off));
auto histoOffView = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::createDeviceView<typename Histo::Counter>(
poff, Histo::totbins());
auto histoOffView =
cms::alpakatools::createDeviceView<typename Histo::Counter>(alpaka::getDev(queue), poff, Histo::totbins());

alpaka::memset(queue, histoOffView, 0, Histo::totbins());
alpaka::wait(queue);
Expand All @@ -77,16 +77,21 @@ namespace cms {

const WorkDiv1D &workDiv = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(
blocksPerGrid, threadsPerBlockOrElementsPerThread);
alpaka::enqueue(queue,
alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>(
workDiv, ::cms::alpakatools::multiBlockPrefixScanFirstStep<uint32_t>(), poff, poff, num_items));

const WorkDiv1D &workDivWith1Block = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(
Vec1D::all(1), threadsPerBlockOrElementsPerThread);
alpaka::enqueue(
queue,
alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>(
workDivWith1Block, ::cms::alpakatools::multiBlockPrefixScanSecondStep<uint32_t>(), poff, poff, num_items, nblocks));
workDiv, ::cms::alpakatools::multiBlockPrefixScanFirstStep<uint32_t>(), poff, poff, num_items));

const WorkDiv1D &workDivWith1Block = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(
Vec1D::all(1), threadsPerBlockOrElementsPerThread);
alpaka::enqueue(queue,
alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>(
workDivWith1Block,
::cms::alpakatools::multiBlockPrefixScanSecondStep<uint32_t>(),
poff,
poff,
num_items,
nblocks));
}

template <typename Histo, typename T>
Expand All @@ -106,14 +111,14 @@ namespace cms {
const WorkDiv1D &workDiv = ::cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE::make_workdiv(
blocksPerGrid, threadsPerBlockOrElementsPerThread);

alpaka::enqueue(
queue,
alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>(workDiv, countFromVector(), h, nh, v, offsets));
alpaka::enqueue(queue,
alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>(
workDiv, countFromVector(), h, nh, v, offsets));
launchFinalize(h, queue);

alpaka::enqueue(
queue,
alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>(workDiv, fillFromVector(), h, nh, v, offsets));
alpaka::enqueue(queue,
alpaka::createTaskKernel<::ALPAKA_ACCELERATOR_NAMESPACE::Acc1D>(
workDiv, fillFromVector(), h, nh, v, offsets));
}

struct finalizeBulk {
Expand Down
24 changes: 14 additions & 10 deletions src/alpaka/AlpakaCore/Product.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,18 @@ namespace edm {
class Wrapper;
}

namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
namespace cms::alpakatools {

namespace impl {
template <typename TQueue>
class ScopedContextGetterBase;
}

/**
* The purpose of this class is to wrap CUDA data to edm::Event in a
* way which forces correct use of various utilities.
*
* The non-default construction has to be done with ::cms::alpakatools::ScopedContext
* The non-default construction has to be done with ScopedContext
* (in order to properly register the CUDA event).
*
* The default constructor is needed only for the ROOT dictionary generation.
Expand All @@ -31,9 +32,12 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
* it. Here is a somewhat natural place. If overhead is too much, we
* can use them only where synchronization between streams is needed.
*/
template <typename T>
class Product : public ProductBase {
template <typename TQueue, typename T>
class Product : public ProductBase<TQueue> {
public:
using Queue = TQueue;
using Event = alpaka::Event<Queue>;

Product() = default; // Needed only for ROOT dictionary generation

Product(const Product&) = delete;
Expand All @@ -42,20 +46,20 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
Product& operator=(Product&&) = default;

private:
friend class impl::ScopedContextGetterBase;
friend class ScopedContextProduce;
friend class edm::Wrapper<Product<T>>;
friend class impl::ScopedContextGetterBase<Queue>;
friend class ScopedContextProduce<Queue>;
friend class edm::Wrapper<Product<Queue, T>>;

explicit Product(std::shared_ptr<Queue> stream, std::shared_ptr<Event> event, T data)
: ProductBase(std::move(stream), std::move(event)), data_(std::move(data)) {}
: ProductBase<Queue>(std::move(stream), std::move(event)), data_(std::move(data)) {}

template <typename... Args>
explicit Product(std::shared_ptr<Queue> stream, std::shared_ptr<Event> event, Args&&... args)
: ProductBase(std::move(stream), std::move(event)), data_(std::forward<Args>(args)...) {}
: ProductBase<Queue>(std::move(stream), std::move(event)), data_(std::forward<Args>(args)...) {}

T data_; //!
};

} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE
} // namespace cms::alpakatools

#endif // AlpakaDataFormats_Common_Product_h
53 changes: 39 additions & 14 deletions src/alpaka/AlpakaCore/ProductBase.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,43 @@

#include <alpaka/alpaka.hpp>

#include "AlpakaCore/alpakaConfigAcc.h"

namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
namespace cms::alpakatools {

namespace impl {
template <typename TQueue>
class ScopedContextBase;
}

template <typename TQueue>
class ScopedContextProduce;

/**
* Base class for all instantiations of CUDA<T> to hold the
* Base class for all instantiations of Product<TQueue, T> to hold the
* non-T-dependent members.
*/
template <typename TQueue>
class ProductBase {
public:
using Queue = ::ALPAKA_ACCELERATOR_NAMESPACE::Queue;
using Queue = TQueue;
using Event = alpaka::Event<Queue>;
using Device = alpaka::Dev<Queue>;

ProductBase() = default; // Needed only for ROOT dictionary generation
~ProductBase();

~ProductBase() {
// Make sure that the production of the product in the GPU is
// complete before destructing the product. This is to make sure
// that the EDM stream does not move to the next event before all
// asynchronous processing of the current is complete.

// TODO: a callback notifying a WaitingTaskHolder (or similar)
// would avoid blocking the CPU, but would also require more work.

// FIXME: this may throw an execption if the underlaying call fails.
if (event_) {
alpaka::wait(*event_);
}
}

ProductBase(const ProductBase&) = delete;
ProductBase& operator=(const ProductBase&) = delete;
Expand All @@ -40,18 +58,25 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
}

bool isValid() const { return stream_.get() != nullptr; }
bool isAvailable() const;

alpaka::Dev<Queue> device() const { return alpaka::getDev(stream()); }
bool isAvailable() const {
// if default-constructed, the product is not available
if (not event_) {
return false;
}
return eventWorkHasCompleted(*(event_.get()));
}

Device device() const { return alpaka::getDev(stream()); }

// cudaStream_t is a pointer to a thread-safe object, for which a
// mutable access is needed even if the ::cms::alpakatools::ScopedContext itself
// mutable access is needed even if the ScopedContext itself
// would be const. Therefore it is ok to return a non-const
// pointer from a const method here.
Queue& stream() const { return *(stream_.get()); }

// cudaEvent_t is a pointer to a thread-safe object, for which a
// mutable access is needed even if the ::cms::alpakatools::ScopedContext itself
// mutable access is needed even if the ScopedContext itself
// would be const. Therefore it is ok to return a non-const
// pointer from a const method here.
Event& event() const { return *(event_.get()); }
Expand All @@ -61,8 +86,8 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
: stream_{std::move(stream)}, event_{std::move(event)} {}

private:
friend class impl::ScopedContextBase;
friend class ScopedContextProduce;
friend class impl::ScopedContextBase<Queue>;
friend class ScopedContextProduce<Queue>;

// The following function is intended to be used only from ScopedContext
const std::shared_ptr<Queue>& streamPtr() const { return stream_; }
Expand All @@ -78,7 +103,7 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
// The cudaStream_t is really shared among edm::Event products, so
// using shared_ptr also here
std::shared_ptr<Queue> stream_; //!
// shared_ptr because of caching in ::cms::alpakatools::EventCache
// shared_ptr because of caching in EventCache
std::shared_ptr<Event> event_; //!

// This flag tells whether the CUDA stream may be reused by a
Expand All @@ -87,6 +112,6 @@ namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE {
mutable std::atomic<bool> mayReuseStream_ = true; //!
};

} // namespace cms::alpakatools::ALPAKA_ACCELERATOR_NAMESPACE
} // namespace cms::alpakatools

#endif // AlpakaDataFormats_Common_ProductBase_h
Loading