Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/cudadev/CUDACore/AcquireContext.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#include "CUDACore/AcquireContext.h"

namespace cms::cuda {
void AcquireContext::commit() { holderHelper_.enqueueCallback(stream()); }
}
45 changes: 45 additions & 0 deletions src/cudadev/CUDACore/AcquireContext.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#ifndef HeterogeneousCore_CUDACore_AcquireContext_h
#define HeterogeneousCore_CUDACore_AcquireContext_h

#include "CUDACore/EDGetterContextBase.h"
#include "CUDACore/TaskContext.h"

namespace cms::cuda {
/**
* The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire():
* - setting the current device
* - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
* - synchronizing between CUDA streams if necessary
* Users should not, however, construct it explicitly.
*/
class AcquireContext : public impl::EDGetterContextBase {
public:
explicit AcquireContext(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
: EDGetterContextBase(streamID), holderHelper_{std::move(waitingTaskHolder), device()} {}
~AcquireContext() = default;

template <typename F>
void pushNextTask(F&& f) {
holderHelper_.pushNextTask(std::forward<F>(f));
}

void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
}

// internal API
void commit();

private:
impl::FwkContextHolderHelper holderHelper_;
};

template <typename F>
void runAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder holder, F func) {
AcquireContext context(streamID, std::move(holder));
func(context);
context.commit();
}
}

#endif
20 changes: 20 additions & 0 deletions src/cudadev/CUDACore/AnalyzeContext.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#ifndef HeterogeneousCore_CUDACore_AnalyzeContext_h
#define HeterogeneousCore_CUDACore_AnalyzeContext_h

#include "CUDACore/EDGetterContextBase.h"

namespace cms::cuda {
/**
* The aim of this class is to do necessary per-event "initialization" in analyze()
* - setting the current device
* - synchronizing between CUDA streams if necessary
* and enforce that those get done in a proper way in RAII fashion.
*/
class AnalyzeContext : public impl::EDGetterContextBase {
public:
/// Constructor to (possibly) re-use a CUDA stream
explicit AnalyzeContext(edm::StreamID streamID) : EDGetterContextBase(streamID) {}
};
}

#endif
46 changes: 46 additions & 0 deletions src/cudadev/CUDACore/Context.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#ifndef HeterogeneousCore_CUDAUtilities_Context_h
#define HeterogeneousCore_CUDAUtilities_Context_h

#include "CUDACore/allocate_device.h"
#include "CUDACore/allocate_host.h"

namespace cms::cuda {
class HostAllocatorContext {
public:
explicit HostAllocatorContext(cudaStream_t stream) : stream_(stream) {}

void *allocate_host(size_t nbytes) const { return cms::cuda::allocate_host(nbytes, stream_); }

void free_host(void *ptr) const { cms::cuda::free_host(ptr); }

private:
cudaStream_t stream_;
};

class DeviceAllocatorContext {
public:
explicit DeviceAllocatorContext(cudaStream_t stream) : stream_(stream) {}

void *allocate_device(size_t nbytes) const { return cms::cuda::allocate_device(nbytes, stream_); }

void free_device(void *ptr) const { cms::cuda::free_device(ptr, stream_); }

private:
cudaStream_t stream_;
};

class Context {
public:
explicit Context(cudaStream_t stream) : stream_(stream) {}

cudaStream_t stream() const { return stream_; }

operator HostAllocatorContext() const { return HostAllocatorContext(stream()); }
operator DeviceAllocatorContext() const { return DeviceAllocatorContext(stream()); }

private:
cudaStream_t stream_;
};
} // namespace cms::cuda

#endif
3 changes: 3 additions & 0 deletions src/cudadev/CUDACore/ContextState.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ namespace cms {
friend class ScopedContextAcquire;
friend class ScopedContextProduce;
friend class ScopedContextTask;
friend class AcquireContext;
friend class ProduceContext;
friend class TaskContext;

void set(int device, SharedStreamPtr stream) {
throwIfStream();
Expand Down
27 changes: 27 additions & 0 deletions src/cudadev/CUDACore/EDGetterContextBase.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#include "CUDACore/EDGetterContextBase.h"
#include "CUDACore/cudaCheck.h"

namespace cms::cuda::impl {
void EDGetterContextBase::synchronizeStreams(int dataDevice,
cudaStream_t dataStream,
bool available,
cudaEvent_t dataEvent) {
if (dataDevice != device()) {
// Eventually replace with prefetch to current device (assuming unified memory works)
// If we won't go to unified memory, need to figure out something else...
throw std::runtime_error("Handling data from multiple devices is not yet supported");
}

if (dataStream != stream()) {
// Different streams, need to synchronize
if (not available) {
// Event not yet occurred, so need to add synchronization
// here. Sychronization is done by making the CUDA stream to
// wait for an event, so all subsequent work in the stream
// will run only after the event has "occurred" (i.e. data
// product became available).
cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event");
}
}
}
}
45 changes: 45 additions & 0 deletions src/cudadev/CUDACore/EDGetterContextBase.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#ifndef HeterogeneousCore_CUDACore_EDGetterContextBase_h
#define HeterogeneousCore_CUDACore_EDGetterContextBase_h

#include "CUDACore/ESProductNew.h"
#include "CUDACore/FwkContextBase.h"
#include "CUDACore/Product.h"
#include "Framework/EDGetToken.h"
#include "Framework/EventSetup.h"

namespace cms::cuda::impl {
/**
* This class is a base class for Context classes that should be
* able to read Event Data products
*/
class EDGetterContextBase : public FwkContextBase {
public:
template <typename T>
const T& get(const Product<T>& data) {
if (not isInitialized()) {
initialize(data);
}
synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event());
return data.data_;
}

template <typename T>
const T& get(const edm::Event& iEvent, edm::EDGetTokenT<Product<T>> token) {
return get(iEvent.get(token));
}

template <typename T>
const T& getData(const edm::EventSetup& iSetup) {
return iSetup.get<cms::cudaNew::ESProduct<T>>().get(device(), stream());
}

protected:
template <typename... Args>
EDGetterContextBase(Args&&... args) : FwkContextBase(std::forward<Args>(args)...) {}

private:
void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent);
};
}

#endif
17 changes: 17 additions & 0 deletions src/cudadev/CUDACore/EDProducer.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#include "CUDACore/EDProducer.h"

namespace cms::cuda {
void EDProducer::produce(edm::Event& event, edm::EventSetup const& eventSetup) {
runProduce(event.streamID(), [&](auto& ctx) { produce(event, eventSetup, ctx); });
}

void SynchronizingEDProducer::acquire(edm::Event const& event,
edm::EventSetup const& eventSetup,
edm::WaitingTaskWithArenaHolder holder) {
runAcquire(event.streamID(), std::move(holder), [&](auto& ctx) { acquire(event, eventSetup, ctx); });
}

void SynchronizingEDProducer::produce(edm::Event& event, edm::EventSetup const& eventSetup) {
runProduce(event.streamID(), [&](auto& ctx) { produce(event, eventSetup, ctx); });
}
} // namespace cms::cuda
27 changes: 27 additions & 0 deletions src/cudadev/CUDACore/EDProducer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#ifndef HeterogeneousCore_CUDACore_stream_EDProducer_h
#define HeterogeneousCore_CUDACore_stream_EDProducer_h

#include "Framework/EDProducer.h"
#include "CUDACore/AcquireContext.h"
#include "CUDACore/ProduceContext.h"

namespace cms::cuda {
class EDProducer : public edm::EDProducer {
public:
void produce(edm::Event& event, edm::EventSetup const& eventSetup) override;
virtual void produce(edm::Event& event, edm::EventSetup const& eventSetup, ProduceContext& context) = 0;
};

class SynchronizingEDProducer : public edm::EDProducerExternalWork {
public:
void acquire(edm::Event const& event,
edm::EventSetup const& eventSetup,
edm::WaitingTaskWithArenaHolder holder) override;
void produce(edm::Event& event, edm::EventSetup const& eventSetup) override;

virtual void acquire(edm::Event const& event, edm::EventSetup const& eventSetup, AcquireContext& context) = 0;
virtual void produce(edm::Event& event, edm::EventSetup const& eventSetup, ProduceContext& context) = 0;
};
} // namespace cms::cuda

#endif
11 changes: 11 additions & 0 deletions src/cudadev/CUDACore/ESContext.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#include "CUDACore/ESContext.h"
#include "CUDACore/StreamCache.h"
#include "CUDACore/cudaCheck.h"

namespace cms::cuda {
ESContext::ESContext(int device) : currentDevice_(device) {
cudaCheck(cudaSetDevice(currentDevice_));
stream_ = getStreamCache().get();
}

} // namespace cms::cuda
85 changes: 85 additions & 0 deletions src/cudadev/CUDACore/ESContext.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#ifndef HeterogeneousCore_CUDACore_ESContext_h
#define HeterogeneousCore_CUDACore_ESContext_h

#include "CUDACore/Context.h"
#include "CUDACore/ESProductNew.h"
#include "CUDACore/SharedStreamPtr.h"
#include "CUDACore/StreamCache.h"

namespace cms::cuda {
class ESContext {
public:
explicit ESContext(int device);

int device() const { return currentDevice_; }

cudaStream_t stream() const { return stream_.get(); }

operator HostAllocatorContext() { return HostAllocatorContext(stream()); }
operator DeviceAllocatorContext() { return DeviceAllocatorContext(stream()); }
operator Context() { return Context(stream()); }

private:
int currentDevice_;
SharedStreamPtr stream_;
};

namespace impl {
template <typename T>
class RunForEachDevice {
public:
RunForEachDevice(T&& data) : data_(std::forward<T>(data)) {}

template <typename F>
[[nodiscard]] auto forEachDevice(F const& func) {
using RetType = decltype(func(std::declval<const T&>(), std::declval<ESContext&>()));
static_assert(not std::is_same_v<RetType, void>,
"Function must return a value corresponding the ESProduct for one device");
auto product = std::make_unique<cms::cudaNew::ESProduct<RetType>>();
auto const& cref = data_;
for (std::size_t i = 0; i < product->size(); ++i) {
ESContext ctx(i);
product->emplace(i, func(cref, ctx), ctx.stream());
}
product->setHostData(std::move(data_));
return product;
}

private:
T data_;
};
} // namespace impl

template <typename F>
[[nodiscard]] auto runForHost(F&& func) {
using RetType = decltype(func(std::declval<HostAllocatorContext&>()));
static_assert(not std::is_same_v<RetType, void>,
"Function must return an intermediate value passed to the functor argument of runOnEachDevice()");
// TODO: temporarily use a "random stream" for the pinned host
// memory allocation until we figure out a way to use the caching
// allocator without requiring a stream. The
// cms::cudaNew::ESProduct will take care of releasing any
// allocated host memory only after all asynchronous work issued
// in RunForEachDevice::forEachDevice() has completed.
cudaCheck(cudaSetDevice(0));
auto stream = getStreamCache().get();
HostAllocatorContext ctx(stream.get());
return impl::RunForEachDevice<RetType>(func(ctx));
}

template <typename F>
[[nodiscard]] auto runForEachDevice(F&& func) {
using RetType = decltype(func(std::declval<ESContext&>()));
static_assert(not std::is_same_v<RetType, void>,
"Function must return a value corresponding the ESProduct for noe device");
auto product = std::make_unique<cms::cudaNew::ESProduct<RetType>>();
for (int i = 0; i < product.size(); ++i) {
ESContext ctx(i);
product->emplace(i, func(ctx), ctx.stream());
}
return product;
}

} // namespace cms::cuda

#endif
Loading