Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 22 additions & 16 deletions src/alpaka/AlpakaCore/AllocatorPolicy.h
Original file line number Diff line number Diff line change
@@ -1,45 +1,51 @@
#ifndef AlpakaCore_AllocatorPolicy_h
#define AlpakaCore_AllocatorPolicy_h

#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
#include <cuda_runtime.h>
#endif

#include <alpaka/alpaka.hpp>

namespace cms::alpakatools {

// Which memory allocator to use
// - Synchronous: (device and host) cudaMalloc and cudaMallocHost
// - Synchronous: (device and host) cudaMalloc/hipMalloc and cudaMallocHost/hipMallocHost
// - Asynchronous: (device only) cudaMallocAsync (requires CUDA >= 11.2)
// - Caching: (device and host) caching allocator
enum class AllocatorPolicy { Synchronous = 0, Asynchronous = 1, Caching = 2 };

template <typename TDev>
constexpr AllocatorPolicy allocator_policy = AllocatorPolicy::Synchronous;
constexpr inline AllocatorPolicy allocator_policy = AllocatorPolicy::Synchronous;

#if defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
template <>
constexpr AllocatorPolicy allocator_policy<alpaka::DevCpu> =
#if ! defined ALPAKA_DISABLE_CACHING_ALLOCATOR
AllocatorPolicy::Caching;
constexpr inline AllocatorPolicy allocator_policy<alpaka::DevCpu> =
#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR
AllocatorPolicy::Caching;
#else
AllocatorPolicy::Synchronous;
AllocatorPolicy::Synchronous;
#endif
#endif // defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED || defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED

#if defined ALPAKA_ACC_GPU_CUDA_ENABLED
template <>
constexpr AllocatorPolicy allocator_policy<alpaka::DevUniformCudaHipRt> =
#if ! defined ALPAKA_DISABLE_CACHING_ALLOCATOR
AllocatorPolicy::Caching;
#elif CUDA_VERSION >= 11020 && ! defined ALPAKA_DISABLE_ASYNC_ALLOCATOR
AllocatorPolicy::Asynchronous;
constexpr inline AllocatorPolicy allocator_policy<alpaka::DevCudaRt> =
#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR
AllocatorPolicy::Caching;
#elif CUDA_VERSION >= 11020 && !defined ALPAKA_DISABLE_ASYNC_ALLOCATOR
AllocatorPolicy::Asynchronous;
#else
AllocatorPolicy::Synchronous;
AllocatorPolicy::Synchronous;
#endif
#endif // ALPAKA_ACC_GPU_CUDA_ENABLED

#if defined ALPAKA_ACC_GPU_HIP_ENABLED
template <>
constexpr inline AllocatorPolicy allocator_policy<alpaka::DevHipRt> =
#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR
AllocatorPolicy::Caching;
#else
AllocatorPolicy::Synchronous;
#endif
#endif // ALPAKA_ACC_GPU_HIP_ENABLED

} // namespace cms::alpakatools

#endif // AlpakaCore_AllocatorPolicy_h
74 changes: 63 additions & 11 deletions src/alpaka/AlpakaCore/CachedBufAlloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define AlpakaCore_CachedBufAlloc_h

#include <alpaka/alpaka.hpp>
#include <alpaka/alpakaExtra.hpp>

#include "AlpakaCore/getDeviceCachingAllocator.h"
#include "AlpakaCore/getHostCachingAllocator.h"
Expand Down Expand Up @@ -32,14 +33,14 @@ namespace cms::alpakatools {

//! The caching memory allocator implementation for the pinned host memory
template <typename TElem, typename TDim, typename TIdx>
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueUniformCudaHipRtNonBlocking, void> {
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueCudaRtNonBlocking, void> {
template <typename TExtent>
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
alpaka::QueueUniformCudaHipRtNonBlocking queue,
alpaka::QueueCudaRtNonBlocking queue,
TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;

auto& allocator = getHostCachingAllocator<alpaka::QueueUniformCudaHipRtNonBlocking>();
auto& allocator = getHostCachingAllocator<alpaka::QueueCudaRtNonBlocking>();

// FIXME the BufCpu does not support a pitch ?
size_t size = alpaka::getExtentProduct(extent);
Expand All @@ -53,17 +54,15 @@ namespace cms::alpakatools {
}
};

//! The caching memory allocator implementation for the CUDA/HIP device
//! The caching memory allocator implementation for the CUDA device
template <typename TElem, typename TDim, typename TIdx, typename TQueue>
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevUniformCudaHipRt, TQueue, void> {
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCudaRt, TQueue, void> {
template <typename TExtent>
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevUniformCudaHipRt const& dev,
TQueue queue,
TExtent const& extent)
-> alpaka::BufUniformCudaHipRt<TElem, TDim, TIdx> {
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCudaRt const& dev, TQueue queue, TExtent const& extent)
-> alpaka::BufCudaRt<TElem, TDim, TIdx> {
ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;

auto& allocator = getDeviceCachingAllocator<alpaka::DevUniformCudaHipRt, TQueue>(dev);
auto& allocator = getDeviceCachingAllocator<alpaka::DevCudaRt, TQueue>(dev);

size_t width = alpaka::getWidth(extent);
size_t widthBytes = width * static_cast<TIdx>(sizeof(TElem));
Expand All @@ -76,13 +75,66 @@ namespace cms::alpakatools {
// use a custom deleter to return the buffer to the CachingAllocator
auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };

return alpaka::BufUniformCudaHipRt<TElem, TDim, TIdx>(
return alpaka::BufCudaRt<TElem, TDim, TIdx>(
dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), pitchBytes, extent);
}
};

#endif // ALPAKA_ACC_GPU_CUDA_ENABLED

#ifdef ALPAKA_ACC_GPU_HIP_ENABLED

//! The caching memory allocator implementation for the pinned host memory
template <typename TElem, typename TDim, typename TIdx>
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueHipRtNonBlocking, void> {
template <typename TExtent>
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
alpaka::QueueHipRtNonBlocking queue,
TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;

auto& allocator = getHostCachingAllocator<alpaka::QueueHipRtNonBlocking>();

// FIXME the BufCpu does not support a pitch ?
size_t size = alpaka::getExtentProduct(extent);
size_t sizeBytes = size * sizeof(TElem);
void* memPtr = allocator.allocate(sizeBytes, queue);

// use a custom deleter to return the buffer to the CachingAllocator
auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };

return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
}
};

//! The caching memory allocator implementation for the ROCm/HIP device
template <typename TElem, typename TDim, typename TIdx, typename TQueue>
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevHipRt, TQueue, void> {
template <typename TExtent>
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevHipRt const& dev, TQueue queue, TExtent const& extent)
-> alpaka::BufHipRt<TElem, TDim, TIdx> {
ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;

auto& allocator = getDeviceCachingAllocator<alpaka::DevHipRt, TQueue>(dev);

size_t width = alpaka::getWidth(extent);
size_t widthBytes = width * static_cast<TIdx>(sizeof(TElem));
// TODO implement pitch for TDim > 1
size_t pitchBytes = widthBytes;
size_t size = alpaka::getExtentProduct(extent);
size_t sizeBytes = size * sizeof(TElem);
void* memPtr = allocator.allocate(sizeBytes, queue);

// use a custom deleter to return the buffer to the CachingAllocator
auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };

return alpaka::BufHipRt<TElem, TDim, TIdx>(
dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), pitchBytes, extent);
}
};

#endif // ALPAKA_ACC_GPU_HIP_ENABLED

} // namespace traits

template <typename TElem, typename TIdx, typename TExtent, typename TQueue, typename TDev>
Expand Down
10 changes: 9 additions & 1 deletion src/alpaka/AlpakaCore/CachingAllocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <boost/core/demangle.hpp>

#include <alpaka/alpaka.hpp>
#include <alpaka/alpakaExtra.hpp>

#include "AlpakaCore/alpakaDevices.h"

Expand Down Expand Up @@ -335,12 +336,19 @@ namespace cms::alpakatools {

// for host memory, pin the newly allocated block
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
if (not cms::alpakatools::devices<alpaka::PltfUniformCudaHipRt>.empty()) {
if (not cms::alpakatools::devices<alpaka::PltfCudaRt>.empty()) {
// it is possible to initialise the CUDA runtime and call cudaHostRegister
// only if the system has at least one supported GPU
alpaka::prepareForAsyncCopy(*block.buffer);
}
#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
if (not cms::alpakatools::devices<alpaka::PltfHipRt>.empty()) {
// it is possible to initialise the ROCm runtime and call hipHostRegister
// only if the system has at least one supported GPU
alpaka::prepareForAsyncCopy(*block.buffer);
}
#endif // ALPAKA_ACC_GPU_HIP_ENABLED

// create a new event associated to the "synchronisation device"
block.event = Event{block.device()};
Expand Down
47 changes: 28 additions & 19 deletions src/alpaka/AlpakaCore/HostOnlyTask.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <memory>

#include <alpaka/alpaka.hpp>
#include <alpaka/alpakaExtra.hpp>

namespace alpaka {

Expand All @@ -18,37 +19,45 @@ namespace alpaka {
std::function<void()> task_;
};

#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)

namespace traits {
//! The CUDA/HIP RT async queue enqueue trait specialization for "safe tasks"

#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
//! The CUDA async queue enqueue trait specialization for "safe tasks"
template <>
struct Enqueue<QueueUniformCudaHipRtNonBlocking, HostOnlyTask> {
#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
static void CUDART_CB
#else
static void HIPRT_CB
#endif
uniformCudaHipRtCallback(ALPAKA_API_PREFIX(Stream_t) /*queue*/,
ALPAKA_API_PREFIX(Error_t) /*status*/,
void* arg) {
struct Enqueue<QueueCudaRtNonBlocking, HostOnlyTask> {
static void CUDART_CB callback(cudaStream_t /*queue*/, cudaError_t /*status*/, void* arg) {
//ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status);
std::unique_ptr<HostOnlyTask> pTask(static_cast<HostOnlyTask*>(arg));
(*pTask)();
}

ALPAKA_FN_HOST static auto enqueue(QueueUniformCudaHipRtNonBlocking& queue, HostOnlyTask task) -> void {
ALPAKA_FN_HOST static auto enqueue(QueueCudaRtNonBlocking& queue, HostOnlyTask task) -> void {
auto pTask = std::make_unique<HostOnlyTask>(std::move(task));
ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
ALPAKA_API_PREFIX(StreamAddCallback)(alpaka::getNativeHandle(queue),
uniformCudaHipRtCallback,
static_cast<void*>(pTask.release()),
0u));
cudaStreamAddCallback(alpaka::getNativeHandle(queue), callback, static_cast<void*>(pTask.release()), 0u));
}
};
} // namespace traits
#endif // ALPAKA_ACC_GPU_CUDA_ENABLED

#endif // defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
//! The HIP async queue enqueue trait specialization for "safe tasks"
template <>
struct Enqueue<QueueHipRtNonBlocking, HostOnlyTask> {
static void HIPRT_CB callback(hipStream_t /*queue*/, hipError_t /*status*/, void* arg) {
//ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status);
std::unique_ptr<HostOnlyTask> pTask(static_cast<HostOnlyTask*>(arg));
(*pTask)();
}

ALPAKA_FN_HOST static auto enqueue(QueueHipRtNonBlocking& queue, HostOnlyTask task) -> void {
auto pTask = std::make_unique<HostOnlyTask>(std::move(task));
ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
hipStreamAddCallback(alpaka::getNativeHandle(queue), callback, static_cast<void*>(pTask.release()), 0u));
}
};
#endif // ALPAKA_ACC_GPU_HIP_ENABLED

} // namespace traits

} // namespace alpaka

Expand Down
17 changes: 8 additions & 9 deletions src/alpaka/AlpakaCore/ScopedContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,10 @@ namespace cms::alpakatools {

template <typename F, typename TQueue>
void pushNextTask(F&& f, ContextState<TQueue> const* state) {
replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{
edm::make_waiting_task_with_holder(std::move(waitingTaskHolder_),
[state, func = std::forward<F>(f)](edm::WaitingTaskWithArenaHolder h) {
func(ScopedContextTask{state, std::move(h)});
})});
replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{edm::make_waiting_task_with_holder(
std::move(waitingTaskHolder_), [state, func = std::forward<F>(f)](edm::WaitingTaskWithArenaHolder h) {
func(ScopedContextTask{state, std::move(h)});
})});
}

void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
Expand All @@ -124,10 +123,10 @@ namespace cms::alpakatools {
template <typename TQueue>
void enqueueCallback(TQueue& stream) {
alpaka::enqueue(stream, alpaka::HostOnlyTask([holder = std::move(waitingTaskHolder_)]() {
// The functor is required to be const, but the original waitingTaskHolder_
// needs to be notified...
const_cast<edm::WaitingTaskWithArenaHolder&>(holder).doneWaiting(nullptr);
}));
// The functor is required to be const, but the original waitingTaskHolder_
// needs to be notified...
const_cast<edm::WaitingTaskWithArenaHolder&>(holder).doneWaiting(nullptr);
}));
}

private:
Expand Down
28 changes: 26 additions & 2 deletions src/alpaka/AlpakaCore/alpakaConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <type_traits>

#include <alpaka/alpaka.hpp>
#include <alpaka/alpakaExtra.hpp>

namespace alpaka_common {

Expand Down Expand Up @@ -48,10 +49,10 @@ namespace alpaka_common {
namespace alpaka_cuda_async {
using namespace alpaka_common;

using Platform = alpaka::PltfUniformCudaHipRt;
using Platform = alpaka::PltfCudaRt;
using Device = alpaka::DevCudaRt;
using Queue = alpaka::QueueCudaRtNonBlocking;
using Event = alpaka::EventUniformCudaHipRt;
using Event = alpaka::EventCudaRt;

template <typename TDim>
using Acc = alpaka::AccGpuCudaRt<TDim, Idx>;
Expand All @@ -67,6 +68,29 @@ namespace alpaka_cuda_async {
#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_cuda_async
#endif // ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND

#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
namespace alpaka_rocm_async {
using namespace alpaka_common;

using Platform = alpaka::PltfHipRt;
using Device = alpaka::DevHipRt;
using Queue = alpaka::QueueHipRtNonBlocking;
using Event = alpaka::EventHipRt;

template <typename TDim>
using Acc = alpaka::AccGpuHipRt<TDim, Idx>;
using Acc1D = Acc<Dim1D>;
using Acc2D = Acc<Dim2D>;
using Acc3D = Acc<Dim3D>;

} // namespace alpaka_rocm_async

#endif // ALPAKA_ACC_GPU_HIP_ENABLED

#ifdef ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND
#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_rocm_async
#endif // ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND

#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
namespace alpaka_serial_sync {
using namespace alpaka_common;
Expand Down
12 changes: 11 additions & 1 deletion src/alpaka/AlpakaCore/alpakaMemory.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ namespace cms {
inline constexpr bool is_unbounded_array_v = is_unbounded_array<T>::value;
} // namespace cms

#include <alpaka/alpaka.hpp>
#include <alpaka/alpakaExtra.hpp>

#include "AlpakaCore/AllocatorPolicy.h"
#include "AlpakaCore/CachedBufAlloc.h"
#include "AlpakaCore/alpakaConfig.h"
Expand Down Expand Up @@ -75,12 +78,19 @@ namespace cms::alpakatools {
template <typename TBuf>
void pin_buffer(TBuf& buffer) {
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
if (not devices<alpaka::PltfUniformCudaHipRt>.empty()) {
if (not cms::alpakatools::devices<alpaka::PltfCudaRt>.empty()) {
// it is possible to initialise the CUDA runtime and call cudaHostRegister
// only if the system has at least one supported GPU
alpaka::prepareForAsyncCopy(buffer);
}
#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
if (not cms::alpakatools::devices<alpaka::PltfHipRt>.empty()) {
// it is possible to initialise the ROCm runtime and call hipHostRegister
// only if the system has at least one supported GPU
alpaka::prepareForAsyncCopy(buffer);
}
#endif // ALPAKA_ACC_GPU_HIP_ENABLED
}

// scalar and 1-dimensional host buffers
Expand Down
Loading