From 7cbfa3dcf5aeffb3f390192a6c53102e11e1a690 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Mon, 14 Mar 2022 23:22:58 +0100 Subject: [PATCH 1/2] [alpakatest] Support CUDA or ROCm/HIP Allow building the "alpakatest" application with support for either of CUDA or ROCm/HIP. --- src/alpakatest/AlpakaCore/AllocatorPolicy.h | 38 ++++++---- src/alpakatest/AlpakaCore/CachedBufAlloc.h | 74 ++++++++++++++++--- src/alpakatest/AlpakaCore/CachingAllocator.h | 10 ++- src/alpakatest/AlpakaCore/HostOnlyTask.h | 47 +++++++----- src/alpakatest/AlpakaCore/ScopedContext.h | 17 ++--- src/alpakatest/AlpakaCore/alpakaConfig.h | 28 ++++++- src/alpakatest/AlpakaCore/alpakaMemory.h | 12 ++- src/alpakatest/AlpakaCore/alpakaWorkDiv.h | 18 +++++ src/alpakatest/AlpakaCore/backend.h | 2 +- .../AlpakaCore/getDeviceCachingAllocator.h | 2 +- src/alpakatest/AlpakaCore/getDeviceIndex.h | 5 ++ src/alpakatest/Makefile | 67 +++++++++++++++-- src/alpakatest/Makefile.deps | 3 + src/alpakatest/alpaka/alpakaExtra.hpp | 26 +++++++ src/alpakatest/bin/main.cc | 54 +++++++++++--- src/alpakatest/plugins.txt | 3 + 16 files changed, 328 insertions(+), 78 deletions(-) create mode 100644 src/alpakatest/alpaka/alpakaExtra.hpp diff --git a/src/alpakatest/AlpakaCore/AllocatorPolicy.h b/src/alpakatest/AlpakaCore/AllocatorPolicy.h index 162222349..d43478e01 100644 --- a/src/alpakatest/AlpakaCore/AllocatorPolicy.h +++ b/src/alpakatest/AlpakaCore/AllocatorPolicy.h @@ -1,45 +1,51 @@ #ifndef AlpakaCore_AllocatorPolicy_h #define AlpakaCore_AllocatorPolicy_h -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED -#include -#endif - #include namespace cms::alpakatools { // Which memory allocator to use - // - Synchronous: (device and host) cudaMalloc and cudaMallocHost + // - Synchronous: (device and host) cudaMalloc/hipMalloc and cudaMallocHost/hipMallocHost // - Asynchronous: (device only) cudaMallocAsync (requires CUDA >= 11.2) // - Caching: (device and host) caching allocator enum class AllocatorPolicy { Synchronous = 0, Asynchronous = 1, Caching = 2 }; template - constexpr AllocatorPolicy allocator_policy = AllocatorPolicy::Synchronous; + constexpr inline AllocatorPolicy allocator_policy = AllocatorPolicy::Synchronous; #if defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED template <> - constexpr AllocatorPolicy allocator_policy = -#if ! defined ALPAKA_DISABLE_CACHING_ALLOCATOR - AllocatorPolicy::Caching; + constexpr inline AllocatorPolicy allocator_policy = +#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR + AllocatorPolicy::Caching; #else - AllocatorPolicy::Synchronous; + AllocatorPolicy::Synchronous; #endif #endif // defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED || defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED #if defined ALPAKA_ACC_GPU_CUDA_ENABLED template <> - constexpr AllocatorPolicy allocator_policy = -#if ! defined ALPAKA_DISABLE_CACHING_ALLOCATOR - AllocatorPolicy::Caching; -#elif CUDA_VERSION >= 11020 && ! defined ALPAKA_DISABLE_ASYNC_ALLOCATOR - AllocatorPolicy::Asynchronous; + constexpr inline AllocatorPolicy allocator_policy = +#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR + AllocatorPolicy::Caching; +#elif CUDA_VERSION >= 11020 && !defined ALPAKA_DISABLE_ASYNC_ALLOCATOR + AllocatorPolicy::Asynchronous; #else - AllocatorPolicy::Synchronous; + AllocatorPolicy::Synchronous; #endif #endif // ALPAKA_ACC_GPU_CUDA_ENABLED +#if defined ALPAKA_ACC_GPU_HIP_ENABLED + template <> + constexpr inline AllocatorPolicy allocator_policy = +#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR + AllocatorPolicy::Caching; +#else + AllocatorPolicy::Synchronous; +#endif +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + } // namespace cms::alpakatools #endif // AlpakaCore_AllocatorPolicy_h diff --git a/src/alpakatest/AlpakaCore/CachedBufAlloc.h b/src/alpakatest/AlpakaCore/CachedBufAlloc.h index 84f029848..a57c321da 100644 --- a/src/alpakatest/AlpakaCore/CachedBufAlloc.h +++ b/src/alpakatest/AlpakaCore/CachedBufAlloc.h @@ -2,6 +2,7 @@ #define AlpakaCore_CachedBufAlloc_h #include +#include #include "AlpakaCore/getDeviceCachingAllocator.h" #include "AlpakaCore/getHostCachingAllocator.h" @@ -32,14 +33,14 @@ namespace cms::alpakatools { //! The caching memory allocator implementation for the pinned host memory template - struct CachedBufAlloc { + struct CachedBufAlloc { template ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev, - alpaka::QueueUniformCudaHipRtNonBlocking queue, + alpaka::QueueCudaRtNonBlocking queue, TExtent const& extent) -> alpaka::BufCpu { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - auto& allocator = getHostCachingAllocator(); + auto& allocator = getHostCachingAllocator(); // FIXME the BufCpu does not support a pitch ? size_t size = alpaka::getExtentProduct(extent); @@ -53,17 +54,15 @@ namespace cms::alpakatools { } }; - //! The caching memory allocator implementation for the CUDA/HIP device + //! The caching memory allocator implementation for the CUDA device template - struct CachedBufAlloc { + struct CachedBufAlloc { template - ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevUniformCudaHipRt const& dev, - TQueue queue, - TExtent const& extent) - -> alpaka::BufUniformCudaHipRt { + ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCudaRt const& dev, TQueue queue, TExtent const& extent) + -> alpaka::BufCudaRt { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - auto& allocator = getDeviceCachingAllocator(dev); + auto& allocator = getDeviceCachingAllocator(dev); size_t width = alpaka::getWidth(extent); size_t widthBytes = width * static_cast(sizeof(TElem)); @@ -76,13 +75,66 @@ namespace cms::alpakatools { // use a custom deleter to return the buffer to the CachingAllocator auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); }; - return alpaka::BufUniformCudaHipRt( + return alpaka::BufCudaRt( dev, reinterpret_cast(memPtr), std::move(deleter), pitchBytes, extent); } }; #endif // ALPAKA_ACC_GPU_CUDA_ENABLED +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + + //! The caching memory allocator implementation for the pinned host memory + template + struct CachedBufAlloc { + template + ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev, + alpaka::QueueHipRtNonBlocking queue, + TExtent const& extent) -> alpaka::BufCpu { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + auto& allocator = getHostCachingAllocator(); + + // FIXME the BufCpu does not support a pitch ? + size_t size = alpaka::getExtentProduct(extent); + size_t sizeBytes = size * sizeof(TElem); + void* memPtr = allocator.allocate(sizeBytes, queue); + + // use a custom deleter to return the buffer to the CachingAllocator + auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); }; + + return alpaka::BufCpu(dev, reinterpret_cast(memPtr), std::move(deleter), extent); + } + }; + + //! The caching memory allocator implementation for the ROCm/HIP device + template + struct CachedBufAlloc { + template + ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevHipRt const& dev, TQueue queue, TExtent const& extent) + -> alpaka::BufHipRt { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + auto& allocator = getDeviceCachingAllocator(dev); + + size_t width = alpaka::getWidth(extent); + size_t widthBytes = width * static_cast(sizeof(TElem)); + // TODO implement pitch for TDim > 1 + size_t pitchBytes = widthBytes; + size_t size = alpaka::getExtentProduct(extent); + size_t sizeBytes = size * sizeof(TElem); + void* memPtr = allocator.allocate(sizeBytes, queue); + + // use a custom deleter to return the buffer to the CachingAllocator + auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); }; + + return alpaka::BufHipRt( + dev, reinterpret_cast(memPtr), std::move(deleter), pitchBytes, extent); + } + }; + +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + } // namespace traits template diff --git a/src/alpakatest/AlpakaCore/CachingAllocator.h b/src/alpakatest/AlpakaCore/CachingAllocator.h index 2de401240..733962d82 100644 --- a/src/alpakatest/AlpakaCore/CachingAllocator.h +++ b/src/alpakatest/AlpakaCore/CachingAllocator.h @@ -15,6 +15,7 @@ #include #include +#include #include "AlpakaCore/alpakaDevices.h" @@ -335,12 +336,19 @@ namespace cms::alpakatools { // for host memory, pin the newly allocated block #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - if (not cms::alpakatools::devices.empty()) { + if (not cms::alpakatools::devices.empty()) { // it is possible to initialise the CUDA runtime and call cudaHostRegister // only if the system has at least one supported GPU alpaka::prepareForAsyncCopy(*block.buffer); } #endif // ALPAKA_ACC_GPU_CUDA_ENABLED +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + if (not cms::alpakatools::devices.empty()) { + // it is possible to initialise the ROCm runtime and call hipHostRegister + // only if the system has at least one supported GPU + alpaka::prepareForAsyncCopy(*block.buffer); + } +#endif // ALPAKA_ACC_GPU_HIP_ENABLED // create a new event associated to the "synchronisation device" block.event = Event{block.device()}; diff --git a/src/alpakatest/AlpakaCore/HostOnlyTask.h b/src/alpakatest/AlpakaCore/HostOnlyTask.h index 1ea88314a..f0d3431f0 100644 --- a/src/alpakatest/AlpakaCore/HostOnlyTask.h +++ b/src/alpakatest/AlpakaCore/HostOnlyTask.h @@ -5,6 +5,7 @@ #include #include +#include namespace alpaka { @@ -18,37 +19,45 @@ namespace alpaka { std::function task_; }; -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED) - namespace traits { - //! The CUDA/HIP RT async queue enqueue trait specialization for "safe tasks" + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + //! The CUDA async queue enqueue trait specialization for "safe tasks" template <> - struct Enqueue { -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - static void CUDART_CB -#else - static void HIPRT_CB -#endif - uniformCudaHipRtCallback(ALPAKA_API_PREFIX(Stream_t) /*queue*/, - ALPAKA_API_PREFIX(Error_t) /*status*/, - void* arg) { + struct Enqueue { + static void CUDART_CB callback(cudaStream_t /*queue*/, cudaError_t /*status*/, void* arg) { //ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status); std::unique_ptr pTask(static_cast(arg)); (*pTask)(); } - ALPAKA_FN_HOST static auto enqueue(QueueUniformCudaHipRtNonBlocking& queue, HostOnlyTask task) -> void { + ALPAKA_FN_HOST static auto enqueue(QueueCudaRtNonBlocking& queue, HostOnlyTask task) -> void { auto pTask = std::make_unique(std::move(task)); ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(StreamAddCallback)(alpaka::getNativeHandle(queue), - uniformCudaHipRtCallback, - static_cast(pTask.release()), - 0u)); + cudaStreamAddCallback(alpaka::getNativeHandle(queue), callback, static_cast(pTask.release()), 0u)); } }; - } // namespace traits +#endif // ALPAKA_ACC_GPU_CUDA_ENABLED -#endif // defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED) +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + //! The HIP async queue enqueue trait specialization for "safe tasks" + template <> + struct Enqueue { + static void HIPRT_CB callback(hipStream_t /*queue*/, hipError_t /*status*/, void* arg) { + //ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status); + std::unique_ptr pTask(static_cast(arg)); + (*pTask)(); + } + + ALPAKA_FN_HOST static auto enqueue(QueueHipRtNonBlocking& queue, HostOnlyTask task) -> void { + auto pTask = std::make_unique(std::move(task)); + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + hipStreamAddCallback(alpaka::getNativeHandle(queue), callback, static_cast(pTask.release()), 0u)); + } + }; +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + + } // namespace traits } // namespace alpaka diff --git a/src/alpakatest/AlpakaCore/ScopedContext.h b/src/alpakatest/AlpakaCore/ScopedContext.h index 7e1b65d00..4d53cf9e1 100644 --- a/src/alpakatest/AlpakaCore/ScopedContext.h +++ b/src/alpakatest/AlpakaCore/ScopedContext.h @@ -110,11 +110,10 @@ namespace cms::alpakatools { template void pushNextTask(F&& f, ContextState const* state) { - replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{ - edm::make_waiting_task_with_holder(std::move(waitingTaskHolder_), - [state, func = std::forward(f)](edm::WaitingTaskWithArenaHolder h) { - func(ScopedContextTask{state, std::move(h)}); - })}); + replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{edm::make_waiting_task_with_holder( + std::move(waitingTaskHolder_), [state, func = std::forward(f)](edm::WaitingTaskWithArenaHolder h) { + func(ScopedContextTask{state, std::move(h)}); + })}); } void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { @@ -124,10 +123,10 @@ namespace cms::alpakatools { template void enqueueCallback(TQueue& stream) { alpaka::enqueue(stream, alpaka::HostOnlyTask([holder = std::move(waitingTaskHolder_)]() { - // The functor is required to be const, but the original waitingTaskHolder_ - // needs to be notified... - const_cast(holder).doneWaiting(nullptr); - })); + // The functor is required to be const, but the original waitingTaskHolder_ + // needs to be notified... + const_cast(holder).doneWaiting(nullptr); + })); } private: diff --git a/src/alpakatest/AlpakaCore/alpakaConfig.h b/src/alpakatest/AlpakaCore/alpakaConfig.h index 0ffa17e01..9562c8254 100644 --- a/src/alpakatest/AlpakaCore/alpakaConfig.h +++ b/src/alpakatest/AlpakaCore/alpakaConfig.h @@ -4,6 +4,7 @@ #include #include +#include namespace alpaka_common { @@ -48,10 +49,10 @@ namespace alpaka_common { namespace alpaka_cuda_async { using namespace alpaka_common; - using Platform = alpaka::PltfUniformCudaHipRt; + using Platform = alpaka::PltfCudaRt; using Device = alpaka::DevCudaRt; using Queue = alpaka::QueueCudaRtNonBlocking; - using Event = alpaka::EventUniformCudaHipRt; + using Event = alpaka::EventCudaRt; template using Acc = alpaka::AccGpuCudaRt; @@ -67,6 +68,29 @@ namespace alpaka_cuda_async { #define ALPAKA_ACCELERATOR_NAMESPACE alpaka_cuda_async #endif // ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED +namespace alpaka_rocm_async { + using namespace alpaka_common; + + using Platform = alpaka::PltfHipRt; + using Device = alpaka::DevHipRt; + using Queue = alpaka::QueueHipRtNonBlocking; + using Event = alpaka::EventHipRt; + + template + using Acc = alpaka::AccGpuHipRt; + using Acc1D = Acc; + using Acc2D = Acc; + using Acc3D = Acc; + +} // namespace alpaka_rocm_async + +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + +#ifdef ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND +#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_rocm_async +#endif // ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND + #ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED namespace alpaka_serial_sync { using namespace alpaka_common; diff --git a/src/alpakatest/AlpakaCore/alpakaMemory.h b/src/alpakatest/AlpakaCore/alpakaMemory.h index 9616497f6..e1ae91435 100644 --- a/src/alpakatest/AlpakaCore/alpakaMemory.h +++ b/src/alpakatest/AlpakaCore/alpakaMemory.h @@ -25,6 +25,9 @@ namespace cms { inline constexpr bool is_unbounded_array_v = is_unbounded_array::value; } // namespace cms +#include +#include + #include "AlpakaCore/AllocatorPolicy.h" #include "AlpakaCore/CachedBufAlloc.h" #include "AlpakaCore/alpakaConfig.h" @@ -75,12 +78,19 @@ namespace cms::alpakatools { template void pin_buffer(TBuf& buffer) { #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - if (not devices.empty()) { + if (not cms::alpakatools::devices.empty()) { // it is possible to initialise the CUDA runtime and call cudaHostRegister // only if the system has at least one supported GPU alpaka::prepareForAsyncCopy(buffer); } #endif // ALPAKA_ACC_GPU_CUDA_ENABLED +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + if (not cms::alpakatools::devices.empty()) { + // it is possible to initialise the ROCm runtime and call hipHostRegister + // only if the system has at least one supported GPU + alpaka::prepareForAsyncCopy(buffer); + } +#endif // ALPAKA_ACC_GPU_HIP_ENABLED } // scalar and 1-dimensional host buffers diff --git a/src/alpakatest/AlpakaCore/alpakaWorkDiv.h b/src/alpakatest/AlpakaCore/alpakaWorkDiv.h index f0867611a..838a8f720 100644 --- a/src/alpakatest/AlpakaCore/alpakaWorkDiv.h +++ b/src/alpakatest/AlpakaCore/alpakaWorkDiv.h @@ -38,6 +38,15 @@ namespace cms::alpakatools { return WorkDiv(blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread); } else #endif // ALPAKA_ACC_GPU_CUDA_ENABLED +#if ALPAKA_ACC_GPU_HIP_ENABLED + if constexpr (std::is_same_v>) { + // On GPU backends, each thread is looking at a single element: + // - threadsPerBlockOrElementsPerThread is the number of threads per block; + // - elementsPerThread is always 1. + const auto elementsPerThread = Idx{1}; + return WorkDiv(blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread); + } else +#endif // ALPAKA_ACC_GPU_HIP_ENABLED { // On CPU backends, run serially with a single thread per block: // - threadsPerBlock is always 1; @@ -63,6 +72,15 @@ namespace cms::alpakatools { return WorkDiv(blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread); } else #endif // ALPAKA_ACC_GPU_CUDA_ENABLED +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + if constexpr (std::is_same_v>) { + // On GPU backends, each thread is looking at a single element: + // - threadsPerBlockOrElementsPerThread is the number of threads per block; + // - elementsPerThread is always 1. + const auto elementsPerThread = Vec::ones(); + return WorkDiv(blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread); + } else +#endif // ALPAKA_ACC_GPU_HIP_ENABLED { // On CPU backends, run serially with a single thread per block: // - threadsPerBlock is always 1; diff --git a/src/alpakatest/AlpakaCore/backend.h b/src/alpakatest/AlpakaCore/backend.h index c9f8717ae..387154013 100644 --- a/src/alpakatest/AlpakaCore/backend.h +++ b/src/alpakatest/AlpakaCore/backend.h @@ -1,6 +1,6 @@ #ifndef AlpakaCore_backend_h #define AlpakaCore_backend_h -enum class Backend { SERIAL, TBB, CUDA }; +enum class Backend { SERIAL, TBB, CUDA, HIP }; #endif // AlpakaCore_backend_h diff --git a/src/alpakatest/AlpakaCore/getDeviceCachingAllocator.h b/src/alpakatest/AlpakaCore/getDeviceCachingAllocator.h index bfdf4faaa..15369f397 100644 --- a/src/alpakatest/AlpakaCore/getDeviceCachingAllocator.h +++ b/src/alpakatest/AlpakaCore/getDeviceCachingAllocator.h @@ -53,7 +53,7 @@ namespace cms::alpakatools { // initialise all allocators, one per device static auto allocators = detail::allocate_device_allocators(); - auto const index = getDeviceIndex(device); + size_t const index = getDeviceIndex(device); assert(index < cms::alpakatools::devices>.size()); // the public interface is thread safe diff --git a/src/alpakatest/AlpakaCore/getDeviceIndex.h b/src/alpakatest/AlpakaCore/getDeviceIndex.h index 523d50ad1..5abbeaa02 100644 --- a/src/alpakatest/AlpakaCore/getDeviceIndex.h +++ b/src/alpakatest/AlpakaCore/getDeviceIndex.h @@ -19,6 +19,11 @@ namespace cms::alpakatools { inline int getDeviceIndex(alpaka::DevCudaRt const& device) { return alpaka::getNativeHandle(device); } #endif // ALPAKA_ACC_GPU_CUDA_ENABLED +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + // overload for DevHipRt + inline int getDeviceIndex(alpaka::DevHipRt const& device) { return alpaka::getNativeHandle(device); } +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + } // namespace cms::alpakatools #endif // AlpakaCore_getDeviceIndex_h diff --git a/src/alpakatest/Makefile b/src/alpakatest/Makefile index 1cd82abf8..281871c13 100644 --- a/src/alpakatest/Makefile +++ b/src/alpakatest/Makefile @@ -16,9 +16,14 @@ test_nvidiagpu: $(TARGET) @echo "Testing $(TARGET)" $(TARGET) --maxEvents 2 --cuda @echo "Succeeded" -test_intelagpu: +test_amdgpu: $(TARGET) + @echo + @echo "Testing $(TARGET)" + $(TARGET) --maxEvents 2 --hip + @echo "Succeeded" +test_intelgpu: test_auto: -.PHONY: test_cpu test_nvidiagpu test_intelgpu test_auto +.PHONY: test_cpu test_nvidiagpu test_amdgpu test_intelgpu test_auto EXE_SRC := $(wildcard $(TARGET_DIR)/bin/*.cc) EXE_OBJ := $(patsubst $(SRC_DIR)%,$(OBJ_DIR)%,$(EXE_SRC:%=%.o)) @@ -30,6 +35,9 @@ MY_CXXFLAGS := -I$(TARGET_DIR) -DSRC_DIR=$(TARGET_DIR) -DLIB_DIR=$(LIB_DIR)/$(TA ifdef CUDA_BASE MY_CXXFLAGS += -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ONLY_MODE endif +ifdef ROCM_BASE +MY_CXXFLAGS += -DALPAKA_ACC_GPU_HIP_ENABLED -DALPAKA_ACC_GPU_HIP_ONLY_MODE +endif MY_LDFLAGS := -ldl -Wl,-rpath,$(LIB_DIR)/$(TARGET_NAME) LIB_LDFLAGS := -L$(LIB_DIR)/$(TARGET_NAME) @@ -69,8 +77,16 @@ $(1)_CUDA_LDFLAGS := -l$(1)_cuda $(1)_CUOBJ := $$($(1)_CUDA_OBJ) $(1)_CUDADLINK := $$(if $$(strip $$($(1)_CUOBJ)),$(OBJ_DIR)/$(TARGET_NAME)/$(1)/lib$(1)_cudalink.o) endif +# ROCm backend +ifdef ROCM_BASE +$(1)_ROCM_OBJ := $$(patsubst $(SRC_DIR)%,$(OBJ_DIR)%,$$($(1)_PORTABLE_SRC:%=%.rocm.o)) +$(1)_ROCM_DEP := $$($(1)_ROCM_OBJ:$.o=$.d) +$(1)_ROCM_LIB := $(LIB_DIR)/$(TARGET_NAME)/lib$(1)_rocm.so +LIBS += $$($(1)_ROCM_LIB) +$(1)_ROCM_LDFLAGS := -l$(1)_rocm +endif endif # if PORTABLE_SRC is not empty -ALL_DEPENDS += $$($(1)_DEP) $$($(1)_SERIAL_DEP) $$($(1)_TBB_DEP) $$($(1)_CUDA_DEP) +ALL_DEPENDS += $$($(1)_DEP) $$($(1)_SERIAL_DEP) $$($(1)_TBB_DEP) $$($(1)_CUDA_DEP) $$($(1)_ROCM_DEP) endef $(foreach lib,$(LIBNAMES),$(eval $(call LIB_template,$(lib)))) @@ -108,8 +124,16 @@ PLUGINNAMES += $(1)_cuda $(1)_CUOBJ := $$($(1)_CUDA_OBJ) $(1)_CUDADLINK := $$(if $$(strip $$($(1)_CUOBJ)),$(OBJ_DIR)/$(TARGET_NAME)/plugin-$(1)/plugin$(1)_cudadlink.o,) endif +# ROCm backend +ifdef ROCM_BASE +$(1)_ROCM_OBJ := $$(patsubst $(SRC_DIR)%,$(OBJ_DIR)%,$$($(1)_PORTABLE_SRC:%=%.rocm.o)) +$(1)_ROCM_DEP := $$($(1)_ROCM_OBJ:$.o=$.d) +$(1)_ROCM_LIB := $(LIB_DIR)/$(TARGET_NAME)/plugin$(1)_rocm.so +PLUGINS += $$($(1)_ROCM_LIB) +PLUGINNAMES += $(1)_rocm +endif endif # if PORTABLE_SRC is not empty -ALL_DEPENDS += $$($(1)_DEP) $$($(1)_SERIAL_DEP) $$($(1)_TBB_DEP) $$($(1)_CUDA_DEP) +ALL_DEPENDS += $$($(1)_DEP) $$($(1)_SERIAL_DEP) $$($(1)_TBB_DEP) $$($(1)_CUDA_DEP) $$($(1)_ROCM_DEP) endef $(foreach lib,$(PLUGINNAMES),$(eval $(call PLUGIN_template,$(lib)))) @@ -130,11 +154,17 @@ TESTS_CUDA_DEP := $(TESTS_CUDA_OBJ:$.o=$.d) TESTS_CUDA_EXE := $(patsubst $(SRC_DIR)/$(TARGET_NAME)/test/alpaka/%.cc,$(TEST_DIR)/$(TARGET_NAME)/%.cuda,$(TESTS_PORTABLE_SRC)) TESTS_CUDADLINK := $(TESTS_CUDA_OBJ:$cu.o=$cudadlink.o) endif +# ROCm backend +ifdef ROCM_BASE +TESTS_ROCM_OBJ := $(patsubst $(SRC_DIR)%,$(OBJ_DIR)%,$(TESTS_PORTABLE_SRC:%=%.rocm.o)) +TESTS_ROCM_DEP := $(TESTS_ROCM_OBJ:$.o=$.d) +TESTS_ROCM_EXE := $(patsubst $(SRC_DIR)/$(TARGET_NAME)/test/alpaka/%.cc,$(TEST_DIR)/$(TARGET_NAME)/%.rocm,$(TESTS_PORTABLE_SRC)) +endif # -TESTS_EXE := $(TESTS_SERIAL_EXE) $(TESTS_TBB_EXE) $(TESTS_CUDA_EXE) -ALL_DEPENDS += $(TESTS_SERIAL_DEP) $(TESTS_TBB_DEP) $(TESTS_CUDA_DEP) +TESTS_EXE := $(TESTS_SERIAL_EXE) $(TESTS_TBB_EXE) $(TESTS_CUDA_EXE) $(TESTS_ROCM_EXE) +ALL_DEPENDS += $(TESTS_SERIAL_DEP) $(TESTS_TBB_DEP) $(TESTS_CUDA_DEP) $(TESTS_ROCM_DEP) # Needed to keep the unit test object files after building $(TARGET) -.SECONDARY: $(TESTS_SERIAL_OBJ) $(TESTS_TBB_OBJ) $(TESTS_CUDA_OBJ) $(TESTS_CUDADLINK) +.SECONDARY: $(TESTS_SERIAL_OBJ) $(TESTS_TBB_OBJ) $(TESTS_CUDA_OBJ) $(TESTS_CUDADLINK) $(TESTS_ROCM_OBJ) define RUNTEST_template run_$(1): $(1) @@ -147,6 +177,7 @@ endef $(foreach test,$(TESTS_SERIAL_EXE),$(eval $(call RUNTEST_template,$(test),cpu))) $(foreach test,$(TESTS_TBB_EXE),$(eval $(call RUNTEST_template,$(test),cpu))) $(foreach test,$(TESTS_CUDA_EXE),$(eval $(call RUNTEST_template,$(test),nvidiagpu))) +$(foreach test,$(TESTS_ROCM_EXE),$(eval $(call RUNTEST_template,$(test),amdgpu))) -include $(ALL_DEPENDS) @@ -180,6 +211,10 @@ $$($(1)_CUDA_LIB): $$($(1)_CUDA_OBJ) $$($(1)_CUDADLINK) $$(foreach dep,$(EXTERNA @[ -d $$(@D) ] || mkdir -p $$(@D) $(CXX) $$($(1)_CUDA_OBJ) $$($(1)_CUDADLINK) $(LDFLAGS) -shared $(SO_LDFLAGS) $(LIB_LDFLAGS) $$(foreach lib,$$($(1)_DEPENDS),$$($$(lib)_LDFLAGS)) $$(foreach lib,$$($(1)_DEPENDS),$$($$(lib)_CUDA_LDFLAGS)) $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_LDFLAGS)) -o $$@ +$$($(1)_ROCM_LIB): $$($(1)_ROCM_OBJ) $$(foreach dep,$(EXTERNAL_DEPENDS_H),$$($$(dep)_DEPS)) $$(foreach lib,$$($(1)_DEPENDS),$$($$(lib)_LIB)) $$(foreach lib,$$($(1)_DEPENDS),$$($$(lib)_ROCM_LIB)) + @[ -d $$(@D) ] || mkdir -p $$(@D) + $(CXX) $$($(1)_ROCM_OBJ) $(LDFLAGS) -shared $(SO_LDFLAGS) $(LIB_LDFLAGS) $$(foreach lib,$$($(1)_DEPENDS),$$($$(lib)_LDFLAGS)) $$(foreach lib,$$($(1)_DEPENDS),$$($$(lib)_ROCM_LDFLAGS)) $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_LDFLAGS)) -o $$@ + # Anything depending on Alpaka # Portable code, for serial backend $(OBJ_DIR)/$(2)/alpaka/%.cc.serial.o: $(SRC_DIR)/$(2)/alpaka/%.cc @@ -211,6 +246,13 @@ $$($(1)_CUDADLINK): $$($(1)_CUOBJ) $(CUDA_NVCC) $(CUDA_DLINKFLAGS) $(CUDA_LDFLAGS) $$($(1)_CUOBJ) -o $$@ endif +# Portable code, for ROCm backend +ifdef ROCM_BASE +$(OBJ_DIR)/$(2)/alpaka/%.cc.rocm.o: $(SRC_DIR)/$(2)/alpaka/%.cc + @[ -d $$(@D) ] || mkdir -p $$(@D) + $(ROCM_HIPCC) $(HIPCC_CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_GPU_HIP_ASYNC_BACKEND -UALPAKA_HOST_ONLY $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_CXXFLAGS)) -c $$< -o $$@ -MMD +endif + endef $(foreach lib,$(LIBNAMES),$(eval $(call BUILD_template,$(lib),$(TARGET_NAME)/$(lib)))) @@ -267,3 +309,14 @@ $(TEST_DIR)/$(TARGET_NAME)/%.cuda: $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.cu @[ -d $(@D) ] || mkdir -p $(@D) $(CXX) $^ $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(patsubst %,-l%,$(LIBNAMES)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS)) endif + +# ROCm backend +ifdef ROCM_BASE +$(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.rocm.o: $(SRC_DIR)/$(TARGET_NAME)/test/alpaka/%.cc + @[ -d $(@D) ] || mkdir -p $(@D) + $(ROCM_HIPCC) $(HIPCC_CXXFLAGS) $(ROCM_TEST_CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_GPU_HIP_ASYNC_BACKEND -UALPAKA_HOST_ONLY $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_CXXFLAGS)) -c $< -o $@ -MMD + +$(TEST_DIR)/$(TARGET_NAME)/%.rocm: $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.rocm.o | $(LIBS) + @[ -d $(@D) ] || mkdir -p $(@D) + $(CXX) $^ $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(patsubst %,-l%,$(LIBNAMES)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS)) +endif diff --git a/src/alpakatest/Makefile.deps b/src/alpakatest/Makefile.deps index f12aa3ab2..3082151c2 100644 --- a/src/alpakatest/Makefile.deps +++ b/src/alpakatest/Makefile.deps @@ -2,5 +2,8 @@ alpakatest_EXTERNAL_DEPENDS := TBB ALPAKA BOOST BACKTRACE ifdef CUDA_BASE alpakatest_EXTERNAL_DEPENDS += CUDA endif +ifdef ROCM_BASE +alpakatest_EXTERNAL_DEPENDS += ROCM +endif Test1_DEPENDS := Framework DataFormats AlpakaCore Test2_DEPENDS := Framework DataFormats AlpakaCore diff --git a/src/alpakatest/alpaka/alpakaExtra.hpp b/src/alpakatest/alpaka/alpakaExtra.hpp new file mode 100644 index 000000000..93d29a119 --- /dev/null +++ b/src/alpakatest/alpaka/alpakaExtra.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include + +// TODO move this into Alpaka +namespace alpaka { + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + using PltfCudaRt = PltfUniformCudaHipRt; + + using EventCudaRt = EventUniformCudaHipRt; + + template + using BufCudaRt = BufUniformCudaHipRt; +#endif // ALPAKA_ACC_GPU_CUDA_ENABLED + +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + using PltfHipRt = PltfUniformCudaHipRt; + + using EventHipRt = EventUniformCudaHipRt; + + template + using BufHipRt = BufUniformCudaHipRt; +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + +} // namespace alpaka diff --git a/src/alpakatest/bin/main.cc b/src/alpakatest/bin/main.cc index 140b333f9..ba18f372e 100644 --- a/src/alpakatest/bin/main.cc +++ b/src/alpakatest/bin/main.cc @@ -1,10 +1,14 @@ #include #include #include +#include #include #include +#include #include +#include #include +#include #include #include @@ -20,8 +24,20 @@ namespace { void print_help(std::string const& name) { std::cout - << name - << ": [--serial] [--tbb] [--cuda] [--numberOfThreads NT] [--numberOfStreams NS] [--maxEvents ME] [--data PATH] " + << name << ": " +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + << "[--serial] " +#endif +#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED + << "[--tbb] " +#endif +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + << "[--cuda] " +#endif +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + << "[--hip] " +#endif + << "[--numberOfThreads NT] [--numberOfStreams NS] [--maxEvents ME] [--data PATH] " "[--transfer]\n\n" << "Options\n" #ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED @@ -32,6 +48,9 @@ namespace { #endif #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED << " --cuda Use CUDA backend\n" +#endif +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + << " --hip Use ROCm/HIP backend\n" #endif << " --numberOfThreads Number of threads to use (default 1, use 0 to use all CPU cores)\n" << " --numberOfStreams Number of concurrent events (default 0 = numberOfThreads)\n" @@ -55,6 +74,7 @@ int main(int argc, char** argv) { int runForMinutes = -1; std::filesystem::path datadir; bool transfer = false; + bool empty = false; for (auto i = args.begin() + 1, e = args.end(); i != e; ++i) { if (*i == "-h" or *i == "--help") { print_help(args.front()); @@ -70,6 +90,10 @@ int main(int argc, char** argv) { #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED } else if (*i == "--cuda") { backends.emplace_back(Backend::CUDA); +#endif +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + } else if (*i == "--hip") { + backends.emplace_back(Backend::HIP); #endif } else if (*i == "--numberOfThreads") { ++i; @@ -88,6 +112,8 @@ int main(int argc, char** argv) { datadir = *i; } else if (*i == "--transfer") { transfer = true; + } else if (*i == "--empty") { + empty = true; } else { std::cout << "Invalid parameter " << *i << std::endl << std::endl; print_help(args.front()); @@ -128,27 +154,35 @@ int main(int argc, char** argv) { cms::alpakatools::initialise(); } #endif +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + if (std::find(backends.begin(), backends.end(), Backend::HIP) != backends.end()) { + cms::alpakatools::initialise(); + } +#endif // Initialize EventProcessor std::vector edmodules; std::vector esmodules; - if (not backends.empty()) { - auto addModules = [&](std::string const& prefix, Backend backend) { + if (not empty) { + auto addModules = [&](std::string const& accelerator_namespace, Backend backend) { if (std::find(backends.begin(), backends.end(), backend) != backends.end()) { - edmodules.emplace_back(prefix + "TestProducer"); - edmodules.emplace_back(prefix + "TestProducer3"); - edmodules.emplace_back(prefix + "TestProducer2"); + edmodules.emplace_back(accelerator_namespace + "::" + "TestProducer"); + edmodules.emplace_back(accelerator_namespace + "::" + "TestProducer3"); + edmodules.emplace_back(accelerator_namespace + "::" + "TestProducer2"); } }; #ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED - addModules("alpaka_serial_sync::", Backend::SERIAL); + addModules("alpaka_serial_sync", Backend::SERIAL); #endif #ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED - addModules("alpaka_tbb_async::", Backend::TBB); + addModules("alpaka_tbb_async", Backend::TBB); #endif #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - addModules("alpaka_cuda_async::", Backend::CUDA); + addModules("alpaka_cuda_async", Backend::CUDA); +#endif +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + addModules("alpaka_rocm_async", Backend::HIP); #endif esmodules = {"IntESProducer"}; if (transfer) { diff --git a/src/alpakatest/plugins.txt b/src/alpakatest/plugins.txt index 4f454dba9..6c4d0c386 100644 --- a/src/alpakatest/plugins.txt +++ b/src/alpakatest/plugins.txt @@ -2,9 +2,12 @@ IntESProducer pluginTest1.so alpaka_serial_sync::TestProducer pluginTest1_serial.so alpaka_tbb_async::TestProducer pluginTest1_tbb.so alpaka_cuda_async::TestProducer pluginTest1_cuda.so +alpaka_rocm_async::TestProducer pluginTest1_rocm.so alpaka_serial_sync::TestProducer2 pluginTest2_serial.so alpaka_tbb_async::TestProducer2 pluginTest2_tbb.so alpaka_cuda_async::TestProducer2 pluginTest2_cuda.so +alpaka_rocm_async::TestProducer2 pluginTest2_rocm.so alpaka_serial_sync::TestProducer3 pluginTest2_serial.so alpaka_tbb_async::TestProducer3 pluginTest2_tbb.so alpaka_cuda_async::TestProducer3 pluginTest2_cuda.so +alpaka_rocm_async::TestProducer3 pluginTest2_rocm.so From 23428cc8d463a5be92753fcdd64a8700da37dfa1 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Mon, 14 Mar 2022 23:22:58 +0100 Subject: [PATCH 2/2] [alpaka] Support CUDA or ROCm/HIP Allow building the "alpaka" application with support for either of CUDA or ROCm/HIP. --- src/alpaka/AlpakaCore/AllocatorPolicy.h | 38 +++--- src/alpaka/AlpakaCore/CachedBufAlloc.h | 74 +++++++++-- src/alpaka/AlpakaCore/CachingAllocator.h | 10 +- src/alpaka/AlpakaCore/HostOnlyTask.h | 47 ++++--- src/alpaka/AlpakaCore/ScopedContext.h | 17 ++- src/alpaka/AlpakaCore/alpakaConfig.h | 28 +++- src/alpaka/AlpakaCore/alpakaMemory.h | 12 +- src/alpaka/AlpakaCore/alpakaWorkDiv.h | 18 +++ src/alpaka/AlpakaCore/backend.h | 2 +- src/alpaka/AlpakaCore/eigenSoA.h | 4 +- .../AlpakaCore/getDeviceCachingAllocator.h | 2 +- src/alpaka/AlpakaCore/getDeviceIndex.h | 5 + src/alpaka/AlpakaCore/initialise.h | 4 +- src/alpaka/AlpakaCore/prefixScan.h | 121 ++++++++++++------ src/alpaka/AlpakaCore/radixSort.h | 12 +- .../TrackingRecHit2DSoAView.h | 5 + .../AlpakaDataFormats/alpaka/BeamSpotAlpaka.h | 2 +- .../alpaka/SiPixelClustersAlpaka.h | 2 +- .../alpaka/SiPixelDigiErrorsAlpaka.h | 2 +- .../alpaka/SiPixelDigisAlpaka.h | 2 +- .../alpaka/TrackingRecHit2DAlpaka.h | 2 +- src/alpaka/DataFormats/approx_atan2.h | 2 +- src/alpaka/Makefile | 71 ++++++++-- src/alpaka/Makefile.deps | 3 + src/alpaka/alpaka/alpakaExtra.hpp | 26 ++++ src/alpaka/bin/main.cc | 31 ++++- .../alpaka/gpuSortByPt2.h | 2 +- .../alpaka/gpuSplitVertices.h | 2 +- .../alpaka/SiPixelRawToClusterGPUKernel.cc | 4 +- .../alpaka/gpuClustering.h | 2 +- .../alpaka/HistoValidator.cc | 26 ++-- src/alpaka/plugins.txt | 12 ++ src/alpaka/test/alpaka/clustering_t.cc | 4 +- src/alpaka/test/alpaka/prefixScan_t.cc | 11 +- 34 files changed, 458 insertions(+), 147 deletions(-) create mode 100644 src/alpaka/alpaka/alpakaExtra.hpp diff --git a/src/alpaka/AlpakaCore/AllocatorPolicy.h b/src/alpaka/AlpakaCore/AllocatorPolicy.h index 162222349..d43478e01 100644 --- a/src/alpaka/AlpakaCore/AllocatorPolicy.h +++ b/src/alpaka/AlpakaCore/AllocatorPolicy.h @@ -1,45 +1,51 @@ #ifndef AlpakaCore_AllocatorPolicy_h #define AlpakaCore_AllocatorPolicy_h -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED -#include -#endif - #include namespace cms::alpakatools { // Which memory allocator to use - // - Synchronous: (device and host) cudaMalloc and cudaMallocHost + // - Synchronous: (device and host) cudaMalloc/hipMalloc and cudaMallocHost/hipMallocHost // - Asynchronous: (device only) cudaMallocAsync (requires CUDA >= 11.2) // - Caching: (device and host) caching allocator enum class AllocatorPolicy { Synchronous = 0, Asynchronous = 1, Caching = 2 }; template - constexpr AllocatorPolicy allocator_policy = AllocatorPolicy::Synchronous; + constexpr inline AllocatorPolicy allocator_policy = AllocatorPolicy::Synchronous; #if defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED template <> - constexpr AllocatorPolicy allocator_policy = -#if ! defined ALPAKA_DISABLE_CACHING_ALLOCATOR - AllocatorPolicy::Caching; + constexpr inline AllocatorPolicy allocator_policy = +#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR + AllocatorPolicy::Caching; #else - AllocatorPolicy::Synchronous; + AllocatorPolicy::Synchronous; #endif #endif // defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED || defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED #if defined ALPAKA_ACC_GPU_CUDA_ENABLED template <> - constexpr AllocatorPolicy allocator_policy = -#if ! defined ALPAKA_DISABLE_CACHING_ALLOCATOR - AllocatorPolicy::Caching; -#elif CUDA_VERSION >= 11020 && ! defined ALPAKA_DISABLE_ASYNC_ALLOCATOR - AllocatorPolicy::Asynchronous; + constexpr inline AllocatorPolicy allocator_policy = +#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR + AllocatorPolicy::Caching; +#elif CUDA_VERSION >= 11020 && !defined ALPAKA_DISABLE_ASYNC_ALLOCATOR + AllocatorPolicy::Asynchronous; #else - AllocatorPolicy::Synchronous; + AllocatorPolicy::Synchronous; #endif #endif // ALPAKA_ACC_GPU_CUDA_ENABLED +#if defined ALPAKA_ACC_GPU_HIP_ENABLED + template <> + constexpr inline AllocatorPolicy allocator_policy = +#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR + AllocatorPolicy::Caching; +#else + AllocatorPolicy::Synchronous; +#endif +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + } // namespace cms::alpakatools #endif // AlpakaCore_AllocatorPolicy_h diff --git a/src/alpaka/AlpakaCore/CachedBufAlloc.h b/src/alpaka/AlpakaCore/CachedBufAlloc.h index 84f029848..a57c321da 100644 --- a/src/alpaka/AlpakaCore/CachedBufAlloc.h +++ b/src/alpaka/AlpakaCore/CachedBufAlloc.h @@ -2,6 +2,7 @@ #define AlpakaCore_CachedBufAlloc_h #include +#include #include "AlpakaCore/getDeviceCachingAllocator.h" #include "AlpakaCore/getHostCachingAllocator.h" @@ -32,14 +33,14 @@ namespace cms::alpakatools { //! The caching memory allocator implementation for the pinned host memory template - struct CachedBufAlloc { + struct CachedBufAlloc { template ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev, - alpaka::QueueUniformCudaHipRtNonBlocking queue, + alpaka::QueueCudaRtNonBlocking queue, TExtent const& extent) -> alpaka::BufCpu { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - auto& allocator = getHostCachingAllocator(); + auto& allocator = getHostCachingAllocator(); // FIXME the BufCpu does not support a pitch ? size_t size = alpaka::getExtentProduct(extent); @@ -53,17 +54,15 @@ namespace cms::alpakatools { } }; - //! The caching memory allocator implementation for the CUDA/HIP device + //! The caching memory allocator implementation for the CUDA device template - struct CachedBufAlloc { + struct CachedBufAlloc { template - ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevUniformCudaHipRt const& dev, - TQueue queue, - TExtent const& extent) - -> alpaka::BufUniformCudaHipRt { + ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCudaRt const& dev, TQueue queue, TExtent const& extent) + -> alpaka::BufCudaRt { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - auto& allocator = getDeviceCachingAllocator(dev); + auto& allocator = getDeviceCachingAllocator(dev); size_t width = alpaka::getWidth(extent); size_t widthBytes = width * static_cast(sizeof(TElem)); @@ -76,13 +75,66 @@ namespace cms::alpakatools { // use a custom deleter to return the buffer to the CachingAllocator auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); }; - return alpaka::BufUniformCudaHipRt( + return alpaka::BufCudaRt( dev, reinterpret_cast(memPtr), std::move(deleter), pitchBytes, extent); } }; #endif // ALPAKA_ACC_GPU_CUDA_ENABLED +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + + //! The caching memory allocator implementation for the pinned host memory + template + struct CachedBufAlloc { + template + ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev, + alpaka::QueueHipRtNonBlocking queue, + TExtent const& extent) -> alpaka::BufCpu { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + auto& allocator = getHostCachingAllocator(); + + // FIXME the BufCpu does not support a pitch ? + size_t size = alpaka::getExtentProduct(extent); + size_t sizeBytes = size * sizeof(TElem); + void* memPtr = allocator.allocate(sizeBytes, queue); + + // use a custom deleter to return the buffer to the CachingAllocator + auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); }; + + return alpaka::BufCpu(dev, reinterpret_cast(memPtr), std::move(deleter), extent); + } + }; + + //! The caching memory allocator implementation for the ROCm/HIP device + template + struct CachedBufAlloc { + template + ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevHipRt const& dev, TQueue queue, TExtent const& extent) + -> alpaka::BufHipRt { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + auto& allocator = getDeviceCachingAllocator(dev); + + size_t width = alpaka::getWidth(extent); + size_t widthBytes = width * static_cast(sizeof(TElem)); + // TODO implement pitch for TDim > 1 + size_t pitchBytes = widthBytes; + size_t size = alpaka::getExtentProduct(extent); + size_t sizeBytes = size * sizeof(TElem); + void* memPtr = allocator.allocate(sizeBytes, queue); + + // use a custom deleter to return the buffer to the CachingAllocator + auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); }; + + return alpaka::BufHipRt( + dev, reinterpret_cast(memPtr), std::move(deleter), pitchBytes, extent); + } + }; + +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + } // namespace traits template diff --git a/src/alpaka/AlpakaCore/CachingAllocator.h b/src/alpaka/AlpakaCore/CachingAllocator.h index 2de401240..733962d82 100644 --- a/src/alpaka/AlpakaCore/CachingAllocator.h +++ b/src/alpaka/AlpakaCore/CachingAllocator.h @@ -15,6 +15,7 @@ #include #include +#include #include "AlpakaCore/alpakaDevices.h" @@ -335,12 +336,19 @@ namespace cms::alpakatools { // for host memory, pin the newly allocated block #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - if (not cms::alpakatools::devices.empty()) { + if (not cms::alpakatools::devices.empty()) { // it is possible to initialise the CUDA runtime and call cudaHostRegister // only if the system has at least one supported GPU alpaka::prepareForAsyncCopy(*block.buffer); } #endif // ALPAKA_ACC_GPU_CUDA_ENABLED +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + if (not cms::alpakatools::devices.empty()) { + // it is possible to initialise the ROCm runtime and call hipHostRegister + // only if the system has at least one supported GPU + alpaka::prepareForAsyncCopy(*block.buffer); + } +#endif // ALPAKA_ACC_GPU_HIP_ENABLED // create a new event associated to the "synchronisation device" block.event = Event{block.device()}; diff --git a/src/alpaka/AlpakaCore/HostOnlyTask.h b/src/alpaka/AlpakaCore/HostOnlyTask.h index 1ea88314a..f0d3431f0 100644 --- a/src/alpaka/AlpakaCore/HostOnlyTask.h +++ b/src/alpaka/AlpakaCore/HostOnlyTask.h @@ -5,6 +5,7 @@ #include #include +#include namespace alpaka { @@ -18,37 +19,45 @@ namespace alpaka { std::function task_; }; -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED) - namespace traits { - //! The CUDA/HIP RT async queue enqueue trait specialization for "safe tasks" + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + //! The CUDA async queue enqueue trait specialization for "safe tasks" template <> - struct Enqueue { -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - static void CUDART_CB -#else - static void HIPRT_CB -#endif - uniformCudaHipRtCallback(ALPAKA_API_PREFIX(Stream_t) /*queue*/, - ALPAKA_API_PREFIX(Error_t) /*status*/, - void* arg) { + struct Enqueue { + static void CUDART_CB callback(cudaStream_t /*queue*/, cudaError_t /*status*/, void* arg) { //ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status); std::unique_ptr pTask(static_cast(arg)); (*pTask)(); } - ALPAKA_FN_HOST static auto enqueue(QueueUniformCudaHipRtNonBlocking& queue, HostOnlyTask task) -> void { + ALPAKA_FN_HOST static auto enqueue(QueueCudaRtNonBlocking& queue, HostOnlyTask task) -> void { auto pTask = std::make_unique(std::move(task)); ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( - ALPAKA_API_PREFIX(StreamAddCallback)(alpaka::getNativeHandle(queue), - uniformCudaHipRtCallback, - static_cast(pTask.release()), - 0u)); + cudaStreamAddCallback(alpaka::getNativeHandle(queue), callback, static_cast(pTask.release()), 0u)); } }; - } // namespace traits +#endif // ALPAKA_ACC_GPU_CUDA_ENABLED -#endif // defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED) +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + //! The HIP async queue enqueue trait specialization for "safe tasks" + template <> + struct Enqueue { + static void HIPRT_CB callback(hipStream_t /*queue*/, hipError_t /*status*/, void* arg) { + //ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status); + std::unique_ptr pTask(static_cast(arg)); + (*pTask)(); + } + + ALPAKA_FN_HOST static auto enqueue(QueueHipRtNonBlocking& queue, HostOnlyTask task) -> void { + auto pTask = std::make_unique(std::move(task)); + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + hipStreamAddCallback(alpaka::getNativeHandle(queue), callback, static_cast(pTask.release()), 0u)); + } + }; +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + + } // namespace traits } // namespace alpaka diff --git a/src/alpaka/AlpakaCore/ScopedContext.h b/src/alpaka/AlpakaCore/ScopedContext.h index 7e1b65d00..4d53cf9e1 100644 --- a/src/alpaka/AlpakaCore/ScopedContext.h +++ b/src/alpaka/AlpakaCore/ScopedContext.h @@ -110,11 +110,10 @@ namespace cms::alpakatools { template void pushNextTask(F&& f, ContextState const* state) { - replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{ - edm::make_waiting_task_with_holder(std::move(waitingTaskHolder_), - [state, func = std::forward(f)](edm::WaitingTaskWithArenaHolder h) { - func(ScopedContextTask{state, std::move(h)}); - })}); + replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{edm::make_waiting_task_with_holder( + std::move(waitingTaskHolder_), [state, func = std::forward(f)](edm::WaitingTaskWithArenaHolder h) { + func(ScopedContextTask{state, std::move(h)}); + })}); } void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { @@ -124,10 +123,10 @@ namespace cms::alpakatools { template void enqueueCallback(TQueue& stream) { alpaka::enqueue(stream, alpaka::HostOnlyTask([holder = std::move(waitingTaskHolder_)]() { - // The functor is required to be const, but the original waitingTaskHolder_ - // needs to be notified... - const_cast(holder).doneWaiting(nullptr); - })); + // The functor is required to be const, but the original waitingTaskHolder_ + // needs to be notified... + const_cast(holder).doneWaiting(nullptr); + })); } private: diff --git a/src/alpaka/AlpakaCore/alpakaConfig.h b/src/alpaka/AlpakaCore/alpakaConfig.h index 0ffa17e01..9562c8254 100644 --- a/src/alpaka/AlpakaCore/alpakaConfig.h +++ b/src/alpaka/AlpakaCore/alpakaConfig.h @@ -4,6 +4,7 @@ #include #include +#include namespace alpaka_common { @@ -48,10 +49,10 @@ namespace alpaka_common { namespace alpaka_cuda_async { using namespace alpaka_common; - using Platform = alpaka::PltfUniformCudaHipRt; + using Platform = alpaka::PltfCudaRt; using Device = alpaka::DevCudaRt; using Queue = alpaka::QueueCudaRtNonBlocking; - using Event = alpaka::EventUniformCudaHipRt; + using Event = alpaka::EventCudaRt; template using Acc = alpaka::AccGpuCudaRt; @@ -67,6 +68,29 @@ namespace alpaka_cuda_async { #define ALPAKA_ACCELERATOR_NAMESPACE alpaka_cuda_async #endif // ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED +namespace alpaka_rocm_async { + using namespace alpaka_common; + + using Platform = alpaka::PltfHipRt; + using Device = alpaka::DevHipRt; + using Queue = alpaka::QueueHipRtNonBlocking; + using Event = alpaka::EventHipRt; + + template + using Acc = alpaka::AccGpuHipRt; + using Acc1D = Acc; + using Acc2D = Acc; + using Acc3D = Acc; + +} // namespace alpaka_rocm_async + +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + +#ifdef ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND +#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_rocm_async +#endif // ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND + #ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED namespace alpaka_serial_sync { using namespace alpaka_common; diff --git a/src/alpaka/AlpakaCore/alpakaMemory.h b/src/alpaka/AlpakaCore/alpakaMemory.h index 9616497f6..e1ae91435 100644 --- a/src/alpaka/AlpakaCore/alpakaMemory.h +++ b/src/alpaka/AlpakaCore/alpakaMemory.h @@ -25,6 +25,9 @@ namespace cms { inline constexpr bool is_unbounded_array_v = is_unbounded_array::value; } // namespace cms +#include +#include + #include "AlpakaCore/AllocatorPolicy.h" #include "AlpakaCore/CachedBufAlloc.h" #include "AlpakaCore/alpakaConfig.h" @@ -75,12 +78,19 @@ namespace cms::alpakatools { template void pin_buffer(TBuf& buffer) { #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - if (not devices.empty()) { + if (not cms::alpakatools::devices.empty()) { // it is possible to initialise the CUDA runtime and call cudaHostRegister // only if the system has at least one supported GPU alpaka::prepareForAsyncCopy(buffer); } #endif // ALPAKA_ACC_GPU_CUDA_ENABLED +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + if (not cms::alpakatools::devices.empty()) { + // it is possible to initialise the ROCm runtime and call hipHostRegister + // only if the system has at least one supported GPU + alpaka::prepareForAsyncCopy(buffer); + } +#endif // ALPAKA_ACC_GPU_HIP_ENABLED } // scalar and 1-dimensional host buffers diff --git a/src/alpaka/AlpakaCore/alpakaWorkDiv.h b/src/alpaka/AlpakaCore/alpakaWorkDiv.h index f0867611a..838a8f720 100644 --- a/src/alpaka/AlpakaCore/alpakaWorkDiv.h +++ b/src/alpaka/AlpakaCore/alpakaWorkDiv.h @@ -38,6 +38,15 @@ namespace cms::alpakatools { return WorkDiv(blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread); } else #endif // ALPAKA_ACC_GPU_CUDA_ENABLED +#if ALPAKA_ACC_GPU_HIP_ENABLED + if constexpr (std::is_same_v>) { + // On GPU backends, each thread is looking at a single element: + // - threadsPerBlockOrElementsPerThread is the number of threads per block; + // - elementsPerThread is always 1. + const auto elementsPerThread = Idx{1}; + return WorkDiv(blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread); + } else +#endif // ALPAKA_ACC_GPU_HIP_ENABLED { // On CPU backends, run serially with a single thread per block: // - threadsPerBlock is always 1; @@ -63,6 +72,15 @@ namespace cms::alpakatools { return WorkDiv(blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread); } else #endif // ALPAKA_ACC_GPU_CUDA_ENABLED +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + if constexpr (std::is_same_v>) { + // On GPU backends, each thread is looking at a single element: + // - threadsPerBlockOrElementsPerThread is the number of threads per block; + // - elementsPerThread is always 1. + const auto elementsPerThread = Vec::ones(); + return WorkDiv(blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread); + } else +#endif // ALPAKA_ACC_GPU_HIP_ENABLED { // On CPU backends, run serially with a single thread per block: // - threadsPerBlock is always 1; diff --git a/src/alpaka/AlpakaCore/backend.h b/src/alpaka/AlpakaCore/backend.h index c9f8717ae..387154013 100644 --- a/src/alpaka/AlpakaCore/backend.h +++ b/src/alpaka/AlpakaCore/backend.h @@ -1,6 +1,6 @@ #ifndef AlpakaCore_backend_h #define AlpakaCore_backend_h -enum class Backend { SERIAL, TBB, CUDA }; +enum class Backend { SERIAL, TBB, CUDA, HIP }; #endif // AlpakaCore_backend_h diff --git a/src/alpaka/AlpakaCore/eigenSoA.h b/src/alpaka/AlpakaCore/eigenSoA.h index 5f1c7ebce..5c136ec90 100644 --- a/src/alpaka/AlpakaCore/eigenSoA.h +++ b/src/alpaka/AlpakaCore/eigenSoA.h @@ -19,9 +19,9 @@ namespace eigenSoA { using Scalar = T; ALPAKA_FN_HOST_ACC constexpr Scalar& operator()(int32_t i) { return data_[i]; } - ALPAKA_FN_ACC constexpr const Scalar operator()(int32_t i) const { return data_[i]; } + ALPAKA_FN_HOST_ACC constexpr const Scalar operator()(int32_t i) const { return data_[i]; } ALPAKA_FN_HOST_ACC constexpr Scalar& operator[](int32_t i) { return data_[i]; } - ALPAKA_FN_ACC constexpr const Scalar operator[](int32_t i) const { return data_[i]; } + ALPAKA_FN_HOST_ACC constexpr const Scalar operator[](int32_t i) const { return data_[i]; } ALPAKA_FN_HOST_ACC constexpr Scalar* data() { return data_; } ALPAKA_FN_HOST_ACC constexpr Scalar const* data() const { return data_; } diff --git a/src/alpaka/AlpakaCore/getDeviceCachingAllocator.h b/src/alpaka/AlpakaCore/getDeviceCachingAllocator.h index bfdf4faaa..15369f397 100644 --- a/src/alpaka/AlpakaCore/getDeviceCachingAllocator.h +++ b/src/alpaka/AlpakaCore/getDeviceCachingAllocator.h @@ -53,7 +53,7 @@ namespace cms::alpakatools { // initialise all allocators, one per device static auto allocators = detail::allocate_device_allocators(); - auto const index = getDeviceIndex(device); + size_t const index = getDeviceIndex(device); assert(index < cms::alpakatools::devices>.size()); // the public interface is thread safe diff --git a/src/alpaka/AlpakaCore/getDeviceIndex.h b/src/alpaka/AlpakaCore/getDeviceIndex.h index 523d50ad1..5abbeaa02 100644 --- a/src/alpaka/AlpakaCore/getDeviceIndex.h +++ b/src/alpaka/AlpakaCore/getDeviceIndex.h @@ -19,6 +19,11 @@ namespace cms::alpakatools { inline int getDeviceIndex(alpaka::DevCudaRt const& device) { return alpaka::getNativeHandle(device); } #endif // ALPAKA_ACC_GPU_CUDA_ENABLED +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + // overload for DevHipRt + inline int getDeviceIndex(alpaka::DevHipRt const& device) { return alpaka::getNativeHandle(device); } +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + } // namespace cms::alpakatools #endif // AlpakaCore_getDeviceIndex_h diff --git a/src/alpaka/AlpakaCore/initialise.h b/src/alpaka/AlpakaCore/initialise.h index 0dca4d131..98ba966a1 100644 --- a/src/alpaka/AlpakaCore/initialise.h +++ b/src/alpaka/AlpakaCore/initialise.h @@ -12,14 +12,14 @@ namespace cms::alpakatools { template void initialise() { - constexpr const char* suffix[] = { "devices.", "device:", "devices:" }; + constexpr const char* suffix[] = {"devices.", "device:", "devices:"}; if (devices.empty()) { devices = enumerate(); auto size = devices.size(); //std::cout << edm::demangle << " platform succesfully initialised." << std::endl; std::cout << "Found " << size << " " << suffix[size < 2 ? size : 2] << std::endl; - for (auto const& device: devices) { + for (auto const& device : devices) { std::cout << " - " << alpaka::getName(device) << std::endl; } } else { diff --git a/src/alpaka/AlpakaCore/prefixScan.h b/src/alpaka/AlpakaCore/prefixScan.h index e317eeaf0..e86d6b4b3 100644 --- a/src/alpaka/AlpakaCore/prefixScan.h +++ b/src/alpaka/AlpakaCore/prefixScan.h @@ -12,15 +12,27 @@ namespace cms { namespace alpakatools { -#if defined ALPAKA_ACC_GPU_CUDA_ENABLED && __CUDA_ARCH__ + // FIXME warpSize should be device-dependent + constexpr uint32_t warpSize = 32; + constexpr uint64_t warpMask = ~(~0ull << warpSize); + +#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)) || \ + (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)) template ALPAKA_FN_ACC ALPAKA_FN_INLINE void warpPrefixScan(uint32_t laneId, T const* ci, T* co, uint32_t i, uint32_t mask) { +#if defined(__HIP_DEVICE_COMPILE__) + ALPAKA_ASSERT_OFFLOAD(mask == warpMask); +#endif // ci and co may be the same auto x = ci[i]; CMS_UNROLL_LOOP - for (int offset = 1; offset < 32; offset <<= 1) { + for (uint32_t offset = 1; offset < warpSize; offset <<= 1) { +#if defined(__CUDA_ARCH__) auto y = __shfl_up_sync(mask, x, offset); +#elif defined(__HIP_DEVICE_COMPILE__) + auto y = __shfl_up(x, offset); +#endif if (laneId >= offset) x += y; } @@ -29,49 +41,64 @@ namespace cms { template ALPAKA_FN_ACC ALPAKA_FN_INLINE void warpPrefixScan(uint32_t laneId, T* c, uint32_t i, uint32_t mask) { +#if defined(__HIP_DEVICE_COMPILE__) + ALPAKA_ASSERT_OFFLOAD(mask == warpMask); +#endif auto x = c[i]; CMS_UNROLL_LOOP - for (int offset = 1; offset < 32; offset <<= 1) { + for (uint32_t offset = 1; offset < warpSize; offset <<= 1) { +#if defined(__CUDA_ARCH__) auto y = __shfl_up_sync(mask, x, offset); +#elif defined(__HIP_DEVICE_COMPILE__) + auto y = __shfl_up(x, offset); +#endif if (laneId >= offset) x += y; } c[i] = x; } -#endif // defined ALPAKA_ACC_GPU_CUDA_ENABLED & ! defined ALPAKA_HOST_ONLY +#endif // (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)) - // limited to 32*32 elements + // limited to warpSize² elements template ALPAKA_FN_ACC ALPAKA_FN_INLINE void blockPrefixScan( const TAcc& acc, T const* ci, T* co, uint32_t size, T* ws = nullptr) { -#if defined ALPAKA_ACC_GPU_CUDA_ENABLED && __CUDA_ARCH__ +#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)) || \ + (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)) uint32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); uint32_t const blockThreadIdx(alpaka::getIdx(acc)[0u]); ALPAKA_ASSERT_OFFLOAD(ws); - ALPAKA_ASSERT_OFFLOAD(size <= 1024); - ALPAKA_ASSERT_OFFLOAD(0 == blockDimension % 32); + ALPAKA_ASSERT_OFFLOAD(size <= warpSize * warpSize); + ALPAKA_ASSERT_OFFLOAD(0 == blockDimension % warpSize); auto first = blockThreadIdx; - auto mask = __ballot_sync(0xffffffff, first < size); - auto laneId = blockThreadIdx & 0x1f; +#if defined(__CUDA_ARCH__) + auto mask = __ballot_sync(warpMask, first < size); +#elif defined(__HIP_DEVICE_COMPILE__) + auto mask = warpMask; +#endif + auto laneId = blockThreadIdx & (warpSize - 1); for (auto i = first; i < size; i += blockDimension) { warpPrefixScan(laneId, ci, co, i, mask); - auto warpId = i / 32; - ALPAKA_ASSERT_OFFLOAD(warpId < 32); - if (31 == laneId) + auto warpId = i / warpSize; + // FIXME test ? + ALPAKA_ASSERT_OFFLOAD(warpId < warpSize); + if ((warpSize - 1) == laneId) ws[warpId] = co[i]; +#if defined(__CUDA_ARCH__) mask = __ballot_sync(mask, i + blockDimension < size); +#endif } alpaka::syncBlockThreads(acc); - if (size <= 32) + if (size <= warpSize) return; - if (blockThreadIdx < 32) { - warpPrefixScan(laneId, ws, blockThreadIdx, 0xffffffff); + if (blockThreadIdx < warpSize) { + warpPrefixScan(laneId, ws, blockThreadIdx, warpMask); } alpaka::syncBlockThreads(acc); - for (auto i = first + 32; i < size; i += blockDimension) { - uint32_t warpId = i / 32; + for (auto i = first + warpSize; i < size; i += blockDimension) { + uint32_t warpId = i / warpSize; co[i] += ws[warpId - 1]; } alpaka::syncBlockThreads(acc); @@ -79,7 +106,7 @@ namespace cms { co[0] = ci[0]; for (uint32_t i = 1; i < size; ++i) co[i] = ci[i] + co[i - 1]; -#endif +#endif // (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)) } template @@ -87,43 +114,50 @@ namespace cms { T* __restrict__ c, uint32_t size, T* __restrict__ ws = nullptr) { -#if defined ALPAKA_ACC_GPU_CUDA_ENABLED && __CUDA_ARCH__ +#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)) || \ + (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)) uint32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); uint32_t const blockThreadIdx(alpaka::getIdx(acc)[0u]); ALPAKA_ASSERT_OFFLOAD(ws); - ALPAKA_ASSERT_OFFLOAD(size <= 1024); - ALPAKA_ASSERT_OFFLOAD(0 == blockDimension % 32); + ALPAKA_ASSERT_OFFLOAD(size <= warpSize * warpSize); + ALPAKA_ASSERT_OFFLOAD(0 == blockDimension % warpSize); auto first = blockThreadIdx; - auto mask = __ballot_sync(0xffffffff, first < size); - auto laneId = blockThreadIdx & 0x1f; +#if defined(__CUDA_ARCH__) + auto mask = __ballot_sync(warpMask, first < size); +#elif defined(__HIP_DEVICE_COMPILE__) + auto mask = warpMask; +#endif + auto laneId = blockThreadIdx & (warpSize - 1); for (auto i = first; i < size; i += blockDimension) { warpPrefixScan(laneId, c, i, mask); - auto warpId = i / 32; - ALPAKA_ASSERT_OFFLOAD(warpId < 32); - if (31 == laneId) + auto warpId = i / warpSize; + ALPAKA_ASSERT_OFFLOAD(warpId < warpSize); + if ((warpSize - 1) == laneId) ws[warpId] = c[i]; +#if defined(__CUDA_ARCH__) mask = __ballot_sync(mask, i + blockDimension < size); +#endif } alpaka::syncBlockThreads(acc); - if (size <= 32) + if (size <= warpSize) return; - if (blockThreadIdx < 32) { - warpPrefixScan(laneId, ws, blockThreadIdx, 0xffffffff); + if (blockThreadIdx < warpSize) { + warpPrefixScan(laneId, ws, blockThreadIdx, warpMask); } alpaka::syncBlockThreads(acc); - for (auto i = first + 32; i < size; i += blockDimension) { - auto warpId = i / 32; + for (auto i = first + warpSize; i < size; i += blockDimension) { + auto warpId = i / warpSize; c[i] += ws[warpId - 1]; } alpaka::syncBlockThreads(acc); #else for (uint32_t i = 1; i < size; ++i) c[i] += c[i - 1]; -#endif +#endif // (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)) } - // limited to 1024*1024 elements + // limited to warpSize⁴ elements template struct multiBlockPrefixScanFirstStep { template @@ -132,11 +166,16 @@ namespace cms { uint32_t const threadDimension(alpaka::getWorkDiv(acc)[0u]); uint32_t const blockIdx(alpaka::getIdx(acc)[0u]); - auto& ws = alpaka::declareSharedVar(acc); - // first each block does a scan of size 1024 (better be enough blocks) + auto& ws = alpaka::declareSharedVar(acc); + // first each block does a scan of size warpSize² (better be enough blocks) #ifndef NDEBUG [[maybe_unused]] uint32_t const gridDimension(alpaka::getWorkDiv(acc)[0u]); - ALPAKA_ASSERT_OFFLOAD(gridDimension / threadDimension <= 1024); + ALPAKA_ASSERT_OFFLOAD(gridDimension / threadDimension <= (warpSize * warpSize)); +#endif +#if 0 + // this is not yet available in alpaka, see + // https://github.com/alpaka-group/alpaka/issues/1648 + ALPAKA_ASSERT_OFFLOAD(sizeof(T) * gridDimension <= dynamic_smem_size()); // size of psum below #endif int off = blockDimension * blockIdx * threadDimension; if (size - off > 0) @@ -144,7 +183,7 @@ namespace cms { } }; - // limited to 1024*1024 elements + // limited to warpSize⁴ elements template struct multiBlockPrefixScanSecondStep { template @@ -153,9 +192,9 @@ namespace cms { uint32_t const threadDimension(alpaka::getWorkDiv(acc)[0u]); uint32_t const threadIdx(alpaka::getIdx(acc)[0u]); - auto* const psum(alpaka::getDynSharedMem(acc)); + T* const psum = alpaka::getDynSharedMem(acc); - // first each block does a scan of size 1024 (better be enough blocks) + // first each block does a scan of size warpSize² (better be enough blocks) ALPAKA_ASSERT_OFFLOAD(static_cast(blockDimension * threadDimension) >= numBlocks); for (int elemId = 0; elemId < static_cast(threadDimension); ++elemId) { int index = +threadIdx * threadDimension + elemId; @@ -170,7 +209,7 @@ namespace cms { alpaka::syncBlockThreads(acc); - auto& ws = alpaka::declareSharedVar(acc); + auto& ws = alpaka::declareSharedVar(acc); blockPrefixScan(acc, psum, psum, numBlocks, ws); for (int elemId = 0; elemId < static_cast(threadDimension); ++elemId) { diff --git a/src/alpaka/AlpakaCore/radixSort.h b/src/alpaka/AlpakaCore/radixSort.h index 606ac8c93..596381e5d 100644 --- a/src/alpaka/AlpakaCore/radixSort.h +++ b/src/alpaka/AlpakaCore/radixSort.h @@ -69,6 +69,8 @@ namespace cms::alpakatools { typename RF> ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void radixSortImpl( const TAcc& acc, T const* __restrict__ a, uint16_t* ind, uint16_t* ind2, uint32_t size, RF reorder) { +#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)) || \ + (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)) const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); const uint32_t blockDimension(alpaka::getWorkDiv(acc)[0u]); @@ -110,7 +112,11 @@ namespace cms::alpakatools { auto laneId = idx & 0x1f; for (int offset = 1; offset < 32; offset <<= 1) { +#if defined(__CUDA_ARCH__) auto y = __shfl_up_sync(0xffffffff, x, offset); +#elif defined(__HIP_DEVICE_COMPILE__) + auto y = __shfl_up(x, offset); +#endif if (laneId >= (uint32_t)offset) x += y; } @@ -166,8 +172,11 @@ namespace cms::alpakatools { }); alpaka::syncBlockThreads(acc); - if (threadIdxLocal == 0) + if (threadIdxLocal == 0) { ibs -= sb; + // cms-patatrack/pixeltrack-standalone#210 + alpaka::mem_fence(acc, alpaka::memory_scope::Grid{}); + } alpaka::syncBlockThreads(acc); } @@ -205,6 +214,7 @@ namespace cms::alpakatools { // now move negative first... (if signed) reorder(acc, a, ind, ind2, size); +#endif } template - BeamSpotAlpaka() = default; + BeamSpotAlpaka() = delete; // alpaka buffers are not default-constructible // constructor that allocates cached device memory on the given queue BeamSpotAlpaka(Queue const& queue) : data_d_{cms::alpakatools::make_device_buffer(queue)} {} diff --git a/src/alpaka/AlpakaDataFormats/alpaka/SiPixelClustersAlpaka.h b/src/alpaka/AlpakaDataFormats/alpaka/SiPixelClustersAlpaka.h index 8e320aaad..f725073c6 100644 --- a/src/alpaka/AlpakaDataFormats/alpaka/SiPixelClustersAlpaka.h +++ b/src/alpaka/AlpakaDataFormats/alpaka/SiPixelClustersAlpaka.h @@ -8,7 +8,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { class SiPixelClustersAlpaka { public: - SiPixelClustersAlpaka() = default; + SiPixelClustersAlpaka() = delete; // alpaka buffers are not default-constructible explicit SiPixelClustersAlpaka(Queue &queue, size_t maxClusters) : moduleStart_d{cms::alpakatools::make_device_buffer(queue, maxClusters + 1)}, clusInModule_d{cms::alpakatools::make_device_buffer(queue, maxClusters)}, diff --git a/src/alpaka/AlpakaDataFormats/alpaka/SiPixelDigiErrorsAlpaka.h b/src/alpaka/AlpakaDataFormats/alpaka/SiPixelDigiErrorsAlpaka.h index 0fe147a13..399bbd58a 100644 --- a/src/alpaka/AlpakaDataFormats/alpaka/SiPixelDigiErrorsAlpaka.h +++ b/src/alpaka/AlpakaDataFormats/alpaka/SiPixelDigiErrorsAlpaka.h @@ -12,7 +12,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { class SiPixelDigiErrorsAlpaka { public: - SiPixelDigiErrorsAlpaka() = default; + SiPixelDigiErrorsAlpaka() = delete; // alpaka buffers are not default-constructible explicit SiPixelDigiErrorsAlpaka(Queue& queue, size_t maxFedWords, PixelFormatterErrors errors) : data_d{cms::alpakatools::make_device_buffer(queue, maxFedWords)}, error_d{cms::alpakatools::make_device_buffer>(queue)}, diff --git a/src/alpaka/AlpakaDataFormats/alpaka/SiPixelDigisAlpaka.h b/src/alpaka/AlpakaDataFormats/alpaka/SiPixelDigisAlpaka.h index 15f02bdf6..996de99d6 100644 --- a/src/alpaka/AlpakaDataFormats/alpaka/SiPixelDigisAlpaka.h +++ b/src/alpaka/AlpakaDataFormats/alpaka/SiPixelDigisAlpaka.h @@ -10,7 +10,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { class SiPixelDigisAlpaka { public: - SiPixelDigisAlpaka() = default; + SiPixelDigisAlpaka() = delete; // alpaka buffers are not default-constructible explicit SiPixelDigisAlpaka(Queue &queue, size_t maxFedWords) : xx_d{cms::alpakatools::make_device_buffer(queue, maxFedWords)}, yy_d{cms::alpakatools::make_device_buffer(queue, maxFedWords)}, diff --git a/src/alpaka/AlpakaDataFormats/alpaka/TrackingRecHit2DAlpaka.h b/src/alpaka/AlpakaDataFormats/alpaka/TrackingRecHit2DAlpaka.h index cdbd8444d..093c0b5dd 100644 --- a/src/alpaka/AlpakaDataFormats/alpaka/TrackingRecHit2DAlpaka.h +++ b/src/alpaka/AlpakaDataFormats/alpaka/TrackingRecHit2DAlpaka.h @@ -15,7 +15,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { public: using Hist = TrackingRecHit2DSoAView::Hist; - TrackingRecHit2DAlpaka() = default; + TrackingRecHit2DAlpaka() = delete; // alpaka buffers are not default-constructible explicit TrackingRecHit2DAlpaka(uint32_t nHits, const pixelCPEforGPU::ParamsOnGPU* cpeParams, diff --git a/src/alpaka/DataFormats/approx_atan2.h b/src/alpaka/DataFormats/approx_atan2.h index 42df9b852..6f37279be 100644 --- a/src/alpaka/DataFormats/approx_atan2.h +++ b/src/alpaka/DataFormats/approx_atan2.h @@ -121,7 +121,7 @@ constexpr float unsafe_atan2f(float y, float x) { template constexpr float safe_atan2f(float y, float x) { - return unsafe_atan2f_impl(y, (y == 0.f) & (x == 0.f) ? 0.2f : x); + return unsafe_atan2f_impl(y, (y == 0.f) && (x == 0.f) ? 0.2f : x); // return (y==0.f)&(x==0.f) ? 0.f : unsafe_atan2f_impl( y, x); } diff --git a/src/alpaka/Makefile b/src/alpaka/Makefile index 59ec61928..bc56cab44 100644 --- a/src/alpaka/Makefile +++ b/src/alpaka/Makefile @@ -16,9 +16,14 @@ test_nvidiagpu: $(TARGET) @echo "Testing $(TARGET)" $(TARGET) --maxEvents 2 --cuda @echo "Succeeded" -test_intelagpu: +test_amdgpu: $(TARGET) + @echo + @echo "Testing $(TARGET)" + $(TARGET) --maxEvents 2 --hip + @echo "Succeeded" +test_intelgpu: test_auto: -.PHONY: test_cpu test_nvidiagpu test_intelgpu test_auto +.PHONY: test_cpu test_nvidiagpu test_amdgpu test_intelgpu test_auto EXE_SRC := $(wildcard $(TARGET_DIR)/bin/*.cc) EXE_OBJ := $(patsubst $(SRC_DIR)%,$(OBJ_DIR)%,$(EXE_SRC:%=%.o)) @@ -30,6 +35,9 @@ MY_CXXFLAGS := -I$(TARGET_DIR) -DSRC_DIR=$(TARGET_DIR) -DLIB_DIR=$(LIB_DIR)/$(TA ifdef CUDA_BASE MY_CXXFLAGS += -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ONLY_MODE endif +ifdef ROCM_BASE +MY_CXXFLAGS += -DALPAKA_ACC_GPU_HIP_ENABLED -DALPAKA_ACC_GPU_HIP_ONLY_MODE +endif MY_LDFLAGS := -ldl -Wl,-rpath,$(LIB_DIR)/$(TARGET_NAME) LIB_LDFLAGS := -L$(LIB_DIR)/$(TARGET_NAME) @@ -69,8 +77,16 @@ $(1)_CUDA_LDFLAGS := -l$(1)_cuda $(1)_CUOBJ := $$($(1)_CUDA_OBJ) $(1)_CUDADLINK := $$(if $$(strip $$($(1)_CUOBJ)),$(OBJ_DIR)/$(TARGET_NAME)/$(1)/lib$(1)_cudalink.o) endif -endif # if PORTABLE_SRC is empty -ALL_DEPENDS += $$($(1)_DEP) $$($(1)_SERIAL_DEP) $$($(1)_TBB_DEP) $$($(1)_CUDA_DEP) +# ROCm backend +ifdef ROCM_BASE +$(1)_ROCM_OBJ := $$(patsubst $(SRC_DIR)%,$(OBJ_DIR)%,$$($(1)_PORTABLE_SRC:%=%.rocm.o)) +$(1)_ROCM_DEP := $$($(1)_ROCM_OBJ:$.o=$.d) +$(1)_ROCM_LIB := $(LIB_DIR)/$(TARGET_NAME)/lib$(1)_rocm.so +LIBS += $$($(1)_ROCM_LIB) +$(1)_ROCM_LDFLAGS := -l$(1)_rocm +endif +endif # if PORTABLE_SRC is not empty +ALL_DEPENDS += $$($(1)_DEP) $$($(1)_SERIAL_DEP) $$($(1)_TBB_DEP) $$($(1)_CUDA_DEP) $$($(1)_ROCM_DEP) endef $(foreach lib,$(LIBNAMES),$(eval $(call LIB_template,$(lib)))) @@ -108,8 +124,16 @@ PLUGINNAMES += $(1)_cuda $(1)_CUOBJ := $$($(1)_CUDA_OBJ) $(1)_CUDADLINK := $$(if $$(strip $$($(1)_CUOBJ)),$(OBJ_DIR)/$(TARGET_NAME)/plugin-$(1)/plugin$(1)_cudadlink.o,) endif -endif # if PORTABLE_SRC is empty -ALL_DEPENDS += $$($(1)_DEP) $$($(1)_SERIAL_DEP) $$($(1)_TBB_DEP) $$($(1)_CUDA_DEP) +# ROCm backend +ifdef ROCM_BASE +$(1)_ROCM_OBJ := $$(patsubst $(SRC_DIR)%,$(OBJ_DIR)%,$$($(1)_PORTABLE_SRC:%=%.rocm.o)) +$(1)_ROCM_DEP := $$($(1)_ROCM_OBJ:$.o=$.d) +$(1)_ROCM_LIB := $(LIB_DIR)/$(TARGET_NAME)/plugin$(1)_rocm.so +PLUGINS += $$($(1)_ROCM_LIB) +PLUGINNAMES += $(1)_rocm +endif +endif # if PORTABLE_SRC is not empty +ALL_DEPENDS += $$($(1)_DEP) $$($(1)_SERIAL_DEP) $$($(1)_TBB_DEP) $$($(1)_CUDA_DEP) $$($(1)_ROCM_DEP) endef $(foreach lib,$(PLUGINNAMES),$(eval $(call PLUGIN_template,$(lib)))) @@ -130,11 +154,17 @@ TESTS_CUDA_DEP := $(TESTS_CUDA_OBJ:$.o=$.d) TESTS_CUDA_EXE := $(patsubst $(SRC_DIR)/$(TARGET_NAME)/test/alpaka/%.cc,$(TEST_DIR)/$(TARGET_NAME)/%.cuda,$(TESTS_PORTABLE_SRC)) TESTS_CUDADLINK := $(TESTS_CUDA_OBJ:$cu.o=$cudadlink.o) endif +# ROCm backend +ifdef ROCM_BASE +TESTS_ROCM_OBJ := $(patsubst $(SRC_DIR)%,$(OBJ_DIR)%,$(TESTS_PORTABLE_SRC:%=%.rocm.o)) +TESTS_ROCM_DEP := $(TESTS_ROCM_OBJ:$.o=$.d) +TESTS_ROCM_EXE := $(patsubst $(SRC_DIR)/$(TARGET_NAME)/test/alpaka/%.cc,$(TEST_DIR)/$(TARGET_NAME)/%.rocm,$(TESTS_PORTABLE_SRC)) +endif # -TESTS_EXE := $(TESTS_SERIAL_EXE) $(TESTS_TBB_EXE) $(TESTS_CUDA_EXE) -ALL_DEPENDS += $(TESTS_SERIAL_DEP) $(TESTS_TBB_DEP) $(TESTS_CUDA_DEP) +TESTS_EXE := $(TESTS_SERIAL_EXE) $(TESTS_TBB_EXE) $(TESTS_CUDA_EXE) $(TESTS_ROCM_EXE) +ALL_DEPENDS += $(TESTS_SERIAL_DEP) $(TESTS_TBB_DEP) $(TESTS_CUDA_DEP) $(TESTS_ROCM_DEP) # Needed to keep the unit test object files after building $(TARGET) -.SECONDARY: $(TESTS_SERIAL_OBJ) $(TESTS_TBB_OBJ) $(TESTS_CUDA_OBJ) $(TESTS_CUDADLINK) +.SECONDARY: $(TESTS_SERIAL_OBJ) $(TESTS_TBB_OBJ) $(TESTS_CUDA_OBJ) $(TESTS_CUDADLINK) $(TESTS_ROCM_OBJ) define RUNTEST_template run_$(1): $(1) @@ -147,6 +177,7 @@ endef $(foreach test,$(TESTS_SERIAL_EXE),$(eval $(call RUNTEST_template,$(test),cpu))) $(foreach test,$(TESTS_TBB_EXE),$(eval $(call RUNTEST_template,$(test),cpu))) $(foreach test,$(TESTS_CUDA_EXE),$(eval $(call RUNTEST_template,$(test),nvidiagpu))) +$(foreach test,$(TESTS_ROCM_EXE),$(eval $(call RUNTEST_template,$(test),amdgpu))) -include $(ALL_DEPENDS) @@ -180,6 +211,10 @@ $$($(1)_CUDA_LIB): $$($(1)_CUDA_OBJ) $$($(1)_CUDADLINK) $$(foreach dep,$(EXTERNA @[ -d $$(@D) ] || mkdir -p $$(@D) $(CXX) $$($(1)_CUDA_OBJ) $$($(1)_CUDADLINK) $(LDFLAGS) -shared $(SO_LDFLAGS) $(LIB_LDFLAGS) $$(foreach lib,$$($(1)_DEPENDS),$$($$(lib)_LDFLAGS)) $$(foreach lib,$$($(1)_DEPENDS),$$($$(lib)_CUDA_LDFLAGS)) $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_LDFLAGS)) -o $$@ +$$($(1)_ROCM_LIB): $$($(1)_ROCM_OBJ) $$(foreach dep,$(EXTERNAL_DEPENDS_H),$$($$(dep)_DEPS)) $$(foreach lib,$$($(1)_DEPENDS),$$($$(lib)_LIB)) $$(foreach lib,$$($(1)_DEPENDS),$$($$(lib)_ROCM_LIB)) + @[ -d $$(@D) ] || mkdir -p $$(@D) + $(CXX) $$($(1)_ROCM_OBJ) $(LDFLAGS) -shared $(SO_LDFLAGS) $(LIB_LDFLAGS) $$(foreach lib,$$($(1)_DEPENDS),$$($$(lib)_LDFLAGS)) $$(foreach lib,$$($(1)_DEPENDS),$$($$(lib)_ROCM_LDFLAGS)) $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_LDFLAGS)) -o $$@ + # Anything depending on Alpaka # Portable code, for serial backend $(OBJ_DIR)/$(2)/alpaka/%.cc.serial.o: $(SRC_DIR)/$(2)/alpaka/%.cc @@ -211,6 +246,13 @@ $$($(1)_CUDADLINK): $$($(1)_CUOBJ) $(CUDA_NVCC) $(CUDA_DLINKFLAGS) $(CUDA_LDFLAGS) $$($(1)_CUOBJ) -o $$@ endif +# Portable code, for ROCm backend +ifdef ROCM_BASE +$(OBJ_DIR)/$(2)/alpaka/%.cc.rocm.o: $(SRC_DIR)/$(2)/alpaka/%.cc + @[ -d $$(@D) ] || mkdir -p $$(@D) + $(ROCM_HIPCC) $(HIPCC_CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_GPU_HIP_ASYNC_BACKEND -UALPAKA_HOST_ONLY $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_CXXFLAGS)) -c $$< -o $$@ -MMD +endif + endef $(foreach lib,$(LIBNAMES),$(eval $(call BUILD_template,$(lib),$(TARGET_NAME)/$(lib)))) @@ -267,3 +309,14 @@ $(TEST_DIR)/$(TARGET_NAME)/%.cuda: $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.cu @[ -d $(@D) ] || mkdir -p $(@D) $(CXX) $^ $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(patsubst %,-l%,$(LIBNAMES)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS)) endif + +# ROCm backend +ifdef ROCM_BASE +$(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.rocm.o: $(SRC_DIR)/$(TARGET_NAME)/test/alpaka/%.cc + @[ -d $(@D) ] || mkdir -p $(@D) + $(ROCM_HIPCC) $(HIPCC_CXXFLAGS) $(ROCM_TEST_CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_GPU_HIP_ASYNC_BACKEND -UALPAKA_HOST_ONLY $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_CXXFLAGS)) -c $< -o $@ -MMD + +$(TEST_DIR)/$(TARGET_NAME)/%.rocm: $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.rocm.o | $(LIBS) + @[ -d $(@D) ] || mkdir -p $(@D) + $(CXX) $^ $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(patsubst %,-l%,$(LIBNAMES)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS)) +endif diff --git a/src/alpaka/Makefile.deps b/src/alpaka/Makefile.deps index 32a3d8ff0..33fa9e31d 100644 --- a/src/alpaka/Makefile.deps +++ b/src/alpaka/Makefile.deps @@ -2,6 +2,9 @@ alpaka_EXTERNAL_DEPENDS := TBB EIGEN ALPAKA BOOST BACKTRACE ifdef CUDA_BASE alpaka_EXTERNAL_DEPENDS += CUDA endif +ifdef ROCM_BASE +alpaka_EXTERNAL_DEPENDS += ROCM +endif AlpakaCore_DEPENDS := Framework BeamSpotProducer_DEPENDS := Framework AlpakaCore AlpakaDataFormats DataFormats PixelTrackFitting_DEPENDS := Framework AlpakaCore AlpakaDataFormats diff --git a/src/alpaka/alpaka/alpakaExtra.hpp b/src/alpaka/alpaka/alpakaExtra.hpp new file mode 100644 index 000000000..93d29a119 --- /dev/null +++ b/src/alpaka/alpaka/alpakaExtra.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include + +// TODO move this into Alpaka +namespace alpaka { + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + using PltfCudaRt = PltfUniformCudaHipRt; + + using EventCudaRt = EventUniformCudaHipRt; + + template + using BufCudaRt = BufUniformCudaHipRt; +#endif // ALPAKA_ACC_GPU_CUDA_ENABLED + +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + using PltfHipRt = PltfUniformCudaHipRt; + + using EventHipRt = EventUniformCudaHipRt; + + template + using BufHipRt = BufUniformCudaHipRt; +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + +} // namespace alpaka diff --git a/src/alpaka/bin/main.cc b/src/alpaka/bin/main.cc index 95bcd564a..6e863d8f3 100644 --- a/src/alpaka/bin/main.cc +++ b/src/alpaka/bin/main.cc @@ -24,8 +24,20 @@ namespace { void print_help(std::string const& name) { std::cout - << name - << ": [--serial] [--tbb] [--cuda] [--numberOfThreads NT] [--numberOfStreams NS] [--maxEvents ME] [--data PATH] " + << name << ": " +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + << "[--serial] " +#endif +#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED + << "[--tbb] " +#endif +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + << "[--cuda] " +#endif +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + << "[--hip] " +#endif + << "[--numberOfThreads NT] [--numberOfStreams NS] [--maxEvents ME] [--data PATH] " "[--transfer] [--validation]\n\n" << "Options\n" #ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED @@ -36,6 +48,9 @@ namespace { #endif #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED << " --cuda Use CUDA backend\n" +#endif +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + << " --hip Use ROCm/HIP backend\n" #endif << " --numberOfThreads Number of threads to use (default 1, use 0 to use all CPU cores)\n" << " --numberOfStreams Number of concurrent events (default 0 = numberOfThreads)\n" @@ -80,6 +95,10 @@ int main(int argc, char** argv) { #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED } else if (*i == "--cuda") { backends.emplace_back(Backend::CUDA); +#endif +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + } else if (*i == "--hip") { + backends.emplace_back(Backend::HIP); #endif } else if (*i == "--numberOfThreads") { ++i; @@ -146,6 +165,11 @@ int main(int argc, char** argv) { cms::alpakatools::initialise(); } #endif +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + if (std::find(backends.begin(), backends.end(), Backend::HIP) != backends.end()) { + cms::alpakatools::initialise(); + } +#endif // Initialize EventProcessor std::vector edmodules; @@ -182,6 +206,9 @@ int main(int argc, char** argv) { #endif #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED addModules("alpaka_cuda_async", Backend::CUDA); +#endif +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + addModules("alpaka_rocm_async", Backend::HIP); #endif } edm::EventProcessor processor( diff --git a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuSortByPt2.h b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuSortByPt2.h index e3e2aeb40..5a71c71c3 100644 --- a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuSortByPt2.h +++ b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuSortByPt2.h @@ -53,7 +53,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { sortInd[0] = 0; return; } -#ifdef ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND +#if defined(ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND) || defined(ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND) auto& sws = alpaka::declareSharedVar(acc); // sort using only 16 bits cms::alpakatools::radixSort(acc, ptv2, sortInd, sws, nvFinal); diff --git a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuSplitVertices.h b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuSplitVertices.h index cb0a28ee7..9f7dd592c 100644 --- a/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuSplitVertices.h +++ b/src/alpaka/plugin-PixelVertexFinding/alpaka/gpuSplitVertices.h @@ -37,7 +37,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // NB: Shared memory size? Is it enough? constexpr uint32_t MAXTK = 512; -#ifdef ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND +#if defined(ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND) || defined(ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND) auto& it = alpaka::declareSharedVar(acc); // track index auto& zz = alpaka::declareSharedVar(acc); // z pos auto& newV = alpaka::declareSharedVar(acc); // 0 or 1 diff --git a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.cc b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.cc index d13cd93fc..9a111649c 100644 --- a/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.cc +++ b/src/alpaka/plugin-SiPixelClusterizer/alpaka/SiPixelRawToClusterGPUKernel.cc @@ -570,7 +570,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { if (wordCounter) // protect in case of empty event.... { -#ifdef ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND +#if defined(ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND) || defined(ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND) const int threadsPerBlockOrElementsPerThread = 512; #else // NB: MPORTANT: This could be tuned to benefit from innermost loop. @@ -626,7 +626,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { { // clusterizer ... using namespace gpuClustering; -#ifdef ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND +#if defined(ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND) || defined(ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND) const auto threadsPerBlockOrElementsPerThread = 256; #else // NB: MPORTANT: This could be tuned to benefit from innermost loop. diff --git a/src/alpaka/plugin-SiPixelClusterizer/alpaka/gpuClustering.h b/src/alpaka/plugin-SiPixelClusterizer/alpaka/gpuClustering.h index 6d42dd67a..37ed6e1a0 100644 --- a/src/alpaka/plugin-SiPixelClusterizer/alpaka/gpuClustering.h +++ b/src/alpaka/plugin-SiPixelClusterizer/alpaka/gpuClustering.h @@ -165,7 +165,7 @@ namespace gpuClustering { constexpr unsigned int maxiter = 16; ALPAKA_ASSERT_OFFLOAD((hist.size() / blockDimension) <= maxiter); -#ifdef ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND +#if defined(ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND) || defined(ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND) constexpr uint32_t threadDimension = 1; #else // NB: can be tuned. diff --git a/src/alpaka/plugin-Validation/alpaka/HistoValidator.cc b/src/alpaka/plugin-Validation/alpaka/HistoValidator.cc index e4c30aa4a..4e13d2955 100644 --- a/src/alpaka/plugin-Validation/alpaka/HistoValidator.cc +++ b/src/alpaka/plugin-Validation/alpaka/HistoValidator.cc @@ -119,7 +119,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { nDigis_ = digis.nDigis(); nModules_ = digis.nModules(); - h_adc = std::move(digis.adcToHostAsync(ctx.stream())); + h_adc = digis.adcToHostAsync(ctx.stream()); nClusters_ = clusters.nClusters(); h_clusInModule = cms::alpakatools::make_host_buffer(ctx.stream(), nModules_); @@ -129,17 +129,17 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { nHits_ = hits.nHits(); - h_lx = std::move(hits.xlToHostAsync(ctx.stream())); - h_ly = std::move(hits.ylToHostAsync(ctx.stream())); - h_lex = std::move(hits.xerrToHostAsync(ctx.stream())); - h_ley = std::move(hits.yerrToHostAsync(ctx.stream())); - h_gx = std::move(hits.xgToHostAsync(ctx.stream())); - h_gy = std::move(hits.ygToHostAsync(ctx.stream())); - h_gz = std::move(hits.zgToHostAsync(ctx.stream())); - h_gr = std::move(hits.rgToHostAsync(ctx.stream())); - h_charge = std::move(hits.chargeToHostAsync(ctx.stream())); - h_sizex = std::move(hits.xsizeToHostAsync(ctx.stream())); - h_sizey = std::move(hits.ysizeToHostAsync(ctx.stream())); + h_lx = hits.xlToHostAsync(ctx.stream()); + h_ly = hits.ylToHostAsync(ctx.stream()); + h_lex = hits.xerrToHostAsync(ctx.stream()); + h_ley = hits.yerrToHostAsync(ctx.stream()); + h_gx = hits.xgToHostAsync(ctx.stream()); + h_gy = hits.ygToHostAsync(ctx.stream()); + h_gz = hits.zgToHostAsync(ctx.stream()); + h_gr = hits.rgToHostAsync(ctx.stream()); + h_charge = hits.chargeToHostAsync(ctx.stream()); + h_sizex = hits.xsizeToHostAsync(ctx.stream()); + h_sizey = hits.ysizeToHostAsync(ctx.stream()); } void HistoValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { @@ -222,6 +222,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { std::ofstream out("histograms_alpaka_tbb.txt"); #elif defined ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND std::ofstream out("histograms_alpaka_cuda.txt"); +#elif defined ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND + std::ofstream out("histograms_alpaka_hip.txt"); #else #error "Support for a new Alpaka backend must be added here" #endif diff --git a/src/alpaka/plugins.txt b/src/alpaka/plugins.txt index 8370653cb..35ed33a40 100644 --- a/src/alpaka/plugins.txt +++ b/src/alpaka/plugins.txt @@ -1,38 +1,50 @@ BeamSpotESProducer pluginBeamSpotProducer.so alpaka_cuda_async::BeamSpotToAlpaka pluginBeamSpotProducer_cuda.so +alpaka_rocm_async::BeamSpotToAlpaka pluginBeamSpotProducer_rocm.so alpaka_tbb_async::BeamSpotToAlpaka pluginBeamSpotProducer_tbb.so alpaka_serial_sync::BeamSpotToAlpaka pluginBeamSpotProducer_serial.so alpaka_cuda_async::CAHitNtupletAlpaka pluginPixelTriplets_cuda.so +alpaka_rocm_async::CAHitNtupletAlpaka pluginPixelTriplets_rocm.so alpaka_tbb_async::CAHitNtupletAlpaka pluginPixelTriplets_tbb.so alpaka_serial_sync::CAHitNtupletAlpaka pluginPixelTriplets_serial.so alpaka_cuda_async::PixelTrackSoAFromAlpaka pluginPixelTrackFitting_cuda.so +alpaka_rocm_async::PixelTrackSoAFromAlpaka pluginPixelTrackFitting_rocm.so alpaka_tbb_async::PixelTrackSoAFromAlpaka pluginPixelTrackFitting_tbb.so alpaka_serial_sync::PixelTrackSoAFromAlpaka pluginPixelTrackFitting_serial.so alpaka_cuda_async::PixelVertexProducerAlpaka pluginPixelVertexFinding_cuda.so +alpaka_rocm_async::PixelVertexProducerAlpaka pluginPixelVertexFinding_rocm.so alpaka_tbb_async::PixelVertexProducerAlpaka pluginPixelVertexFinding_tbb.so alpaka_serial_sync::PixelVertexProducerAlpaka pluginPixelVertexFinding_serial.so alpaka_cuda_async::PixelVertexSoAFromAlpaka pluginPixelVertexFinding_cuda.so +alpaka_rocm_async::PixelVertexSoAFromAlpaka pluginPixelVertexFinding_rocm.so alpaka_tbb_async::PixelVertexSoAFromAlpaka pluginPixelVertexFinding_tbb.so alpaka_serial_sync::PixelVertexSoAFromAlpaka pluginPixelVertexFinding_serial.so alpaka_cuda_async::SiPixelRawToCluster pluginSiPixelClusterizer_cuda.so +alpaka_rocm_async::SiPixelRawToCluster pluginSiPixelClusterizer_rocm.so alpaka_tbb_async::SiPixelRawToCluster pluginSiPixelClusterizer_tbb.so alpaka_serial_sync::SiPixelRawToCluster pluginSiPixelClusterizer_serial.so SiPixelFedIdsESProducer pluginSiPixelClusterizer.so alpaka_cuda_async::SiPixelFedCablingMapESProducer pluginSiPixelClusterizer_cuda.so +alpaka_rocm_async::SiPixelFedCablingMapESProducer pluginSiPixelClusterizer_rocm.so alpaka_tbb_async::SiPixelFedCablingMapESProducer pluginSiPixelClusterizer_tbb.so alpaka_serial_sync::SiPixelFedCablingMapESProducer pluginSiPixelClusterizer_serial.so alpaka_cuda_async::SiPixelGainCalibrationForHLTESProducer pluginSiPixelClusterizer_cuda.so +alpaka_rocm_async::SiPixelGainCalibrationForHLTESProducer pluginSiPixelClusterizer_rocm.so alpaka_tbb_async::SiPixelGainCalibrationForHLTESProducer pluginSiPixelClusterizer_tbb.so alpaka_serial_sync::SiPixelGainCalibrationForHLTESProducer pluginSiPixelClusterizer_serial.so alpaka_cuda_async::PixelCPEFastESProducer pluginSiPixelRecHits_cuda.so +alpaka_rocm_async::PixelCPEFastESProducer pluginSiPixelRecHits_rocm.so alpaka_tbb_async::PixelCPEFastESProducer pluginSiPixelRecHits_tbb.so alpaka_serial_sync::PixelCPEFastESProducer pluginSiPixelRecHits_serial.so alpaka_cuda_async::SiPixelRecHitAlpaka pluginSiPixelRecHits_cuda.so +alpaka_rocm_async::SiPixelRecHitAlpaka pluginSiPixelRecHits_rocm.so alpaka_tbb_async::SiPixelRecHitAlpaka pluginSiPixelRecHits_tbb.so alpaka_serial_sync::SiPixelRecHitAlpaka pluginSiPixelRecHits_serial.so alpaka_cuda_async::CountValidator pluginValidation_cuda.so +alpaka_rocm_async::CountValidator pluginValidation_rocm.so alpaka_tbb_async::CountValidator pluginValidation_tbb.so alpaka_serial_sync::CountValidator pluginValidation_serial.so alpaka_cuda_async::HistoValidator pluginValidation_cuda.so +alpaka_rocm_async::HistoValidator pluginValidation_rocm.so alpaka_tbb_async::HistoValidator pluginValidation_tbb.so alpaka_serial_sync::HistoValidator pluginValidation_serial.so diff --git a/src/alpaka/test/alpaka/clustering_t.cc b/src/alpaka/test/alpaka/clustering_t.cc index c72daf419..998dee975 100644 --- a/src/alpaka/test/alpaka/clustering_t.cc +++ b/src/alpaka/test/alpaka/clustering_t.cc @@ -239,8 +239,8 @@ int main(void) { alpaka::memcpy(queue, d_y, h_y, n); alpaka::memcpy(queue, d_adc, h_adc, n); -// Launch CUDA Kernels -#ifdef ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND +// Launch CUDA/HIP Kernels +#if defined(ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND) || defined(ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND) const auto threadsPerBlockOrElementsPerThread = (kkk == 5) ? 512 : ((kkk == 3) ? 128 : 256); #else // NB: can be tuned. diff --git a/src/alpaka/test/alpaka/prefixScan_t.cc b/src/alpaka/test/alpaka/prefixScan_t.cc index 7a8010b8e..4c7574f63 100644 --- a/src/alpaka/test/alpaka/prefixScan_t.cc +++ b/src/alpaka/test/alpaka/prefixScan_t.cc @@ -54,12 +54,14 @@ template struct testWarpPrefixScan { template ALPAKA_FN_ACC void operator()(const TAcc& acc, uint32_t size) const { +#if defined(ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND) && defined(__CUDA_ARCH__) || \ + defined(ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND) && defined(__HIP_DEVICE_COMPILE__) assert(size <= 32); auto& c = alpaka::declareSharedVar(acc); auto& co = alpaka::declareSharedVar(acc); - uint32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); - uint32_t const blockThreadIdx(alpaka::getIdx(acc)[0u]); + uint32_t const blockDimension = alpaka::getWorkDiv(acc)[0u]; + uint32_t const blockThreadIdx = alpaka::getIdx(acc)[0u]; auto i = blockThreadIdx; c[i] = 1; alpaka::syncBlockThreads(acc); @@ -76,9 +78,10 @@ struct testWarpPrefixScan { if (c[i] != c[i - 1] + 1) printf(format_traits::failed_msg, size, i, blockDimension, c[i], c[i - 1]); assert(c[i] == c[i - 1] + 1); - assert(c[i] == i + 1); + assert(c[i] == static_cast(i + 1)); assert(c[i] == co[i]); } +#endif } }; @@ -116,7 +119,7 @@ int main() { Queue queue(device); // WARP PREFIXSCAN (OBVIOUSLY GPU-ONLY) -#ifdef ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND +#if defined(ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND) || defined(ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND) std::cout << "warp level" << std::endl; const auto threadsPerBlockOrElementsPerThread = 32;