From 38e10994692eeb5a36184681a0bc69b830887298 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Wed, 13 Jun 2018 14:54:36 +0200 Subject: [PATCH] Prototype of better GPU vector interface --- HeterogeneousCore/CUDAUtilities/BuildFile.xml | 1 + .../CUDAUtilities/interface/GPUVector.h | 165 +++++++++++ .../CUDAUtilities/test/BuildFile.xml | 5 + .../CUDAUtilities/test/test_GPUVector.cu | 274 ++++++++++++++++++ .../CUDAUtilities/test/test_main.cpp | 2 + 5 files changed, 447 insertions(+) create mode 100644 HeterogeneousCore/CUDAUtilities/interface/GPUVector.h create mode 100644 HeterogeneousCore/CUDAUtilities/test/test_GPUVector.cu create mode 100644 HeterogeneousCore/CUDAUtilities/test/test_main.cpp diff --git a/HeterogeneousCore/CUDAUtilities/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/BuildFile.xml index adfe5e217c073..c7990c08b13f5 100644 --- a/HeterogeneousCore/CUDAUtilities/BuildFile.xml +++ b/HeterogeneousCore/CUDAUtilities/BuildFile.xml @@ -1,2 +1,3 @@ + diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUVector.h b/HeterogeneousCore/CUDAUtilities/interface/GPUVector.h new file mode 100644 index 0000000000000..e7bb15dea39ac --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/GPUVector.h @@ -0,0 +1,165 @@ +#ifndef HeterogeneousCore_CUDAUtilities_GPUVector_h +#define HeterogeneousCore_CUDAUtilities_GPUVector_h + +#include + +#include +#include +#include + +template +class GPUVectorWrapper; + +/** + * This class owns the GPU memory, and provides interface to transfer + * the data to CPU memory. + */ +template +class GPUVector { +public: + explicit GPUVector(int capacity) { + static_assert(std::is_trivially_destructible::value); + + m_sizeCapacity_host.m_size = 0; + m_sizeCapacity_host.m_capacity = capacity; + + auto current_device = cuda::device::current::get(); + m_sizeCapacity = cuda::memory::device::make_unique(current_device); + m_data = cuda::memory::device::make_unique(current_device, capacity); + + updateMetadataToDevice(); + } + + void updateMetadata() { + cuda::memory::copy(&m_sizeCapacity_host, m_sizeCapacity.get(), sizeof(SizeCapacity)); + } + void updateMetadataAsync(cudaStream_t stream) { + cuda::memory::async::copy(&m_sizeCapacity_host, m_sizeCapacity.get(), sizeof(SizeCapacity), stream); + } + + int size() const { return m_sizeCapacity_host.m_size; } + int capacity() const { return m_sizeCapacity_host.m_capacity; } + + const T *data() const { return m_data.get(); } + T *data() { return m_data.get(); } + + void copyFrom(const T *src, int num) { + assert(num <= m_sizeCapacity_host.m_capacity); + cuda::memory::copy(m_data.get(), src, num*sizeof(T)); + m_sizeCapacity_host.m_size = num; + updateMetadataToDevice(); + } + + void copyFromAsync(const T *src, int num, cudaStream_t stream) { + assert(num <= m_sizeCapacity_host.m_capacity); + cuda::memory::async::copy(m_data.get(), src, num*sizeof(T), stream); + m_sizeCapacity_host.m_size = num; + updateMetadataToDeviceAsync(stream); + } + + int copyTo(T *dst, int num) { + updateMetadata(); + int copied = std::min(num, m_sizeCapacity_host.m_size); + cuda::memory::copy(dst, m_data.get(), copied*sizeof(T)); + return copied; + } + int copyToAsync(T *dst, int num, cudaStream_t stream) { + // calling updateMetadataAsync() or otherwise guaranteeing the host + // and device to be in synch with the size is on the + // responsibility of the caller + int copied = std::min(num, m_sizeCapacity_host.m_size); + cuda::memory::async::copy(dst, m_data.get(), copied*sizeof(T), stream); + return copied; + } + +private: + void updateMetadataToDevice() { + cuda::memory::copy(m_sizeCapacity.get(), &m_sizeCapacity_host, sizeof(SizeCapacity)); + } + void updateMetadataToDeviceAsync(cudaStream_t stream) { + cuda::memory::async::copy(m_sizeCapacity.get(), &m_sizeCapacity_host, sizeof(SizeCapacity), stream); + } + + friend GPUVectorWrapper; + + struct SizeCapacity { +#if defined(__NVCC__) || defined(__CUDACC__) + __device__ int addElement() { + auto previousSize = atomicAdd(&m_size, 1); + assert(previousSize < m_capacity); + return previousSize; + } + + __device__ void resize(int size) { + assert(size <= m_capacity); + m_size = size; + } +#endif + + int m_size; + int m_capacity; + }; + + SizeCapacity m_sizeCapacity_host; + cuda::memory::device::unique_ptr m_sizeCapacity; + cuda::memory::device::unique_ptr m_data; +}; + +/** + * This class acts as a device wrapper of GPUVector by containing + * the pointers to GPU memory and an interface for manipulations in + * the device. It can be passed by value to the kernels. + */ +template +class GPUVectorWrapper { +public: + GPUVectorWrapper(GPUVector& vec): // allow automatic conversion + m_sizeCapacity(vec.m_sizeCapacity.get()), + m_data(vec.m_data.get()) + {} + +#if defined(__NVCC__) || defined(__CUDACC__) + // thread-safe version of the vector, when used in a CUDA kernel + __device__ void push_back(const T &element) { + auto index = m_sizeCapacity->addElement(); + m_data[index] = element; + } + + template + __device__ void emplace_back(Args&&... args) { + auto index = m_sizeCapacity->addElement(); + new (&m_data[index]) T(std::forward(args)...); + } + + __device__ const T& back() const { + assert(m_sizeCapacity->m_size > 0); + return m_data[m_sizeCapacity->m_size - 1]; + } + __device__ T& back() { + assert(m_sizeCapacity->m_size > 0); + return m_data[m_sizeCapacity->m_size - 1]; + } + + __device__ void reset() { m_sizeCapacity->m_size = 0; } + + __device__ int size() const { return m_sizeCapacity->m_size; } + + __device__ int capacity() const { return m_sizeCapacity->m_capacity; } + + __device__ void resize(int size) { m_sizeCapacity->resize(size); } + + __device__ const T& operator[](int i) const { return m_data[i]; } + __device__ T& operator[](int i) { return m_data[i]; } + + __device__ const T *data() const { return m_data; } + __device__ T *data() { return m_data; } + +#endif + +private: + typename GPUVector::SizeCapacity *m_sizeCapacity = nullptr; + T *m_data = nullptr; +}; + + +#endif diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml index a1a7efcce6abb..1d424aa9f5c8b 100644 --- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml +++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml @@ -1,2 +1,7 @@ + + + + + diff --git a/HeterogeneousCore/CUDAUtilities/test/test_GPUVector.cu b/HeterogeneousCore/CUDAUtilities/test/test_GPUVector.cu new file mode 100644 index 0000000000000..e7ee6c0ad35df --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/test_GPUVector.cu @@ -0,0 +1,274 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/GPUVector.h" + +#include +#include +#include + +#include + +#include "catch.hpp" + +__global__ void vector_sizeCapacity(GPUVectorWrapper vec, unsigned int *ret) { + *ret = 0; + if(vec.capacity() == 10) { + *ret = *ret | 1<<0; + } + if(vec.size() == 0) { + *ret = *ret | 1<<1; + } +} + +__global__ void vector_elements(GPUVectorWrapper vec, int *ret) { + auto index = threadIdx.x + blockIdx.x*blockDim.x; + ret[index] = (vec[index] == index); +} + +__global__ void vector_pushback(GPUVectorWrapper vec) { + auto index = threadIdx.x + blockIdx.x * blockDim.x; + vec.push_back(index); +} + +__global__ void vector_emplaceback(GPUVectorWrapper vec) { + auto index = threadIdx.x + blockIdx.x * blockDim.x; + vec.emplace_back(index); +} + +__global__ void vector_access(GPUVectorWrapper vec) { + auto index = threadIdx.x + blockIdx.x * blockDim.x; + vec[index] += index; + atomicAdd(&vec.back(), 1); +} + +__global__ void vector_resize(GPUVectorWrapper vec, unsigned int *ret) { + *ret = 0; + if(vec.capacity() == 10) { + *ret = *ret | 1<<0; + } + if(vec.size() == 10) { + *ret = *ret | 1<<1; + } + + vec.resize(5); + + if(vec.capacity() == 10) { + *ret = *ret | 1<<2; + } + if(vec.size() == 5) { + *ret = *ret | 1<<3; + } +} + +__global__ void vector_reset(GPUVectorWrapper vec, unsigned int *ret) { + *ret = 0; + if(vec.capacity() == 10) { + *ret = *ret | 1<<0; + } + if(vec.size() == 10) { + *ret = *ret | 1<<1; + } + + vec.reset(); + + if(vec.capacity() == 10) { + *ret = *ret | 1<<2; + } + if(vec.size() == 0) { + *ret = *ret | 1<<3; + } +} + + + +TEST_CASE("Tests of GPUVector", "[GPUVector]") { + int deviceCount = 0; + auto ret = cudaGetDeviceCount( &deviceCount ); + if(ret != cudaSuccess || deviceCount < 1) { + WARN("No CUDA devices, ignoring the tests"); + return; + } + + auto current_device = cuda::device::current::get(); + + auto vec_d = GPUVector(10); + + SECTION("Construction") { + REQUIRE(vec_d.size() == 0); + REQUIRE(vec_d.capacity() == 10); + + auto res_d = cuda::memory::device::make_unique(current_device); + vector_sizeCapacity<<<1, 1>>>(vec_d, res_d.get()); + current_device.synchronize(); + + unsigned int res; + cuda::memory::copy(&res, res_d.get(), sizeof(unsigned int)); + auto ret = std::bitset<2>(res); + for(int i=0; i<2; ++i) { + INFO("Bit " << i); + CHECK(ret.test(i)); + } + } + + auto stream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream); + auto res_h = std::vector(10, 0); + + SECTION("Copy to device") { + auto vec_h = std::vector(10); + std::iota(vec_h.begin(), vec_h.end(), 0); + + SECTION("Synchronous") { + vec_d.copyFrom(vec_h.data(), 10); + + auto res_d = cuda::memory::device::make_unique(current_device, 10); + vector_elements<<<1, 10>>>(vec_d, res_d.get()); + current_device.synchronize(); + + cuda::memory::copy(res_h.data(), res_d.get(), 10*sizeof(int)); + for(int i=0; i<10; ++i) { + INFO("Index " << i); + CHECK(res_h[i] == 1); // all comparisons are true + } + } + + SECTION("Asynchronous") { + vec_d.copyFromAsync(vec_h.data(), 10, stream.id()); + + auto res_d = cuda::memory::device::make_unique(current_device, 10); + vector_elements<<<1, 10, 0, stream.id()>>>(vec_d, res_d.get()); + + cuda::memory::async::copy(res_h.data(), res_d.get(), 10*sizeof(int), stream.id()); + stream.synchronize(); + for(int i=0; i<10; ++i) { + INFO("Index " << i); + CHECK(res_h[i] == 1); // all comparisons are true + } + } + } + + SECTION("Copy from device") { + auto vec_h = std::vector(10); + std::iota(vec_h.begin(), vec_h.end(), 0); + + SECTION("Synchronous") { + vec_d.copyFrom(vec_h.data(), 10); + + auto ret = vec_d.copyTo(res_h.data(), 10); + REQUIRE(ret == 10); + for(int i=0; i<10; ++i) { + INFO("Index " << i); + CHECK(res_h[i] == i); + } + + ret = vec_d.copyTo(res_h.data(), 5); + REQUIRE(ret == 5); + for(int i=0; i<5; ++i) { + INFO("Index " << i); + CHECK(res_h[i] == i); + } + + ret = vec_d.copyTo(res_h.data(), 20); + REQUIRE(ret == 10); + } + + SECTION("Asynchronous") { + vec_d.copyFromAsync(vec_h.data(), 10, stream.id()); + vec_d.updateMetadataAsync(stream.id()); + stream.synchronize(); + + auto ret = vec_d.copyToAsync(res_h.data(), 10, stream.id()); + REQUIRE(ret == 10); + stream.synchronize(); + for(int i=0; i<10; ++i) { + INFO("Index " << i); + CHECK(res_h[i] == i); + } + + std::fill(res_h.begin(), res_h.end(), -1); + ret = vec_d.copyToAsync(res_h.data(), 5, stream.id()); + REQUIRE(ret == 5); + stream.synchronize(); + for(int i=0; i<5; ++i) { + INFO("Index " << i); + CHECK(res_h[i] == i); + } + + std::fill(res_h.begin(), res_h.end(), -1); + ret = vec_d.copyToAsync(res_h.data(), 20, stream.id()); + REQUIRE(ret == 10); + } + } + + SECTION("push_back") { + vector_pushback<<<1, 10>>>(vec_d); + current_device.synchronize(); + + vec_d.updateMetadata(); + REQUIRE(vec_d.size() == 10); + + vec_d.copyTo(res_h.data(), 10); + for(int i=0; i<10; ++i) { + CHECK(std::find(res_h.begin(), res_h.end(), i) != res_h.end()); + } + } + + SECTION("emplace_back") { + vector_emplaceback<<<1, 10>>>(vec_d); + current_device.synchronize(); + + vec_d.updateMetadata(); + REQUIRE(vec_d.size() == 10); + + vec_d.copyTo(res_h.data(), 10); + for(int i=0; i<10; ++i) { + CHECK(std::find(res_h.begin(), res_h.end(), i) != res_h.end()); + } + } + + SECTION("Element access") { + auto vec_h = std::vector(10); + std::iota(vec_h.begin(), vec_h.end(), 0); + vec_d.copyFrom(vec_h.data(), 10); + + vector_access<<<1, 9>>>(vec_d); + current_device.synchronize(); + + vec_d.copyTo(res_h.data(), 10); + for(int i=0; i<9; ++i) { + CHECK(res_h[i] == i*2); + } + CHECK(res_h[9] == 9+9); + } + + SECTION("Resize") { + auto vec_h = std::vector(10); + std::iota(vec_h.begin(), vec_h.end(), 0); + vec_d.copyFrom(vec_h.data(), 10); + + auto res_d = cuda::memory::device::make_unique(current_device); + vector_resize<<<1, 1>>>(vec_d, res_d.get()); + + unsigned int res; + cuda::memory::copy(&res, res_d.get(), sizeof(unsigned int)); + auto ret = std::bitset<4>(res); + for(int i=0; i<4; ++i) { + INFO("Bit " << i); + CHECK(ret.test(i)); + } + } + + SECTION("Reset") { + auto vec_h = std::vector(10); + std::iota(vec_h.begin(), vec_h.end(), 0); + vec_d.copyFrom(vec_h.data(), 10); + + auto res_d = cuda::memory::device::make_unique(current_device); + vector_reset<<<1, 1>>>(vec_d, res_d.get()); + + unsigned int res; + cuda::memory::copy(&res, res_d.get(), sizeof(unsigned int)); + auto ret = std::bitset<4>(res); + for(int i=0; i<4; ++i) { + INFO("Bit " << i); + CHECK(ret.test(i)); + } + } +} diff --git a/HeterogeneousCore/CUDAUtilities/test/test_main.cpp b/HeterogeneousCore/CUDAUtilities/test/test_main.cpp new file mode 100644 index 0000000000000..0c7c351f437f5 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/test_main.cpp @@ -0,0 +1,2 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp"