diff --git a/HeterogeneousCore/CUDAUtilities/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/BuildFile.xml
index adfe5e217c073..c7990c08b13f5 100644
--- a/HeterogeneousCore/CUDAUtilities/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/BuildFile.xml
@@ -1,2 +1,3 @@
+
diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUVector.h b/HeterogeneousCore/CUDAUtilities/interface/GPUVector.h
new file mode 100644
index 0000000000000..e7bb15dea39ac
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/GPUVector.h
@@ -0,0 +1,165 @@
+#ifndef HeterogeneousCore_CUDAUtilities_GPUVector_h
+#define HeterogeneousCore_CUDAUtilities_GPUVector_h
+
+#include
+
+#include
+#include
+#include
+
+template
+class GPUVectorWrapper;
+
+/**
+ * This class owns the GPU memory, and provides interface to transfer
+ * the data to CPU memory.
+ */
+template
+class GPUVector {
+public:
+ explicit GPUVector(int capacity) {
+ static_assert(std::is_trivially_destructible::value);
+
+ m_sizeCapacity_host.m_size = 0;
+ m_sizeCapacity_host.m_capacity = capacity;
+
+ auto current_device = cuda::device::current::get();
+ m_sizeCapacity = cuda::memory::device::make_unique(current_device);
+ m_data = cuda::memory::device::make_unique(current_device, capacity);
+
+ updateMetadataToDevice();
+ }
+
+ void updateMetadata() {
+ cuda::memory::copy(&m_sizeCapacity_host, m_sizeCapacity.get(), sizeof(SizeCapacity));
+ }
+ void updateMetadataAsync(cudaStream_t stream) {
+ cuda::memory::async::copy(&m_sizeCapacity_host, m_sizeCapacity.get(), sizeof(SizeCapacity), stream);
+ }
+
+ int size() const { return m_sizeCapacity_host.m_size; }
+ int capacity() const { return m_sizeCapacity_host.m_capacity; }
+
+ const T *data() const { return m_data.get(); }
+ T *data() { return m_data.get(); }
+
+ void copyFrom(const T *src, int num) {
+ assert(num <= m_sizeCapacity_host.m_capacity);
+ cuda::memory::copy(m_data.get(), src, num*sizeof(T));
+ m_sizeCapacity_host.m_size = num;
+ updateMetadataToDevice();
+ }
+
+ void copyFromAsync(const T *src, int num, cudaStream_t stream) {
+ assert(num <= m_sizeCapacity_host.m_capacity);
+ cuda::memory::async::copy(m_data.get(), src, num*sizeof(T), stream);
+ m_sizeCapacity_host.m_size = num;
+ updateMetadataToDeviceAsync(stream);
+ }
+
+ int copyTo(T *dst, int num) {
+ updateMetadata();
+ int copied = std::min(num, m_sizeCapacity_host.m_size);
+ cuda::memory::copy(dst, m_data.get(), copied*sizeof(T));
+ return copied;
+ }
+ int copyToAsync(T *dst, int num, cudaStream_t stream) {
+ // calling updateMetadataAsync() or otherwise guaranteeing the host
+ // and device to be in synch with the size is on the
+ // responsibility of the caller
+ int copied = std::min(num, m_sizeCapacity_host.m_size);
+ cuda::memory::async::copy(dst, m_data.get(), copied*sizeof(T), stream);
+ return copied;
+ }
+
+private:
+ void updateMetadataToDevice() {
+ cuda::memory::copy(m_sizeCapacity.get(), &m_sizeCapacity_host, sizeof(SizeCapacity));
+ }
+ void updateMetadataToDeviceAsync(cudaStream_t stream) {
+ cuda::memory::async::copy(m_sizeCapacity.get(), &m_sizeCapacity_host, sizeof(SizeCapacity), stream);
+ }
+
+ friend GPUVectorWrapper;
+
+ struct SizeCapacity {
+#if defined(__NVCC__) || defined(__CUDACC__)
+ __device__ int addElement() {
+ auto previousSize = atomicAdd(&m_size, 1);
+ assert(previousSize < m_capacity);
+ return previousSize;
+ }
+
+ __device__ void resize(int size) {
+ assert(size <= m_capacity);
+ m_size = size;
+ }
+#endif
+
+ int m_size;
+ int m_capacity;
+ };
+
+ SizeCapacity m_sizeCapacity_host;
+ cuda::memory::device::unique_ptr m_sizeCapacity;
+ cuda::memory::device::unique_ptr m_data;
+};
+
+/**
+ * This class acts as a device wrapper of GPUVector by containing
+ * the pointers to GPU memory and an interface for manipulations in
+ * the device. It can be passed by value to the kernels.
+ */
+template
+class GPUVectorWrapper {
+public:
+ GPUVectorWrapper(GPUVector& vec): // allow automatic conversion
+ m_sizeCapacity(vec.m_sizeCapacity.get()),
+ m_data(vec.m_data.get())
+ {}
+
+#if defined(__NVCC__) || defined(__CUDACC__)
+ // thread-safe version of the vector, when used in a CUDA kernel
+ __device__ void push_back(const T &element) {
+ auto index = m_sizeCapacity->addElement();
+ m_data[index] = element;
+ }
+
+ template
+ __device__ void emplace_back(Args&&... args) {
+ auto index = m_sizeCapacity->addElement();
+ new (&m_data[index]) T(std::forward(args)...);
+ }
+
+ __device__ const T& back() const {
+ assert(m_sizeCapacity->m_size > 0);
+ return m_data[m_sizeCapacity->m_size - 1];
+ }
+ __device__ T& back() {
+ assert(m_sizeCapacity->m_size > 0);
+ return m_data[m_sizeCapacity->m_size - 1];
+ }
+
+ __device__ void reset() { m_sizeCapacity->m_size = 0; }
+
+ __device__ int size() const { return m_sizeCapacity->m_size; }
+
+ __device__ int capacity() const { return m_sizeCapacity->m_capacity; }
+
+ __device__ void resize(int size) { m_sizeCapacity->resize(size); }
+
+ __device__ const T& operator[](int i) const { return m_data[i]; }
+ __device__ T& operator[](int i) { return m_data[i]; }
+
+ __device__ const T *data() const { return m_data; }
+ __device__ T *data() { return m_data; }
+
+#endif
+
+private:
+ typename GPUVector::SizeCapacity *m_sizeCapacity = nullptr;
+ T *m_data = nullptr;
+};
+
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
index a1a7efcce6abb..1d424aa9f5c8b 100644
--- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -1,2 +1,7 @@
+
+
+
+
+
diff --git a/HeterogeneousCore/CUDAUtilities/test/test_GPUVector.cu b/HeterogeneousCore/CUDAUtilities/test/test_GPUVector.cu
new file mode 100644
index 0000000000000..e7ee6c0ad35df
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/test_GPUVector.cu
@@ -0,0 +1,274 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUVector.h"
+
+#include
+#include
+#include
+
+#include
+
+#include "catch.hpp"
+
+__global__ void vector_sizeCapacity(GPUVectorWrapper vec, unsigned int *ret) {
+ *ret = 0;
+ if(vec.capacity() == 10) {
+ *ret = *ret | 1<<0;
+ }
+ if(vec.size() == 0) {
+ *ret = *ret | 1<<1;
+ }
+}
+
+__global__ void vector_elements(GPUVectorWrapper vec, int *ret) {
+ auto index = threadIdx.x + blockIdx.x*blockDim.x;
+ ret[index] = (vec[index] == index);
+}
+
+__global__ void vector_pushback(GPUVectorWrapper vec) {
+ auto index = threadIdx.x + blockIdx.x * blockDim.x;
+ vec.push_back(index);
+}
+
+__global__ void vector_emplaceback(GPUVectorWrapper vec) {
+ auto index = threadIdx.x + blockIdx.x * blockDim.x;
+ vec.emplace_back(index);
+}
+
+__global__ void vector_access(GPUVectorWrapper vec) {
+ auto index = threadIdx.x + blockIdx.x * blockDim.x;
+ vec[index] += index;
+ atomicAdd(&vec.back(), 1);
+}
+
+__global__ void vector_resize(GPUVectorWrapper vec, unsigned int *ret) {
+ *ret = 0;
+ if(vec.capacity() == 10) {
+ *ret = *ret | 1<<0;
+ }
+ if(vec.size() == 10) {
+ *ret = *ret | 1<<1;
+ }
+
+ vec.resize(5);
+
+ if(vec.capacity() == 10) {
+ *ret = *ret | 1<<2;
+ }
+ if(vec.size() == 5) {
+ *ret = *ret | 1<<3;
+ }
+}
+
+__global__ void vector_reset(GPUVectorWrapper vec, unsigned int *ret) {
+ *ret = 0;
+ if(vec.capacity() == 10) {
+ *ret = *ret | 1<<0;
+ }
+ if(vec.size() == 10) {
+ *ret = *ret | 1<<1;
+ }
+
+ vec.reset();
+
+ if(vec.capacity() == 10) {
+ *ret = *ret | 1<<2;
+ }
+ if(vec.size() == 0) {
+ *ret = *ret | 1<<3;
+ }
+}
+
+
+
+TEST_CASE("Tests of GPUVector", "[GPUVector]") {
+ int deviceCount = 0;
+ auto ret = cudaGetDeviceCount( &deviceCount );
+ if(ret != cudaSuccess || deviceCount < 1) {
+ WARN("No CUDA devices, ignoring the tests");
+ return;
+ }
+
+ auto current_device = cuda::device::current::get();
+
+ auto vec_d = GPUVector(10);
+
+ SECTION("Construction") {
+ REQUIRE(vec_d.size() == 0);
+ REQUIRE(vec_d.capacity() == 10);
+
+ auto res_d = cuda::memory::device::make_unique(current_device);
+ vector_sizeCapacity<<<1, 1>>>(vec_d, res_d.get());
+ current_device.synchronize();
+
+ unsigned int res;
+ cuda::memory::copy(&res, res_d.get(), sizeof(unsigned int));
+ auto ret = std::bitset<2>(res);
+ for(int i=0; i<2; ++i) {
+ INFO("Bit " << i);
+ CHECK(ret.test(i));
+ }
+ }
+
+ auto stream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream);
+ auto res_h = std::vector(10, 0);
+
+ SECTION("Copy to device") {
+ auto vec_h = std::vector(10);
+ std::iota(vec_h.begin(), vec_h.end(), 0);
+
+ SECTION("Synchronous") {
+ vec_d.copyFrom(vec_h.data(), 10);
+
+ auto res_d = cuda::memory::device::make_unique(current_device, 10);
+ vector_elements<<<1, 10>>>(vec_d, res_d.get());
+ current_device.synchronize();
+
+ cuda::memory::copy(res_h.data(), res_d.get(), 10*sizeof(int));
+ for(int i=0; i<10; ++i) {
+ INFO("Index " << i);
+ CHECK(res_h[i] == 1); // all comparisons are true
+ }
+ }
+
+ SECTION("Asynchronous") {
+ vec_d.copyFromAsync(vec_h.data(), 10, stream.id());
+
+ auto res_d = cuda::memory::device::make_unique(current_device, 10);
+ vector_elements<<<1, 10, 0, stream.id()>>>(vec_d, res_d.get());
+
+ cuda::memory::async::copy(res_h.data(), res_d.get(), 10*sizeof(int), stream.id());
+ stream.synchronize();
+ for(int i=0; i<10; ++i) {
+ INFO("Index " << i);
+ CHECK(res_h[i] == 1); // all comparisons are true
+ }
+ }
+ }
+
+ SECTION("Copy from device") {
+ auto vec_h = std::vector(10);
+ std::iota(vec_h.begin(), vec_h.end(), 0);
+
+ SECTION("Synchronous") {
+ vec_d.copyFrom(vec_h.data(), 10);
+
+ auto ret = vec_d.copyTo(res_h.data(), 10);
+ REQUIRE(ret == 10);
+ for(int i=0; i<10; ++i) {
+ INFO("Index " << i);
+ CHECK(res_h[i] == i);
+ }
+
+ ret = vec_d.copyTo(res_h.data(), 5);
+ REQUIRE(ret == 5);
+ for(int i=0; i<5; ++i) {
+ INFO("Index " << i);
+ CHECK(res_h[i] == i);
+ }
+
+ ret = vec_d.copyTo(res_h.data(), 20);
+ REQUIRE(ret == 10);
+ }
+
+ SECTION("Asynchronous") {
+ vec_d.copyFromAsync(vec_h.data(), 10, stream.id());
+ vec_d.updateMetadataAsync(stream.id());
+ stream.synchronize();
+
+ auto ret = vec_d.copyToAsync(res_h.data(), 10, stream.id());
+ REQUIRE(ret == 10);
+ stream.synchronize();
+ for(int i=0; i<10; ++i) {
+ INFO("Index " << i);
+ CHECK(res_h[i] == i);
+ }
+
+ std::fill(res_h.begin(), res_h.end(), -1);
+ ret = vec_d.copyToAsync(res_h.data(), 5, stream.id());
+ REQUIRE(ret == 5);
+ stream.synchronize();
+ for(int i=0; i<5; ++i) {
+ INFO("Index " << i);
+ CHECK(res_h[i] == i);
+ }
+
+ std::fill(res_h.begin(), res_h.end(), -1);
+ ret = vec_d.copyToAsync(res_h.data(), 20, stream.id());
+ REQUIRE(ret == 10);
+ }
+ }
+
+ SECTION("push_back") {
+ vector_pushback<<<1, 10>>>(vec_d);
+ current_device.synchronize();
+
+ vec_d.updateMetadata();
+ REQUIRE(vec_d.size() == 10);
+
+ vec_d.copyTo(res_h.data(), 10);
+ for(int i=0; i<10; ++i) {
+ CHECK(std::find(res_h.begin(), res_h.end(), i) != res_h.end());
+ }
+ }
+
+ SECTION("emplace_back") {
+ vector_emplaceback<<<1, 10>>>(vec_d);
+ current_device.synchronize();
+
+ vec_d.updateMetadata();
+ REQUIRE(vec_d.size() == 10);
+
+ vec_d.copyTo(res_h.data(), 10);
+ for(int i=0; i<10; ++i) {
+ CHECK(std::find(res_h.begin(), res_h.end(), i) != res_h.end());
+ }
+ }
+
+ SECTION("Element access") {
+ auto vec_h = std::vector(10);
+ std::iota(vec_h.begin(), vec_h.end(), 0);
+ vec_d.copyFrom(vec_h.data(), 10);
+
+ vector_access<<<1, 9>>>(vec_d);
+ current_device.synchronize();
+
+ vec_d.copyTo(res_h.data(), 10);
+ for(int i=0; i<9; ++i) {
+ CHECK(res_h[i] == i*2);
+ }
+ CHECK(res_h[9] == 9+9);
+ }
+
+ SECTION("Resize") {
+ auto vec_h = std::vector(10);
+ std::iota(vec_h.begin(), vec_h.end(), 0);
+ vec_d.copyFrom(vec_h.data(), 10);
+
+ auto res_d = cuda::memory::device::make_unique(current_device);
+ vector_resize<<<1, 1>>>(vec_d, res_d.get());
+
+ unsigned int res;
+ cuda::memory::copy(&res, res_d.get(), sizeof(unsigned int));
+ auto ret = std::bitset<4>(res);
+ for(int i=0; i<4; ++i) {
+ INFO("Bit " << i);
+ CHECK(ret.test(i));
+ }
+ }
+
+ SECTION("Reset") {
+ auto vec_h = std::vector(10);
+ std::iota(vec_h.begin(), vec_h.end(), 0);
+ vec_d.copyFrom(vec_h.data(), 10);
+
+ auto res_d = cuda::memory::device::make_unique(current_device);
+ vector_reset<<<1, 1>>>(vec_d, res_d.get());
+
+ unsigned int res;
+ cuda::memory::copy(&res, res_d.get(), sizeof(unsigned int));
+ auto ret = std::bitset<4>(res);
+ for(int i=0; i<4; ++i) {
+ INFO("Bit " << i);
+ CHECK(ret.test(i));
+ }
+ }
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/test_main.cpp b/HeterogeneousCore/CUDAUtilities/test/test_main.cpp
new file mode 100644
index 0000000000000..0c7c351f437f5
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/test_main.cpp
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"