Skip to content
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 33 additions & 15 deletions CUDADataFormats/Common/interface/HeterogeneousSoA.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"

Expand All @@ -21,15 +22,15 @@ class HeterogeneousSoA {

explicit HeterogeneousSoA(cudautils::device::unique_ptr<T> &&p) : dm_ptr(std::move(p)) {}
explicit HeterogeneousSoA(cudautils::host::unique_ptr<T> &&p) : hm_ptr(std::move(p)) {}
explicit HeterogeneousSoA(std::unique_ptr<T> &&p) : std_ptr(std::move(p)) {}
explicit HeterogeneousSoA(cudautils::cpu::unique_ptr<T> &&p) : cm_ptr(std::move(p)) {}

auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : cm_ptr.get()); }

auto const &operator*() const { return *get(); }

auto const *operator-> () const { return get(); }

auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : cm_ptr.get()); }

auto &operator*() { return *get(); }

Expand All @@ -47,12 +48,14 @@ class HeterogeneousSoA {
// a union wan't do it, a variant will not be more efficienct
cudautils::device::unique_ptr<T> dm_ptr; //!
cudautils::host::unique_ptr<T> hm_ptr; //!
std::unique_ptr<T> std_ptr; //!
cudautils::cpu::unique_ptr<T> cm_ptr; //!
};

namespace cudaCompat {

struct GPUTraits {
static constexpr const char * name = "GPU";

template <typename T>
using unique_ptr = cudautils::device::unique_ptr<T>;

Expand Down Expand Up @@ -83,6 +86,8 @@ namespace cudaCompat {
};

struct HostTraits {
static constexpr const char * name = "HOST";

template <typename T>
using unique_ptr = cudautils::host::unique_ptr<T>;

Expand All @@ -108,32 +113,45 @@ namespace cudaCompat {
};

struct CPUTraits {
static constexpr const char * name = "CPU";

template <typename T>
using unique_ptr = std::unique_ptr<T>;
using unique_ptr = cudautils::cpu::unique_ptr<T>;;


template <typename T>
static auto make_unique(cudaStream_t) {
return std::make_unique<T>();
static auto make_unique() {
return cudautils::make_cpu_unique<T>(0);
}

template <typename T>
static auto make_unique(size_t size, cudaStream_t) {
return std::make_unique<T>(size);
static auto make_unique(size_t size) {
return cudautils::make_cpu_unique<T>(size,0);
}

template <typename T>
static auto make_host_unique(cudaStream_t) {
return std::make_unique<T>();
static auto make_unique(cudaStream_t stream) {
return cudautils::make_cpu_unique<T>(stream);
}

template <typename T>
static auto make_device_unique(cudaStream_t) {
return std::make_unique<T>();
static auto make_unique(size_t size, cudaStream_t stream) {
return cudautils::make_cpu_unique<T>(size, stream);
}

template <typename T>
static auto make_device_unique(size_t size, cudaStream_t) {
return std::make_unique<T>(size);
static auto make_host_unique(cudaStream_t stream) {
return cudautils::make_cpu_unique<T>(stream);
}

template <typename T>
static auto make_device_unique(cudaStream_t stream) {
return cudautils::make_cpu_unique<T>(stream);
}

template <typename T>
static auto make_device_unique(size_t size, cudaStream_t stream) {
return cudautils::make_cpu_unique<T>(size, stream);
}
};

Expand Down
86 changes: 86 additions & 0 deletions HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#ifndef HeterogeneousCore_CUDAUtilities_interface_cpu_unique_ptr_h
#define HeterogeneousCore_CUDAUtilities_interface_cpu_unique_ptr_h

#include <memory>
#include <functional>

#include <cstdlib>
#include <cuda_runtime.h>
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

from a look at the file, I think #include <cuda_runtime.h> could be removed ?


namespace cudautils {
namespace cpu {
namespace impl {
// Additional layer of types to distinguish from device:: and host::unique_ptr
class CPUDeleter {
public:
CPUDeleter() = default;

void operator()(void *ptr) {
::free(ptr);
}
};
} // namespace impl

template <typename T>
using unique_ptr = std::unique_ptr<T, impl::CPUDeleter>;

namespace impl {
template <typename T>
struct make_cpu_unique_selector {
using non_array = cudautils::cpu::unique_ptr<T>;
};
template <typename T>
struct make_cpu_unique_selector<T[]> {
using unbounded_array = cudautils::cpu::unique_ptr<T[]>;
};
template <typename T, size_t N>
struct make_cpu_unique_selector<T[N]> {
struct bounded_array {};
};
} // namespace impl
} // namespace cpu

template <typename T>
typename cpu::impl::make_cpu_unique_selector<T>::non_array make_cpu_unique(cudaStream_t) {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

trying to better understand this: calling make_cpu_unique would be roughly equivalent to c++20's std::make_unique_default_init, plus it sets the deleter to just call free() instead of calling the destructors ?

static_assert(std::is_trivially_constructible<T>::value,
"Allocating with non-trivial constructor on the cpu memory is not supported");
void *mem = ::malloc(sizeof(T));
return typename cpu::impl::make_cpu_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
cpu::impl::CPUDeleter()};
}

template <typename T>
typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array make_cpu_unique(size_t n, cudaStream_t) {
using element_type = typename std::remove_extent<T>::type;
static_assert(std::is_trivially_constructible<element_type>::value,
"Allocating with non-trivial constructor on the cpu memory is not supported");
void *mem = ::malloc(n * sizeof(element_type));
return typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem),
cpu::impl::CPUDeleter()};
}

template <typename T, typename... Args>
typename cpu::impl::make_cpu_unique_selector<T>::bounded_array make_cpu_unique(Args &&...) = delete;

// No check for the trivial constructor, make it clear in the interface
template <typename T>
typename cpu::impl::make_cpu_unique_selector<T>::non_array make_cpu_unique_uninitialized(cudaStream_t) {
void *mem = ::malloc(sizeof(T));
return typename cpu::impl::make_cpu_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
cpu::impl::CPUDeleter()};
}

template <typename T>
typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array make_cpu_unique_uninitialized(size_t n, cudaStream_t) {
using element_type = typename std::remove_extent<T>::type;
void *mem = ::malloc(n * sizeof(element_type));
return typename cpu::impl::make_cpu_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem),
cpu::impl::CPUDeleter()};
}

template <typename T, typename... Args>
typename cpu::impl::make_cpu_unique_selector<T>::bounded_array make_cpu_unique_uninitialized(Args &&...) =
delete;
} // namespace cudautils

#endif
6 changes: 5 additions & 1 deletion HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
*/

#ifndef __CUDACC__
#define CUDA_KERNELS_ON_CPU
#endif

#ifdef CUDA_KERNELS_ON_CPU

#include <algorithm>
#include <cstdint>
Expand Down Expand Up @@ -98,6 +102,6 @@ namespace cudaCompat {
using namespace cudaCompat;
#endif

#endif
#endif // CUDA_KERNELS_ON_CPU

#endif // HeterogeneousCore_CUDAUtilities_interface_cudaCompat_h
12 changes: 10 additions & 2 deletions HeterogeneousCore/CUDAUtilities/interface/launch.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,14 @@ namespace cudautils {
} // namespace detail

// wrappers for cudaLaunchKernel

inline
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will add the inline because it makes sense on its own

void launch(void (*kernel)(), LaunchParameters config) {
#ifdef CUDA_KERNELS_ON_CPU
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but I really, really do not want to add a dependency on #ifdefs etc. here.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will find a less intrusive solution

kernel();
#else
cudaCheck(cudaLaunchKernel(
(const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream));
#endif
}

template <typename F, typename... Args>
Expand All @@ -107,6 +111,9 @@ namespace cudautils {
std::enable_if_t<std::is_void<std::result_of_t<F && (Args && ...)> >::value>
#endif
launch(F* kernel, LaunchParameters config, Args&&... args) {
#ifdef CUDA_KERNELS_ON_CPU
kernel(args...);
#else
using function_type = detail::kernel_traits<F>;
typename function_type::argument_type_tuple args_copy(args...);

Expand All @@ -116,10 +123,11 @@ namespace cudautils {
detail::pointer_setter<size>()(pointers, args_copy);
cudaCheck(cudaLaunchKernel(
(const void*)kernel, config.gridDim, config.blockDim, (void**)pointers, config.sharedMem, config.stream));
#endif
}

// wrappers for cudaLaunchCooperativeKernel

inline
void launch_cooperative(void (*kernel)(), LaunchParameters config) {
cudaCheck(cudaLaunchCooperativeKernel(
(const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream));
Expand Down
7 changes: 7 additions & 0 deletions HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@
<flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
</bin>

<bin file="Launch_t.cpp Launch_t.cu" name="gpuLaunch_t">
</bin>

<bin file="Launch_t.cpp" name="cpuLaunch_t">
<flags CXXFLAGS="-DCUDA_KERNELS_ON_CPU"/>
</bin>

<bin file="test_GPUSimpleVector.cu" name="test_GPUSimpleVector">
<flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
</bin>
Expand Down
31 changes: 31 additions & 0 deletions HeterogeneousCore/CUDAUtilities/test/Launch_t.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"



#include <cuda_runtime.h>
#include<cstdio>

#undef __global__
#define __global__ inline __attribute__((always_inline))
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is needed to avoid multiple definition of the same symbol

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

alternative is to have also the c++ definition in its own .cc (not inlined)

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For this sample case, wouldn't it be enough to have everything in the .cu file ?

Copy link

@fwyzard fwyzard Dec 5, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, of course that does not work...

What we are trying to do for Cupla and Alpaka is to have the whole implementation in something like test/implement/Launch_t.cc, and then let scram build two versions by having

test/Launch_t.cpp

#define CUDA_KERNELS_ON_CPU
#include "implement/Launch_t.cc"

test/Launch_t.cu

#include "implement/Launch_t.cc"

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but the idea of the test is to have a single file compiled by gcc twice (see buildfile)
to test that indeed we can launch kernels from gcc and that the same code will run instead on cpu if CUDA_KERNELS_ON_CPU is defined (in this case as a compiler option).
of course for cuda we need the additional .cu file to compile the device code.
For symmetry one can claim that cpu kernels should be compiled in their on cc (as at the end I do in the vertex producer together with a minimal driver).
Still for cpu the code must be forced inlined to avoid multiple symbols.

so in my opinion (at least with this model)

#define __global__ inline __attribute__((always_inline))

in case of cpu code will be required (and apparently does not harm cuda code).
This is done in cudaCompact.h. I tried to keep this specific test as self-included as possible.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, and that is my previous standard: one .h, one .cc one .cu.
here I tried one .cc and one .cu the latter with ONLY the kernel, no driver code.
I want to test launching from code compiled with gcc (having in mind that both cpu and gpu code shall resides in the same "load units", which is not the case in this test I agree).
I can build two tests (or three) to see what is needed to have both gpu and cpu code compiled, loaded and then run in the same executable (with eventually the driver code compiled by gcc even for the gpu case).



__global__
void hello(float k) {

printf("hello %f\n",k);

}




int main() {

requireCUDADevices();

cudautils::launch(hello,{1, 1},3.14);
cudaDeviceSynchronize();

return 0;
}
10 changes: 10 additions & 0 deletions HeterogeneousCore/CUDAUtilities/test/Launch_t.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#include<cstdio>

__global__
void hello(float k) {

printf("hello %f\n",k);

}


31 changes: 31 additions & 0 deletions HeterogeneousCore/CUDAUtilities/test/cpu_unique_ptr_t.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#include "catch.hpp"

#include "HeterogeneousCore/CUDAUtilities/interface/cpu_unique_ptr.h"

TEST_CASE("cpu_unique_ptr", "[cudaMemTools]") {

SECTION("Single element") {
auto ptr = cudautils::make_cpu_unique<int>(0);
REQUIRE(ptr != nullptr);
}

SECTION("Reset") {
auto ptr = cudautils::make_cpu_unique<int>(0);
REQUIRE(ptr != nullptr);

ptr.reset();
REQUIRE(ptr.get() == nullptr);
}

SECTION("Multiple elements") {
auto ptr = cudautils::make_host_unique<int[]>(10,0);
REQUIRE(ptr != nullptr);
}

SECTION("Allocating too much") {
constexpr size_t maxSize = 1 << 30; // 8**10
auto ptr = cudautils::make_cpu_unique<char[]>(maxSize+1,0);
ptr.reset();
REQUIRE(ptr != nullptr);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,7 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
#endif

// in principle we can use "nhits" to heuristically dimension the workspace...
// overkill to use template here (std::make_unique would suffice)
// device_isOuterHitOfCell_ = Traits:: template make_unique<GPUCACell::OuterHitOfCell[]>(cs, std::max(1U,nhits), stream);
device_isOuterHitOfCell_.reset(
(GPUCACell::OuterHitOfCell *)malloc(std::max(1U, nhits) * sizeof(GPUCACell::OuterHitOfCell)));
device_isOuterHitOfCell_ = Traits:: template make_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U,nhits),stream);
assert(device_isOuterHitOfCell_.get());
gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(),
nhits,
Expand All @@ -31,8 +28,7 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
device_theCellTracks_,
device_theCellTracksContainer_.get());

// device_theCells_ = Traits:: template make_unique<GPUCACell[]>(cs, m_params.maxNumberOfDoublets_, stream);
device_theCells_.reset((GPUCACell *)malloc(sizeof(GPUCACell) * m_params.maxNumberOfDoublets_));
device_theCells_ = Traits:: template make_unique<GPUCACell[]>(m_params.maxNumberOfDoublets_,stream);
if (0 == nhits)
return; // protect against empty events

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH
}

PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
PixelTrackHeterogeneous tracks(std::make_unique<pixelTrack::TrackSoA>());
PixelTrackHeterogeneous tracks(cudautils::make_cpu_unique<pixelTrack::TrackSoA>(0));

auto* soa = tracks.get();
assert(soa);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent

assert(tracks);

ctx.emplace(iEvent, tokenGPUVertex_, m_gpuAlgo.makeAsync(ctx.stream(), tracks, m_ptMin));
ctx.emplace(iEvent, tokenGPUVertex_, m_gpuAlgo.make<cudaCompat::GPUTraits>(ctx.stream(), tracks, m_ptMin,m_OnGPU));

} else {
auto const* tracks = iEvent.get(tokenCPUTrack_).get();
Expand All @@ -117,7 +117,8 @@ void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent
std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl;
*/

iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make(tracks, m_ptMin));
cudaStream_t stream=0;
iEvent.emplace(tokenCPUVertex_, m_gpuAlgo.make<cudaCompat::CPUTraits>(stream,tracks, m_ptMin,m_OnGPU));
}
}

Expand Down
Loading