Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions backends/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ endif()
if(_cuda_is_msvc_toolchain)
target_link_libraries(
aoti_cuda_shims PRIVATE cuda_platform CUDA::cudart CUDA::curand
${CMAKE_DL_LIBS}
extension_cuda ${CMAKE_DL_LIBS}
)
# Link object library directly so symbols are pulled exactly once while
# avoiding duplicate static/object inclusion and interface leakage.
Expand All @@ -169,8 +169,13 @@ else()
target_link_libraries(
aoti_cuda_shims
PRIVATE cuda_platform
PUBLIC -Wl,--whole-archive aoti_common_shims_slim -Wl,--no-whole-archive
CUDA::cudart CUDA::curand ${CMAKE_DL_LIBS}
PUBLIC -Wl,--whole-archive
aoti_common_shims_slim
-Wl,--no-whole-archive
CUDA::cudart
CUDA::curand
extension_cuda
${CMAKE_DL_LIBS}
)
endif()

Expand Down Expand Up @@ -243,6 +248,12 @@ install(
if(BUILD_TESTING)
include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)

et_cxx_test(
test_cuda_allocator SOURCES runtime/test/test_cuda_allocator.cpp EXTRA_LIBS
aoti_cuda_backend
)
target_compile_definitions(test_cuda_allocator PRIVATE CUDA_AVAILABLE=1)

et_cxx_test(
test_cuda_mutable_state SOURCES runtime/test/test_cuda_mutable_state.cpp
EXTRA_LIBS aoti_cuda_backend
Expand Down
18 changes: 18 additions & 0 deletions backends/cuda/runtime/TARGETS
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ runtime.cxx_library(
"//executorch/runtime/core:device_allocator",
],
deps = [
"//executorch/extension/cuda:caller_stream",
"//executorch/runtime/platform:platform",
],
nvcc_flags = get_nvcc_arch_args() + [
Expand Down Expand Up @@ -163,3 +164,20 @@ cpp_unittest(
platform = "gpu-remote-execution",
),
)

cpp_unittest(
name = "test_cuda_allocator",
srcs = ["test/test_cuda_allocator.cpp"],
deps = [
":cuda_allocator",
"//executorch/extension/cuda:caller_stream",
"//executorch/runtime/core:core",
"//executorch/runtime/platform:platform",
],
external_deps = [("cuda", None, "cuda-lazy")],
preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
keep_gpu_sections = True,
remote_execution = re_test_utils.remote_execution(
platform = "gpu-remote-execution",
),
)
136 changes: 82 additions & 54 deletions backends/cuda/runtime/cuda_allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include <cuda_runtime.h>

#include <executorch/extension/cuda/caller_stream.h>
#include <executorch/runtime/platform/log.h>
Comment on lines 11 to 14

namespace executorch::backends::cuda {
Expand All @@ -19,6 +20,85 @@ using executorch::runtime::Result;
using executorch::runtime::etensor::DeviceIndex;
using executorch::runtime::etensor::DeviceType;

namespace {

Error copy_impl(
void* dst,
const void* src,
size_t nbytes,
DeviceIndex index,
cudaMemcpyKind kind) {
ET_CHECK_OR_RETURN_ERROR(
kind == cudaMemcpyHostToDevice || kind == cudaMemcpyDeviceToHost,
InvalidArgument,
"CudaAllocator::copy_impl: unsupported cudaMemcpyKind %d",
static_cast<int>(kind));
const char* method = kind == cudaMemcpyHostToDevice
? "CudaAllocator::copy_host_to_device"
: "CudaAllocator::copy_device_to_host";
ET_CHECK_OR_RETURN_ERROR(
dst != nullptr, InvalidArgument, "%s: dst is null", method);
ET_CHECK_OR_RETURN_ERROR(
src != nullptr, InvalidArgument, "%s: src is null", method);
ET_CHECK_OR_RETURN_ERROR(
index >= -1,
InvalidArgument,
"%s: invalid device index %d (must be >= -1)",
method,
static_cast<int>(index));
const auto caller_stream = executorch::extension::cuda::getCallerStream();
if (caller_stream) {
// TODO: validate caller stream device matches index.
// For now assert index is -1 or 0.
ET_CHECK_OR_RETURN_ERROR(
index == -1 || index == 0,
InvalidArgument,
"%s: with caller stream, only supports device 0 or -1 (current), got %d",
method,
static_cast<int>(index));
}
Comment on lines +49 to +59
if (nbytes == 0) {
return Error::Ok;
}

int prev_device = 0;
cudaError_t prev_device_err = cudaSuccess;

if (index >= 0) {
prev_device_err = cudaGetDevice(&prev_device);
if (prev_device_err == cudaSuccess) {
cudaSetDevice(index);
}
}
cudaError_t err = cudaSuccess;
if (caller_stream) {
err = cudaMemcpyAsync(dst, src, nbytes, kind, *caller_stream);
if (err == cudaSuccess && kind == cudaMemcpyDeviceToHost) {
err = cudaStreamSynchronize(*caller_stream);
}
} else {
err = cudaMemcpy(dst, src, nbytes, kind);
}
Comment on lines +74 to +81

if (index >= 0 && prev_device_err == cudaSuccess) {
cudaSetDevice(prev_device);
}

if (err != cudaSuccess) {
ET_LOG(
Error,
"cudaMemcpy %s failed: %s (%zu bytes, device %d)",
kind == cudaMemcpyHostToDevice ? "H2D" : "D2H",
cudaGetErrorString(err),
nbytes,
static_cast<int>(index));
return Error::Internal;
}
return Error::Ok;
}

} // namespace

Result<void*>
CudaAllocator::allocate(size_t nbytes, DeviceIndex index, size_t alignment) {
// index == -1 means "use the current CUDA device"; any value < -1 is invalid.
Expand Down Expand Up @@ -124,72 +204,20 @@ void CudaAllocator::deallocate(void* ptr, DeviceIndex index) {
}
}

// TODO(gasoonjia): Add support for async copy
Error CudaAllocator::copy_host_to_device(
void* dst,
const void* src,
size_t nbytes,
DeviceIndex index) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

copy_host_to_device and copy_device_to_host are almost identical now (null checks, nbytes==0, the index guard, device save/restore, the stream branch), differing only in the copy kind and the sync. Could fold them into one private helper that takes the cudaMemcpyKind to keep the shared logic in one place.

int prev_device = 0;
cudaError_t prev_device_err = cudaSuccess;

if (index >= 0) {
prev_device_err = cudaGetDevice(&prev_device);
if (prev_device_err == cudaSuccess) {
cudaSetDevice(index);
}
}

cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToDevice);

if (index >= 0 && prev_device_err == cudaSuccess) {
cudaSetDevice(prev_device);
}

if (err != cudaSuccess) {
ET_LOG(
Error,
"cudaMemcpy H2D failed: %s (%zu bytes, device %d)",
cudaGetErrorString(err),
nbytes,
static_cast<int>(index));
return Error::Internal;
}
return Error::Ok;
return copy_impl(dst, src, nbytes, index, cudaMemcpyHostToDevice);
}

// TODO(gasoonjia): Add support for async copy
Error CudaAllocator::copy_device_to_host(
void* dst,
const void* src,
size_t nbytes,
DeviceIndex index) {
int prev_device = 0;
cudaError_t prev_device_err = cudaSuccess;

if (index >= 0) {
prev_device_err = cudaGetDevice(&prev_device);
if (prev_device_err == cudaSuccess) {
cudaSetDevice(index);
}
}

cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToHost);

if (index >= 0 && prev_device_err == cudaSuccess) {
cudaSetDevice(prev_device);
}

if (err != cudaSuccess) {
ET_LOG(
Error,
"cudaMemcpy D2H failed: %s (%zu bytes, device %d)",
cudaGetErrorString(err),
nbytes,
static_cast<int>(index));
return Error::Internal;
}
return Error::Ok;
return copy_impl(dst, src, nbytes, index, cudaMemcpyDeviceToHost);
}

DeviceType CudaAllocator::device_type() const {
Expand Down
114 changes: 114 additions & 0 deletions backends/cuda/runtime/test/test_cuda_allocator.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <gtest/gtest.h>

#include <cuda_runtime.h>

#include <cstdint>
#include <vector>

#include <executorch/backends/cuda/runtime/cuda_allocator.h>
#include <executorch/extension/cuda/caller_stream.h>
#include <executorch/runtime/core/error.h>
#include <executorch/runtime/platform/platform.h>

using executorch::backends::cuda::CudaAllocator;
using executorch::runtime::Error;

class CudaAllocatorTest : public testing::Test {
protected:
void SetUp() override {
et_pal_init();

int device_count = 0;
cudaError_t err = cudaGetDeviceCount(&device_count);
if (err != cudaSuccess || device_count == 0) {
GTEST_SKIP() << "CUDA not available";
}
}
};

TEST_F(CudaAllocatorTest, CopyRoundtrip) {
CudaAllocator& a = CudaAllocator::instance();
constexpr size_t N = 1024;
auto res = a.allocate(N, 0);
ASSERT_TRUE(res.ok());
void* dptr = res.get();

std::vector<uint8_t> h_src(N, 42), h_dst(N, 0);
ASSERT_EQ(a.copy_host_to_device(dptr, h_src.data(), N, 0), Error::Ok);
EXPECT_EQ(a.copy_device_to_host(h_dst.data(), dptr, N, 0), Error::Ok);
EXPECT_EQ(h_src, h_dst);

a.deallocate(dptr, 0);
}

TEST_F(CudaAllocatorTest, CopyRoundtripWithCallerStream) {
int device = 0;
ASSERT_EQ(cudaGetDevice(&device), cudaSuccess);
ASSERT_EQ(device, 0) << "test assumes single GPU device 0";
// TODO: validate caller stream device matches index once CallerStreamGuard
// exposes device. For now assert single-GPU case.
cudaStream_t s;
ASSERT_EQ(cudaStreamCreate(&s), cudaSuccess);
{
executorch::extension::cuda::CallerStreamGuard g(s);

CudaAllocator& a = CudaAllocator::instance();
auto res = a.allocate(256, 0);
ASSERT_TRUE(res.ok());
void* d = res.get();
std::vector<uint8_t> h_src(256, 5), h_dst(256, 0);
ASSERT_EQ(a.copy_host_to_device(d, h_src.data(), 256, 0), Error::Ok);
EXPECT_EQ(a.copy_device_to_host(h_dst.data(), d, 256, 0), Error::Ok);
EXPECT_EQ(h_src, h_dst);
EXPECT_EQ(cudaStreamSynchronize(s), cudaSuccess);

a.deallocate(d, 0);
}
ASSERT_EQ(cudaStreamDestroy(s), cudaSuccess);
}

TEST_F(CudaAllocatorTest, CopyHostToDeviceNullDstReturnsInvalidArgument) {
CudaAllocator& a = CudaAllocator::instance();
// null dst should fail gracefully not CHECK abort
std::vector<uint8_t> h(8, 1);
Error e = a.copy_host_to_device(nullptr, h.data(), 8, 0);
EXPECT_EQ(e, Error::InvalidArgument)
<< "expected InvalidArgument for null dst, got "
<< static_cast<uint32_t>(e);
}

TEST_F(CudaAllocatorTest, CopyHostToDeviceNullSrcReturnsInvalidArgument) {
CudaAllocator& a = CudaAllocator::instance();
void* dummy_dst = reinterpret_cast<void*>(0x1);
Error e = a.copy_host_to_device(dummy_dst, nullptr, 8, 0);
EXPECT_EQ(e, Error::InvalidArgument)
<< "expected InvalidArgument for null src, got "
<< static_cast<uint32_t>(e);
}

TEST_F(CudaAllocatorTest, CopyDeviceToHostNullDstReturnsInvalidArgument) {
CudaAllocator& a = CudaAllocator::instance();
void* dummy_src = reinterpret_cast<void*>(0x1);
Error e = a.copy_device_to_host(nullptr, dummy_src, 8, 0);
EXPECT_EQ(e, Error::InvalidArgument)
<< "expected InvalidArgument for null dst, got "
<< static_cast<uint32_t>(e);
}

TEST_F(CudaAllocatorTest, CopyDeviceToHostNullSrcReturnsInvalidArgument) {
CudaAllocator& a = CudaAllocator::instance();
std::vector<uint8_t> h(8, 1);
// null src should fail gracefully not CHECK abort
Error e = a.copy_device_to_host(h.data(), nullptr, 8, 0);
EXPECT_EQ(e, Error::InvalidArgument)
<< "expected InvalidArgument for null src, got "
<< static_cast<uint32_t>(e);
}
3 changes: 3 additions & 0 deletions extension/cuda/caller_stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ EXECUTORCH_EXTENSION_CUDA_API std::optional<cudaStream_t> getCallerStream();
* context's SM partition; the confinement rides the stream, so the green
* context need not be made current. The caller owns the stream for the guard's
* lifetime.
*
* The user is responsible for handling the lifetimes of host data so that it
* lasts until after async operations that use it are completed.
*/
class EXECUTORCH_EXTENSION_CUDA_API CallerStreamGuard {
public:
Expand Down
Loading