Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions backends/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,12 @@ install(
if(BUILD_TESTING)
include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)

et_cxx_test(
test_cuda_allocator SOURCES runtime/test/test_cuda_allocator.cpp EXTRA_LIBS
aoti_cuda_backend
)
target_compile_definitions(test_cuda_allocator PRIVATE CUDA_AVAILABLE=1)

et_cxx_test(
test_cuda_mutable_state SOURCES runtime/test/test_cuda_mutable_state.cpp
EXTRA_LIBS aoti_cuda_backend
Expand Down
18 changes: 18 additions & 0 deletions backends/cuda/runtime/TARGETS
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ runtime.cxx_library(
"//executorch/runtime/core:device_allocator",
],
deps = [
"//executorch/extension/cuda:caller_stream",
"//executorch/runtime/platform:platform",
],
nvcc_flags = get_nvcc_arch_args() + [
Expand Down Expand Up @@ -163,3 +164,20 @@ cpp_unittest(
platform = "gpu-remote-execution",
),
)

cpp_unittest(
name = "test_cuda_allocator",
srcs = ["test/test_cuda_allocator.cpp"],
deps = [
":cuda_allocator",
"//executorch/extension/cuda:caller_stream",
"//executorch/runtime/core:core",
"//executorch/runtime/platform:platform",
],
external_deps = [("cuda", None, "cuda-lazy")],
preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
keep_gpu_sections = True,
remote_execution = re_test_utils.remote_execution(
platform = "gpu-remote-execution",
),
)
66 changes: 60 additions & 6 deletions backends/cuda/runtime/cuda_allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include <cuda_runtime.h>

#include <executorch/extension/cuda/caller_stream.h>
#include <executorch/runtime/platform/log.h>
Comment on lines 11 to 14

namespace executorch::backends::cuda {
Expand Down Expand Up @@ -124,12 +125,30 @@ void CudaAllocator::deallocate(void* ptr, DeviceIndex index) {
}
}

// TODO(gasoonjia): Add support for async copy
Error CudaAllocator::copy_host_to_device(
void* dst,
const void* src,
size_t nbytes,
DeviceIndex index) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

copy_host_to_device and copy_device_to_host are almost identical now (null checks, nbytes==0, the index guard, device save/restore, the stream branch), differing only in the copy kind and the sync. Could fold them into one private helper that takes the cudaMemcpyKind to keep the shared logic in one place.

ET_CHECK_OR_RETURN_ERROR(
dst != nullptr,
InvalidArgument,
"CudaAllocator::copy_host_to_device dst is null");
ET_CHECK_OR_RETURN_ERROR(
src != nullptr,
InvalidArgument,
"CudaAllocator::copy_host_to_device src is null");
if (nbytes == 0) {
return Error::Ok;
}
// TODO: validate caller stream device matches index.
// For now assert index is -1 or 0.
ET_CHECK_OR_RETURN_ERROR(

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The index==-1 || index==0 guard is too broad. The reason to restrict the device is that a green-context caller stream is tied to its creation device. The plain cudaMemcpy path has no caller stream and used to handle any index via setDevice/restore, and tensor_ptr.cpp passes arbitrary indices and aborts on a non-Ok result, so this guard crashes multi-GPU callers that worked before. Suggest checking the device only in the caller-stream branch and letting the no-stream path copy on any index>=-1 like before. Same applies to copy_device_to_host (line 207).

index == -1 || index == 0,
InvalidArgument,
"CudaAllocator::copy_host_to_device only supports device 0 or -1 (current), got %d",
static_cast<int>(index));
Comment on lines +144 to +150

int prev_device = 0;
cudaError_t prev_device_err = cudaSuccess;

Expand All @@ -139,8 +158,16 @@ Error CudaAllocator::copy_host_to_device(
cudaSetDevice(index);
}
}

cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToDevice);
cudaError_t err = cudaSuccess;
const auto caller_stream = executorch::extension::cuda::getCallerStream();
if (caller_stream) {
err = cudaMemcpyAsync(
dst, src, nbytes, cudaMemcpyHostToDevice, *caller_stream);
// We don't synchronize the stream here because the caller is expected to
Comment on lines +161 to +166
// synchronize the stream.
} else {
err = cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToDevice);
}

if (index >= 0 && prev_device_err == cudaSuccess) {
cudaSetDevice(prev_device);
Expand All @@ -158,12 +185,30 @@ Error CudaAllocator::copy_host_to_device(
return Error::Ok;
}

// TODO(gasoonjia): Add support for async copy
Error CudaAllocator::copy_device_to_host(
void* dst,
const void* src,
size_t nbytes,
DeviceIndex index) {
ET_CHECK_OR_RETURN_ERROR(
dst != nullptr,
InvalidArgument,
"CudaAllocator::copy_device_to_host dst is null");
ET_CHECK_OR_RETURN_ERROR(
src != nullptr,
InvalidArgument,
"CudaAllocator::copy_device_to_host src is null");
if (nbytes == 0) {
return Error::Ok;
}
// TODO: validate caller stream device matches index.
// For now assert index is -1 or 0.
ET_CHECK_OR_RETURN_ERROR(
index == -1 || index == 0,
InvalidArgument,
"CudaAllocator::copy_device_to_host only supports device 0 or -1 (current), got %d",
static_cast<int>(index));
Comment on lines +204 to +210

int prev_device = 0;
cudaError_t prev_device_err = cudaSuccess;

Expand All @@ -173,8 +218,17 @@ Error CudaAllocator::copy_device_to_host(
cudaSetDevice(index);
}
}

cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToHost);
cudaError_t err = cudaSuccess;
const auto caller_stream = executorch::extension::cuda::getCallerStream();
if (caller_stream) {
err = cudaMemcpyAsync(
dst, src, nbytes, cudaMemcpyDeviceToHost, *caller_stream);
if (err == cudaSuccess) {
err = cudaStreamSynchronize(*caller_stream);
}
Comment on lines +223 to +228
} else {
Comment on lines +223 to +229
err = cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToHost);
}

if (index >= 0 && prev_device_err == cudaSuccess) {
cudaSetDevice(prev_device);
Expand Down
150 changes: 150 additions & 0 deletions backends/cuda/runtime/test/test_cuda_allocator.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <gtest/gtest.h>

#include <cuda_runtime.h>

#include <cstdint>
#include <vector>

#include <executorch/backends/cuda/runtime/cuda_allocator.h>
#include <executorch/extension/cuda/caller_stream.h>
#include <executorch/runtime/core/error.h>
#include <executorch/runtime/platform/platform.h>

using executorch::backends::cuda::CudaAllocator;
using executorch::runtime::Error;

class CudaAllocatorTest : public testing::Test {
protected:
void SetUp() override {
et_pal_init();

int device_count = 0;
cudaError_t err = cudaGetDeviceCount(&device_count);
if (err != cudaSuccess || device_count == 0) {
GTEST_SKIP() << "CUDA not available";
}
}
};

TEST_F(CudaAllocatorTest, CopyHostToDevice) {
CudaAllocator& a = CudaAllocator::instance();
constexpr size_t N = 1024;
auto res = a.allocate(N, 0);
ASSERT_TRUE(res.ok());
void* dptr = res.get();

std::vector<uint8_t> h_src(N, 42);
EXPECT_EQ(a.copy_host_to_device(dptr, h_src.data(), N, 0), Error::Ok);

a.deallocate(dptr, 0);
Comment on lines +44 to +47
}

TEST_F(CudaAllocatorTest, CopyDeviceToHost) {
CudaAllocator& a = CudaAllocator::instance();
constexpr size_t N = 1024;
auto res = a.allocate(N, 0);
ASSERT_TRUE(res.ok());
void* dptr = res.get();

std::vector<uint8_t> h_src(N, 42), h_dst(N, 0);
ASSERT_EQ(a.copy_host_to_device(dptr, h_src.data(), N, 0), Error::Ok);
EXPECT_EQ(a.copy_device_to_host(h_dst.data(), dptr, N, 0), Error::Ok);
EXPECT_EQ(h_src, h_dst);

a.deallocate(dptr, 0);
}

TEST_F(CudaAllocatorTest, CopyHostToDeviceWithCallerStream) {
int device = 0;
ASSERT_EQ(cudaGetDevice(&device), cudaSuccess);
ASSERT_EQ(device, 0) << "test assumes single GPU device 0";
// TODO: validate caller stream device matches index once CallerStreamGuard
// exposes device. For now assert single-GPU case.
cudaStream_t s;
ASSERT_EQ(cudaStreamCreate(&s), cudaSuccess);
{
executorch::extension::cuda::CallerStreamGuard g(s);

CudaAllocator& a = CudaAllocator::instance();
auto res = a.allocate(256, 0);
ASSERT_TRUE(res.ok());
void* d = res.get();
std::vector<uint8_t> h(256, 7);
// should take async branch internally, still return Ok
EXPECT_EQ(a.copy_host_to_device(d, h.data(), 256, 0), Error::Ok);
ASSERT_EQ(cudaStreamSynchronize(s), cudaSuccess);
a.deallocate(d, 0);
Comment on lines +80 to +84
}
ASSERT_EQ(cudaStreamDestroy(s), cudaSuccess);
}

TEST_F(CudaAllocatorTest, CopyDeviceToHostWithCallerStream) {
int device = 0;
ASSERT_EQ(cudaGetDevice(&device), cudaSuccess);
ASSERT_EQ(device, 0) << "test assumes single GPU device 0";
// TODO: validate caller stream device matches index once CallerStreamGuard
// exposes device. For now assert single-GPU case.
cudaStream_t s;
ASSERT_EQ(cudaStreamCreate(&s), cudaSuccess);
{
executorch::extension::cuda::CallerStreamGuard g(s);

CudaAllocator& a = CudaAllocator::instance();
auto res = a.allocate(256, 0);
ASSERT_TRUE(res.ok());
void* d = res.get();
std::vector<uint8_t> h_src(256, 5), h_dst(256, 0);
ASSERT_EQ(a.copy_host_to_device(d, h_src.data(), 256, 0), Error::Ok);
EXPECT_EQ(a.copy_device_to_host(h_dst.data(), d, 256, 0), Error::Ok);
EXPECT_EQ(h_src, h_dst);

a.deallocate(d, 0);
}
ASSERT_EQ(cudaStreamDestroy(s), cudaSuccess);
}

TEST_F(CudaAllocatorTest, CopyHostToDeviceNullDstReturnsInvalidArgument) {
CudaAllocator& a = CudaAllocator::instance();
// null dst should fail gracefully not CHECK abort
std::vector<uint8_t> h(8, 1);
Error e = a.copy_host_to_device(nullptr, h.data(), 8, 0);
EXPECT_EQ(e, Error::InvalidArgument)
<< "expected InvalidArgument for null dst, got "
<< static_cast<uint32_t>(e);
}

TEST_F(CudaAllocatorTest, CopyHostToDeviceNullSrcReturnsInvalidArgument) {
CudaAllocator& a = CudaAllocator::instance();
void* dummy_dst = reinterpret_cast<void*>(0x1);
Error e = a.copy_host_to_device(dummy_dst, nullptr, 8, 0);
EXPECT_EQ(e, Error::InvalidArgument)
<< "expected InvalidArgument for null src, got "
<< static_cast<uint32_t>(e);
}

TEST_F(CudaAllocatorTest, CopyDeviceToHostNullDstReturnsInvalidArgument) {
CudaAllocator& a = CudaAllocator::instance();
void* dummy_src = reinterpret_cast<void*>(0x1);
Error e = a.copy_device_to_host(nullptr, dummy_src, 8, 0);
EXPECT_EQ(e, Error::InvalidArgument)
<< "expected InvalidArgument for null dst, got "
<< static_cast<uint32_t>(e);
}

TEST_F(CudaAllocatorTest, CopyDeviceToHostNullSrcReturnsInvalidArgument) {
CudaAllocator& a = CudaAllocator::instance();
std::vector<uint8_t> h(8, 1);
// null src should fail gracefully not CHECK abort
Error e = a.copy_device_to_host(h.data(), nullptr, 8, 0);
EXPECT_EQ(e, Error::InvalidArgument)
<< "expected InvalidArgument for null src, got "
<< static_cast<uint32_t>(e);
}
Loading