Skip to content

Commit c657616

Browse files
Conarnarfacebook-github-bot
authored andcommitted
Use caller CUDA stream for D2H and H2D copies (#20498)
Summary: CudaAllocator memory copies now support async copy on a caller-provided CUDA stream. When a caller stream is available (via `getCallerStream()`), `copy_host_to_device` and `copy_device_to_host` use `cudaMemcpyAsync`. When no caller stream is set, the synchronous `cudaMemcpy` path is used as before. Additionally: - Added null pointer and zero-byte validation — null `dst`/`src` return `Error::InvalidArgument` instead of aborting in `cudaMemcpy`, and zero-byte copies return `Error::Ok` early. - Assert single-GPU case (index 0 or -1) until multi-GPU stream validation is added. - Wired `//executorch/extension/cuda:caller_stream` dependency in TARGETS. - Added `extension_cuda` dependencies to CMakeLists.txt. - Added `test_cuda_allocator` with coverage for sync/async paths and error handling. Reviewed By: Gasoonjia Differential Revision: D109590531
1 parent b919db7 commit c657616

5 files changed

Lines changed: 231 additions & 57 deletions

File tree

backends/cuda/CMakeLists.txt

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ endif()
160160
if(_cuda_is_msvc_toolchain)
161161
target_link_libraries(
162162
aoti_cuda_shims PRIVATE cuda_platform CUDA::cudart CUDA::curand
163-
${CMAKE_DL_LIBS}
163+
extension_cuda ${CMAKE_DL_LIBS}
164164
)
165165
# Link object library directly so symbols are pulled exactly once while
166166
# avoiding duplicate static/object inclusion and interface leakage.
@@ -169,8 +169,13 @@ else()
169169
target_link_libraries(
170170
aoti_cuda_shims
171171
PRIVATE cuda_platform
172-
PUBLIC -Wl,--whole-archive aoti_common_shims_slim -Wl,--no-whole-archive
173-
CUDA::cudart CUDA::curand ${CMAKE_DL_LIBS}
172+
PUBLIC -Wl,--whole-archive
173+
aoti_common_shims_slim
174+
-Wl,--no-whole-archive
175+
CUDA::cudart
176+
CUDA::curand
177+
extension_cuda
178+
${CMAKE_DL_LIBS}
174179
)
175180
endif()
176181

@@ -243,6 +248,12 @@ install(
243248
if(BUILD_TESTING)
244249
include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
245250

251+
et_cxx_test(
252+
test_cuda_allocator SOURCES runtime/test/test_cuda_allocator.cpp EXTRA_LIBS
253+
aoti_cuda_backend
254+
)
255+
target_compile_definitions(test_cuda_allocator PRIVATE CUDA_AVAILABLE=1)
256+
246257
et_cxx_test(
247258
test_cuda_mutable_state SOURCES runtime/test/test_cuda_mutable_state.cpp
248259
EXTRA_LIBS aoti_cuda_backend

backends/cuda/runtime/TARGETS

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ runtime.cxx_library(
9292
"//executorch/runtime/core:device_allocator",
9393
],
9494
deps = [
95+
"//executorch/extension/cuda:caller_stream",
9596
"//executorch/runtime/platform:platform",
9697
],
9798
nvcc_flags = get_nvcc_arch_args() + [
@@ -163,3 +164,20 @@ cpp_unittest(
163164
platform = "gpu-remote-execution",
164165
),
165166
)
167+
168+
cpp_unittest(
169+
name = "test_cuda_allocator",
170+
srcs = ["test/test_cuda_allocator.cpp"],
171+
deps = [
172+
":cuda_allocator",
173+
"//executorch/extension/cuda:caller_stream",
174+
"//executorch/runtime/core:core",
175+
"//executorch/runtime/platform:platform",
176+
],
177+
external_deps = [("cuda", None, "cuda-lazy")],
178+
preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
179+
keep_gpu_sections = True,
180+
remote_execution = re_test_utils.remote_execution(
181+
platform = "gpu-remote-execution",
182+
),
183+
)

backends/cuda/runtime/cuda_allocator.cpp

Lines changed: 82 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
#include <cuda_runtime.h>
1212

13+
#include <executorch/extension/cuda/caller_stream.h>
1314
#include <executorch/runtime/platform/log.h>
1415

1516
namespace executorch::backends::cuda {
@@ -19,6 +20,85 @@ using executorch::runtime::Result;
1920
using executorch::runtime::etensor::DeviceIndex;
2021
using executorch::runtime::etensor::DeviceType;
2122

23+
namespace {
24+
25+
Error copy_impl(
26+
void* dst,
27+
const void* src,
28+
size_t nbytes,
29+
DeviceIndex index,
30+
cudaMemcpyKind kind) {
31+
ET_CHECK_OR_RETURN_ERROR(
32+
kind == cudaMemcpyHostToDevice || kind == cudaMemcpyDeviceToHost,
33+
InvalidArgument,
34+
"CudaAllocator::copy_impl: unsupported cudaMemcpyKind %d",
35+
static_cast<int>(kind));
36+
const char* method = kind == cudaMemcpyHostToDevice
37+
? "CudaAllocator::copy_host_to_device"
38+
: "CudaAllocator::copy_device_to_host";
39+
ET_CHECK_OR_RETURN_ERROR(
40+
dst != nullptr, InvalidArgument, "%s: dst is null", method);
41+
ET_CHECK_OR_RETURN_ERROR(
42+
src != nullptr, InvalidArgument, "%s: src is null", method);
43+
ET_CHECK_OR_RETURN_ERROR(
44+
index >= -1,
45+
InvalidArgument,
46+
"%s: invalid device index %d (must be >= -1)",
47+
method,
48+
static_cast<int>(index));
49+
const auto caller_stream = executorch::extension::cuda::getCallerStream();
50+
if (caller_stream) {
51+
// TODO: validate caller stream device matches index.
52+
// For now assert index is -1 or 0.
53+
ET_CHECK_OR_RETURN_ERROR(
54+
index == -1 || index == 0,
55+
InvalidArgument,
56+
"%s: with caller stream, only supports device 0 or -1 (current), got %d",
57+
method,
58+
static_cast<int>(index));
59+
}
60+
if (nbytes == 0) {
61+
return Error::Ok;
62+
}
63+
64+
int prev_device = 0;
65+
cudaError_t prev_device_err = cudaSuccess;
66+
67+
if (index >= 0) {
68+
prev_device_err = cudaGetDevice(&prev_device);
69+
if (prev_device_err == cudaSuccess) {
70+
cudaSetDevice(index);
71+
}
72+
}
73+
cudaError_t err = cudaSuccess;
74+
if (caller_stream) {
75+
err = cudaMemcpyAsync(dst, src, nbytes, kind, *caller_stream);
76+
if (err == cudaSuccess && kind == cudaMemcpyDeviceToHost) {
77+
err = cudaStreamSynchronize(*caller_stream);
78+
}
79+
} else {
80+
err = cudaMemcpy(dst, src, nbytes, kind);
81+
}
82+
83+
if (index >= 0 && prev_device_err == cudaSuccess) {
84+
cudaSetDevice(prev_device);
85+
}
86+
87+
if (err != cudaSuccess) {
88+
ET_LOG(
89+
Error,
90+
"cudaMemcpy %s failed: %s (%zu bytes, device %d)",
91+
kind == cudaMemcpyHostToDevice ? "H2D" : "D2H",
92+
cudaGetErrorString(err),
93+
nbytes,
94+
static_cast<int>(index));
95+
return Error::Internal;
96+
}
97+
return Error::Ok;
98+
}
99+
100+
} // namespace
101+
22102
Result<void*>
23103
CudaAllocator::allocate(size_t nbytes, DeviceIndex index, size_t alignment) {
24104
// index == -1 means "use the current CUDA device"; any value < -1 is invalid.
@@ -124,72 +204,20 @@ void CudaAllocator::deallocate(void* ptr, DeviceIndex index) {
124204
}
125205
}
126206

127-
// TODO(gasoonjia): Add support for async copy
128207
Error CudaAllocator::copy_host_to_device(
129208
void* dst,
130209
const void* src,
131210
size_t nbytes,
132211
DeviceIndex index) {
133-
int prev_device = 0;
134-
cudaError_t prev_device_err = cudaSuccess;
135-
136-
if (index >= 0) {
137-
prev_device_err = cudaGetDevice(&prev_device);
138-
if (prev_device_err == cudaSuccess) {
139-
cudaSetDevice(index);
140-
}
141-
}
142-
143-
cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToDevice);
144-
145-
if (index >= 0 && prev_device_err == cudaSuccess) {
146-
cudaSetDevice(prev_device);
147-
}
148-
149-
if (err != cudaSuccess) {
150-
ET_LOG(
151-
Error,
152-
"cudaMemcpy H2D failed: %s (%zu bytes, device %d)",
153-
cudaGetErrorString(err),
154-
nbytes,
155-
static_cast<int>(index));
156-
return Error::Internal;
157-
}
158-
return Error::Ok;
212+
return copy_impl(dst, src, nbytes, index, cudaMemcpyHostToDevice);
159213
}
160214

161-
// TODO(gasoonjia): Add support for async copy
162215
Error CudaAllocator::copy_device_to_host(
163216
void* dst,
164217
const void* src,
165218
size_t nbytes,
166219
DeviceIndex index) {
167-
int prev_device = 0;
168-
cudaError_t prev_device_err = cudaSuccess;
169-
170-
if (index >= 0) {
171-
prev_device_err = cudaGetDevice(&prev_device);
172-
if (prev_device_err == cudaSuccess) {
173-
cudaSetDevice(index);
174-
}
175-
}
176-
177-
cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToHost);
178-
179-
if (index >= 0 && prev_device_err == cudaSuccess) {
180-
cudaSetDevice(prev_device);
181-
}
182-
183-
if (err != cudaSuccess) {
184-
ET_LOG(
185-
Error,
186-
"cudaMemcpy D2H failed: %s (%zu bytes, device %d)",
187-
cudaGetErrorString(err),
188-
nbytes,
189-
static_cast<int>(index));
190-
return Error::Internal;
191-
}
192-
return Error::Ok;
220+
return copy_impl(dst, src, nbytes, index, cudaMemcpyDeviceToHost);
193221
}
194222

195223
DeviceType CudaAllocator::device_type() const {
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <gtest/gtest.h>
10+
11+
#include <cuda_runtime.h>
12+
13+
#include <cstdint>
14+
#include <vector>
15+
16+
#include <executorch/backends/cuda/runtime/cuda_allocator.h>
17+
#include <executorch/extension/cuda/caller_stream.h>
18+
#include <executorch/runtime/core/error.h>
19+
#include <executorch/runtime/platform/platform.h>
20+
21+
using executorch::backends::cuda::CudaAllocator;
22+
using executorch::runtime::Error;
23+
24+
class CudaAllocatorTest : public testing::Test {
25+
protected:
26+
void SetUp() override {
27+
et_pal_init();
28+
29+
int device_count = 0;
30+
cudaError_t err = cudaGetDeviceCount(&device_count);
31+
if (err != cudaSuccess || device_count == 0) {
32+
GTEST_SKIP() << "CUDA not available";
33+
}
34+
}
35+
};
36+
37+
TEST_F(CudaAllocatorTest, CopyRoundtrip) {
38+
CudaAllocator& a = CudaAllocator::instance();
39+
constexpr size_t N = 1024;
40+
auto res = a.allocate(N, 0);
41+
ASSERT_TRUE(res.ok());
42+
void* dptr = res.get();
43+
44+
std::vector<uint8_t> h_src(N, 42), h_dst(N, 0);
45+
ASSERT_EQ(a.copy_host_to_device(dptr, h_src.data(), N, 0), Error::Ok);
46+
EXPECT_EQ(a.copy_device_to_host(h_dst.data(), dptr, N, 0), Error::Ok);
47+
EXPECT_EQ(h_src, h_dst);
48+
49+
a.deallocate(dptr, 0);
50+
}
51+
52+
TEST_F(CudaAllocatorTest, CopyRoundtripWithCallerStream) {
53+
int device = 0;
54+
ASSERT_EQ(cudaGetDevice(&device), cudaSuccess);
55+
ASSERT_EQ(device, 0) << "test assumes single GPU device 0";
56+
// TODO: validate caller stream device matches index once CallerStreamGuard
57+
// exposes device. For now assert single-GPU case.
58+
cudaStream_t s;
59+
ASSERT_EQ(cudaStreamCreate(&s), cudaSuccess);
60+
{
61+
executorch::extension::cuda::CallerStreamGuard g(s);
62+
63+
CudaAllocator& a = CudaAllocator::instance();
64+
auto res = a.allocate(256, 0);
65+
ASSERT_TRUE(res.ok());
66+
void* d = res.get();
67+
std::vector<uint8_t> h_src(256, 5), h_dst(256, 0);
68+
ASSERT_EQ(a.copy_host_to_device(d, h_src.data(), 256, 0), Error::Ok);
69+
EXPECT_EQ(a.copy_device_to_host(h_dst.data(), d, 256, 0), Error::Ok);
70+
EXPECT_EQ(h_src, h_dst);
71+
EXPECT_EQ(cudaStreamSynchronize(s), cudaSuccess);
72+
73+
a.deallocate(d, 0);
74+
}
75+
ASSERT_EQ(cudaStreamDestroy(s), cudaSuccess);
76+
}
77+
78+
TEST_F(CudaAllocatorTest, CopyHostToDeviceNullDstReturnsInvalidArgument) {
79+
CudaAllocator& a = CudaAllocator::instance();
80+
// null dst should fail gracefully not CHECK abort
81+
std::vector<uint8_t> h(8, 1);
82+
Error e = a.copy_host_to_device(nullptr, h.data(), 8, 0);
83+
EXPECT_EQ(e, Error::InvalidArgument)
84+
<< "expected InvalidArgument for null dst, got "
85+
<< static_cast<uint32_t>(e);
86+
}
87+
88+
TEST_F(CudaAllocatorTest, CopyHostToDeviceNullSrcReturnsInvalidArgument) {
89+
CudaAllocator& a = CudaAllocator::instance();
90+
void* dummy_dst = reinterpret_cast<void*>(0x1);
91+
Error e = a.copy_host_to_device(dummy_dst, nullptr, 8, 0);
92+
EXPECT_EQ(e, Error::InvalidArgument)
93+
<< "expected InvalidArgument for null src, got "
94+
<< static_cast<uint32_t>(e);
95+
}
96+
97+
TEST_F(CudaAllocatorTest, CopyDeviceToHostNullDstReturnsInvalidArgument) {
98+
CudaAllocator& a = CudaAllocator::instance();
99+
void* dummy_src = reinterpret_cast<void*>(0x1);
100+
Error e = a.copy_device_to_host(nullptr, dummy_src, 8, 0);
101+
EXPECT_EQ(e, Error::InvalidArgument)
102+
<< "expected InvalidArgument for null dst, got "
103+
<< static_cast<uint32_t>(e);
104+
}
105+
106+
TEST_F(CudaAllocatorTest, CopyDeviceToHostNullSrcReturnsInvalidArgument) {
107+
CudaAllocator& a = CudaAllocator::instance();
108+
std::vector<uint8_t> h(8, 1);
109+
// null src should fail gracefully not CHECK abort
110+
Error e = a.copy_device_to_host(h.data(), nullptr, 8, 0);
111+
EXPECT_EQ(e, Error::InvalidArgument)
112+
<< "expected InvalidArgument for null src, got "
113+
<< static_cast<uint32_t>(e);
114+
}

extension/cuda/caller_stream.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ EXECUTORCH_EXTENSION_CUDA_API std::optional<cudaStream_t> getCallerStream();
3737
* context's SM partition; the confinement rides the stream, so the green
3838
* context need not be made current. The caller owns the stream for the guard's
3939
* lifetime.
40+
*
41+
* The user is responsible for handling the lifetimes of host data so that it
42+
* lasts until after async operations that use it are completed.
4043
*/
4144
class EXECUTORCH_EXTENSION_CUDA_API CallerStreamGuard {
4245
public:

0 commit comments

Comments
 (0)