Skip to content

Commit 1e001a5

Browse files
Conarnarfacebook-github-bot
authored andcommitted
Use caller CUDA stream for D2H and H2D copies (pytorch#20498)
Summary: CudaAllocator memory copies now support async copy on a caller-provided CUDA stream. When a caller stream is available (via `getCallerStream()`), `copy_host_to_device` and `copy_device_to_host` use `cudaMemcpyAsync`. When no caller stream is set, the synchronous `cudaMemcpy` path is used as before. Additionally: - Added null pointer and zero-byte validation — null `dst`/`src` return `Error::InvalidArgument` instead of aborting in `cudaMemcpy`, and zero-byte copies return `Error::Ok` early. - Assert single-GPU case (index 0 or -1) until multi-GPU stream validation is added. - Wired `//executorch/extension/cuda:caller_stream` dependency in TARGETS. - Added `test_cuda_allocator` with coverage for sync/async paths and error handling. Differential Revision: D109590531
1 parent bb28350 commit 1e001a5

4 files changed

Lines changed: 239 additions & 6 deletions

File tree

backends/cuda/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,12 @@ install(
243243
if(BUILD_TESTING)
244244
include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
245245

246+
et_cxx_test(
247+
test_cuda_allocator SOURCES runtime/test/test_cuda_allocator.cpp EXTRA_LIBS
248+
aoti_cuda_backend
249+
)
250+
target_compile_definitions(test_cuda_allocator PRIVATE CUDA_AVAILABLE=1)
251+
246252
et_cxx_test(
247253
test_cuda_mutable_state SOURCES runtime/test/test_cuda_mutable_state.cpp
248254
EXTRA_LIBS aoti_cuda_backend

backends/cuda/runtime/TARGETS

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ runtime.cxx_library(
9292
"//executorch/runtime/core:device_allocator",
9393
],
9494
deps = [
95+
"//executorch/extension/cuda:caller_stream",
9596
"//executorch/runtime/platform:platform",
9697
],
9798
nvcc_flags = get_nvcc_arch_args() + [
@@ -163,3 +164,20 @@ cpp_unittest(
163164
platform = "gpu-remote-execution",
164165
),
165166
)
167+
168+
cpp_unittest(
169+
name = "test_cuda_allocator",
170+
srcs = ["test/test_cuda_allocator.cpp"],
171+
deps = [
172+
":cuda_allocator",
173+
"//executorch/extension/cuda:caller_stream",
174+
"//executorch/runtime/core:core",
175+
"//executorch/runtime/platform:platform",
176+
],
177+
external_deps = [("cuda", None, "cuda-lazy")],
178+
preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
179+
keep_gpu_sections = True,
180+
remote_execution = re_test_utils.remote_execution(
181+
platform = "gpu-remote-execution",
182+
),
183+
)

backends/cuda/runtime/cuda_allocator.cpp

Lines changed: 60 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
#include <cuda_runtime.h>
1212

13+
#include <executorch/extension/cuda/caller_stream.h>
1314
#include <executorch/runtime/platform/log.h>
1415

1516
namespace executorch::backends::cuda {
@@ -124,12 +125,30 @@ void CudaAllocator::deallocate(void* ptr, DeviceIndex index) {
124125
}
125126
}
126127

127-
// TODO(gasoonjia): Add support for async copy
128128
Error CudaAllocator::copy_host_to_device(
129129
void* dst,
130130
const void* src,
131131
size_t nbytes,
132132
DeviceIndex index) {
133+
ET_CHECK_OR_RETURN_ERROR(
134+
dst != nullptr,
135+
InvalidArgument,
136+
"CudaAllocator::copy_host_to_device dst is null");
137+
ET_CHECK_OR_RETURN_ERROR(
138+
src != nullptr,
139+
InvalidArgument,
140+
"CudaAllocator::copy_host_to_device src is null");
141+
if (nbytes == 0) {
142+
return Error::Ok;
143+
}
144+
// TODO: validate caller stream device matches index.
145+
// For now assert index is -1 or 0.
146+
ET_CHECK_OR_RETURN_ERROR(
147+
index == -1 || index == 0,
148+
InvalidArgument,
149+
"CudaAllocator::copy_host_to_device only supports device 0 or -1 (current), got %d",
150+
static_cast<int>(index));
151+
133152
int prev_device = 0;
134153
cudaError_t prev_device_err = cudaSuccess;
135154

@@ -139,8 +158,16 @@ Error CudaAllocator::copy_host_to_device(
139158
cudaSetDevice(index);
140159
}
141160
}
142-
143-
cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToDevice);
161+
cudaError_t err = cudaSuccess;
162+
const auto caller_stream = executorch::extension::cuda::getCallerStream();
163+
if (caller_stream) {
164+
err = cudaMemcpyAsync(
165+
dst, src, nbytes, cudaMemcpyHostToDevice, *caller_stream);
166+
// We don't synchronize the stream here because the caller is expected to
167+
// synchronize the stream.
168+
} else {
169+
err = cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToDevice);
170+
}
144171

145172
if (index >= 0 && prev_device_err == cudaSuccess) {
146173
cudaSetDevice(prev_device);
@@ -158,12 +185,30 @@ Error CudaAllocator::copy_host_to_device(
158185
return Error::Ok;
159186
}
160187

161-
// TODO(gasoonjia): Add support for async copy
162188
Error CudaAllocator::copy_device_to_host(
163189
void* dst,
164190
const void* src,
165191
size_t nbytes,
166192
DeviceIndex index) {
193+
ET_CHECK_OR_RETURN_ERROR(
194+
dst != nullptr,
195+
InvalidArgument,
196+
"CudaAllocator::copy_device_to_host dst is null");
197+
ET_CHECK_OR_RETURN_ERROR(
198+
src != nullptr,
199+
InvalidArgument,
200+
"CudaAllocator::copy_device_to_host src is null");
201+
if (nbytes == 0) {
202+
return Error::Ok;
203+
}
204+
// TODO: validate caller stream device matches index.
205+
// For now assert index is -1 or 0.
206+
ET_CHECK_OR_RETURN_ERROR(
207+
index == -1 || index == 0,
208+
InvalidArgument,
209+
"CudaAllocator::copy_device_to_host only supports device 0 or -1 (current), got %d",
210+
static_cast<int>(index));
211+
167212
int prev_device = 0;
168213
cudaError_t prev_device_err = cudaSuccess;
169214

@@ -173,8 +218,17 @@ Error CudaAllocator::copy_device_to_host(
173218
cudaSetDevice(index);
174219
}
175220
}
176-
177-
cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToHost);
221+
cudaError_t err = cudaSuccess;
222+
const auto caller_stream = executorch::extension::cuda::getCallerStream();
223+
if (caller_stream) {
224+
err = cudaMemcpyAsync(
225+
dst, src, nbytes, cudaMemcpyDeviceToHost, *caller_stream);
226+
if (err == cudaSuccess) {
227+
err = cudaStreamSynchronize(*caller_stream);
228+
}
229+
} else {
230+
err = cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToHost);
231+
}
178232

179233
if (index >= 0 && prev_device_err == cudaSuccess) {
180234
cudaSetDevice(prev_device);
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <gtest/gtest.h>
10+
11+
#include <cuda_runtime.h>
12+
13+
#include <cstdint>
14+
#include <vector>
15+
16+
#include <executorch/backends/cuda/runtime/cuda_allocator.h>
17+
#include <executorch/extension/cuda/caller_stream.h>
18+
#include <executorch/runtime/core/error.h>
19+
#include <executorch/runtime/platform/platform.h>
20+
21+
using executorch::backends::cuda::CudaAllocator;
22+
using executorch::runtime::Error;
23+
24+
namespace {
25+
bool cuda_device_available() {
26+
int device_count = 0;
27+
const cudaError_t err = cudaGetDeviceCount(&device_count);
28+
return err == cudaSuccess && device_count > 0;
29+
}
30+
} // namespace
31+
32+
class CudaAllocatorTest : public testing::Test {
33+
protected:
34+
void SetUp() override {
35+
if (!cuda_device_available()) {
36+
GTEST_SKIP() << "CUDA device unavailable";
37+
}
38+
et_pal_init();
39+
}
40+
};
41+
42+
TEST_F(CudaAllocatorTest, CopyHostToDevice) {
43+
CudaAllocator& a = CudaAllocator::instance();
44+
constexpr size_t N = 1024;
45+
auto res = a.allocate(N, 0);
46+
ASSERT_TRUE(res.ok());
47+
void* dptr = res.get();
48+
49+
std::vector<uint8_t> h_src(N, 42);
50+
EXPECT_EQ(a.copy_host_to_device(dptr, h_src.data(), N, 0), Error::Ok);
51+
52+
a.deallocate(dptr, 0);
53+
}
54+
55+
TEST_F(CudaAllocatorTest, CopyDeviceToHost) {
56+
CudaAllocator& a = CudaAllocator::instance();
57+
constexpr size_t N = 1024;
58+
auto res = a.allocate(N, 0);
59+
ASSERT_TRUE(res.ok());
60+
void* dptr = res.get();
61+
62+
std::vector<uint8_t> h_src(N, 42), h_dst(N, 0);
63+
ASSERT_EQ(a.copy_host_to_device(dptr, h_src.data(), N, 0), Error::Ok);
64+
EXPECT_EQ(a.copy_device_to_host(h_dst.data(), dptr, N, 0), Error::Ok);
65+
EXPECT_EQ(h_src, h_dst);
66+
67+
a.deallocate(dptr, 0);
68+
}
69+
70+
TEST_F(CudaAllocatorTest, CopyHostToDeviceWithCallerStream) {
71+
int device = 0;
72+
ASSERT_EQ(cudaGetDevice(&device), cudaSuccess);
73+
ASSERT_EQ(device, 0) << "test assumes single GPU device 0";
74+
// TODO: validate caller stream device matches index once CallerStreamGuard
75+
// exposes device. For now assert single-GPU case.
76+
cudaStream_t s;
77+
ASSERT_EQ(cudaStreamCreate(&s), cudaSuccess);
78+
{
79+
executorch::extension::cuda::CallerStreamGuard g(s);
80+
81+
CudaAllocator& a = CudaAllocator::instance();
82+
auto res = a.allocate(256, 0);
83+
ASSERT_TRUE(res.ok());
84+
void* d = res.get();
85+
std::vector<uint8_t> h(256, 7);
86+
// should take async branch internally, still return Ok
87+
EXPECT_EQ(a.copy_host_to_device(d, h.data(), 256, 0), Error::Ok);
88+
ASSERT_EQ(cudaStreamSynchronize(s), cudaSuccess);
89+
a.deallocate(d, 0);
90+
}
91+
ASSERT_EQ(cudaStreamDestroy(s), cudaSuccess);
92+
}
93+
94+
TEST_F(CudaAllocatorTest, CopyDeviceToHostWithCallerStream) {
95+
int device = 0;
96+
ASSERT_EQ(cudaGetDevice(&device), cudaSuccess);
97+
ASSERT_EQ(device, 0) << "test assumes single GPU device 0";
98+
// TODO: validate caller stream device matches index once CallerStreamGuard
99+
// exposes device. For now assert single-GPU case.
100+
cudaStream_t s;
101+
ASSERT_EQ(cudaStreamCreate(&s), cudaSuccess);
102+
{
103+
executorch::extension::cuda::CallerStreamGuard g(s);
104+
105+
CudaAllocator& a = CudaAllocator::instance();
106+
auto res = a.allocate(256, 0);
107+
ASSERT_TRUE(res.ok());
108+
void* d = res.get();
109+
std::vector<uint8_t> h_src(256, 5), h_dst(256, 0);
110+
ASSERT_EQ(a.copy_host_to_device(d, h_src.data(), 256, 0), Error::Ok);
111+
EXPECT_EQ(a.copy_device_to_host(h_dst.data(), d, 256, 0), Error::Ok);
112+
EXPECT_EQ(h_src, h_dst);
113+
114+
a.deallocate(d, 0);
115+
}
116+
ASSERT_EQ(cudaStreamDestroy(s), cudaSuccess);
117+
}
118+
119+
TEST_F(CudaAllocatorTest, CopyHostToDeviceNullDstReturnsInvalidArgument) {
120+
CudaAllocator& a = CudaAllocator::instance();
121+
// null dst should fail gracefully not CHECK abort
122+
std::vector<uint8_t> h(8, 1);
123+
Error e = a.copy_host_to_device(nullptr, h.data(), 8, 0);
124+
EXPECT_EQ(e, Error::InvalidArgument)
125+
<< "expected InvalidArgument for null dst, got "
126+
<< static_cast<uint32_t>(e);
127+
}
128+
129+
TEST_F(CudaAllocatorTest, CopyHostToDeviceNullSrcReturnsInvalidArgument) {
130+
CudaAllocator& a = CudaAllocator::instance();
131+
void* dummy_dst = reinterpret_cast<void*>(0x1);
132+
Error e = a.copy_host_to_device(dummy_dst, nullptr, 8, 0);
133+
EXPECT_EQ(e, Error::InvalidArgument)
134+
<< "expected InvalidArgument for null src, got "
135+
<< static_cast<uint32_t>(e);
136+
}
137+
138+
TEST_F(CudaAllocatorTest, CopyDeviceToHostNullDstReturnsInvalidArgument) {
139+
CudaAllocator& a = CudaAllocator::instance();
140+
void* dummy_src = reinterpret_cast<void*>(0x1);
141+
Error e = a.copy_device_to_host(nullptr, dummy_src, 8, 0);
142+
EXPECT_EQ(e, Error::InvalidArgument)
143+
<< "expected InvalidArgument for null dst, got "
144+
<< static_cast<uint32_t>(e);
145+
}
146+
147+
TEST_F(CudaAllocatorTest, CopyDeviceToHostNullSrcReturnsInvalidArgument) {
148+
CudaAllocator& a = CudaAllocator::instance();
149+
std::vector<uint8_t> h(8, 1);
150+
// null src should fail gracefully not CHECK abort
151+
Error e = a.copy_device_to_host(h.data(), nullptr, 8, 0);
152+
EXPECT_EQ(e, Error::InvalidArgument)
153+
<< "expected InvalidArgument for null src, got "
154+
<< static_cast<uint32_t>(e);
155+
}

0 commit comments

Comments
 (0)