Use caller CUDA stream for D2H and H2D copies (pytorch#20498)

Conarnar · facebook-github-bot · commit 1e001a5f66dd · 2026-06-25T13:50:42.000-07:00
Summary:

CudaAllocator memory copies now support async copy on a caller-provided CUDA stream. When a caller stream is available (via `getCallerStream()`), `copy_host_to_device` and `copy_device_to_host` use `cudaMemcpyAsync`. When no caller stream is set, the synchronous `cudaMemcpy` path is used as before.

Additionally:
- Added null pointer and zero-byte validation — null `dst`/`src` return `Error::InvalidArgument` instead of aborting in `cudaMemcpy`, and zero-byte copies return `Error::Ok` early.
- Assert single-GPU case (index 0 or -1) until multi-GPU stream validation is added.
- Wired `//executorch/extension/cuda:caller_stream` dependency in TARGETS.
- Added `test_cuda_allocator` with coverage for sync/async paths and error handling.

Differential Revision: D109590531
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
@@ -243,6 +243,12 @@ install(
 if(BUILD_TESTING)
   include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
+  et_cxx_test(
+    test_cuda_allocator SOURCES runtime/test/test_cuda_allocator.cpp EXTRA_LIBS
+    aoti_cuda_backend
+  )
+  target_compile_definitions(test_cuda_allocator PRIVATE CUDA_AVAILABLE=1)
+
   et_cxx_test(
     test_cuda_mutable_state SOURCES runtime/test/test_cuda_mutable_state.cpp
     EXTRA_LIBS aoti_cuda_backend
diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
@@ -92,6 +92,7 @@ runtime.cxx_library(
         "//executorch/runtime/core:device_allocator",
     ],
     deps = [
+        "//executorch/extension/cuda:caller_stream",
         "//executorch/runtime/platform:platform",
     ],
     nvcc_flags = get_nvcc_arch_args() + [
@@ -163,3 +164,20 @@ cpp_unittest(
         platform = "gpu-remote-execution",
     ),
 )
+
+cpp_unittest(
+    name = "test_cuda_allocator",
+    srcs = ["test/test_cuda_allocator.cpp"],
+    deps = [
+        ":cuda_allocator",
+        "//executorch/extension/cuda:caller_stream",
+        "//executorch/runtime/core:core",
+        "//executorch/runtime/platform:platform",
+    ],
+    external_deps = [("cuda", None, "cuda-lazy")],
+    preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
+    keep_gpu_sections = True,
+    remote_execution = re_test_utils.remote_execution(
+        platform = "gpu-remote-execution",
+    ),
+)
diff --git a/backends/cuda/runtime/cuda_allocator.cpp b/backends/cuda/runtime/cuda_allocator.cpp
@@ -10,6 +10,7 @@
 
 #include <cuda_runtime.h>
 
+#include <executorch/extension/cuda/caller_stream.h>
 #include <executorch/runtime/platform/log.h>
 
 namespace executorch::backends::cuda {
@@ -124,12 +125,30 @@ void CudaAllocator::deallocate(void* ptr, DeviceIndex index) {
   }
 }
 
-// TODO(gasoonjia): Add support for async copy
 Error CudaAllocator::copy_host_to_device(
     void* dst,
     const void* src,
     size_t nbytes,
     DeviceIndex index) {
+  ET_CHECK_OR_RETURN_ERROR(
+      dst != nullptr,
+      InvalidArgument,
+      "CudaAllocator::copy_host_to_device dst is null");
+  ET_CHECK_OR_RETURN_ERROR(
+      src != nullptr,
+      InvalidArgument,
+      "CudaAllocator::copy_host_to_device src is null");
+  if (nbytes == 0) {
+    return Error::Ok;
+  }
+  // TODO: validate caller stream device matches index.
+  // For now assert index is -1 or 0.
+  ET_CHECK_OR_RETURN_ERROR(
+      index == -1 || index == 0,
+      InvalidArgument,
+      "CudaAllocator::copy_host_to_device only supports device 0 or -1 (current), got %d",
+      static_cast<int>(index));
+
   int prev_device = 0;
   cudaError_t prev_device_err = cudaSuccess;
 
@@ -139,8 +158,16 @@ Error CudaAllocator::copy_host_to_device(
       cudaSetDevice(index);
     }
   }
-
-  cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToDevice);
+  cudaError_t err = cudaSuccess;
+  const auto caller_stream = executorch::extension::cuda::getCallerStream();
+  if (caller_stream) {
+    err = cudaMemcpyAsync(
+        dst, src, nbytes, cudaMemcpyHostToDevice, *caller_stream);
+    // We don't synchronize the stream here because the caller is expected to
+    // synchronize the stream.
+  } else {
+    err = cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToDevice);
+  }
 
   if (index >= 0 && prev_device_err == cudaSuccess) {
     cudaSetDevice(prev_device);
@@ -158,12 +185,30 @@ Error CudaAllocator::copy_host_to_device(
   return Error::Ok;
 }
 
-// TODO(gasoonjia): Add support for async copy
 Error CudaAllocator::copy_device_to_host(
     void* dst,
     const void* src,
     size_t nbytes,
     DeviceIndex index) {
+  ET_CHECK_OR_RETURN_ERROR(
+      dst != nullptr,
+      InvalidArgument,
+      "CudaAllocator::copy_device_to_host dst is null");
+  ET_CHECK_OR_RETURN_ERROR(
+      src != nullptr,
+      InvalidArgument,
+      "CudaAllocator::copy_device_to_host src is null");
+  if (nbytes == 0) {
+    return Error::Ok;
+  }
+  // TODO: validate caller stream device matches index.
+  // For now assert index is -1 or 0.
+  ET_CHECK_OR_RETURN_ERROR(
+      index == -1 || index == 0,
+      InvalidArgument,
+      "CudaAllocator::copy_device_to_host only supports device 0 or -1 (current), got %d",
+      static_cast<int>(index));
+
   int prev_device = 0;
   cudaError_t prev_device_err = cudaSuccess;
 
@@ -173,8 +218,17 @@ Error CudaAllocator::copy_device_to_host(
       cudaSetDevice(index);
     }
   }
-
-  cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToHost);
+  cudaError_t err = cudaSuccess;
+  const auto caller_stream = executorch::extension::cuda::getCallerStream();
+  if (caller_stream) {
+    err = cudaMemcpyAsync(
+        dst, src, nbytes, cudaMemcpyDeviceToHost, *caller_stream);
+    if (err == cudaSuccess) {
+      err = cudaStreamSynchronize(*caller_stream);
+    }
+  } else {
+    err = cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToHost);
+  }
 
   if (index >= 0 && prev_device_err == cudaSuccess) {
     cudaSetDevice(prev_device);
diff --git a/backends/cuda/runtime/test/test_cuda_allocator.cpp b/backends/cuda/runtime/test/test_cuda_allocator.cpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cuda_runtime.h>
+
+#include <cstdint>
+#include <vector>
+
+#include <executorch/backends/cuda/runtime/cuda_allocator.h>
+#include <executorch/extension/cuda/caller_stream.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/platform.h>
+
+using executorch::backends::cuda::CudaAllocator;
+using executorch::runtime::Error;
+
+namespace {
+bool cuda_device_available() {
+  int device_count = 0;
+  const cudaError_t err = cudaGetDeviceCount(&device_count);
+  return err == cudaSuccess && device_count > 0;
+}
+} // namespace
+
+class CudaAllocatorTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    if (!cuda_device_available()) {
+      GTEST_SKIP() << "CUDA device unavailable";
+    }
+    et_pal_init();
+  }
+};
+
+TEST_F(CudaAllocatorTest, CopyHostToDevice) {
+  CudaAllocator& a = CudaAllocator::instance();
+  constexpr size_t N = 1024;
+  auto res = a.allocate(N, 0);
+  ASSERT_TRUE(res.ok());
+  void* dptr = res.get();
+
+  std::vector<uint8_t> h_src(N, 42);
+  EXPECT_EQ(a.copy_host_to_device(dptr, h_src.data(), N, 0), Error::Ok);
+
+  a.deallocate(dptr, 0);
+}
+
+TEST_F(CudaAllocatorTest, CopyDeviceToHost) {
+  CudaAllocator& a = CudaAllocator::instance();
+  constexpr size_t N = 1024;
+  auto res = a.allocate(N, 0);
+  ASSERT_TRUE(res.ok());
+  void* dptr = res.get();
+
+  std::vector<uint8_t> h_src(N, 42), h_dst(N, 0);
+  ASSERT_EQ(a.copy_host_to_device(dptr, h_src.data(), N, 0), Error::Ok);
+  EXPECT_EQ(a.copy_device_to_host(h_dst.data(), dptr, N, 0), Error::Ok);
+  EXPECT_EQ(h_src, h_dst);
+
+  a.deallocate(dptr, 0);
+}
+
+TEST_F(CudaAllocatorTest, CopyHostToDeviceWithCallerStream) {
+  int device = 0;
+  ASSERT_EQ(cudaGetDevice(&device), cudaSuccess);
+  ASSERT_EQ(device, 0) << "test assumes single GPU device 0";
+  // TODO: validate caller stream device matches index once CallerStreamGuard
+  // exposes device. For now assert single-GPU case.
+  cudaStream_t s;
+  ASSERT_EQ(cudaStreamCreate(&s), cudaSuccess);
+  {
+    executorch::extension::cuda::CallerStreamGuard g(s);
+
+    CudaAllocator& a = CudaAllocator::instance();
+    auto res = a.allocate(256, 0);
+    ASSERT_TRUE(res.ok());
+    void* d = res.get();
+    std::vector<uint8_t> h(256, 7);
+    // should take async branch internally, still return Ok
+    EXPECT_EQ(a.copy_host_to_device(d, h.data(), 256, 0), Error::Ok);
+    ASSERT_EQ(cudaStreamSynchronize(s), cudaSuccess);
+    a.deallocate(d, 0);
+  }
+  ASSERT_EQ(cudaStreamDestroy(s), cudaSuccess);
+}
+
+TEST_F(CudaAllocatorTest, CopyDeviceToHostWithCallerStream) {
+  int device = 0;
+  ASSERT_EQ(cudaGetDevice(&device), cudaSuccess);
+  ASSERT_EQ(device, 0) << "test assumes single GPU device 0";
+  // TODO: validate caller stream device matches index once CallerStreamGuard
+  // exposes device. For now assert single-GPU case.
+  cudaStream_t s;
+  ASSERT_EQ(cudaStreamCreate(&s), cudaSuccess);
+  {
+    executorch::extension::cuda::CallerStreamGuard g(s);
+
+    CudaAllocator& a = CudaAllocator::instance();
+    auto res = a.allocate(256, 0);
+    ASSERT_TRUE(res.ok());
+    void* d = res.get();
+    std::vector<uint8_t> h_src(256, 5), h_dst(256, 0);
+    ASSERT_EQ(a.copy_host_to_device(d, h_src.data(), 256, 0), Error::Ok);
+    EXPECT_EQ(a.copy_device_to_host(h_dst.data(), d, 256, 0), Error::Ok);
+    EXPECT_EQ(h_src, h_dst);
+
+    a.deallocate(d, 0);
+  }
+  ASSERT_EQ(cudaStreamDestroy(s), cudaSuccess);
+}
+
+TEST_F(CudaAllocatorTest, CopyHostToDeviceNullDstReturnsInvalidArgument) {
+  CudaAllocator& a = CudaAllocator::instance();
+  // null dst should fail gracefully not CHECK abort
+  std::vector<uint8_t> h(8, 1);
+  Error e = a.copy_host_to_device(nullptr, h.data(), 8, 0);
+  EXPECT_EQ(e, Error::InvalidArgument)
+      << "expected InvalidArgument for null dst, got "
+      << static_cast<uint32_t>(e);
+}
+
+TEST_F(CudaAllocatorTest, CopyHostToDeviceNullSrcReturnsInvalidArgument) {
+  CudaAllocator& a = CudaAllocator::instance();
+  void* dummy_dst = reinterpret_cast<void*>(0x1);
+  Error e = a.copy_host_to_device(dummy_dst, nullptr, 8, 0);
+  EXPECT_EQ(e, Error::InvalidArgument)
+      << "expected InvalidArgument for null src, got "
+      << static_cast<uint32_t>(e);
+}
+
+TEST_F(CudaAllocatorTest, CopyDeviceToHostNullDstReturnsInvalidArgument) {
+  CudaAllocator& a = CudaAllocator::instance();
+  void* dummy_src = reinterpret_cast<void*>(0x1);
+  Error e = a.copy_device_to_host(nullptr, dummy_src, 8, 0);
+  EXPECT_EQ(e, Error::InvalidArgument)
+      << "expected InvalidArgument for null dst, got "
+      << static_cast<uint32_t>(e);
+}
+
+TEST_F(CudaAllocatorTest, CopyDeviceToHostNullSrcReturnsInvalidArgument) {
+  CudaAllocator& a = CudaAllocator::instance();
+  std::vector<uint8_t> h(8, 1);
+  // null src should fail gracefully not CHECK abort
+  Error e = a.copy_device_to_host(h.data(), nullptr, 8, 0);
+  EXPECT_EQ(e, Error::InvalidArgument)
+      << "expected InvalidArgument for null src, got "
+      << static_cast<uint32_t>(e);
+}