diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 40f4195744b..e4a13c079a5 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -213,6 +213,40 @@ jobs: pip install gguf python -m pytest examples/models/gemma4_31b/quant/tests/ examples/models/gemma4_31b/tests/ --ignore=examples/models/gemma4_31b/tests/test_mlx_pipeline.py -v -o "addopts=" + unittest-cuda-runtime: + name: unittest-cuda-runtime + needs: [changed-files, run-decision] + if: | + contains(needs.changed-files.outputs.changed-files, 'backends/cuda') || + contains(needs.changed-files.outputs.changed-files, 'backends/aoti') || + contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda.yml') || + contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test-cuda-build.sh') || + contains(needs.changed-files.outputs.changed-files, '.ci/scripts/export_model_artifact.sh') || + contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test_model_e2e.sh') || + needs.run-decision.outputs.is-full-run == 'true' + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + with: + timeout: 90 + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "13.0" + use-custom-docker-registry: false + submodules: recursive + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + set -eux + bash ./install_executorch.sh + conda install -y -c conda-forge 'libstdcxx-ng>=12' + export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH + + cmake --preset llm-release-cuda -DEXECUTORCH_BUILD_TESTS=ON + cmake --build cmake-out --target test_cuda_allocator test_cuda_mutable_state -j$(nproc) + ctest --test-dir cmake-out -R test_cuda_allocator --output-on-failure -V + ctest --test-dir cmake-out -R test_cuda_mutable_state --output-on-failure -V + export-model-cuda-artifact: name: export-model-cuda-artifact # Skip this job if the pull request is from a fork (HuggingFace secrets are not available). diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index 2d522f33e28..d0ff05b05ee 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -160,7 +160,7 @@ endif() if(_cuda_is_msvc_toolchain) target_link_libraries( aoti_cuda_shims PRIVATE cuda_platform CUDA::cudart CUDA::curand - ${CMAKE_DL_LIBS} + extension_cuda ${CMAKE_DL_LIBS} ) # Link object library directly so symbols are pulled exactly once while # avoiding duplicate static/object inclusion and interface leakage. @@ -169,8 +169,13 @@ else() target_link_libraries( aoti_cuda_shims PRIVATE cuda_platform - PUBLIC -Wl,--whole-archive aoti_common_shims_slim -Wl,--no-whole-archive - CUDA::cudart CUDA::curand ${CMAKE_DL_LIBS} + PUBLIC -Wl,--whole-archive + aoti_common_shims_slim + -Wl,--no-whole-archive + CUDA::cudart + CUDA::curand + extension_cuda + ${CMAKE_DL_LIBS} ) endif() @@ -243,6 +248,12 @@ install( if(BUILD_TESTING) include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake) + et_cxx_test( + test_cuda_allocator SOURCES runtime/test/test_cuda_allocator.cpp EXTRA_LIBS + aoti_cuda_backend + ) + target_compile_definitions(test_cuda_allocator PRIVATE CUDA_AVAILABLE=1) + et_cxx_test( test_cuda_mutable_state SOURCES runtime/test/test_cuda_mutable_state.cpp EXTRA_LIBS aoti_cuda_backend diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS index 348f8a6ba78..122560e98ec 100644 --- a/backends/cuda/runtime/TARGETS +++ b/backends/cuda/runtime/TARGETS @@ -92,6 +92,7 @@ runtime.cxx_library( "//executorch/runtime/core:device_allocator", ], deps = [ + "//executorch/extension/cuda:caller_stream", "//executorch/runtime/platform:platform", ], nvcc_flags = get_nvcc_arch_args() + [ @@ -163,3 +164,20 @@ cpp_unittest( platform = "gpu-remote-execution", ), ) + +cpp_unittest( + name = "test_cuda_allocator", + srcs = ["test/test_cuda_allocator.cpp"], + deps = [ + ":cuda_allocator", + "//executorch/extension/cuda:caller_stream", + "//executorch/runtime/core:core", + "//executorch/runtime/platform:platform", + ], + external_deps = [("cuda", None, "cuda-lazy")], + preprocessor_flags = ["-DCUDA_AVAILABLE=1"], + keep_gpu_sections = True, + remote_execution = re_test_utils.remote_execution( + platform = "gpu-remote-execution", + ), +) diff --git a/backends/cuda/runtime/cuda_allocator.cpp b/backends/cuda/runtime/cuda_allocator.cpp index 94294b08fa0..0f1abfe2126 100644 --- a/backends/cuda/runtime/cuda_allocator.cpp +++ b/backends/cuda/runtime/cuda_allocator.cpp @@ -10,6 +10,7 @@ #include +#include #include namespace executorch::backends::cuda { @@ -19,6 +20,85 @@ using executorch::runtime::Result; using executorch::runtime::etensor::DeviceIndex; using executorch::runtime::etensor::DeviceType; +namespace { + +Error copy_impl( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex index, + cudaMemcpyKind kind) { + ET_CHECK_OR_RETURN_ERROR( + kind == cudaMemcpyHostToDevice || kind == cudaMemcpyDeviceToHost, + InvalidArgument, + "CudaAllocator::copy_impl: unsupported cudaMemcpyKind %d", + static_cast(kind)); + const char* method = kind == cudaMemcpyHostToDevice + ? "CudaAllocator::copy_host_to_device" + : "CudaAllocator::copy_device_to_host"; + ET_CHECK_OR_RETURN_ERROR( + dst != nullptr, InvalidArgument, "%s: dst is null", method); + ET_CHECK_OR_RETURN_ERROR( + src != nullptr, InvalidArgument, "%s: src is null", method); + ET_CHECK_OR_RETURN_ERROR( + index >= -1, + InvalidArgument, + "%s: invalid device index %d (must be >= -1)", + method, + static_cast(index)); + const auto caller_stream = executorch::extension::cuda::getCallerStream(); + if (caller_stream) { + // TODO: validate caller stream device matches index. + // For now assert index is -1 or 0. + ET_CHECK_OR_RETURN_ERROR( + index == -1 || index == 0, + InvalidArgument, + "%s: with caller stream, only supports device 0 or -1 (current), got %d", + method, + static_cast(index)); + } + if (nbytes == 0) { + return Error::Ok; + } + + int prev_device = 0; + cudaError_t prev_device_err = cudaSuccess; + + if (index >= 0) { + prev_device_err = cudaGetDevice(&prev_device); + if (prev_device_err == cudaSuccess) { + (void)cudaSetDevice(index); + } + } + cudaError_t err = cudaSuccess; + if (caller_stream) { + err = cudaMemcpyAsync(dst, src, nbytes, kind, *caller_stream); + if (err == cudaSuccess && kind == cudaMemcpyDeviceToHost) { + err = cudaStreamSynchronize(*caller_stream); + } + } else { + err = cudaMemcpy(dst, src, nbytes, kind); + } + + if (index >= 0 && prev_device_err == cudaSuccess) { + (void)cudaSetDevice(prev_device); + } + + if (err != cudaSuccess) { + ET_LOG( + Error, + "cudaMemcpy %s failed: %s (%zu bytes, device %d)", + kind == cudaMemcpyHostToDevice ? "H2D" : "D2H", + cudaGetErrorString(err), + nbytes, + static_cast(index)); + return Error::Internal; + } + return Error::Ok; +} + +} // namespace + Result CudaAllocator::allocate(size_t nbytes, DeviceIndex index, size_t alignment) { // index == -1 means "use the current CUDA device"; any value < -1 is invalid. @@ -124,72 +204,20 @@ void CudaAllocator::deallocate(void* ptr, DeviceIndex index) { } } -// TODO(gasoonjia): Add support for async copy Error CudaAllocator::copy_host_to_device( void* dst, const void* src, size_t nbytes, DeviceIndex index) { - int prev_device = 0; - cudaError_t prev_device_err = cudaSuccess; - - if (index >= 0) { - prev_device_err = cudaGetDevice(&prev_device); - if (prev_device_err == cudaSuccess) { - cudaSetDevice(index); - } - } - - cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToDevice); - - if (index >= 0 && prev_device_err == cudaSuccess) { - cudaSetDevice(prev_device); - } - - if (err != cudaSuccess) { - ET_LOG( - Error, - "cudaMemcpy H2D failed: %s (%zu bytes, device %d)", - cudaGetErrorString(err), - nbytes, - static_cast(index)); - return Error::Internal; - } - return Error::Ok; + return copy_impl(dst, src, nbytes, index, cudaMemcpyHostToDevice); } -// TODO(gasoonjia): Add support for async copy Error CudaAllocator::copy_device_to_host( void* dst, const void* src, size_t nbytes, DeviceIndex index) { - int prev_device = 0; - cudaError_t prev_device_err = cudaSuccess; - - if (index >= 0) { - prev_device_err = cudaGetDevice(&prev_device); - if (prev_device_err == cudaSuccess) { - cudaSetDevice(index); - } - } - - cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToHost); - - if (index >= 0 && prev_device_err == cudaSuccess) { - cudaSetDevice(prev_device); - } - - if (err != cudaSuccess) { - ET_LOG( - Error, - "cudaMemcpy D2H failed: %s (%zu bytes, device %d)", - cudaGetErrorString(err), - nbytes, - static_cast(index)); - return Error::Internal; - } - return Error::Ok; + return copy_impl(dst, src, nbytes, index, cudaMemcpyDeviceToHost); } DeviceType CudaAllocator::device_type() const { diff --git a/backends/cuda/runtime/test/test_cuda_allocator.cpp b/backends/cuda/runtime/test/test_cuda_allocator.cpp new file mode 100644 index 00000000000..9bbbed98ca8 --- /dev/null +++ b/backends/cuda/runtime/test/test_cuda_allocator.cpp @@ -0,0 +1,114 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include +#include + +#include +#include +#include +#include + +using executorch::backends::cuda::CudaAllocator; +using executorch::runtime::Error; + +class CudaAllocatorTest : public testing::Test { + protected: + void SetUp() override { + et_pal_init(); + + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + if (err != cudaSuccess || device_count == 0) { + GTEST_SKIP() << "CUDA not available"; + } + } +}; + +TEST_F(CudaAllocatorTest, CopyRoundtrip) { + CudaAllocator& a = CudaAllocator::instance(); + constexpr size_t N = 1024; + auto res = a.allocate(N, 0); + ASSERT_TRUE(res.ok()); + void* dptr = res.get(); + + std::vector h_src(N, 42), h_dst(N, 0); + ASSERT_EQ(a.copy_host_to_device(dptr, h_src.data(), N, 0), Error::Ok); + EXPECT_EQ(a.copy_device_to_host(h_dst.data(), dptr, N, 0), Error::Ok); + EXPECT_EQ(h_src, h_dst); + + a.deallocate(dptr, 0); +} + +TEST_F(CudaAllocatorTest, CopyRoundtripWithCallerStream) { + int device = 0; + ASSERT_EQ(cudaGetDevice(&device), cudaSuccess); + ASSERT_EQ(device, 0) << "test assumes single GPU device 0"; + // TODO: validate caller stream device matches index once CallerStreamGuard + // exposes device. For now assert single-GPU case. + cudaStream_t s; + ASSERT_EQ(cudaStreamCreate(&s), cudaSuccess); + { + executorch::extension::cuda::CallerStreamGuard g(s); + + CudaAllocator& a = CudaAllocator::instance(); + auto res = a.allocate(256, 0); + ASSERT_TRUE(res.ok()); + void* d = res.get(); + std::vector h_src(256, 5), h_dst(256, 0); + ASSERT_EQ(a.copy_host_to_device(d, h_src.data(), 256, 0), Error::Ok); + EXPECT_EQ(a.copy_device_to_host(h_dst.data(), d, 256, 0), Error::Ok); + EXPECT_EQ(h_src, h_dst); + EXPECT_EQ(cudaStreamSynchronize(s), cudaSuccess); + + a.deallocate(d, 0); + } + ASSERT_EQ(cudaStreamDestroy(s), cudaSuccess); +} + +TEST_F(CudaAllocatorTest, CopyHostToDeviceNullDstReturnsInvalidArgument) { + CudaAllocator& a = CudaAllocator::instance(); + // null dst should fail gracefully not CHECK abort + std::vector h(8, 1); + Error e = a.copy_host_to_device(nullptr, h.data(), 8, 0); + EXPECT_EQ(e, Error::InvalidArgument) + << "expected InvalidArgument for null dst, got " + << static_cast(e); +} + +TEST_F(CudaAllocatorTest, CopyHostToDeviceNullSrcReturnsInvalidArgument) { + CudaAllocator& a = CudaAllocator::instance(); + void* dummy_dst = reinterpret_cast(0x1); + Error e = a.copy_host_to_device(dummy_dst, nullptr, 8, 0); + EXPECT_EQ(e, Error::InvalidArgument) + << "expected InvalidArgument for null src, got " + << static_cast(e); +} + +TEST_F(CudaAllocatorTest, CopyDeviceToHostNullDstReturnsInvalidArgument) { + CudaAllocator& a = CudaAllocator::instance(); + void* dummy_src = reinterpret_cast(0x1); + Error e = a.copy_device_to_host(nullptr, dummy_src, 8, 0); + EXPECT_EQ(e, Error::InvalidArgument) + << "expected InvalidArgument for null dst, got " + << static_cast(e); +} + +TEST_F(CudaAllocatorTest, CopyDeviceToHostNullSrcReturnsInvalidArgument) { + CudaAllocator& a = CudaAllocator::instance(); + std::vector h(8, 1); + // null src should fail gracefully not CHECK abort + Error e = a.copy_device_to_host(h.data(), nullptr, 8, 0); + EXPECT_EQ(e, Error::InvalidArgument) + << "expected InvalidArgument for null src, got " + << static_cast(e); +} diff --git a/extension/cuda/caller_stream.h b/extension/cuda/caller_stream.h index a2341d380cf..a13b7a9b396 100644 --- a/extension/cuda/caller_stream.h +++ b/extension/cuda/caller_stream.h @@ -37,6 +37,9 @@ EXECUTORCH_EXTENSION_CUDA_API std::optional getCallerStream(); * context's SM partition; the confinement rides the stream, so the green * context need not be made current. The caller owns the stream for the guard's * lifetime. + * + * The user is responsible for handling the lifetimes of host data so that it + * lasts until after async operations that use it are completed. */ class EXECUTORCH_EXTENSION_CUDA_API CallerStreamGuard { public: