From 0370efd95713f43b8987d14f639c7bda398caf9a Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 17 Mar 2025 12:00:39 -0500 Subject: [PATCH 1/4] Add async view memory resource bindings to Python. --- .../cuda_async_view_memory_resource.hpp | 3 +- python/rmm/rmm/librmm/memory_resource.pxd | 12 +++++- python/rmm/rmm/mr.py | 4 +- python/rmm/rmm/pylibrmm/memory_resource.pxd | 5 ++- python/rmm/rmm/pylibrmm/memory_resource.pyx | 38 ++++++++++++++++++- python/rmm/rmm/tests/test_rmm.py | 33 ++++++++++++++++ 6 files changed, 89 insertions(+), 6 deletions(-) diff --git a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp index 92aea2072..615bda124 100644 --- a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp +++ b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp @@ -72,7 +72,8 @@ class cuda_async_view_memory_resource final : public device_memory_resource { */ [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return cuda_pool_handle_; } - cuda_async_view_memory_resource() = default; + cuda_async_view_memory_resource() = default; + ~cuda_async_view_memory_resource() = default; ///< @default_destructor cuda_async_view_memory_resource(cuda_async_view_memory_resource const&) = default; ///< @default_copy_constructor cuda_async_view_memory_resource(cuda_async_view_memory_resource&&) = diff --git a/python/rmm/rmm/librmm/memory_resource.pxd b/python/rmm/rmm/librmm/memory_resource.pxd index 9e7b70c4f..26846f702 100644 --- a/python/rmm/rmm/librmm/memory_resource.pxd +++ b/python/rmm/rmm/librmm/memory_resource.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,6 +23,8 @@ from libcpp.optional cimport optional from libcpp.pair cimport pair from libcpp.string cimport string +from cuda.bindings.cyruntime cimport cudaMemPool_t + from rmm.librmm.cuda_stream_view cimport cuda_stream_view from rmm.librmm.memory_resource cimport device_memory_resource @@ -108,6 +110,14 @@ cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \ optional[size_t] release_threshold, optional[allocation_handle_type] export_handle_type) except + +cdef extern from "rmm/mr/device/cuda_async_view_memory_resource.hpp" \ + namespace "rmm::mr" nogil: + + cdef cppclass cuda_async_view_memory_resource(device_memory_resource): + cuda_async_view_memory_resource( + cudaMemPool_t valid_pool_handle) except + + cudaMemPool_t pool_handle() const + # TODO: when we adopt Cython 3.0 use enum class cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \ namespace \ diff --git a/python/rmm/rmm/mr.py b/python/rmm/rmm/mr.py index 673ffde82..eada9b21b 100644 --- a/python/rmm/rmm/mr.py +++ b/python/rmm/rmm/mr.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ BinningMemoryResource, CallbackMemoryResource, CudaAsyncMemoryResource, + CudaAsyncViewMemoryResource, CudaMemoryResource, DeviceMemoryResource, FailureCallbackResourceAdaptor, @@ -50,6 +51,7 @@ "BinningMemoryResource", "CallbackMemoryResource", "CudaAsyncMemoryResource", + "CudaAsyncViewMemoryResource", "CudaMemoryResource", "DeviceMemoryResource", "FailureCallbackResourceAdaptor", diff --git a/python/rmm/rmm/pylibrmm/memory_resource.pxd b/python/rmm/rmm/pylibrmm/memory_resource.pxd index d1e5610db..8ccd07a02 100644 --- a/python/rmm/rmm/pylibrmm/memory_resource.pxd +++ b/python/rmm/rmm/pylibrmm/memory_resource.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -44,6 +44,9 @@ cdef class SamHeadroomMemoryResource(DeviceMemoryResource): cdef class CudaAsyncMemoryResource(DeviceMemoryResource): pass +cdef class CudaAsyncViewMemoryResource(DeviceMemoryResource): + pass + cdef class PoolMemoryResource(UpstreamResourceAdaptor): pass diff --git a/python/rmm/rmm/pylibrmm/memory_resource.pyx b/python/rmm/rmm/pylibrmm/memory_resource.pyx index 0189a58b5..4efdbff18 100644 --- a/python/rmm/rmm/pylibrmm/memory_resource.pyx +++ b/python/rmm/rmm/pylibrmm/memory_resource.pyx @@ -28,7 +28,8 @@ from libcpp.memory cimport make_unique, unique_ptr from libcpp.optional cimport optional from libcpp.pair cimport pair -from cuda.bindings.runtime import cudaError_t +from cuda.bindings cimport cyruntime +from cuda.bindings import runtime from rmm._cuda.gpu import CUDARuntimeError, getDevice, setDevice @@ -54,6 +55,7 @@ from rmm.librmm.memory_resource cimport ( binning_memory_resource, callback_memory_resource, cuda_async_memory_resource, + cuda_async_view_memory_resource, cuda_memory_resource, deallocate_callback_t, device_memory_resource, @@ -203,6 +205,38 @@ cdef class CudaAsyncMemoryResource(DeviceMemoryResource): ) +cdef class CudaAsyncViewMemoryResource(DeviceMemoryResource): + """ + Memory resource that uses ``cudaMallocAsync``/``cudaFreeAsync`` for + allocation/deallocation with an existing CUDA memory pool. + + This resource uses an existing CUDA memory pool handle (such as the default pool) + instead of creating a new one. This is useful for integrating with existing GPU + applications that already use a CUDA memory pool, or customizing the flags + used by the memory pool. + + Parameters + ---------- + valid_pool_handle : cudaMemPool_t + Handle to a CUDA memory pool which will be used to serve allocation + requests. + """ + def __cinit__( + self, + valid_pool_handle + ): + cdef cyruntime.cudaMemPool_t c_memory_pool_handle = \ + valid_pool_handle + self.c_obj.reset( + new cuda_async_view_memory_resource(c_memory_pool_handle) + ) + + def pool_handle(self): + cdef cuda_async_view_memory_resource* c_mr = \ + self.c_obj.get() + return c_mr.pool_handle() + + cdef class ManagedMemoryResource(DeviceMemoryResource): def __cinit__(self): self.c_obj.reset( @@ -991,7 +1025,7 @@ cpdef void _initialize( try: original_device = getDevice() except CUDARuntimeError as e: - if e.status == cudaError_t.cudaErrorNoDevice: + if e.status == runtime.cudaError_t.cudaErrorNoDevice: warnings.warn(e.msg) else: raise e diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py index ee02d5d0e..f531ed763 100644 --- a/python/rmm/rmm/tests/test_rmm.py +++ b/python/rmm/rmm/tests/test_rmm.py @@ -1078,3 +1078,36 @@ def test_available_device_memory(): assert initial_memory[1] == final_memory[1] assert initial_memory[0] > 0 assert final_memory[0] > 0 + + +@pytest.mark.parametrize("dtype", _dtypes) +@pytest.mark.parametrize("nelem", _nelems) +@pytest.mark.parametrize("alloc", _allocs) +def test_cuda_async_view_memory_resource_default_pool(dtype, nelem, alloc): + # Get the default memory pool handle + current_device = rmm._cuda.gpu.getDevice() + err, pool = runtime.cudaDeviceGetDefaultMemPool(current_device) + assert err == runtime.cudaError_t.cudaSuccess + + mr = rmm.mr.CudaAsyncViewMemoryResource(pool) + rmm.mr.set_current_device_resource(mr) + assert rmm.mr.get_current_device_resource_type() is type(mr) + array_tester(dtype, nelem, alloc) + + +@pytest.mark.parametrize("dtype", _dtypes) +@pytest.mark.parametrize("nelem", _nelems) +@pytest.mark.parametrize("alloc", _allocs) +def test_cuda_async_view_memory_resource_custom_pool(dtype, nelem, alloc): + # Create a memory pool handle + props = runtime.cudaMemPoolProps() + props.allocType = runtime.cudaMemAllocationType.cudaMemAllocationTypePinned + props.location.id = rmm._cuda.gpu.getDevice() + props.location.type = runtime.cudaMemLocationType.cudaMemLocationTypeDevice + err, pool = runtime.cudaMemPoolCreate(props) + assert err == runtime.cudaError_t.cudaSuccess + + mr = rmm.mr.CudaAsyncViewMemoryResource(pool) + rmm.mr.set_current_device_resource(mr) + assert rmm.mr.get_current_device_resource_type() is type(mr) + array_tester(dtype, nelem, alloc) From b34ea3b34110e5b9107102c70d0430bf9414eca0 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 17 Mar 2025 12:43:01 -0500 Subject: [PATCH 2/4] Remove docs on destructor. --- include/rmm/mr/device/cuda_async_view_memory_resource.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp index 615bda124..240fdb223 100644 --- a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp +++ b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp @@ -73,7 +73,7 @@ class cuda_async_view_memory_resource final : public device_memory_resource { [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return cuda_pool_handle_; } cuda_async_view_memory_resource() = default; - ~cuda_async_view_memory_resource() = default; ///< @default_destructor + ~cuda_async_view_memory_resource() = default; cuda_async_view_memory_resource(cuda_async_view_memory_resource const&) = default; ///< @default_copy_constructor cuda_async_view_memory_resource(cuda_async_view_memory_resource&&) = From 88a2f71fb81d457e9e04c486162a116461ed5166 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 17 Mar 2025 15:26:31 -0500 Subject: [PATCH 3/4] Fix cudaMemPool_t conversions. --- python/rmm/rmm/pylibrmm/memory_resource.pyx | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/python/rmm/rmm/pylibrmm/memory_resource.pyx b/python/rmm/rmm/pylibrmm/memory_resource.pyx index 4efdbff18..14d5da105 100644 --- a/python/rmm/rmm/pylibrmm/memory_resource.pyx +++ b/python/rmm/rmm/pylibrmm/memory_resource.pyx @@ -29,7 +29,7 @@ from libcpp.optional cimport optional from libcpp.pair cimport pair from cuda.bindings cimport cyruntime -from cuda.bindings import runtime +from cuda.bindings import driver, runtime from rmm._cuda.gpu import CUDARuntimeError, getDevice, setDevice @@ -225,10 +225,17 @@ cdef class CudaAsyncViewMemoryResource(DeviceMemoryResource): self, valid_pool_handle ): - cdef cyruntime.cudaMemPool_t c_memory_pool_handle = \ - valid_pool_handle + # Convert the valid_pool_handle to a cyruntime.cudaMemPool_t + cdef cyruntime.cudaMemPool_t c_valid_pool_handle + if isinstance(valid_pool_handle, (runtime.cudaMemPool_t, driver.CUmemoryPool)): + raw_pool_handle = int(valid_pool_handle) + c_valid_pool_handle = raw_pool_handle + else: + raw_pool_handle = int(runtime.cudaMemPool_t(valid_pool_handle)) + c_valid_pool_handle = raw_pool_handle + self.c_obj.reset( - new cuda_async_view_memory_resource(c_memory_pool_handle) + new cuda_async_view_memory_resource(c_valid_pool_handle) ) def pool_handle(self): From b7e899d62127f06b6729ed1426869a255a4fa539 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 18 Mar 2025 17:28:57 -0500 Subject: [PATCH 4/4] Review feedback. --- .../cuda_async_view_memory_resource.hpp | 10 ++++---- python/rmm/rmm/librmm/memory_resource.pxd | 2 +- python/rmm/rmm/pylibrmm/memory_resource.pyx | 23 ++++++++++--------- python/rmm/rmm/tests/test_rmm.py | 6 +++++ 4 files changed, 24 insertions(+), 17 deletions(-) diff --git a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp index 240fdb223..0ef159c9f 100644 --- a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp +++ b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp @@ -47,13 +47,13 @@ class cuda_async_view_memory_resource final : public device_memory_resource { * * @throws rmm::logic_error if the CUDA version does not support `cudaMallocAsync` * - * @param valid_pool_handle Handle to a CUDA memory pool which will be used to + * @param pool_handle Handle to a CUDA memory pool which will be used to * serve allocation requests. */ - cuda_async_view_memory_resource(cudaMemPool_t valid_pool_handle) - : cuda_pool_handle_{[valid_pool_handle]() { - RMM_EXPECTS(nullptr != valid_pool_handle, "Unexpected null pool handle."); - return valid_pool_handle; + cuda_async_view_memory_resource(cudaMemPool_t pool_handle) + : cuda_pool_handle_{[pool_handle]() { + RMM_EXPECTS(nullptr != pool_handle, "Unexpected null pool handle."); + return pool_handle; }()} { // Check if cudaMallocAsync Memory pool supported diff --git a/python/rmm/rmm/librmm/memory_resource.pxd b/python/rmm/rmm/librmm/memory_resource.pxd index 26846f702..3ded7a9be 100644 --- a/python/rmm/rmm/librmm/memory_resource.pxd +++ b/python/rmm/rmm/librmm/memory_resource.pxd @@ -115,7 +115,7 @@ cdef extern from "rmm/mr/device/cuda_async_view_memory_resource.hpp" \ cdef cppclass cuda_async_view_memory_resource(device_memory_resource): cuda_async_view_memory_resource( - cudaMemPool_t valid_pool_handle) except + + cudaMemPool_t pool_handle) except + cudaMemPool_t pool_handle() const # TODO: when we adopt Cython 3.0 use enum class diff --git a/python/rmm/rmm/pylibrmm/memory_resource.pyx b/python/rmm/rmm/pylibrmm/memory_resource.pyx index 14d5da105..253352e7c 100644 --- a/python/rmm/rmm/pylibrmm/memory_resource.pyx +++ b/python/rmm/rmm/pylibrmm/memory_resource.pyx @@ -215,27 +215,28 @@ cdef class CudaAsyncViewMemoryResource(DeviceMemoryResource): applications that already use a CUDA memory pool, or customizing the flags used by the memory pool. + The memory pool passed in must not be destroyed during the lifetime of this + memory resource. + Parameters ---------- - valid_pool_handle : cudaMemPool_t + pool_handle : cudaMemPool_t or CUmemoryPool Handle to a CUDA memory pool which will be used to serve allocation requests. """ def __cinit__( self, - valid_pool_handle + pool_handle ): - # Convert the valid_pool_handle to a cyruntime.cudaMemPool_t - cdef cyruntime.cudaMemPool_t c_valid_pool_handle - if isinstance(valid_pool_handle, (runtime.cudaMemPool_t, driver.CUmemoryPool)): - raw_pool_handle = int(valid_pool_handle) - c_valid_pool_handle = raw_pool_handle - else: - raw_pool_handle = int(runtime.cudaMemPool_t(valid_pool_handle)) - c_valid_pool_handle = raw_pool_handle + # Convert the pool_handle to a cyruntime.cudaMemPool_t + if not isinstance(pool_handle, (runtime.cudaMemPool_t, driver.CUmemoryPool)): + raise ValueError("pool_handle must be a cudaMemPool_t or CUmemoryPool") + + cdef cyruntime.cudaMemPool_t c_pool_handle + c_pool_handle = int(pool_handle) self.c_obj.reset( - new cuda_async_view_memory_resource(c_valid_pool_handle) + new cuda_async_view_memory_resource(c_pool_handle) ) def pool_handle(self): diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py index f531ed763..2fc917863 100644 --- a/python/rmm/rmm/tests/test_rmm.py +++ b/python/rmm/rmm/tests/test_rmm.py @@ -1111,3 +1111,9 @@ def test_cuda_async_view_memory_resource_custom_pool(dtype, nelem, alloc): rmm.mr.set_current_device_resource(mr) assert rmm.mr.get_current_device_resource_type() is type(mr) array_tester(dtype, nelem, alloc) + + # After the pool is destroyed, new allocations should raise + (err,) = runtime.cudaMemPoolDestroy(pool) + assert err == runtime.cudaError_t.cudaSuccess + with pytest.raises(MemoryError): + array_tester(dtype, nelem, alloc)