From 0370efd95713f43b8987d14f639c7bda398caf9a Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 17 Mar 2025 12:00:39 -0500
Subject: [PATCH 1/4] Add async view memory resource bindings to Python.

---
 .../cuda_async_view_memory_resource.hpp       |  3 +-
 python/rmm/rmm/librmm/memory_resource.pxd     | 12 +++++-
 python/rmm/rmm/mr.py                          |  4 +-
 python/rmm/rmm/pylibrmm/memory_resource.pxd   |  5 ++-
 python/rmm/rmm/pylibrmm/memory_resource.pyx   | 38 ++++++++++++++++++-
 python/rmm/rmm/tests/test_rmm.py              | 33 ++++++++++++++++
 6 files changed, 89 insertions(+), 6 deletions(-)

diff --git a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
index 92aea2072..615bda124 100644
--- a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
+++ b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
@@ -72,7 +72,8 @@ class cuda_async_view_memory_resource final : public device_memory_resource {
    */
   [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return cuda_pool_handle_; }
 
-  cuda_async_view_memory_resource() = default;
+  cuda_async_view_memory_resource()  = default;
+  ~cuda_async_view_memory_resource() = default;  ///< @default_destructor
   cuda_async_view_memory_resource(cuda_async_view_memory_resource const&) =
     default;  ///< @default_copy_constructor
   cuda_async_view_memory_resource(cuda_async_view_memory_resource&&) =
diff --git a/python/rmm/rmm/librmm/memory_resource.pxd b/python/rmm/rmm/librmm/memory_resource.pxd
index 9e7b70c4f..26846f702 100644
--- a/python/rmm/rmm/librmm/memory_resource.pxd
+++ b/python/rmm/rmm/librmm/memory_resource.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,6 +23,8 @@ from libcpp.optional cimport optional
 from libcpp.pair cimport pair
 from libcpp.string cimport string
 
+from cuda.bindings.cyruntime cimport cudaMemPool_t
+
 from rmm.librmm.cuda_stream_view cimport cuda_stream_view
 from rmm.librmm.memory_resource cimport device_memory_resource
 
@@ -108,6 +110,14 @@ cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \
             optional[size_t] release_threshold,
             optional[allocation_handle_type] export_handle_type) except +
 
+cdef extern from "rmm/mr/device/cuda_async_view_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+
+    cdef cppclass cuda_async_view_memory_resource(device_memory_resource):
+        cuda_async_view_memory_resource(
+            cudaMemPool_t valid_pool_handle) except +
+        cudaMemPool_t pool_handle() const
+
 # TODO: when we adopt Cython 3.0 use enum class
 cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \
         namespace \
diff --git a/python/rmm/rmm/mr.py b/python/rmm/rmm/mr.py
index 673ffde82..eada9b21b 100644
--- a/python/rmm/rmm/mr.py
+++ b/python/rmm/rmm/mr.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
     BinningMemoryResource,
     CallbackMemoryResource,
     CudaAsyncMemoryResource,
+    CudaAsyncViewMemoryResource,
     CudaMemoryResource,
     DeviceMemoryResource,
     FailureCallbackResourceAdaptor,
@@ -50,6 +51,7 @@
     "BinningMemoryResource",
     "CallbackMemoryResource",
     "CudaAsyncMemoryResource",
+    "CudaAsyncViewMemoryResource",
     "CudaMemoryResource",
     "DeviceMemoryResource",
     "FailureCallbackResourceAdaptor",
diff --git a/python/rmm/rmm/pylibrmm/memory_resource.pxd b/python/rmm/rmm/pylibrmm/memory_resource.pxd
index d1e5610db..8ccd07a02 100644
--- a/python/rmm/rmm/pylibrmm/memory_resource.pxd
+++ b/python/rmm/rmm/pylibrmm/memory_resource.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -44,6 +44,9 @@ cdef class SamHeadroomMemoryResource(DeviceMemoryResource):
 cdef class CudaAsyncMemoryResource(DeviceMemoryResource):
     pass
 
+cdef class CudaAsyncViewMemoryResource(DeviceMemoryResource):
+    pass
+
 cdef class PoolMemoryResource(UpstreamResourceAdaptor):
     pass
 
diff --git a/python/rmm/rmm/pylibrmm/memory_resource.pyx b/python/rmm/rmm/pylibrmm/memory_resource.pyx
index 0189a58b5..4efdbff18 100644
--- a/python/rmm/rmm/pylibrmm/memory_resource.pyx
+++ b/python/rmm/rmm/pylibrmm/memory_resource.pyx
@@ -28,7 +28,8 @@ from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.optional cimport optional
 from libcpp.pair cimport pair
 
-from cuda.bindings.runtime import cudaError_t
+from cuda.bindings cimport cyruntime
+from cuda.bindings import runtime
 
 from rmm._cuda.gpu import CUDARuntimeError, getDevice, setDevice
 
@@ -54,6 +55,7 @@ from rmm.librmm.memory_resource cimport (
     binning_memory_resource,
     callback_memory_resource,
     cuda_async_memory_resource,
+    cuda_async_view_memory_resource,
     cuda_memory_resource,
     deallocate_callback_t,
     device_memory_resource,
@@ -203,6 +205,38 @@ cdef class CudaAsyncMemoryResource(DeviceMemoryResource):
         )
 
 
+cdef class CudaAsyncViewMemoryResource(DeviceMemoryResource):
+    """
+    Memory resource that uses ``cudaMallocAsync``/``cudaFreeAsync`` for
+    allocation/deallocation with an existing CUDA memory pool.
+
+    This resource uses an existing CUDA memory pool handle (such as the default pool)
+    instead of creating a new one. This is useful for integrating with existing GPU
+    applications that already use a CUDA memory pool, or customizing the flags
+    used by the memory pool.
+
+    Parameters
+    ----------
+    valid_pool_handle : cudaMemPool_t
+        Handle to a CUDA memory pool which will be used to serve allocation
+        requests.
+    """
+    def __cinit__(
+        self,
+        valid_pool_handle
+    ):
+        cdef cyruntime.cudaMemPool_t c_memory_pool_handle = \
+            <cyruntime.cudaMemPool_t>valid_pool_handle
+        self.c_obj.reset(
+            new cuda_async_view_memory_resource(c_memory_pool_handle)
+        )
+
+    def pool_handle(self):
+        cdef cuda_async_view_memory_resource* c_mr = \
+            <cuda_async_view_memory_resource*>self.c_obj.get()
+        return <uintptr_t>c_mr.pool_handle()
+
+
 cdef class ManagedMemoryResource(DeviceMemoryResource):
     def __cinit__(self):
         self.c_obj.reset(
@@ -991,7 +1025,7 @@ cpdef void _initialize(
     try:
         original_device = getDevice()
     except CUDARuntimeError as e:
-        if e.status == cudaError_t.cudaErrorNoDevice:
+        if e.status == runtime.cudaError_t.cudaErrorNoDevice:
             warnings.warn(e.msg)
         else:
             raise e
diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py
index ee02d5d0e..f531ed763 100644
--- a/python/rmm/rmm/tests/test_rmm.py
+++ b/python/rmm/rmm/tests/test_rmm.py
@@ -1078,3 +1078,36 @@ def test_available_device_memory():
     assert initial_memory[1] == final_memory[1]
     assert initial_memory[0] > 0
     assert final_memory[0] > 0
+
+
+@pytest.mark.parametrize("dtype", _dtypes)
+@pytest.mark.parametrize("nelem", _nelems)
+@pytest.mark.parametrize("alloc", _allocs)
+def test_cuda_async_view_memory_resource_default_pool(dtype, nelem, alloc):
+    # Get the default memory pool handle
+    current_device = rmm._cuda.gpu.getDevice()
+    err, pool = runtime.cudaDeviceGetDefaultMemPool(current_device)
+    assert err == runtime.cudaError_t.cudaSuccess
+
+    mr = rmm.mr.CudaAsyncViewMemoryResource(pool)
+    rmm.mr.set_current_device_resource(mr)
+    assert rmm.mr.get_current_device_resource_type() is type(mr)
+    array_tester(dtype, nelem, alloc)
+
+
+@pytest.mark.parametrize("dtype", _dtypes)
+@pytest.mark.parametrize("nelem", _nelems)
+@pytest.mark.parametrize("alloc", _allocs)
+def test_cuda_async_view_memory_resource_custom_pool(dtype, nelem, alloc):
+    # Create a memory pool handle
+    props = runtime.cudaMemPoolProps()
+    props.allocType = runtime.cudaMemAllocationType.cudaMemAllocationTypePinned
+    props.location.id = rmm._cuda.gpu.getDevice()
+    props.location.type = runtime.cudaMemLocationType.cudaMemLocationTypeDevice
+    err, pool = runtime.cudaMemPoolCreate(props)
+    assert err == runtime.cudaError_t.cudaSuccess
+
+    mr = rmm.mr.CudaAsyncViewMemoryResource(pool)
+    rmm.mr.set_current_device_resource(mr)
+    assert rmm.mr.get_current_device_resource_type() is type(mr)
+    array_tester(dtype, nelem, alloc)

From b34ea3b34110e5b9107102c70d0430bf9414eca0 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 17 Mar 2025 12:43:01 -0500
Subject: [PATCH 2/4] Remove docs on destructor.

---
 include/rmm/mr/device/cuda_async_view_memory_resource.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
index 615bda124..240fdb223 100644
--- a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
+++ b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
@@ -73,7 +73,7 @@ class cuda_async_view_memory_resource final : public device_memory_resource {
   [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return cuda_pool_handle_; }
 
   cuda_async_view_memory_resource()  = default;
-  ~cuda_async_view_memory_resource() = default;  ///< @default_destructor
+  ~cuda_async_view_memory_resource() = default;
   cuda_async_view_memory_resource(cuda_async_view_memory_resource const&) =
     default;  ///< @default_copy_constructor
   cuda_async_view_memory_resource(cuda_async_view_memory_resource&&) =

From 88a2f71fb81d457e9e04c486162a116461ed5166 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 17 Mar 2025 15:26:31 -0500
Subject: [PATCH 3/4] Fix cudaMemPool_t conversions.

---
 python/rmm/rmm/pylibrmm/memory_resource.pyx | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/python/rmm/rmm/pylibrmm/memory_resource.pyx b/python/rmm/rmm/pylibrmm/memory_resource.pyx
index 4efdbff18..14d5da105 100644
--- a/python/rmm/rmm/pylibrmm/memory_resource.pyx
+++ b/python/rmm/rmm/pylibrmm/memory_resource.pyx
@@ -29,7 +29,7 @@ from libcpp.optional cimport optional
 from libcpp.pair cimport pair
 
 from cuda.bindings cimport cyruntime
-from cuda.bindings import runtime
+from cuda.bindings import driver, runtime
 
 from rmm._cuda.gpu import CUDARuntimeError, getDevice, setDevice
 
@@ -225,10 +225,17 @@ cdef class CudaAsyncViewMemoryResource(DeviceMemoryResource):
         self,
         valid_pool_handle
     ):
-        cdef cyruntime.cudaMemPool_t c_memory_pool_handle = \
-            <cyruntime.cudaMemPool_t>valid_pool_handle
+        # Convert the valid_pool_handle to a cyruntime.cudaMemPool_t
+        cdef cyruntime.cudaMemPool_t c_valid_pool_handle
+        if isinstance(valid_pool_handle, (runtime.cudaMemPool_t, driver.CUmemoryPool)):
+            raw_pool_handle = int(valid_pool_handle)
+            c_valid_pool_handle = <cyruntime.cudaMemPool_t><uintptr_t>raw_pool_handle
+        else:
+            raw_pool_handle = int(runtime.cudaMemPool_t(valid_pool_handle))
+            c_valid_pool_handle = <cyruntime.cudaMemPool_t><uintptr_t>raw_pool_handle
+
         self.c_obj.reset(
-            new cuda_async_view_memory_resource(c_memory_pool_handle)
+            new cuda_async_view_memory_resource(c_valid_pool_handle)
         )
 
     def pool_handle(self):

From b7e899d62127f06b6729ed1426869a255a4fa539 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 18 Mar 2025 17:28:57 -0500
Subject: [PATCH 4/4] Review feedback.

---
 .../cuda_async_view_memory_resource.hpp       | 10 ++++----
 python/rmm/rmm/librmm/memory_resource.pxd     |  2 +-
 python/rmm/rmm/pylibrmm/memory_resource.pyx   | 23 ++++++++++---------
 python/rmm/rmm/tests/test_rmm.py              |  6 +++++
 4 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
index 240fdb223..0ef159c9f 100644
--- a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
+++ b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
@@ -47,13 +47,13 @@ class cuda_async_view_memory_resource final : public device_memory_resource {
    *
    * @throws rmm::logic_error if the CUDA version does not support `cudaMallocAsync`
    *
-   * @param valid_pool_handle Handle to a CUDA memory pool which will be used to
+   * @param pool_handle Handle to a CUDA memory pool which will be used to
    * serve allocation requests.
    */
-  cuda_async_view_memory_resource(cudaMemPool_t valid_pool_handle)
-    : cuda_pool_handle_{[valid_pool_handle]() {
-        RMM_EXPECTS(nullptr != valid_pool_handle, "Unexpected null pool handle.");
-        return valid_pool_handle;
+  cuda_async_view_memory_resource(cudaMemPool_t pool_handle)
+    : cuda_pool_handle_{[pool_handle]() {
+        RMM_EXPECTS(nullptr != pool_handle, "Unexpected null pool handle.");
+        return pool_handle;
       }()}
   {
     // Check if cudaMallocAsync Memory pool supported
diff --git a/python/rmm/rmm/librmm/memory_resource.pxd b/python/rmm/rmm/librmm/memory_resource.pxd
index 26846f702..3ded7a9be 100644
--- a/python/rmm/rmm/librmm/memory_resource.pxd
+++ b/python/rmm/rmm/librmm/memory_resource.pxd
@@ -115,7 +115,7 @@ cdef extern from "rmm/mr/device/cuda_async_view_memory_resource.hpp" \
 
     cdef cppclass cuda_async_view_memory_resource(device_memory_resource):
         cuda_async_view_memory_resource(
-            cudaMemPool_t valid_pool_handle) except +
+            cudaMemPool_t pool_handle) except +
         cudaMemPool_t pool_handle() const
 
 # TODO: when we adopt Cython 3.0 use enum class
diff --git a/python/rmm/rmm/pylibrmm/memory_resource.pyx b/python/rmm/rmm/pylibrmm/memory_resource.pyx
index 14d5da105..253352e7c 100644
--- a/python/rmm/rmm/pylibrmm/memory_resource.pyx
+++ b/python/rmm/rmm/pylibrmm/memory_resource.pyx
@@ -215,27 +215,28 @@ cdef class CudaAsyncViewMemoryResource(DeviceMemoryResource):
     applications that already use a CUDA memory pool, or customizing the flags
     used by the memory pool.
 
+    The memory pool passed in must not be destroyed during the lifetime of this
+    memory resource.
+
     Parameters
     ----------
-    valid_pool_handle : cudaMemPool_t
+    pool_handle : cudaMemPool_t or CUmemoryPool
         Handle to a CUDA memory pool which will be used to serve allocation
         requests.
     """
     def __cinit__(
         self,
-        valid_pool_handle
+        pool_handle
     ):
-        # Convert the valid_pool_handle to a cyruntime.cudaMemPool_t
-        cdef cyruntime.cudaMemPool_t c_valid_pool_handle
-        if isinstance(valid_pool_handle, (runtime.cudaMemPool_t, driver.CUmemoryPool)):
-            raw_pool_handle = int(valid_pool_handle)
-            c_valid_pool_handle = <cyruntime.cudaMemPool_t><uintptr_t>raw_pool_handle
-        else:
-            raw_pool_handle = int(runtime.cudaMemPool_t(valid_pool_handle))
-            c_valid_pool_handle = <cyruntime.cudaMemPool_t><uintptr_t>raw_pool_handle
+        # Convert the pool_handle to a cyruntime.cudaMemPool_t
+        if not isinstance(pool_handle, (runtime.cudaMemPool_t, driver.CUmemoryPool)):
+            raise ValueError("pool_handle must be a cudaMemPool_t or CUmemoryPool")
+
+        cdef cyruntime.cudaMemPool_t c_pool_handle
+        c_pool_handle = <cyruntime.cudaMemPool_t><uintptr_t>int(pool_handle)
 
         self.c_obj.reset(
-            new cuda_async_view_memory_resource(c_valid_pool_handle)
+            new cuda_async_view_memory_resource(c_pool_handle)
         )
 
     def pool_handle(self):
diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py
index f531ed763..2fc917863 100644
--- a/python/rmm/rmm/tests/test_rmm.py
+++ b/python/rmm/rmm/tests/test_rmm.py
@@ -1111,3 +1111,9 @@ def test_cuda_async_view_memory_resource_custom_pool(dtype, nelem, alloc):
     rmm.mr.set_current_device_resource(mr)
     assert rmm.mr.get_current_device_resource_type() is type(mr)
     array_tester(dtype, nelem, alloc)
+
+    # After the pool is destroyed, new allocations should raise
+    (err,) = runtime.cudaMemPoolDestroy(pool)
+    assert err == runtime.cudaError_t.cudaSuccess
+    with pytest.raises(MemoryError):
+        array_tester(dtype, nelem, alloc)