Skip to content

Commit 0eeabc1

Browse files
stiepanleofang
andauthored
feat: Introduce StridedLayout, support wrapping external allocations in Buffer, add StridedMemoryView.from_buffer (#1283)
* Add StridedLayout Signed-off-by: Kamil Tokarski <[email protected]> * Support wrapping ptr in Buffer, create SMV from buffer and layout, dlpack export Signed-off-by: Kamil Tokarski <[email protected]> * Documentation, linting, minor fixes Signed-off-by: Kamil Tokarski <[email protected]> * Add NotImplemented copy_from/copy_to Signed-off-by: Kamil Tokarski <[email protected]> * Adjust flattening scalars to numpy/cupy behavior, fix shape validation in reshape, fix to dense with sliced views Signed-off-by: Kamil Tokarski <[email protected]> * Add StridedLayout tests Signed-off-by: Kamil Tokarski <[email protected]> * Use explicit int32_t instead of int in integer fused type Signed-off-by: Kamil Tokarski <[email protected]> * Disable (for now) exporting the SMV via dlpack Signed-off-by: Kamil Tokarski <[email protected]> * Revert dlpack changes Signed-off-by: Kamil Tokarski <[email protected]> * Support layouts up to 64 dims Signed-off-by: Kamil Tokarski <[email protected]> * Use cydriver to query memory attributes, fix managed memory handling, add tests for the attributes Signed-off-by: Kamil Tokarski <[email protected]> * Test owner and mr cannot be specified together Signed-off-by: Kamil Tokarski <[email protected]> * Test Buffer.close with owner Signed-off-by: Kamil Tokarski <[email protected]> * Add envelope checks (rquires_size_in_bytes, offset_bounds) Signed-off-by: Kamil Tokarski <[email protected]> * Docs, annotation fixes, remove dlpack export mentions Signed-off-by: Kamil Tokarski <[email protected]> * Add SMV.from_buffer/view tests Signed-off-by: Kamil Tokarski <[email protected]> * Layout tests for SMV created from CAI Signed-off-by: Kamil Tokarski <[email protected]> * Fix missing host unregister call in buffer test Signed-off-by: Kamil Tokarski <[email protected]> * Fix num attrib on re-try Signed-off-by: Kamil Tokarski <[email protected]> * Call int on the buffer.handle Signed-off-by: Kamil Tokarski <[email protected]> * Don't enforce Buffer having an owner when creating SMV Signed-off-by: Kamil Tokarski <[email protected]> * Use np._s instead of a custom helper in the tests Signed-off-by: Kamil Tokarski <[email protected]> * Take lanes into account when computing the itemsize Signed-off-by: Kamil Tokarski <[email protected]> * Move layout validation out of get_data_ptr helper Signed-off-by: Kamil Tokarski <[email protected]> * Disambiguate all_axes mask for layout flattening, add range flattening tests Signed-off-by: Kamil Tokarski <[email protected]> * Bring back the intptr_t in SMV Signed-off-by: Kamil Tokarski <[email protected]> * Reorder methods, adjust SMV tests to from_dlpack/form_cai methods Signed-off-by: Kamil Tokarski <[email protected]> * Move the Device import to top-level imports Signed-off-by: Kamil Tokarski <[email protected]> --------- Signed-off-by: Kamil Tokarski <[email protected]> Co-authored-by: Leo Fang <[email protected]>
1 parent 476cf1d commit 0eeabc1

File tree

15 files changed

+3896
-78
lines changed

15 files changed

+3896
-78
lines changed

cuda_core/cuda/core/experimental/_layout.pxd

Lines changed: 693 additions & 0 deletions
Large diffs are not rendered by default.

cuda_core/cuda/core/experimental/_layout.pyx

Lines changed: 1323 additions & 0 deletions
Large diffs are not rendered by default.

cuda_core/cuda/core/experimental/_memory/_buffer.pxd

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,23 @@ from libc.stdint cimport uintptr_t
77
from cuda.core.experimental._stream cimport Stream
88

99

10+
cdef struct _MemAttrs:
11+
int device_id
12+
bint is_device_accessible
13+
bint is_host_accessible
14+
15+
1016
cdef class Buffer:
1117
cdef:
1218
uintptr_t _ptr
1319
size_t _size
1420
MemoryResource _memory_resource
1521
object _ipc_data
22+
object _owner
1623
object _ptr_obj
1724
Stream _alloc_stream
25+
_MemAttrs _mem_attrs
26+
bint _mem_attrs_inited
1827

1928

2029
cdef class MemoryResource:

cuda_core/cuda/core/experimental/_memory/_buffer.pyx

Lines changed: 100 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from __future__ import annotations
66

7+
cimport cython
78
from libc.stdint cimport uintptr_t, int64_t, uint64_t
89

910
from cuda.bindings cimport cydriver
@@ -18,6 +19,7 @@ from typing import TypeVar, Union
1819

1920
from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
2021
from cuda.core.experimental._utils.cuda_utils import driver
22+
from cuda.core.experimental._device import Device
2123

2224
__all__ = ['Buffer', 'MemoryResource']
2325

@@ -47,6 +49,8 @@ cdef class Buffer:
4749
self._ipc_data = None
4850
self._ptr_obj = None
4951
self._alloc_stream = None
52+
self._owner = None
53+
self._mem_attrs_inited = False
5054

5155
def __init__(self, *args, **kwargs):
5256
raise RuntimeError("Buffer objects cannot be instantiated directly. "
@@ -55,15 +59,19 @@ cdef class Buffer:
5559
@classmethod
5660
def _init(
5761
cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None,
58-
stream: Stream | None = None, ipc_descriptor: IPCBufferDescriptor | None = None
62+
stream: Stream | None = None, ipc_descriptor: IPCBufferDescriptor | None = None,
63+
owner : object | None = None
5964
):
6065
cdef Buffer self = Buffer.__new__(cls)
6166
self._ptr = <uintptr_t>(int(ptr))
6267
self._ptr_obj = ptr
6368
self._size = size
69+
if mr is not None and owner is not None:
70+
raise ValueError("owner and memory resource cannot be both specified together")
6471
self._memory_resource = mr
6572
self._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None
6673
self._alloc_stream = <Stream>(stream) if stream is not None else None
74+
self._owner = owner
6775
return self
6876

6977
def __dealloc__(self):
@@ -75,7 +83,8 @@ cdef class Buffer:
7583

7684
@staticmethod
7785
def from_handle(
78-
ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None
86+
ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None,
87+
owner: object | None = None,
7988
) -> Buffer:
8089
"""Create a new :class:`Buffer` object from a pointer.
8190

@@ -87,9 +96,13 @@ cdef class Buffer:
8796
Memory size of the buffer
8897
mr : :obj:`~_memory.MemoryResource`, optional
8998
Memory resource associated with the buffer
99+
owner : object, optional
100+
An object holding external allocation that the ``ptr`` points to.
101+
The reference is kept as long as the buffer is alive.
102+
The ``owner`` and ``mr`` cannot be specified together.
90103
"""
91104
# TODO: It is better to take a stream for latter deallocation
92-
return Buffer._init(ptr, size, mr=mr)
105+
return Buffer._init(ptr, size, mr=mr, owner=owner)
93106

94107
@classmethod
95108
def from_ipc_descriptor(
@@ -297,7 +310,9 @@ cdef class Buffer:
297310
"""Return the device ordinal of this buffer."""
298311
if self._memory_resource is not None:
299312
return self._memory_resource.device_id
300-
raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
313+
else:
314+
Buffer_init_mem_attrs(self)
315+
return self._mem_attrs.device_id
301316

302317
@property
303318
def handle(self) -> DevicePointerT:
@@ -321,14 +336,18 @@ cdef class Buffer:
321336
"""Return True if this buffer can be accessed by the GPU, otherwise False."""
322337
if self._memory_resource is not None:
323338
return self._memory_resource.is_device_accessible
324-
raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
339+
else:
340+
Buffer_init_mem_attrs(self)
341+
return self._mem_attrs.is_device_accessible
325342

326343
@property
327344
def is_host_accessible(self) -> bool:
328345
"""Return True if this buffer can be accessed by the CPU, otherwise False."""
329346
if self._memory_resource is not None:
330347
return self._memory_resource.is_host_accessible
331-
raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
348+
else:
349+
Buffer_init_mem_attrs(self)
350+
return self._mem_attrs.is_host_accessible
332351

333352
@property
334353
def is_mapped(self) -> bool:
@@ -346,20 +365,92 @@ cdef class Buffer:
346365
"""Return the memory size of this buffer."""
347366
return self._size
348367

368+
@property
369+
def owner(self) -> object:
370+
"""Return the object holding external allocation."""
371+
return self._owner
372+
349373

350374
# Buffer Implementation
351375
# ---------------------
352376
cdef inline void Buffer_close(Buffer self, stream):
353377
cdef Stream s
354-
if self._ptr and self._memory_resource is not None:
355-
s = Stream_accept(stream) if stream is not None else self._alloc_stream
356-
self._memory_resource.deallocate(self._ptr, self._size, s)
378+
if self._ptr:
379+
if self._memory_resource is not None:
380+
s = Stream_accept(stream) if stream is not None else self._alloc_stream
381+
self._memory_resource.deallocate(self._ptr, self._size, s)
357382
self._ptr = 0
358383
self._memory_resource = None
384+
self._owner = None
359385
self._ptr_obj = None
360386
self._alloc_stream = None
361387

362388

389+
cdef Buffer_init_mem_attrs(Buffer self):
390+
if not self._mem_attrs_inited:
391+
query_memory_attrs(self._mem_attrs, self._ptr)
392+
self._mem_attrs_inited = True
393+
394+
395+
cdef int query_memory_attrs(_MemAttrs &out, uintptr_t ptr) except -1 nogil:
396+
cdef unsigned int memory_type = 0
397+
cdef int is_managed = 0
398+
cdef int device_id = 0
399+
_query_memory_attrs(memory_type, is_managed, device_id, <cydriver.CUdeviceptr>ptr)
400+
401+
if memory_type == 0:
402+
# unregistered host pointer
403+
out.is_host_accessible = True
404+
out.is_device_accessible = False
405+
out.device_id = -1
406+
# for managed memory, the memory type can be CU_MEMORYTYPE_DEVICE,
407+
# so we need to check it first not to falsely claim it is not
408+
# host accessible.
409+
elif (
410+
is_managed
411+
or memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_HOST
412+
):
413+
# For pinned memory allocated with cudaMallocHost or paged-locked
414+
# with cudaHostRegister, the memory_type is
415+
# cydriver.CUmemorytype.CU_MEMORYTYPE_HOST.
416+
# TODO(ktokarski): In some cases, the registered memory requires
417+
# using different ptr for device and host, we could check
418+
# cuMemHostGetDevicePointer and
419+
# CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM
420+
# to double check the device accessibility.
421+
out.is_host_accessible = True
422+
out.is_device_accessible = True
423+
out.device_id = device_id
424+
elif memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_DEVICE:
425+
out.is_host_accessible = False
426+
out.is_device_accessible = True
427+
out.device_id = device_id
428+
else:
429+
raise ValueError(f"Unsupported memory type: {memory_type}")
430+
return 0
431+
432+
433+
cdef inline int _query_memory_attrs(unsigned int& memory_type, int & is_managed, int& device_id, cydriver.CUdeviceptr ptr) except -1 nogil:
434+
cdef cydriver.CUpointer_attribute attrs[3]
435+
cdef uintptr_t vals[3]
436+
attrs[0] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE
437+
attrs[1] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_MANAGED
438+
attrs[2] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL
439+
vals[0] = <uintptr_t><void*>&memory_type
440+
vals[1] = <uintptr_t><void*>&is_managed
441+
vals[2] = <uintptr_t><void*>&device_id
442+
443+
cdef cydriver.CUresult ret
444+
ret = cydriver.cuPointerGetAttributes(3, attrs, <void**>vals, ptr)
445+
if ret == cydriver.CUresult.CUDA_ERROR_NOT_INITIALIZED:
446+
with cython.gil:
447+
# Device class handles the cuInit call internally
448+
Device()
449+
ret = cydriver.cuPointerGetAttributes(3, attrs, <void**>vals, ptr)
450+
HANDLE_RETURN(ret)
451+
return 0
452+
453+
363454
cdef class MemoryResource:
364455
"""Abstract base class for memory resources that manage allocation and
365456
deallocation of buffers.

0 commit comments

Comments
 (0)