diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 27e76b817a..57eab540da 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -565,6 +565,44 @@ DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner) { return DevicePtrHandle(box, &box->resource); } +// ============================================================================ +// MemoryResource-owned Device Pointer Handles +// ============================================================================ + +static MRDeallocCallback mr_dealloc_cb = nullptr; + +void register_mr_dealloc_callback(MRDeallocCallback cb) { + mr_dealloc_cb = cb; +} + +DevicePtrHandle deviceptr_create_with_mr(CUdeviceptr ptr, size_t size, PyObject* mr) { + if (!mr) { + return deviceptr_create_ref(ptr); + } + // GIL required when mr is provided + GILAcquireGuard gil; + if (!gil.acquired()) { + return deviceptr_create_ref(ptr); + } + Py_INCREF(mr); + auto box = std::shared_ptr( + new DevicePtrBox{ptr, StreamHandle{}}, + [mr, size](DevicePtrBox* b) { + GILAcquireGuard gil; + if (gil.acquired() && mr_dealloc_cb) { + mr_dealloc_cb(mr, b->resource, size, b->h_stream); + } + // Decref mr even if callback was skipped (prevent leak when + // callback is not registered -- should not happen in practice). + if (gil.acquired()) { + Py_DECREF(mr); + } + delete b; + } + ); + return DevicePtrHandle(box, &box->resource); +} + // ============================================================================ // IPC Pointer Cache // ============================================================================ diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index cb66841172..3628e98de1 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -239,6 +239,23 @@ DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr); // If owner is nullptr, equivalent to deviceptr_create_ref. DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner); +// Callback type for MemoryResource deallocation. +// Called from the shared_ptr deleter when a handle created via +// deviceptr_create_with_mr is destroyed. The implementation is responsible +// for converting raw C types to Python objects and calling +// mr.deallocate(ptr, size, stream). +using MRDeallocCallback = void (*)(PyObject* mr, CUdeviceptr ptr, + size_t size, const StreamHandle& stream); + +// Register the MR deallocation callback. +void register_mr_dealloc_callback(MRDeallocCallback cb); + +// Create a device pointer handle whose destructor calls mr.deallocate() +// via the registered callback. The mr's refcount is incremented and +// decremented when the handle is released. +// If mr is nullptr, equivalent to deviceptr_create_ref. +DevicePtrHandle deviceptr_create_with_mr(CUdeviceptr ptr, size_t size, PyObject* mr); + // Import a device pointer from IPC via cuMemPoolImportPointer. // When the last reference is released, cuMemFreeAsync is called on the stored stream. // Note: Does not yet implement reference counting for nvbug 5570902. diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 03abd913a9..ece900dda8 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -17,12 +17,14 @@ from cuda.core._resource_handles cimport ( DevicePtrHandle, StreamHandle, deviceptr_create_with_owner, + deviceptr_create_with_mr, + register_mr_dealloc_callback, as_intptr, as_cu, set_deallocation_stream, ) -from cuda.core._stream cimport Stream_accept, Stream +from cuda.core._stream cimport Stream, Stream_accept from cuda.core._utils.cuda_utils cimport HANDLE_RETURN import sys @@ -37,6 +39,29 @@ from cuda.core._dlpack import DLDeviceType, make_py_capsule from cuda.core._utils.cuda_utils import driver from cuda.core._device import Device + +# ============================================================================= +# MR deallocation callback (invoked from C++ shared_ptr deleter) +# ============================================================================= + +cdef void _mr_dealloc_callback( + object mr, + cydriver.CUdeviceptr ptr, + size_t size, + const StreamHandle& h_stream, +) noexcept: + """Called by the C++ deleter to deallocate via MemoryResource.deallocate.""" + try: + stream = None + if h_stream: + stream = Stream._from_handle(Stream, h_stream) + mr.deallocate(int(ptr), size, stream) + except Exception: + pass # Cannot propagate exceptions from a C++ destructor + +register_mr_dealloc_callback(_mr_dealloc_callback) + + __all__ = ['Buffer', 'MemoryResource'] @@ -73,18 +98,23 @@ cdef class Buffer: @classmethod def _init( cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None, - stream: Stream | None = None, ipc_descriptor: IPCBufferDescriptor | None = None, + ipc_descriptor: IPCBufferDescriptor | None = None, owner : object | None = None ): - """Legacy init for compatibility - creates a non-owning ref handle. + """Create a Buffer from a raw pointer. - Note: The stream parameter is accepted for API compatibility but is - ignored since non-owning refs are never freed by the handle. + When ``mr`` is provided, the buffer takes ownership: ``mr.deallocate()`` + is called when the buffer is closed or garbage collected. When ``owner`` + is provided, the owner is kept alive but no deallocation is performed. """ if mr is not None and owner is not None: raise ValueError("owner and memory resource cannot be both specified together") cdef Buffer self = Buffer.__new__(cls) - self._h_ptr = deviceptr_create_with_owner((int(ptr)), owner) + cdef uintptr_t c_ptr = (int(ptr)) + if mr is not None: + self._h_ptr = deviceptr_create_with_mr(c_ptr, size, mr) + else: + self._h_ptr = deviceptr_create_with_owner(c_ptr, owner) self._size = size self._memory_resource = mr self._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None @@ -92,8 +122,6 @@ cdef class Buffer: self._mem_attrs_inited = False return self - # No __dealloc__ needed - RAII handles cleanup via _h_ptr destructor - def __reduce__(self): # Must not serialize the parent's stream! return Buffer.from_ipc_descriptor, (self.memory_resource, self.get_ipc_descriptor()) @@ -112,7 +140,9 @@ cdef class Buffer: size : int Memory size of the buffer mr : :obj:`~_memory.MemoryResource`, optional - Memory resource associated with the buffer + Memory resource associated with the buffer. When provided, + :meth:`MemoryResource.deallocate` is called when the buffer is + closed or garbage collected. owner : object, optional An object holding external allocation that the ``ptr`` points to. The reference is kept as long as the buffer is alive. @@ -120,8 +150,9 @@ cdef class Buffer: Note ---- - This creates a non-owning reference. The pointer will NOT be freed - when the Buffer is closed or garbage collected. + When neither ``mr`` nor ``owner`` is specified, this creates a + non-owning reference. The pointer will NOT be freed when the + :class:`Buffer` is closed or garbage collected. """ return Buffer._init(ptr, size, mr=mr, owner=owner) diff --git a/cuda_core/cuda/core/_memory/_legacy.py b/cuda_core/cuda/core/_memory/_legacy.py index 7b8cd6dd56..919c37fc50 100644 --- a/cuda_core/cuda/core/_memory/_legacy.py +++ b/cuda_core/cuda/core/_memory/_legacy.py @@ -51,7 +51,7 @@ def allocate(self, size, stream=None) -> Buffer: raise_if_driver_error(err) else: ptr = 0 - return Buffer._init(ptr, size, self, stream) + return Buffer._init(ptr, size, self) def deallocate(self, ptr: DevicePointerT, size, stream): """Deallocate a buffer previously allocated by this resource. @@ -106,7 +106,7 @@ def allocate(self, size, stream=None) -> Buffer: raise_if_driver_error(err) else: ptr = 0 - return Buffer._init(ptr, size, self, stream) + return Buffer._init(ptr, size, self) def deallocate(self, ptr, size, stream): if stream is not None: diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd index d573862d16..452568bba4 100644 --- a/cuda_core/cuda/core/_resource_handles.pxd +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -108,6 +108,15 @@ cdef DevicePtrHandle deviceptr_alloc(size_t size) except+ nogil cdef DevicePtrHandle deviceptr_alloc_host(size_t size) except+ nogil cdef DevicePtrHandle deviceptr_create_ref(cydriver.CUdeviceptr ptr) except+ nogil cdef DevicePtrHandle deviceptr_create_with_owner(cydriver.CUdeviceptr ptr, object owner) except+ nogil +cdef DevicePtrHandle deviceptr_create_with_mr( + cydriver.CUdeviceptr ptr, size_t size, object mr) except+ nogil + +# MR deallocation callback type and registration +ctypedef void (*MRDeallocCallback)( + object mr, cydriver.CUdeviceptr ptr, size_t size, + const StreamHandle& stream) noexcept +cdef void register_mr_dealloc_callback(MRDeallocCallback cb) noexcept + cdef DevicePtrHandle deviceptr_import_ipc( const MemoryPoolHandle& h_pool, const void* export_data, const StreamHandle& h_stream) except+ nogil cdef StreamHandle deallocation_stream(const DevicePtrHandle& h) noexcept nogil diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index 2652d4448e..3c9922fdbe 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -92,6 +92,16 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": cydriver.CUdeviceptr ptr) except+ nogil DevicePtrHandle deviceptr_create_with_owner "cuda_core::deviceptr_create_with_owner" ( cydriver.CUdeviceptr ptr, object owner) except+ nogil + + # MR deallocation callback + ctypedef void (*MRDeallocCallback)( + object mr, cydriver.CUdeviceptr ptr, size_t size, + const StreamHandle& stream) noexcept + void register_mr_dealloc_callback "cuda_core::register_mr_dealloc_callback" ( + MRDeallocCallback cb) noexcept + DevicePtrHandle deviceptr_create_with_mr "cuda_core::deviceptr_create_with_mr" ( + cydriver.CUdeviceptr ptr, size_t size, object mr) except+ nogil + DevicePtrHandle deviceptr_import_ipc "cuda_core::deviceptr_import_ipc" ( const MemoryPoolHandle& h_pool, const void* export_data, const StreamHandle& h_stream) except+ nogil StreamHandle deallocation_stream "cuda_core::deallocation_stream" ( diff --git a/cuda_core/tests/helpers/buffers.py b/cuda_core/tests/helpers/buffers.py index 3004cd0d00..68d3161fc3 100644 --- a/cuda_core/tests/helpers/buffers.py +++ b/cuda_core/tests/helpers/buffers.py @@ -14,6 +14,7 @@ "DummyUnifiedMemoryResource", "make_scratch_buffer", "PatternGen", + "TrackingMR", ] @@ -41,6 +42,37 @@ def device_id(self) -> int: return self.device +class TrackingMR(MemoryResource): + """A MemoryResource that tracks active allocations via a dict. + + Useful for verifying that deallocate is called at the expected time. + """ + + def __init__(self): + self.active = {} + + def allocate(self, size, stream=None): + ptr = handle_return(driver.cuMemAlloc(size)) + self.active[int(ptr)] = size + return Buffer.from_handle(ptr=ptr, size=size, mr=self) + + def deallocate(self, ptr, size, stream=None): + handle_return(driver.cuMemFree(ptr)) + del self.active[int(ptr)] + + @property + def is_device_accessible(self): + return True + + @property + def is_host_accessible(self): + return False + + @property + def device_id(self): + return 0 + + class PatternGen: """ Provides methods to fill a target buffer with known test patterns and diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 9a88f5f483..181db3fb28 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -39,7 +39,7 @@ from cuda.core._utils.cuda_utils import CUDAError, handle_return from cuda.core.utils import StridedMemoryView from helpers import IS_WINDOWS, supports_ipc_mempool -from helpers.buffers import DummyUnifiedMemoryResource +from helpers.buffers import DummyUnifiedMemoryResource, TrackingMR from conftest import ( create_managed_memory_resource_or_skip, @@ -471,6 +471,31 @@ def test_buffer_external_managed(change_device): handle_return(driver.cuMemFree(ptr)) +def test_mr_deallocate_called_on_close(): + """Buffer.from_handle(mr=mr) calls mr.deallocate() on close (issue #1619).""" + device = Device() + device.set_current() + mr = TrackingMR() + buf = mr.allocate(1024) + assert len(mr.active) == 1 + buf.close() + assert len(mr.active) == 0 + + +def test_mr_deallocate_called_on_gc(): + """Buffer.from_handle(mr=mr) calls mr.deallocate() on GC (issue #1619).""" + import gc + + device = Device() + device.set_current() + mr = TrackingMR() + buf = mr.allocate(1024) + assert len(mr.active) == 1 + del buf + gc.collect() + assert len(mr.active) == 0 + + def test_memory_resource_and_owner_disallowed(): with pytest.raises(ValueError, match="cannot be both specified together"): a = (ctypes.c_byte * 20)()