Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions cuda_core/cuda/core/_cpp/resource_handles.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -565,6 +565,44 @@ DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner) {
return DevicePtrHandle(box, &box->resource);
}

// ============================================================================
// MemoryResource-owned Device Pointer Handles
// ============================================================================

static MRDeallocCallback mr_dealloc_cb = nullptr;

void register_mr_dealloc_callback(MRDeallocCallback cb) {
mr_dealloc_cb = cb;
}

DevicePtrHandle deviceptr_create_with_mr(CUdeviceptr ptr, size_t size, PyObject* mr) {
if (!mr) {
return deviceptr_create_ref(ptr);
}
// GIL required when mr is provided
GILAcquireGuard gil;
if (!gil.acquired()) {
return deviceptr_create_ref(ptr);
}
Py_INCREF(mr);
auto box = std::shared_ptr<DevicePtrBox>(
new DevicePtrBox{ptr, StreamHandle{}},
[mr, size](DevicePtrBox* b) {
GILAcquireGuard gil;
if (gil.acquired() && mr_dealloc_cb) {
mr_dealloc_cb(mr, b->resource, size, b->h_stream);
}
// Decref mr even if callback was skipped (prevent leak when
// callback is not registered -- should not happen in practice).
if (gil.acquired()) {
Py_DECREF(mr);
}
delete b;
}
);
return DevicePtrHandle(box, &box->resource);
}

// ============================================================================
// IPC Pointer Cache
// ============================================================================
Expand Down
17 changes: 17 additions & 0 deletions cuda_core/cuda/core/_cpp/resource_handles.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,23 @@ DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr);
// If owner is nullptr, equivalent to deviceptr_create_ref.
DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner);

// Callback type for MemoryResource deallocation.
// Called from the shared_ptr deleter when a handle created via
// deviceptr_create_with_mr is destroyed. The implementation is responsible
// for converting raw C types to Python objects and calling
// mr.deallocate(ptr, size, stream).
using MRDeallocCallback = void (*)(PyObject* mr, CUdeviceptr ptr,
size_t size, const StreamHandle& stream);

// Register the MR deallocation callback.
void register_mr_dealloc_callback(MRDeallocCallback cb);

// Create a device pointer handle whose destructor calls mr.deallocate()
// via the registered callback. The mr's refcount is incremented and
// decremented when the handle is released.
// If mr is nullptr, equivalent to deviceptr_create_ref.
DevicePtrHandle deviceptr_create_with_mr(CUdeviceptr ptr, size_t size, PyObject* mr);

// Import a device pointer from IPC via cuMemPoolImportPointer.
// When the last reference is released, cuMemFreeAsync is called on the stored stream.
// Note: Does not yet implement reference counting for nvbug 5570902.
Expand Down
53 changes: 42 additions & 11 deletions cuda_core/cuda/core/_memory/_buffer.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,14 @@ from cuda.core._resource_handles cimport (
DevicePtrHandle,
StreamHandle,
deviceptr_create_with_owner,
deviceptr_create_with_mr,
register_mr_dealloc_callback,
as_intptr,
as_cu,
set_deallocation_stream,
)

from cuda.core._stream cimport Stream_accept, Stream
from cuda.core._stream cimport Stream, Stream_accept
from cuda.core._utils.cuda_utils cimport HANDLE_RETURN

import sys
Expand All @@ -37,6 +39,29 @@ from cuda.core._dlpack import DLDeviceType, make_py_capsule
from cuda.core._utils.cuda_utils import driver
from cuda.core._device import Device


# =============================================================================
# MR deallocation callback (invoked from C++ shared_ptr deleter)
# =============================================================================

cdef void _mr_dealloc_callback(
object mr,
cydriver.CUdeviceptr ptr,
size_t size,
const StreamHandle& h_stream,
) noexcept:
"""Called by the C++ deleter to deallocate via MemoryResource.deallocate."""
try:
stream = None
if h_stream:
stream = Stream._from_handle(Stream, h_stream)
mr.deallocate(int(ptr), size, stream)
except Exception:
pass # Cannot propagate exceptions from a C++ destructor

register_mr_dealloc_callback(_mr_dealloc_callback)


__all__ = ['Buffer', 'MemoryResource']


Expand Down Expand Up @@ -73,27 +98,30 @@ cdef class Buffer:
@classmethod
def _init(
cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None,
stream: Stream | None = None, ipc_descriptor: IPCBufferDescriptor | None = None,
ipc_descriptor: IPCBufferDescriptor | None = None,
owner : object | None = None
):
"""Legacy init for compatibility - creates a non-owning ref handle.
"""Create a Buffer from a raw pointer.

Note: The stream parameter is accepted for API compatibility but is
ignored since non-owning refs are never freed by the handle.
When ``mr`` is provided, the buffer takes ownership: ``mr.deallocate()``
is called when the buffer is closed or garbage collected. When ``owner``
is provided, the owner is kept alive but no deallocation is performed.
"""
if mr is not None and owner is not None:
raise ValueError("owner and memory resource cannot be both specified together")
cdef Buffer self = Buffer.__new__(cls)
self._h_ptr = deviceptr_create_with_owner(<uintptr_t>(int(ptr)), owner)
cdef uintptr_t c_ptr = <uintptr_t>(int(ptr))
if mr is not None:
self._h_ptr = deviceptr_create_with_mr(c_ptr, size, mr)
else:
self._h_ptr = deviceptr_create_with_owner(c_ptr, owner)
self._size = size
self._memory_resource = mr
self._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None
self._owner = owner
self._mem_attrs_inited = False
return self

# No __dealloc__ needed - RAII handles cleanup via _h_ptr destructor

def __reduce__(self):
# Must not serialize the parent's stream!
return Buffer.from_ipc_descriptor, (self.memory_resource, self.get_ipc_descriptor())
Expand All @@ -112,16 +140,19 @@ cdef class Buffer:
size : int
Memory size of the buffer
mr : :obj:`~_memory.MemoryResource`, optional
Memory resource associated with the buffer
Memory resource associated with the buffer. When provided,
:meth:`MemoryResource.deallocate` is called when the buffer is
closed or garbage collected.
owner : object, optional
An object holding external allocation that the ``ptr`` points to.
The reference is kept as long as the buffer is alive.
The ``owner`` and ``mr`` cannot be specified together.

Note
----
This creates a non-owning reference. The pointer will NOT be freed
when the Buffer is closed or garbage collected.
When neither ``mr`` nor ``owner`` is specified, this creates a
non-owning reference. The pointer will NOT be freed when the
:class:`Buffer` is closed or garbage collected.
"""
return Buffer._init(ptr, size, mr=mr, owner=owner)

Expand Down
4 changes: 2 additions & 2 deletions cuda_core/cuda/core/_memory/_legacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def allocate(self, size, stream=None) -> Buffer:
raise_if_driver_error(err)
else:
ptr = 0
return Buffer._init(ptr, size, self, stream)
return Buffer._init(ptr, size, self)

def deallocate(self, ptr: DevicePointerT, size, stream):
"""Deallocate a buffer previously allocated by this resource.
Expand Down Expand Up @@ -106,7 +106,7 @@ def allocate(self, size, stream=None) -> Buffer:
raise_if_driver_error(err)
else:
ptr = 0
return Buffer._init(ptr, size, self, stream)
return Buffer._init(ptr, size, self)

def deallocate(self, ptr, size, stream):
if stream is not None:
Expand Down
9 changes: 9 additions & 0 deletions cuda_core/cuda/core/_resource_handles.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,15 @@ cdef DevicePtrHandle deviceptr_alloc(size_t size) except+ nogil
cdef DevicePtrHandle deviceptr_alloc_host(size_t size) except+ nogil
cdef DevicePtrHandle deviceptr_create_ref(cydriver.CUdeviceptr ptr) except+ nogil
cdef DevicePtrHandle deviceptr_create_with_owner(cydriver.CUdeviceptr ptr, object owner) except+ nogil
cdef DevicePtrHandle deviceptr_create_with_mr(
cydriver.CUdeviceptr ptr, size_t size, object mr) except+ nogil

# MR deallocation callback type and registration
ctypedef void (*MRDeallocCallback)(
object mr, cydriver.CUdeviceptr ptr, size_t size,
const StreamHandle& stream) noexcept
cdef void register_mr_dealloc_callback(MRDeallocCallback cb) noexcept

cdef DevicePtrHandle deviceptr_import_ipc(
const MemoryPoolHandle& h_pool, const void* export_data, const StreamHandle& h_stream) except+ nogil
cdef StreamHandle deallocation_stream(const DevicePtrHandle& h) noexcept nogil
Expand Down
10 changes: 10 additions & 0 deletions cuda_core/cuda/core/_resource_handles.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,16 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
cydriver.CUdeviceptr ptr) except+ nogil
DevicePtrHandle deviceptr_create_with_owner "cuda_core::deviceptr_create_with_owner" (
cydriver.CUdeviceptr ptr, object owner) except+ nogil

# MR deallocation callback
ctypedef void (*MRDeallocCallback)(
object mr, cydriver.CUdeviceptr ptr, size_t size,
const StreamHandle& stream) noexcept
void register_mr_dealloc_callback "cuda_core::register_mr_dealloc_callback" (
MRDeallocCallback cb) noexcept
DevicePtrHandle deviceptr_create_with_mr "cuda_core::deviceptr_create_with_mr" (
cydriver.CUdeviceptr ptr, size_t size, object mr) except+ nogil

DevicePtrHandle deviceptr_import_ipc "cuda_core::deviceptr_import_ipc" (
const MemoryPoolHandle& h_pool, const void* export_data, const StreamHandle& h_stream) except+ nogil
StreamHandle deallocation_stream "cuda_core::deallocation_stream" (
Expand Down
32 changes: 32 additions & 0 deletions cuda_core/tests/helpers/buffers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"DummyUnifiedMemoryResource",
"make_scratch_buffer",
"PatternGen",
"TrackingMR",
]


Expand Down Expand Up @@ -41,6 +42,37 @@ def device_id(self) -> int:
return self.device


class TrackingMR(MemoryResource):
"""A MemoryResource that tracks active allocations via a dict.

Useful for verifying that deallocate is called at the expected time.
"""

def __init__(self):
self.active = {}

def allocate(self, size, stream=None):
ptr = handle_return(driver.cuMemAlloc(size))
self.active[int(ptr)] = size
return Buffer.from_handle(ptr=ptr, size=size, mr=self)

def deallocate(self, ptr, size, stream=None):
handle_return(driver.cuMemFree(ptr))
del self.active[int(ptr)]

@property
def is_device_accessible(self):
return True

@property
def is_host_accessible(self):
return False

@property
def device_id(self):
return 0


class PatternGen:
"""
Provides methods to fill a target buffer with known test patterns and
Expand Down
27 changes: 26 additions & 1 deletion cuda_core/tests/test_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from cuda.core._utils.cuda_utils import CUDAError, handle_return
from cuda.core.utils import StridedMemoryView
from helpers import IS_WINDOWS, supports_ipc_mempool
from helpers.buffers import DummyUnifiedMemoryResource
from helpers.buffers import DummyUnifiedMemoryResource, TrackingMR

from conftest import (
create_managed_memory_resource_or_skip,
Expand Down Expand Up @@ -471,6 +471,31 @@ def test_buffer_external_managed(change_device):
handle_return(driver.cuMemFree(ptr))


def test_mr_deallocate_called_on_close():
"""Buffer.from_handle(mr=mr) calls mr.deallocate() on close (issue #1619)."""
device = Device()
device.set_current()
mr = TrackingMR()
buf = mr.allocate(1024)
assert len(mr.active) == 1
buf.close()
assert len(mr.active) == 0


def test_mr_deallocate_called_on_gc():
"""Buffer.from_handle(mr=mr) calls mr.deallocate() on GC (issue #1619)."""
import gc

device = Device()
device.set_current()
mr = TrackingMR()
buf = mr.allocate(1024)
assert len(mr.active) == 1
del buf
gc.collect()
assert len(mr.active) == 0


def test_memory_resource_and_owner_disallowed():
with pytest.raises(ValueError, match="cannot be both specified together"):
a = (ctypes.c_byte * 20)()
Expand Down
Loading