From 143b12880f4f1144a3ac6f4a0129df3daf13dfcf Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Thu, 12 Feb 2026 14:00:14 -0500 Subject: [PATCH 1/3] [PERF]: Faster void * conversion --- cuda_bindings/cuda/bindings/_lib/utils.pxd.in | 13 +- cuda_bindings/cuda/bindings/_lib/utils.pxi.in | 44 +- cuda_bindings/cuda/bindings/driver.pyx.in | 224 +++++---- cuda_bindings/cuda/bindings/nvrtc.pyx.in | 12 +- cuda_bindings/cuda/bindings/runtime.pyx.in | 435 ++++++++++-------- 5 files changed, 426 insertions(+), 302 deletions(-) diff --git a/cuda_bindings/cuda/bindings/_lib/utils.pxd.in b/cuda_bindings/cuda/bindings/_lib/utils.pxd.in index 7b4adb9624..1e4603e84b 100644 --- a/cuda_bindings/cuda/bindings/_lib/utils.pxd.in +++ b/cuda_bindings/cuda/bindings/_lib/utils.pxd.in @@ -1,10 +1,11 @@ -# SPDX-FileCopyrightText: Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE cimport cuda.bindings.driver as driver cimport cuda.bindings.cydriver as cydriver cimport cuda.bindings.cyruntime as cyruntime from libcpp.vector cimport vector +from cpython.buffer cimport PyBuffer_Release, Py_buffer cdef class _HelperKernelParams: cdef Py_buffer _pybuffer @@ -18,6 +19,16 @@ cdef class _HelperInputVoidPtr: cdef Py_buffer _pybuffer cdef void* _cptr cdef bint _pyobj_acquired + +cdef struct _HelperInputVoidPtrStruct: + Py_buffer _pybuffer + +cdef void * _helper_input_void_ptr(ptr, _HelperInputVoidPtrStruct *buffer) + +cdef inline void * _helper_input_void_ptr_free(_HelperInputVoidPtrStruct *helper): + if helper[0]._pybuffer.buf != NULL: + PyBuffer_Release(&helper[0]._pybuffer) + {{if 'CUmemPool_attribute_enum' in found_types}} cdef class _HelperCUmemPool_attribute: diff --git a/cuda_bindings/cuda/bindings/_lib/utils.pxi.in b/cuda_bindings/cuda/bindings/_lib/utils.pxi.in index f4b75741bc..14da64f5f4 100644 --- a/cuda_bindings/cuda/bindings/_lib/utils.pxi.in +++ b/cuda_bindings/cuda/bindings/_lib/utils.pxi.in @@ -129,30 +129,42 @@ cdef class _HelperKernelParams: cdef class _HelperInputVoidPtr: def __cinit__(self, ptr): self._pyobj_acquired = False - if ptr is None: - self._cptr = NULL - elif isinstance(ptr, (int)): - # Easy run, user gave us an already configured void** address + try: self._cptr = ptr - elif isinstance(ptr, (_driver["CUdeviceptr"])): - self._cptr = int(ptr) + except: + if ptr is None: + self._cptr = NULL + elif PyObject_CheckBuffer(ptr): + # Easy run, get address from Python Buffer Protocol + err_buffer = PyObject_GetBuffer(ptr, &self._pybuffer, PyBUF_SIMPLE | PyBUF_ANY_CONTIGUOUS) + if err_buffer == -1: + raise RuntimeError("Failed to retrieve buffer through Buffer Protocol") + self._pyobj_acquired = True + self._cptr = self._pybuffer.buf + else: + raise TypeError("Provided argument is of type {} but expected Type {}, {} or object with Buffer Protocol".format(type(ptr), type(None), type(int))) + + def __dealloc__(self): + if self._pyobj_acquired is True: + PyBuffer_Release(&self._pybuffer) + + +cdef void * _helper_input_void_ptr(ptr, _HelperInputVoidPtrStruct *helper): + helper[0]._pybuffer.buf = NULL + try: + return ptr + except: + if ptr is None: + return NULL elif PyObject_CheckBuffer(ptr): # Easy run, get address from Python Buffer Protocol - err_buffer = PyObject_GetBuffer(ptr, &self._pybuffer, PyBUF_SIMPLE | PyBUF_ANY_CONTIGUOUS) + err_buffer = PyObject_GetBuffer(ptr, &helper[0]._pybuffer, PyBUF_SIMPLE | PyBUF_ANY_CONTIGUOUS) if err_buffer == -1: raise RuntimeError("Failed to retrieve buffer through Buffer Protocol") - self._pyobj_acquired = True - self._cptr = self._pybuffer.buf + return (helper[0]._pybuffer.buf) else: raise TypeError("Provided argument is of type {} but expected Type {}, {} or object with Buffer Protocol".format(type(ptr), type(None), type(int))) - def __dealloc__(self): - if self._pyobj_acquired is True: - PyBuffer_Release(&self._pybuffer) - - @property - def cptr(self): - return self._cptr {{if 'CUmemPool_attribute_enum' in found_types}} diff --git a/cuda_bindings/cuda/bindings/driver.pyx.in b/cuda_bindings/cuda/bindings/driver.pyx.in index a3d90f7223..5c78562da8 100644 --- a/cuda_bindings/cuda/bindings/driver.pyx.in +++ b/cuda_bindings/cuda/bindings/driver.pyx.in @@ -26209,10 +26209,11 @@ def cuDeviceGetNvSciSyncAttributes(nvSciSyncAttrList, dev, int flags): else: pdev = int(CUdevice(dev)) cydev = pdev - cynvSciSyncAttrList = _HelperInputVoidPtr(nvSciSyncAttrList) - cdef void* cynvSciSyncAttrList_ptr = cynvSciSyncAttrList.cptr + cdef _HelperInputVoidPtrStruct cynvSciSyncAttrListHelper + cdef void* cynvSciSyncAttrList = _helper_input_void_ptr(nvSciSyncAttrList, &cynvSciSyncAttrListHelper) with nogil: - err = cydriver.cuDeviceGetNvSciSyncAttributes(cynvSciSyncAttrList_ptr, cydev, flags) + err = cydriver.cuDeviceGetNvSciSyncAttributes(cynvSciSyncAttrList, cydev, flags) + _helper_input_void_ptr_free(&cynvSciSyncAttrListHelper) return (CUresult(err),) {{endif}} @@ -28315,10 +28316,11 @@ def cuModuleLoadData(image): :py:obj:`~.cuModuleGetFunction`, :py:obj:`~.cuModuleGetGlobal`, :py:obj:`~.cuModuleGetTexRef`, :py:obj:`~.cuModuleLoad`, :py:obj:`~.cuModuleLoadDataEx`, :py:obj:`~.cuModuleLoadFatBinary`, :py:obj:`~.cuModuleUnload` """ cdef CUmodule module = CUmodule() - cyimage = _HelperInputVoidPtr(image) - cdef void* cyimage_ptr = cyimage.cptr + cdef _HelperInputVoidPtrStruct cyimageHelper + cdef void* cyimage = _helper_input_void_ptr(image, &cyimageHelper) with nogil: - err = cydriver.cuModuleLoadData(module._pvt_ptr, cyimage_ptr) + err = cydriver.cuModuleLoadData(module._pvt_ptr, cyimage) + _helper_input_void_ptr_free(&cyimageHelper) if err != cydriver.CUDA_SUCCESS: return (CUresult(err), None) return (CUresult(err), module) @@ -28362,8 +28364,8 @@ def cuModuleLoadDataEx(image, unsigned int numOptions, options : Optional[tuple[ if not all(isinstance(_x, (CUjit_option)) for _x in options): raise TypeError("Argument 'options' is not instance of type (expected tuple[cydriver.CUjit_option] or list[cydriver.CUjit_option]") cdef CUmodule module = CUmodule() - cyimage = _HelperInputVoidPtr(image) - cdef void* cyimage_ptr = cyimage.cptr + cdef _HelperInputVoidPtrStruct cyimageHelper + cdef void* cyimage = _helper_input_void_ptr(image, &cyimageHelper) if numOptions > len(options): raise RuntimeError("List is too small: " + str(len(options)) + " < " + str(numOptions)) if numOptions > len(optionValues): raise RuntimeError("List is too small: " + str(len(optionValues)) + " < " + str(numOptions)) cdef vector[cydriver.CUjit_option] cyoptions = [int(pyoptions) for pyoptions in (options)] @@ -28371,7 +28373,8 @@ def cuModuleLoadDataEx(image, unsigned int numOptions, options : Optional[tuple[ cdef _InputVoidPtrPtrHelper voidStarHelperoptionValues = _InputVoidPtrPtrHelper(pylist) cdef void** cyoptionValues_ptr = voidStarHelperoptionValues.cptr with nogil: - err = cydriver.cuModuleLoadDataEx(module._pvt_ptr, cyimage_ptr, numOptions, cyoptions.data(), cyoptionValues_ptr) + err = cydriver.cuModuleLoadDataEx(module._pvt_ptr, cyimage, numOptions, cyoptions.data(), cyoptionValues_ptr) + _helper_input_void_ptr_free(&cyimageHelper) if err != cydriver.CUDA_SUCCESS: return (CUresult(err), None) return (CUresult(err), module) @@ -28411,10 +28414,11 @@ def cuModuleLoadFatBinary(fatCubin): :py:obj:`~.cuModuleGetFunction`, :py:obj:`~.cuModuleGetGlobal`, :py:obj:`~.cuModuleGetTexRef`, :py:obj:`~.cuModuleLoad`, :py:obj:`~.cuModuleLoadData`, :py:obj:`~.cuModuleLoadDataEx`, :py:obj:`~.cuModuleUnload` """ cdef CUmodule module = CUmodule() - cyfatCubin = _HelperInputVoidPtr(fatCubin) - cdef void* cyfatCubin_ptr = cyfatCubin.cptr + cdef _HelperInputVoidPtrStruct cyfatCubinHelper + cdef void* cyfatCubin = _helper_input_void_ptr(fatCubin, &cyfatCubinHelper) with nogil: - err = cydriver.cuModuleLoadFatBinary(module._pvt_ptr, cyfatCubin_ptr) + err = cydriver.cuModuleLoadFatBinary(module._pvt_ptr, cyfatCubin) + _helper_input_void_ptr_free(&cyfatCubinHelper) if err != cydriver.CUDA_SUCCESS: return (CUresult(err), None) return (CUresult(err), module) @@ -28810,8 +28814,8 @@ def cuLinkAddData(state, typename not None : CUjitInputType, data, size_t size, pstate = int(CUlinkState(state)) cystate = pstate cdef cydriver.CUjitInputType cytypename = int(typename) - cydata = _HelperInputVoidPtr(data) - cdef void* cydata_ptr = cydata.cptr + cdef _HelperInputVoidPtrStruct cydataHelper + cdef void* cydata = _helper_input_void_ptr(data, &cydataHelper) if numOptions > len(options): raise RuntimeError("List is too small: " + str(len(options)) + " < " + str(numOptions)) if numOptions > len(optionValues): raise RuntimeError("List is too small: " + str(len(optionValues)) + " < " + str(numOptions)) cdef vector[cydriver.CUjit_option] cyoptions = [int(pyoptions) for pyoptions in (options)] @@ -28819,7 +28823,8 @@ def cuLinkAddData(state, typename not None : CUjitInputType, data, size_t size, cdef _InputVoidPtrPtrHelper voidStarHelperoptionValues = _InputVoidPtrPtrHelper(pylist) cdef void** cyoptionValues_ptr = voidStarHelperoptionValues.cptr with nogil: - err = cydriver.cuLinkAddData(cystate, cytypename, cydata_ptr, size, name, numOptions, cyoptions.data(), cyoptionValues_ptr) + err = cydriver.cuLinkAddData(cystate, cytypename, cydata, size, name, numOptions, cyoptions.data(), cyoptionValues_ptr) + _helper_input_void_ptr_free(&cydataHelper) return (CUresult(err),) {{endif}} @@ -29147,8 +29152,8 @@ def cuLibraryLoadData(code, jitOptions : Optional[tuple[CUjit_option] | list[CUj if not all(isinstance(_x, (CUjit_option)) for _x in jitOptions): raise TypeError("Argument 'jitOptions' is not instance of type (expected tuple[cydriver.CUjit_option] or list[cydriver.CUjit_option]") cdef CUlibrary library = CUlibrary() - cycode = _HelperInputVoidPtr(code) - cdef void* cycode_ptr = cycode.cptr + cdef _HelperInputVoidPtrStruct cycodeHelper + cdef void* cycode = _helper_input_void_ptr(code, &cycodeHelper) cdef vector[cydriver.CUjit_option] cyjitOptions = [int(pyjitOptions) for pyjitOptions in (jitOptions)] pylist = [_HelperCUjit_option(pyoptions, pyoptionValues) for pyoptions, pyoptionValues in zip(jitOptions, jitOptionsValues)] cdef _InputVoidPtrPtrHelper voidStarHelperjitOptionsValues = _InputVoidPtrPtrHelper(pylist) @@ -29162,7 +29167,8 @@ def cuLibraryLoadData(code, jitOptions : Optional[tuple[CUjit_option] | list[CUj if numLibraryOptions > len(libraryOptions): raise RuntimeError("List is too small: " + str(len(libraryOptions)) + " < " + str(numLibraryOptions)) if numLibraryOptions > len(libraryOptionValues): raise RuntimeError("List is too small: " + str(len(libraryOptionValues)) + " < " + str(numLibraryOptions)) with nogil: - err = cydriver.cuLibraryLoadData(library._pvt_ptr, cycode_ptr, cyjitOptions.data(), cyjitOptionsValues_ptr, numJitOptions, cylibraryOptions.data(), cylibraryOptionValues_ptr, numLibraryOptions) + err = cydriver.cuLibraryLoadData(library._pvt_ptr, cycode, cyjitOptions.data(), cyjitOptionsValues_ptr, numJitOptions, cylibraryOptions.data(), cylibraryOptionValues_ptr, numLibraryOptions) + _helper_input_void_ptr_free(&cycodeHelper) if err != cydriver.CUDA_SUCCESS: return (CUresult(err), None) return (CUresult(err), library) @@ -30460,10 +30466,11 @@ def cuMemFreeHost(p): -------- :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaFreeHost` """ - cyp = _HelperInputVoidPtr(p) - cdef void* cyp_ptr = cyp.cptr + cdef _HelperInputVoidPtrStruct cypHelper + cdef void* cyp = _helper_input_void_ptr(p, &cypHelper) with nogil: - err = cydriver.cuMemFreeHost(cyp_ptr) + err = cydriver.cuMemFreeHost(cyp) + _helper_input_void_ptr_free(&cypHelper) return (CUresult(err),) {{endif}} @@ -30614,10 +30621,11 @@ def cuMemHostGetDevicePointer(p, unsigned int Flags): :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaHostGetDevicePointer` """ cdef CUdeviceptr pdptr = CUdeviceptr() - cyp = _HelperInputVoidPtr(p) - cdef void* cyp_ptr = cyp.cptr + cdef _HelperInputVoidPtrStruct cypHelper + cdef void* cyp = _helper_input_void_ptr(p, &cypHelper) with nogil: - err = cydriver.cuMemHostGetDevicePointer(pdptr._pvt_ptr, cyp_ptr, Flags) + err = cydriver.cuMemHostGetDevicePointer(pdptr._pvt_ptr, cyp, Flags) + _helper_input_void_ptr_free(&cypHelper) if err != cydriver.CUDA_SUCCESS: return (CUresult(err), None) return (CUresult(err), pdptr) @@ -30653,10 +30661,11 @@ def cuMemHostGetFlags(p): :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cudaHostGetFlags` """ cdef unsigned int pFlags = 0 - cyp = _HelperInputVoidPtr(p) - cdef void* cyp_ptr = cyp.cptr + cdef _HelperInputVoidPtrStruct cypHelper + cdef void* cyp = _helper_input_void_ptr(p, &cypHelper) with nogil: - err = cydriver.cuMemHostGetFlags(&pFlags, cyp_ptr) + err = cydriver.cuMemHostGetFlags(&pFlags, cyp) + _helper_input_void_ptr_free(&cypHelper) if err != cydriver.CUDA_SUCCESS: return (CUresult(err), None) return (CUresult(err), pFlags) @@ -30872,15 +30881,15 @@ def cuDeviceRegisterAsyncNotification(device, callbackFunc, userData): else: pdevice = int(CUdevice(device)) cydevice = pdevice - cyuserData = _HelperInputVoidPtr(userData) - cdef void* cyuserData_ptr = cyuserData.cptr + cdef _HelperInputVoidPtrStruct cyuserDataHelper + cdef void* cyuserData = _helper_input_void_ptr(userData, &cyuserDataHelper) cdef cuAsyncCallbackData *cbData = NULL cbData = malloc(sizeof(cbData[0])) if cbData == NULL: return (CUresult.CUDA_ERROR_OUT_OF_MEMORY, None) cbData.callback = cycallbackFunc - cbData.userData = cyuserData_ptr + cbData.userData = cyuserData cdef CUasyncCallbackHandle callback = CUasyncCallbackHandle() with nogil: @@ -30889,6 +30898,7 @@ def cuDeviceRegisterAsyncNotification(device, callbackFunc, userData): free(cbData) else: m_global._allocated[int(callback)] = cbData + _helper_input_void_ptr_free(&cyuserDataHelper) if err != cydriver.CUDA_SUCCESS: return (CUresult(err), None) return (CUresult(err), callback) @@ -31408,10 +31418,11 @@ def cuMemHostRegister(p, size_t bytesize, unsigned int Flags): -------- :py:obj:`~.cuMemHostUnregister`, :py:obj:`~.cuMemHostGetFlags`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cudaHostRegister` """ - cyp = _HelperInputVoidPtr(p) - cdef void* cyp_ptr = cyp.cptr + cdef _HelperInputVoidPtrStruct cypHelper + cdef void* cyp = _helper_input_void_ptr(p, &cypHelper) with nogil: - err = cydriver.cuMemHostRegister(cyp_ptr, bytesize, Flags) + err = cydriver.cuMemHostRegister(cyp, bytesize, Flags) + _helper_input_void_ptr_free(&cypHelper) return (CUresult(err),) {{endif}} @@ -31441,10 +31452,11 @@ def cuMemHostUnregister(p): -------- :py:obj:`~.cuMemHostRegister`, :py:obj:`~.cudaHostUnregister` """ - cyp = _HelperInputVoidPtr(p) - cdef void* cyp_ptr = cyp.cptr + cdef _HelperInputVoidPtrStruct cypHelper + cdef void* cyp = _helper_input_void_ptr(p, &cypHelper) with nogil: - err = cydriver.cuMemHostUnregister(cyp_ptr) + err = cydriver.cuMemHostUnregister(cyp) + _helper_input_void_ptr_free(&cypHelper) return (CUresult(err),) {{endif}} @@ -31607,10 +31619,11 @@ def cuMemcpyHtoD(dstDevice, srcHost, size_t ByteCount): else: pdstDevice = int(CUdeviceptr(dstDevice)) cydstDevice = pdstDevice - cysrcHost = _HelperInputVoidPtr(srcHost) - cdef void* cysrcHost_ptr = cysrcHost.cptr + cdef _HelperInputVoidPtrStruct cysrcHostHelper + cdef void* cysrcHost = _helper_input_void_ptr(srcHost, &cysrcHostHelper) with nogil: - err = cydriver.cuMemcpyHtoD(cydstDevice, cysrcHost_ptr, ByteCount) + err = cydriver.cuMemcpyHtoD(cydstDevice, cysrcHost, ByteCount) + _helper_input_void_ptr_free(&cysrcHostHelper) return (CUresult(err),) {{endif}} @@ -31650,10 +31663,11 @@ def cuMemcpyDtoH(dstHost, srcDevice, size_t ByteCount): else: psrcDevice = int(CUdeviceptr(srcDevice)) cysrcDevice = psrcDevice - cydstHost = _HelperInputVoidPtr(dstHost) - cdef void* cydstHost_ptr = cydstHost.cptr + cdef _HelperInputVoidPtrStruct cydstHostHelper + cdef void* cydstHost = _helper_input_void_ptr(dstHost, &cydstHostHelper) with nogil: - err = cydriver.cuMemcpyDtoH(cydstHost_ptr, cysrcDevice, ByteCount) + err = cydriver.cuMemcpyDtoH(cydstHost, cysrcDevice, ByteCount) + _helper_input_void_ptr_free(&cydstHostHelper) return (CUresult(err),) {{endif}} @@ -31851,10 +31865,11 @@ def cuMemcpyHtoA(dstArray, size_t dstOffset, srcHost, size_t ByteCount): else: pdstArray = int(CUarray(dstArray)) cydstArray = pdstArray - cysrcHost = _HelperInputVoidPtr(srcHost) - cdef void* cysrcHost_ptr = cysrcHost.cptr + cdef _HelperInputVoidPtrStruct cysrcHostHelper + cdef void* cysrcHost = _helper_input_void_ptr(srcHost, &cysrcHostHelper) with nogil: - err = cydriver.cuMemcpyHtoA(cydstArray, dstOffset, cysrcHost_ptr, ByteCount) + err = cydriver.cuMemcpyHtoA(cydstArray, dstOffset, cysrcHost, ByteCount) + _helper_input_void_ptr_free(&cysrcHostHelper) return (CUresult(err),) {{endif}} @@ -31897,10 +31912,11 @@ def cuMemcpyAtoH(dstHost, srcArray, size_t srcOffset, size_t ByteCount): else: psrcArray = int(CUarray(srcArray)) cysrcArray = psrcArray - cydstHost = _HelperInputVoidPtr(dstHost) - cdef void* cydstHost_ptr = cydstHost.cptr + cdef _HelperInputVoidPtrStruct cydstHostHelper + cdef void* cydstHost = _helper_input_void_ptr(dstHost, &cydstHostHelper) with nogil: - err = cydriver.cuMemcpyAtoH(cydstHost_ptr, cysrcArray, srcOffset, ByteCount) + err = cydriver.cuMemcpyAtoH(cydstHost, cysrcArray, srcOffset, ByteCount) + _helper_input_void_ptr_free(&cydstHostHelper) return (CUresult(err),) {{endif}} @@ -32564,10 +32580,11 @@ def cuMemcpyHtoDAsync(dstDevice, srcHost, size_t ByteCount, hStream): else: pdstDevice = int(CUdeviceptr(dstDevice)) cydstDevice = pdstDevice - cysrcHost = _HelperInputVoidPtr(srcHost) - cdef void* cysrcHost_ptr = cysrcHost.cptr + cdef _HelperInputVoidPtrStruct cysrcHostHelper + cdef void* cysrcHost = _helper_input_void_ptr(srcHost, &cysrcHostHelper) with nogil: - err = cydriver.cuMemcpyHtoDAsync(cydstDevice, cysrcHost_ptr, ByteCount, cyhStream) + err = cydriver.cuMemcpyHtoDAsync(cydstDevice, cysrcHost, ByteCount, cyhStream) + _helper_input_void_ptr_free(&cysrcHostHelper) return (CUresult(err),) {{endif}} @@ -32617,10 +32634,11 @@ def cuMemcpyDtoHAsync(dstHost, srcDevice, size_t ByteCount, hStream): else: psrcDevice = int(CUdeviceptr(srcDevice)) cysrcDevice = psrcDevice - cydstHost = _HelperInputVoidPtr(dstHost) - cdef void* cydstHost_ptr = cydstHost.cptr + cdef _HelperInputVoidPtrStruct cydstHostHelper + cdef void* cydstHost = _helper_input_void_ptr(dstHost, &cydstHostHelper) with nogil: - err = cydriver.cuMemcpyDtoHAsync(cydstHost_ptr, cysrcDevice, ByteCount, cyhStream) + err = cydriver.cuMemcpyDtoHAsync(cydstHost, cysrcDevice, ByteCount, cyhStream) + _helper_input_void_ptr_free(&cydstHostHelper) return (CUresult(err),) {{endif}} @@ -32732,10 +32750,11 @@ def cuMemcpyHtoAAsync(dstArray, size_t dstOffset, srcHost, size_t ByteCount, hSt else: pdstArray = int(CUarray(dstArray)) cydstArray = pdstArray - cysrcHost = _HelperInputVoidPtr(srcHost) - cdef void* cysrcHost_ptr = cysrcHost.cptr + cdef _HelperInputVoidPtrStruct cysrcHostHelper + cdef void* cysrcHost = _helper_input_void_ptr(srcHost, &cysrcHostHelper) with nogil: - err = cydriver.cuMemcpyHtoAAsync(cydstArray, dstOffset, cysrcHost_ptr, ByteCount, cyhStream) + err = cydriver.cuMemcpyHtoAAsync(cydstArray, dstOffset, cysrcHost, ByteCount, cyhStream) + _helper_input_void_ptr_free(&cysrcHostHelper) return (CUresult(err),) {{endif}} @@ -32788,10 +32807,11 @@ def cuMemcpyAtoHAsync(dstHost, srcArray, size_t srcOffset, size_t ByteCount, hSt else: psrcArray = int(CUarray(srcArray)) cysrcArray = psrcArray - cydstHost = _HelperInputVoidPtr(dstHost) - cdef void* cydstHost_ptr = cydstHost.cptr + cdef _HelperInputVoidPtrStruct cydstHostHelper + cdef void* cydstHost = _helper_input_void_ptr(dstHost, &cydstHostHelper) with nogil: - err = cydriver.cuMemcpyAtoHAsync(cydstHost_ptr, cysrcArray, srcOffset, ByteCount, cyhStream) + err = cydriver.cuMemcpyAtoHAsync(cydstHost, cysrcArray, srcOffset, ByteCount, cyhStream) + _helper_input_void_ptr_free(&cydstHostHelper) return (CUresult(err),) {{endif}} @@ -35743,11 +35763,12 @@ def cuMemImportFromShareableHandle(osHandle, shHandleType not None : CUmemAlloca Importing shareable handles exported from some graphics APIs(VUlkan, OpenGL, etc) created on devices under an SLI group may not be supported, and thus this API will return CUDA_ERROR_NOT_SUPPORTED. There is no guarantee that the contents of `handle` will be the same CUDA memory handle for the same given OS shareable handle, or the same underlying allocation. """ cdef CUmemGenericAllocationHandle handle = CUmemGenericAllocationHandle() - cyosHandle = _HelperInputVoidPtr(osHandle) - cdef void* cyosHandle_ptr = cyosHandle.cptr + cdef _HelperInputVoidPtrStruct cyosHandleHelper + cdef void* cyosHandle = _helper_input_void_ptr(osHandle, &cyosHandleHelper) cdef cydriver.CUmemAllocationHandleType cyshHandleType = int(shHandleType) with nogil: - err = cydriver.cuMemImportFromShareableHandle(handle._pvt_ptr, cyosHandle_ptr, cyshHandleType) + err = cydriver.cuMemImportFromShareableHandle(handle._pvt_ptr, cyosHandle, cyshHandleType) + _helper_input_void_ptr_free(&cyosHandleHelper) if err != cydriver.CUDA_SUCCESS: return (CUresult(err), None) return (CUresult(err), handle) @@ -35863,10 +35884,11 @@ def cuMemRetainAllocationHandle(addr): The address `addr`, can be any address in a range previously mapped by :py:obj:`~.cuMemMap`, and not necessarily the start address. """ cdef CUmemGenericAllocationHandle handle = CUmemGenericAllocationHandle() - cyaddr = _HelperInputVoidPtr(addr) - cdef void* cyaddr_ptr = cyaddr.cptr + cdef _HelperInputVoidPtrStruct cyaddrHelper + cdef void* cyaddr = _helper_input_void_ptr(addr, &cyaddrHelper) with nogil: - err = cydriver.cuMemRetainAllocationHandle(handle._pvt_ptr, cyaddr_ptr) + err = cydriver.cuMemRetainAllocationHandle(handle._pvt_ptr, cyaddr) + _helper_input_void_ptr_free(&cyaddrHelper) if err != cydriver.CUDA_SUCCESS: return (CUresult(err), None) return (CUresult(err), handle) @@ -36744,11 +36766,12 @@ def cuMemPoolImportFromShareableHandle(handle, handleType not None : CUmemAlloca Imported memory pools do not support creating new allocations. As such imported memory pools may not be used in cuDeviceSetMemPool or :py:obj:`~.cuMemAllocFromPoolAsync` calls. """ cdef CUmemoryPool pool_out = CUmemoryPool() - cyhandle = _HelperInputVoidPtr(handle) - cdef void* cyhandle_ptr = cyhandle.cptr + cdef _HelperInputVoidPtrStruct cyhandleHelper + cdef void* cyhandle = _helper_input_void_ptr(handle, &cyhandleHelper) cdef cydriver.CUmemAllocationHandleType cyhandleType = int(handleType) with nogil: - err = cydriver.cuMemPoolImportFromShareableHandle(pool_out._pvt_ptr, cyhandle_ptr, cyhandleType, flags) + err = cydriver.cuMemPoolImportFromShareableHandle(pool_out._pvt_ptr, cyhandle, cyhandleType, flags) + _helper_input_void_ptr_free(&cyhandleHelper) if err != cydriver.CUDA_SUCCESS: return (CUresult(err), None) return (CUresult(err), pool_out) @@ -39286,20 +39309,21 @@ def cuStreamAddCallback(hStream, callback, userData, unsigned int flags): else: phStream = int(CUstream(hStream)) cyhStream = phStream - cyuserData = _HelperInputVoidPtr(userData) - cdef void* cyuserData_ptr = cyuserData.cptr + cdef _HelperInputVoidPtrStruct cyuserDataHelper + cdef void* cyuserData = _helper_input_void_ptr(userData, &cyuserDataHelper) cdef cuStreamCallbackData *cbData = NULL cbData = malloc(sizeof(cbData[0])) if cbData == NULL: return (CUresult.CUDA_ERROR_OUT_OF_MEMORY,) cbData.callback = cycallback - cbData.userData = cyuserData_ptr + cbData.userData = cyuserData with nogil: err = cydriver.cuStreamAddCallback(cyhStream, cuStreamCallbackWrapper, cbData, flags) if err != cydriver.CUDA_SUCCESS: free(cbData) + _helper_input_void_ptr_free(&cyuserDataHelper) return (CUresult(err),) {{endif}} @@ -43021,20 +43045,21 @@ def cuLaunchHostFunc(hStream, fn, userData): else: phStream = int(CUstream(hStream)) cyhStream = phStream - cyuserData = _HelperInputVoidPtr(userData) - cdef void* cyuserData_ptr = cyuserData.cptr + cdef _HelperInputVoidPtrStruct cyuserDataHelper + cdef void* cyuserData = _helper_input_void_ptr(userData, &cyuserDataHelper) cdef cuHostCallbackData *cbData = NULL cbData = malloc(sizeof(cbData[0])) if cbData == NULL: return (CUresult.CUDA_ERROR_OUT_OF_MEMORY,) cbData.callback = cyfn - cbData.userData = cyuserData_ptr + cbData.userData = cyuserData with nogil: err = cydriver.cuLaunchHostFunc(cyhStream, cuHostCallbackWrapper, cbData) if err != cydriver.CUDA_SUCCESS: free(cbData) + _helper_input_void_ptr_free(&cyuserDataHelper) return (CUresult(err),) {{endif}} @@ -43289,10 +43314,11 @@ def cuParamSetv(hfunc, int offset, ptr, unsigned int numbytes): else: phfunc = int(CUfunction(hfunc)) cyhfunc = phfunc - cyptr = _HelperInputVoidPtr(ptr) - cdef void* cyptr_ptr = cyptr.cptr + cdef _HelperInputVoidPtrStruct cyptrHelper + cdef void* cyptr = _helper_input_void_ptr(ptr, &cyptrHelper) with nogil: - err = cydriver.cuParamSetv(cyhfunc, offset, cyptr_ptr, numbytes) + err = cydriver.cuParamSetv(cyhfunc, offset, cyptr, numbytes) + _helper_input_void_ptr_free(&cyptrHelper) return (CUresult(err),) {{endif}} @@ -48378,10 +48404,11 @@ def cuUserObjectCreate(ptr, destroy, unsigned int initialRefcount, unsigned int pdestroy = int(CUhostFn(destroy)) cydestroy = pdestroy cdef CUuserObject object_out = CUuserObject() - cyptr = _HelperInputVoidPtr(ptr) - cdef void* cyptr_ptr = cyptr.cptr + cdef _HelperInputVoidPtrStruct cyptrHelper + cdef void* cyptr = _helper_input_void_ptr(ptr, &cyptrHelper) with nogil: - err = cydriver.cuUserObjectCreate(object_out._pvt_ptr, cyptr_ptr, cydestroy, initialRefcount, flags) + err = cydriver.cuUserObjectCreate(object_out._pvt_ptr, cyptr, cydestroy, initialRefcount, flags) + _helper_input_void_ptr_free(&cyptrHelper) if err != cydriver.CUDA_SUCCESS: return (CUresult(err), None) return (CUresult(err), object_out) @@ -51515,14 +51542,15 @@ def cuTensorMapEncodeTiled(tensorDataType not None : CUtensorMapDataType, tensor cytensorRank = ptensorRank cdef CUtensorMap tensorMap = CUtensorMap() cdef cydriver.CUtensorMapDataType cytensorDataType = int(tensorDataType) - cyglobalAddress = _HelperInputVoidPtr(globalAddress) - cdef void* cyglobalAddress_ptr = cyglobalAddress.cptr + cdef _HelperInputVoidPtrStruct cyglobalAddressHelper + cdef void* cyglobalAddress = _helper_input_void_ptr(globalAddress, &cyglobalAddressHelper) cdef cydriver.CUtensorMapInterleave cyinterleave = int(interleave) cdef cydriver.CUtensorMapSwizzle cyswizzle = int(swizzle) cdef cydriver.CUtensorMapL2promotion cyl2Promotion = int(l2Promotion) cdef cydriver.CUtensorMapFloatOOBfill cyoobFill = int(oobFill) with nogil: - err = cydriver.cuTensorMapEncodeTiled(tensorMap._pvt_ptr, cytensorDataType, cytensorRank, cyglobalAddress_ptr, cyglobalDim, cyglobalStrides, cyboxDim, cyelementStrides, cyinterleave, cyswizzle, cyl2Promotion, cyoobFill) + err = cydriver.cuTensorMapEncodeTiled(tensorMap._pvt_ptr, cytensorDataType, cytensorRank, cyglobalAddress, cyglobalDim, cyglobalStrides, cyboxDim, cyelementStrides, cyinterleave, cyswizzle, cyl2Promotion, cyoobFill) + _helper_input_void_ptr_free(&cyglobalAddressHelper) if err != cydriver.CUDA_SUCCESS: return (CUresult(err), None) return (CUresult(err), tensorMap) @@ -51857,8 +51885,8 @@ def cuTensorMapEncodeIm2col(tensorDataType not None : CUtensorMapDataType, tenso cytensorRank = ptensorRank cdef CUtensorMap tensorMap = CUtensorMap() cdef cydriver.CUtensorMapDataType cytensorDataType = int(tensorDataType) - cyglobalAddress = _HelperInputVoidPtr(globalAddress) - cdef void* cyglobalAddress_ptr = cyglobalAddress.cptr + cdef _HelperInputVoidPtrStruct cyglobalAddressHelper + cdef void* cyglobalAddress = _helper_input_void_ptr(globalAddress, &cyglobalAddressHelper) cdef vector[int] cypixelBoxLowerCorner = pixelBoxLowerCorner cdef vector[int] cypixelBoxUpperCorner = pixelBoxUpperCorner cdef cydriver.CUtensorMapInterleave cyinterleave = int(interleave) @@ -51866,7 +51894,8 @@ def cuTensorMapEncodeIm2col(tensorDataType not None : CUtensorMapDataType, tenso cdef cydriver.CUtensorMapL2promotion cyl2Promotion = int(l2Promotion) cdef cydriver.CUtensorMapFloatOOBfill cyoobFill = int(oobFill) with nogil: - err = cydriver.cuTensorMapEncodeIm2col(tensorMap._pvt_ptr, cytensorDataType, cytensorRank, cyglobalAddress_ptr, cyglobalDim, cyglobalStrides, cypixelBoxLowerCorner.data(), cypixelBoxUpperCorner.data(), cychannelsPerPixel, cypixelsPerColumn, cyelementStrides, cyinterleave, cyswizzle, cyl2Promotion, cyoobFill) + err = cydriver.cuTensorMapEncodeIm2col(tensorMap._pvt_ptr, cytensorDataType, cytensorRank, cyglobalAddress, cyglobalDim, cyglobalStrides, cypixelBoxLowerCorner.data(), cypixelBoxUpperCorner.data(), cychannelsPerPixel, cypixelsPerColumn, cyelementStrides, cyinterleave, cyswizzle, cyl2Promotion, cyoobFill) + _helper_input_void_ptr_free(&cyglobalAddressHelper) if err != cydriver.CUDA_SUCCESS: return (CUresult(err), None) return (CUresult(err), tensorMap) @@ -52182,15 +52211,16 @@ def cuTensorMapEncodeIm2colWide(tensorDataType not None : CUtensorMapDataType, t cytensorRank = ptensorRank cdef CUtensorMap tensorMap = CUtensorMap() cdef cydriver.CUtensorMapDataType cytensorDataType = int(tensorDataType) - cyglobalAddress = _HelperInputVoidPtr(globalAddress) - cdef void* cyglobalAddress_ptr = cyglobalAddress.cptr + cdef _HelperInputVoidPtrStruct cyglobalAddressHelper + cdef void* cyglobalAddress = _helper_input_void_ptr(globalAddress, &cyglobalAddressHelper) cdef cydriver.CUtensorMapInterleave cyinterleave = int(interleave) cdef cydriver.CUtensorMapIm2ColWideMode cymode = int(mode) cdef cydriver.CUtensorMapSwizzle cyswizzle = int(swizzle) cdef cydriver.CUtensorMapL2promotion cyl2Promotion = int(l2Promotion) cdef cydriver.CUtensorMapFloatOOBfill cyoobFill = int(oobFill) with nogil: - err = cydriver.cuTensorMapEncodeIm2colWide(tensorMap._pvt_ptr, cytensorDataType, cytensorRank, cyglobalAddress_ptr, cyglobalDim, cyglobalStrides, pixelBoxLowerCornerWidth, pixelBoxUpperCornerWidth, cychannelsPerPixel, cypixelsPerColumn, cyelementStrides, cyinterleave, cymode, cyswizzle, cyl2Promotion, cyoobFill) + err = cydriver.cuTensorMapEncodeIm2colWide(tensorMap._pvt_ptr, cytensorDataType, cytensorRank, cyglobalAddress, cyglobalDim, cyglobalStrides, pixelBoxLowerCornerWidth, pixelBoxUpperCornerWidth, cychannelsPerPixel, cypixelsPerColumn, cyelementStrides, cyinterleave, cymode, cyswizzle, cyl2Promotion, cyoobFill) + _helper_input_void_ptr_free(&cyglobalAddressHelper) if err != cydriver.CUDA_SUCCESS: return (CUresult(err), None) return (CUresult(err), tensorMap) @@ -52227,10 +52257,11 @@ def cuTensorMapReplaceAddress(tensorMap : Optional[CUtensorMap], globalAddress): :py:obj:`~.cuTensorMapEncodeTiled`, :py:obj:`~.cuTensorMapEncodeIm2col`, :py:obj:`~.cuTensorMapEncodeIm2colWide` """ cdef cydriver.CUtensorMap* cytensorMap_ptr = tensorMap._pvt_ptr if tensorMap is not None else NULL - cyglobalAddress = _HelperInputVoidPtr(globalAddress) - cdef void* cyglobalAddress_ptr = cyglobalAddress.cptr + cdef _HelperInputVoidPtrStruct cyglobalAddressHelper + cdef void* cyglobalAddress = _helper_input_void_ptr(globalAddress, &cyglobalAddressHelper) with nogil: - err = cydriver.cuTensorMapReplaceAddress(cytensorMap_ptr, cyglobalAddress_ptr) + err = cydriver.cuTensorMapReplaceAddress(cytensorMap_ptr, cyglobalAddress) + _helper_input_void_ptr_free(&cyglobalAddressHelper) return (CUresult(err),) {{endif}} @@ -54594,15 +54625,15 @@ def cuLogsRegisterCallback(callbackFunc, userData): else: pcallbackFunc = int(CUlogsCallback(callbackFunc)) cycallbackFunc = pcallbackFunc - cyuserData = _HelperInputVoidPtr(userData) - cdef void* cyuserData_ptr = cyuserData.cptr + cdef _HelperInputVoidPtrStruct cyuserDataHelper + cdef void* cyuserData = _helper_input_void_ptr(userData, &cyuserDataHelper) cdef cuLogsCallbackData *cbData = NULL cbData = malloc(sizeof(cbData[0])) if cbData == NULL: return (CUresult.CUDA_ERROR_OUT_OF_MEMORY, None) cbData.callback = cycallbackFunc - cbData.userData = cyuserData_ptr + cbData.userData = cyuserData cdef CUlogsCallbackHandle callback_out = CUlogsCallbackHandle() with nogil: @@ -54611,6 +54642,7 @@ def cuLogsRegisterCallback(callbackFunc, userData): free(cbData) else: m_global._allocated[int(callback_out)] = cbData + _helper_input_void_ptr_free(&cyuserDataHelper) if err != cydriver.CUDA_SUCCESS: return (CUresult(err), None) return (CUresult(err), callback_out) diff --git a/cuda_bindings/cuda/bindings/nvrtc.pyx.in b/cuda_bindings/cuda/bindings/nvrtc.pyx.in index 9f5949f4b5..b3ecbedd53 100644 --- a/cuda_bindings/cuda/bindings/nvrtc.pyx.in +++ b/cuda_bindings/cuda/bindings/nvrtc.pyx.in @@ -1029,12 +1029,14 @@ def nvrtcSetFlowCallback(prog, callback, payload): else: pprog = int(nvrtcProgram(prog)) cyprog = pprog - cycallback = _HelperInputVoidPtr(callback) - cdef void* cycallback_ptr = cycallback.cptr - cypayload = _HelperInputVoidPtr(payload) - cdef void* cypayload_ptr = cypayload.cptr + cdef _HelperInputVoidPtrStruct cycallbackHelper + cdef void* cycallback = _helper_input_void_ptr(callback, &cycallbackHelper) + cdef _HelperInputVoidPtrStruct cypayloadHelper + cdef void* cypayload = _helper_input_void_ptr(payload, &cypayloadHelper) with nogil: - err = cynvrtc.nvrtcSetFlowCallback(cyprog, cycallback_ptr, cypayload_ptr) + err = cynvrtc.nvrtcSetFlowCallback(cyprog, cycallback, cypayload) + _helper_input_void_ptr_free(&cycallbackHelper) + _helper_input_void_ptr_free(&cypayloadHelper) return (nvrtcResult(err),) {{endif}} diff --git a/cuda_bindings/cuda/bindings/runtime.pyx.in b/cuda_bindings/cuda/bindings/runtime.pyx.in index d47832656e..f9b388b9dd 100644 --- a/cuda_bindings/cuda/bindings/runtime.pyx.in +++ b/cuda_bindings/cuda/bindings/runtime.pyx.in @@ -21203,10 +21203,11 @@ def cudaIpcGetMemHandle(devPtr): :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaIpcGetEventHandle`, :py:obj:`~.cudaIpcOpenEventHandle`, :py:obj:`~.cudaIpcOpenMemHandle`, :py:obj:`~.cudaIpcCloseMemHandle`, :py:obj:`~.cuIpcGetMemHandle` """ cdef cudaIpcMemHandle_t handle = cudaIpcMemHandle_t() - cydevPtr = _HelperInputVoidPtr(devPtr) - cdef void* cydevPtr_ptr = cydevPtr.cptr + cdef _HelperInputVoidPtrStruct cydevPtrHelper + cdef void* cydevPtr = _helper_input_void_ptr(devPtr, &cydevPtrHelper) with nogil: - err = cyruntime.cudaIpcGetMemHandle(handle._pvt_ptr, cydevPtr_ptr) + err = cyruntime.cudaIpcGetMemHandle(handle._pvt_ptr, cydevPtr) + _helper_input_void_ptr_free(&cydevPtrHelper) if err != cyruntime.cudaSuccess: return (cudaError_t(err), None) return (cudaError_t(err), handle) @@ -21321,10 +21322,11 @@ def cudaIpcCloseMemHandle(devPtr): -------- :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaIpcGetEventHandle`, :py:obj:`~.cudaIpcOpenEventHandle`, :py:obj:`~.cudaIpcGetMemHandle`, :py:obj:`~.cudaIpcOpenMemHandle`, :py:obj:`~.cuIpcCloseMemHandle` """ - cydevPtr = _HelperInputVoidPtr(devPtr) - cdef void* cydevPtr_ptr = cydevPtr.cptr + cdef _HelperInputVoidPtrStruct cydevPtrHelper + cdef void* cydevPtr = _helper_input_void_ptr(devPtr, &cydevPtrHelper) with nogil: - err = cyruntime.cudaIpcCloseMemHandle(cydevPtr_ptr) + err = cyruntime.cudaIpcCloseMemHandle(cydevPtr) + _helper_input_void_ptr_free(&cydevPtrHelper) return (cudaError_t(err),) {{endif}} @@ -21436,15 +21438,15 @@ def cudaDeviceRegisterAsyncNotification(int device, callbackFunc, userData): else: pcallbackFunc = int(cudaAsyncCallback(callbackFunc)) cycallbackFunc = pcallbackFunc - cyuserData = _HelperInputVoidPtr(userData) - cdef void* cyuserData_ptr = cyuserData.cptr + cdef _HelperInputVoidPtrStruct cyuserDataHelper + cdef void* cyuserData = _helper_input_void_ptr(userData, &userDataHelper) cdef cudaAsyncCallbackData *cbData = NULL cbData = malloc(sizeof(cbData[0])) if cbData == NULL: return (cudaError_t.cudaErrorMemoryAllocation, None) cbData.callback = cycallbackFunc - cbData.userData = cyuserData_ptr + cbData.userData = cyuserData cdef cudaAsyncCallbackHandle_t callback = cudaAsyncCallbackHandle_t() with nogil: @@ -21453,6 +21455,8 @@ def cudaDeviceRegisterAsyncNotification(int device, callbackFunc, userData): free(cbData) else: m_global._allocated[int(callback)] = cbData + _helper_input_void_ptr_free(&userDataHelper) + if err != cyruntime.cudaSuccess: return (cudaError_t(err), None) return (cudaError_t(err), callback) @@ -22075,10 +22079,11 @@ def cudaDeviceGetNvSciSyncAttributes(nvSciSyncAttrList, int device, int flags): -------- :py:obj:`~.cudaImportExternalSemaphore`, :py:obj:`~.cudaDestroyExternalSemaphore`, :py:obj:`~.cudaSignalExternalSemaphoresAsync`, :py:obj:`~.cudaWaitExternalSemaphoresAsync` """ - cynvSciSyncAttrList = _HelperInputVoidPtr(nvSciSyncAttrList) - cdef void* cynvSciSyncAttrList_ptr = cynvSciSyncAttrList.cptr + cdef _HelperInputVoidPtrStruct cynvSciSyncAttrListHelper + cdef void* cynvSciSyncAttrList = _helper_input_void_ptr(nvSciSyncAttrList, &cynvSciSyncAttrListHelper) with nogil: - err = cyruntime.cudaDeviceGetNvSciSyncAttributes(cynvSciSyncAttrList_ptr, device, flags) + err = cyruntime.cudaDeviceGetNvSciSyncAttributes(cynvSciSyncAttrList, device, flags) + _helper_input_void_ptr_free(&cynvSciSyncAttrListHelper) return (cudaError_t(err),) {{endif}} @@ -23185,20 +23190,22 @@ def cudaStreamAddCallback(stream, callback, userData, unsigned int flags): else: pstream = int(cudaStream_t(stream)) cystream = pstream - cyuserData = _HelperInputVoidPtr(userData) - cdef void* cyuserData_ptr = cyuserData.cptr + cdef _HelperInputVoidPtrStruct cyuserDataHelper + cdef void* cyuserData = _helper_input_void_ptr(userData, &cyuserDataHelper) cdef cudaStreamCallbackData *cbData = NULL cbData = malloc(sizeof(cbData[0])) if cbData == NULL: return (cudaError_t.cudaErrorMemoryAllocation,) cbData.callback = cycallback - cbData.userData = cyuserData_ptr + cbData.userData = cyuserData with nogil: err = cyruntime.cudaStreamAddCallback(cystream, cudaStreamRtCallbackWrapper, cbData, flags) if err != cyruntime.cudaSuccess: free(cbData) + _helper_input_void_ptr_free(&userDataHelper) + return (cudaError_t(err),) {{endif}} @@ -23384,10 +23391,11 @@ def cudaStreamAttachMemAsync(stream, devPtr, size_t length, unsigned int flags): else: pstream = int(cudaStream_t(stream)) cystream = pstream - cydevPtr = _HelperInputVoidPtr(devPtr) - cdef void* cydevPtr_ptr = cydevPtr.cptr + cdef _HelperInputVoidPtrStruct cydevPtrHelper + cdef void* cydevPtr = _helper_input_void_ptr(devPtr, &cydevPtrHelper) with nogil: - err = cyruntime.cudaStreamAttachMemAsync(cystream, cydevPtr_ptr, length, flags) + err = cyruntime.cudaStreamAttachMemAsync(cystream, cydevPtr, length, flags) + _helper_input_void_ptr_free(&cydevPtrHelper) return (cudaError_t(err),) {{endif}} @@ -25196,11 +25204,12 @@ def cudaFuncSetCacheConfig(func, cacheConfig not None : cudaFuncCache): ----- This API does not accept a :py:obj:`~.cudaKernel_t` casted as void*. If cache config modification is required for a :py:obj:`~.cudaKernel_t` (or a global function), it can be replaced with a call to :py:obj:`~.cudaFuncSetAttributes` with the attribute :py:obj:`~.cudaFuncAttributePreferredSharedMemoryCarveout` to specify a more granular L1 cache and shared memory split configuration. """ - cyfunc = _HelperInputVoidPtr(func) - cdef void* cyfunc_ptr = cyfunc.cptr + cdef _HelperInputVoidPtrStruct cyfuncHelper + cdef void* cyfunc = _helper_input_void_ptr(func, &cyfuncHelper) cdef cyruntime.cudaFuncCache cycacheConfig = int(cacheConfig) with nogil: - err = cyruntime.cudaFuncSetCacheConfig(cyfunc_ptr, cycacheConfig) + err = cyruntime.cudaFuncSetCacheConfig(cyfunc, cycacheConfig) + _helper_input_void_ptr_free(&cyfuncHelper) return (cudaError_t(err),) {{endif}} @@ -25239,10 +25248,11 @@ def cudaFuncGetAttributes(func): :py:obj:`~.cudaFuncSetCacheConfig (C API)`, cudaFuncGetAttributes (C++ API), :py:obj:`~.cudaLaunchKernel (C API)`, :py:obj:`~.cuFuncGetAttribute` """ cdef cudaFuncAttributes attr = cudaFuncAttributes() - cyfunc = _HelperInputVoidPtr(func) - cdef void* cyfunc_ptr = cyfunc.cptr + cdef _HelperInputVoidPtrStruct cyfuncHelper + cdef void* cyfunc = _helper_input_void_ptr(func, &cyfuncHelper) with nogil: - err = cyruntime.cudaFuncGetAttributes(attr._pvt_ptr, cyfunc_ptr) + err = cyruntime.cudaFuncGetAttributes(attr._pvt_ptr, cyfunc) + _helper_input_void_ptr_free(&cyfuncHelper) if err != cyruntime.cudaSuccess: return (cudaError_t(err), None) return (cudaError_t(err), attr) @@ -25326,11 +25336,12 @@ def cudaFuncSetAttribute(func, attr not None : cudaFuncAttribute, int value): cudaError_t :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDeviceFunction`, :py:obj:`~.cudaErrorInvalidValue` """ - cyfunc = _HelperInputVoidPtr(func) - cdef void* cyfunc_ptr = cyfunc.cptr + cdef _HelperInputVoidPtrStruct cyfuncHelper + cdef void* cyfunc = _helper_input_void_ptr(func, &cyfuncHelper) cdef cyruntime.cudaFuncAttribute cyattr = int(attr) with nogil: - err = cyruntime.cudaFuncSetAttribute(cyfunc_ptr, cyattr, value) + err = cyruntime.cudaFuncSetAttribute(cyfunc, cyattr, value) + _helper_input_void_ptr_free(&cyfuncHelper) return (cudaError_t(err),) {{endif}} @@ -25427,20 +25438,22 @@ def cudaLaunchHostFunc(stream, fn, userData): else: pstream = int(cudaStream_t(stream)) cystream = pstream - cyuserData = _HelperInputVoidPtr(userData) - cdef void* cyuserData_ptr = cyuserData.cptr + cdef _HelperInputVoidPtrStruct cyuserDataHelper + cdef void* cyuserData = _helper_input_void_ptr(userData, &cyuserDataHelper) cdef cudaStreamHostCallbackData *cbData = NULL cbData = malloc(sizeof(cbData[0])) if cbData == NULL: return (cudaError_t.cudaErrorMemoryAllocation,) cbData.callback = cyfn - cbData.userData = cyuserData_ptr + cbData.userData = cyuserData with nogil: err = cyruntime.cudaLaunchHostFunc(cystream, cudaStreamRtHostCallbackWrapper, cbData) if err != cyruntime.cudaSuccess: free(cbData) + _helper_input_void_ptr_free(&userDataHelper) + return (cudaError_t(err),) {{endif}} @@ -25504,11 +25517,12 @@ def cudaFuncSetSharedMemConfig(func, config not None : cudaSharedMemConfig): -------- :py:obj:`~.cudaDeviceSetSharedMemConfig`, :py:obj:`~.cudaDeviceGetSharedMemConfig`, :py:obj:`~.cudaDeviceSetCacheConfig`, :py:obj:`~.cudaDeviceGetCacheConfig`, :py:obj:`~.cudaFuncSetCacheConfig`, :py:obj:`~.cuFuncSetSharedMemConfig` """ - cyfunc = _HelperInputVoidPtr(func) - cdef void* cyfunc_ptr = cyfunc.cptr + cdef _HelperInputVoidPtrStruct cyfuncHelper + cdef void* cyfunc = _helper_input_void_ptr(func, &cyfuncHelper) cdef cyruntime.cudaSharedMemConfig cyconfig = int(config) with nogil: - err = cyruntime.cudaFuncSetSharedMemConfig(cyfunc_ptr, cyconfig) + err = cyruntime.cudaFuncSetSharedMemConfig(cyfunc, cyconfig) + _helper_input_void_ptr_free(&cyfuncHelper) return (cudaError_t(err),) {{endif}} @@ -25542,10 +25556,11 @@ def cudaOccupancyMaxActiveBlocksPerMultiprocessor(func, int blockSize, size_t dy :py:obj:`~.cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags`, cudaOccupancyMaxPotentialBlockSize (C++ API), cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API), cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API), cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API), cudaOccupancyAvailableDynamicSMemPerBlock (C++ API), :py:obj:`~.cuOccupancyMaxActiveBlocksPerMultiprocessor` """ cdef int numBlocks = 0 - cyfunc = _HelperInputVoidPtr(func) - cdef void* cyfunc_ptr = cyfunc.cptr + cdef _HelperInputVoidPtrStruct cyfuncHelper + cdef void* cyfunc = _helper_input_void_ptr(func, &cyfuncHelper) with nogil: - err = cyruntime.cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, cyfunc_ptr, blockSize, dynamicSMemSize) + err = cyruntime.cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, cyfunc, blockSize, dynamicSMemSize) + _helper_input_void_ptr_free(&cyfuncHelper) if err != cyruntime.cudaSuccess: return (cudaError_t(err), None) return (cudaError_t(err), numBlocks) @@ -25581,10 +25596,11 @@ def cudaOccupancyAvailableDynamicSMemPerBlock(func, int numBlocks, int blockSize :py:obj:`~.cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags`, cudaOccupancyMaxPotentialBlockSize (C++ API), cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API), cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API), cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API), :py:obj:`~.cudaOccupancyAvailableDynamicSMemPerBlock` """ cdef size_t dynamicSmemSize = 0 - cyfunc = _HelperInputVoidPtr(func) - cdef void* cyfunc_ptr = cyfunc.cptr + cdef _HelperInputVoidPtrStruct cyfuncHelper + cdef void* cyfunc = _helper_input_void_ptr(func, &cyfuncHelper) with nogil: - err = cyruntime.cudaOccupancyAvailableDynamicSMemPerBlock(&dynamicSmemSize, cyfunc_ptr, numBlocks, blockSize) + err = cyruntime.cudaOccupancyAvailableDynamicSMemPerBlock(&dynamicSmemSize, cyfunc, numBlocks, blockSize) + _helper_input_void_ptr_free(&cyfuncHelper) if err != cyruntime.cudaSuccess: return (cudaError_t(err), None) return (cudaError_t(err), dynamicSmemSize) @@ -25637,10 +25653,11 @@ def cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(func, int blockSize, :py:obj:`~.cudaOccupancyMaxActiveBlocksPerMultiprocessor`, cudaOccupancyMaxPotentialBlockSize (C++ API), cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API), cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API), cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API), cudaOccupancyAvailableDynamicSMemPerBlock (C++ API), :py:obj:`~.cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` """ cdef int numBlocks = 0 - cyfunc = _HelperInputVoidPtr(func) - cdef void* cyfunc_ptr = cyfunc.cptr + cdef _HelperInputVoidPtrStruct cyfuncHelper + cdef void* cyfunc = _helper_input_void_ptr(func, &cyfuncHelper) with nogil: - err = cyruntime.cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(&numBlocks, cyfunc_ptr, blockSize, dynamicSMemSize, flags) + err = cyruntime.cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(&numBlocks, cyfunc, blockSize, dynamicSMemSize, flags) + _helper_input_void_ptr_free(&cyfuncHelper) if err != cyruntime.cudaSuccess: return (cudaError_t(err), None) return (cudaError_t(err), numBlocks) @@ -26043,10 +26060,11 @@ def cudaFree(devPtr): -------- :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaMallocManaged`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaMallocFromPoolAsync` :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaFreeAsync` :py:obj:`~.cudaHostAlloc`, :py:obj:`~.cuMemFree` """ - cydevPtr = _HelperInputVoidPtr(devPtr) - cdef void* cydevPtr_ptr = cydevPtr.cptr + cdef _HelperInputVoidPtrStruct cydevPtrHelper + cdef void* cydevPtr = _helper_input_void_ptr(devPtr, &cydevPtrHelper) with nogil: - err = cyruntime.cudaFree(cydevPtr_ptr) + err = cyruntime.cudaFree(cydevPtr) + _helper_input_void_ptr_free(&cydevPtrHelper) return (cudaError_t(err),) {{endif}} @@ -26074,10 +26092,11 @@ def cudaFreeHost(ptr): -------- :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.cuMemFreeHost` """ - cyptr = _HelperInputVoidPtr(ptr) - cdef void* cyptr_ptr = cyptr.cptr + cdef _HelperInputVoidPtrStruct cyptrHelper + cdef void* cyptr = _helper_input_void_ptr(ptr, &cyptrHelper) with nogil: - err = cyruntime.cudaFreeHost(cyptr_ptr) + err = cyruntime.cudaFreeHost(cyptr) + _helper_input_void_ptr_free(&cyptrHelper) return (cudaError_t(err),) {{endif}} @@ -26350,10 +26369,11 @@ def cudaHostRegister(ptr, size_t size, unsigned int flags): -------- :py:obj:`~.cudaHostUnregister`, :py:obj:`~.cudaHostGetFlags`, :py:obj:`~.cudaHostGetDevicePointer`, :py:obj:`~.cuMemHostRegister` """ - cyptr = _HelperInputVoidPtr(ptr) - cdef void* cyptr_ptr = cyptr.cptr + cdef _HelperInputVoidPtrStruct cyptrHelper + cdef void* cyptr = _helper_input_void_ptr(ptr, &cyptrHelper) with nogil: - err = cyruntime.cudaHostRegister(cyptr_ptr, size, flags) + err = cyruntime.cudaHostRegister(cyptr, size, flags) + _helper_input_void_ptr_free(&cyptrHelper) return (cudaError_t(err),) {{endif}} @@ -26383,10 +26403,11 @@ def cudaHostUnregister(ptr): -------- :py:obj:`~.cudaHostUnregister`, :py:obj:`~.cuMemHostUnregister` """ - cyptr = _HelperInputVoidPtr(ptr) - cdef void* cyptr_ptr = cyptr.cptr + cdef _HelperInputVoidPtrStruct cyptrHelper + cdef void* cyptr = _helper_input_void_ptr(ptr, &cyptrHelper) with nogil: - err = cyruntime.cudaHostUnregister(cyptr_ptr) + err = cyruntime.cudaHostUnregister(cyptr) + _helper_input_void_ptr_free(&cyptrHelper) return (cudaError_t(err),) {{endif}} @@ -26443,10 +26464,11 @@ def cudaHostGetDevicePointer(pHost, unsigned int flags): :py:obj:`~.cudaSetDeviceFlags`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer` """ cdef void_ptr pDevice = 0 - cypHost = _HelperInputVoidPtr(pHost) - cdef void* cypHost_ptr = cypHost.cptr + cdef _HelperInputVoidPtrStruct cypHostHelper + cdef void* cypHost = _helper_input_void_ptr(pHost, &cypHostHelper) with nogil: - err = cyruntime.cudaHostGetDevicePointer(&pDevice, cypHost_ptr, flags) + err = cyruntime.cudaHostGetDevicePointer(&pDevice, cypHost, flags) + _helper_input_void_ptr_free(&cypHostHelper) if err != cyruntime.cudaSuccess: return (cudaError_t(err), None) return (cudaError_t(err), pDevice) @@ -26478,10 +26500,11 @@ def cudaHostGetFlags(pHost): :py:obj:`~.cudaHostAlloc`, :py:obj:`~.cuMemHostGetFlags` """ cdef unsigned int pFlags = 0 - cypHost = _HelperInputVoidPtr(pHost) - cdef void* cypHost_ptr = cypHost.cptr + cdef _HelperInputVoidPtrStruct cypHostHelper + cdef void* cypHost = _helper_input_void_ptr(pHost, &cypHostHelper) with nogil: - err = cyruntime.cudaHostGetFlags(&pFlags, cypHost_ptr) + err = cyruntime.cudaHostGetFlags(&pFlags, cypHost) + _helper_input_void_ptr_free(&cypHostHelper) if err != cyruntime.cudaSuccess: return (cudaError_t(err), None) return (cudaError_t(err), pFlags) @@ -27502,13 +27525,15 @@ def cudaMemcpy(dst, src, size_t count, kind not None : cudaMemcpyKind): -------- :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpy` """ - cydst = _HelperInputVoidPtr(dst) - cdef void* cydst_ptr = cydst.cptr - cysrc = _HelperInputVoidPtr(src) - cdef void* cysrc_ptr = cysrc.cptr + cdef _HelperInputVoidPtrStruct cydstHelper + cdef void* cydst = _helper_input_void_ptr(dst, &cydstHelper) + cdef _HelperInputVoidPtrStruct cysrcHelper + cdef void* cysrc = _helper_input_void_ptr(src, &cysrcHelper) cdef cyruntime.cudaMemcpyKind cykind = int(kind) with nogil: - err = cyruntime.cudaMemcpy(cydst_ptr, cysrc_ptr, count, cykind) + err = cyruntime.cudaMemcpy(cydst, cysrc, count, cykind) + _helper_input_void_ptr_free(&cydstHelper) + _helper_input_void_ptr_free(&cysrcHelper) return (cudaError_t(err),) {{endif}} @@ -27551,12 +27576,14 @@ def cudaMemcpyPeer(dst, int dstDevice, src, int srcDevice, size_t count): -------- :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpyPeerAsync`, :py:obj:`~.cudaMemcpy3DPeerAsync`, :py:obj:`~.cuMemcpyPeer` """ - cydst = _HelperInputVoidPtr(dst) - cdef void* cydst_ptr = cydst.cptr - cysrc = _HelperInputVoidPtr(src) - cdef void* cysrc_ptr = cysrc.cptr + cdef _HelperInputVoidPtrStruct cydstHelper + cdef void* cydst = _helper_input_void_ptr(dst, &cydstHelper) + cdef _HelperInputVoidPtrStruct cysrcHelper + cdef void* cysrc = _helper_input_void_ptr(src, &cysrcHelper) with nogil: - err = cyruntime.cudaMemcpyPeer(cydst_ptr, dstDevice, cysrc_ptr, srcDevice, count) + err = cyruntime.cudaMemcpyPeer(cydst, dstDevice, cysrc, srcDevice, count) + _helper_input_void_ptr_free(&cydstHelper) + _helper_input_void_ptr_free(&cysrcHelper) return (cudaError_t(err),) {{endif}} @@ -27610,13 +27637,15 @@ def cudaMemcpy2D(dst, size_t dpitch, src, size_t spitch, size_t width, size_t he -------- :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DUnaligned` """ - cydst = _HelperInputVoidPtr(dst) - cdef void* cydst_ptr = cydst.cptr - cysrc = _HelperInputVoidPtr(src) - cdef void* cysrc_ptr = cysrc.cptr + cdef _HelperInputVoidPtrStruct cydstHelper + cdef void* cydst = _helper_input_void_ptr(dst, &cydstHelper) + cdef _HelperInputVoidPtrStruct cysrcHelper + cdef void* cysrc = _helper_input_void_ptr(src, &cysrcHelper) cdef cyruntime.cudaMemcpyKind cykind = int(kind) with nogil: - err = cyruntime.cudaMemcpy2D(cydst_ptr, dpitch, cysrc_ptr, spitch, width, height, cykind) + err = cyruntime.cudaMemcpy2D(cydst, dpitch, cysrc, spitch, width, height, cykind) + _helper_input_void_ptr_free(&cydstHelper) + _helper_input_void_ptr_free(&cysrcHelper) return (cudaError_t(err),) {{endif}} @@ -27679,11 +27708,12 @@ def cudaMemcpy2DToArray(dst, size_t wOffset, size_t hOffset, src, size_t spitch, else: pdst = int(cudaArray_t(dst)) cydst = pdst - cysrc = _HelperInputVoidPtr(src) - cdef void* cysrc_ptr = cysrc.cptr + cdef _HelperInputVoidPtrStruct cysrcHelper + cdef void* cysrc = _helper_input_void_ptr(src, &cysrcHelper) cdef cyruntime.cudaMemcpyKind cykind = int(kind) with nogil: - err = cyruntime.cudaMemcpy2DToArray(cydst, wOffset, hOffset, cysrc_ptr, spitch, width, height, cykind) + err = cyruntime.cudaMemcpy2DToArray(cydst, wOffset, hOffset, cysrc, spitch, width, height, cykind) + _helper_input_void_ptr_free(&cysrcHelper) return (cudaError_t(err),) {{endif}} @@ -27746,11 +27776,12 @@ def cudaMemcpy2DFromArray(dst, size_t dpitch, src, size_t wOffset, size_t hOffse else: psrc = int(cudaArray_const_t(src)) cysrc = psrc - cydst = _HelperInputVoidPtr(dst) - cdef void* cydst_ptr = cydst.cptr + cdef _HelperInputVoidPtrStruct cydstHelper + cdef void* cydst = _helper_input_void_ptr(dst, &cydstHelper) cdef cyruntime.cudaMemcpyKind cykind = int(kind) with nogil: - err = cyruntime.cudaMemcpy2DFromArray(cydst_ptr, dpitch, cysrc, wOffset, hOffset, width, height, cykind) + err = cyruntime.cudaMemcpy2DFromArray(cydst, dpitch, cysrc, wOffset, hOffset, width, height, cykind) + _helper_input_void_ptr_free(&cydstHelper) return (cudaError_t(err),) {{endif}} @@ -27887,13 +27918,15 @@ def cudaMemcpyAsync(dst, src, size_t count, kind not None : cudaMemcpyKind, stre else: pstream = int(cudaStream_t(stream)) cystream = pstream - cydst = _HelperInputVoidPtr(dst) - cdef void* cydst_ptr = cydst.cptr - cysrc = _HelperInputVoidPtr(src) - cdef void* cysrc_ptr = cysrc.cptr + cdef _HelperInputVoidPtrStruct cydstHelper + cdef void* cydst = _helper_input_void_ptr(dst, &cydstHelper) + cdef _HelperInputVoidPtrStruct cysrcHelper + cdef void* cysrc = _helper_input_void_ptr(src, &cysrcHelper) cdef cyruntime.cudaMemcpyKind cykind = int(kind) with nogil: - err = cyruntime.cudaMemcpyAsync(cydst_ptr, cysrc_ptr, count, cykind, cystream) + err = cyruntime.cudaMemcpyAsync(cydst, cysrc, count, cykind, cystream) + _helper_input_void_ptr_free(&cydstHelper) + _helper_input_void_ptr_free(&cysrcHelper) return (cudaError_t(err),) {{endif}} @@ -27944,12 +27977,14 @@ def cudaMemcpyPeerAsync(dst, int dstDevice, src, int srcDevice, size_t count, st else: pstream = int(cudaStream_t(stream)) cystream = pstream - cydst = _HelperInputVoidPtr(dst) - cdef void* cydst_ptr = cydst.cptr - cysrc = _HelperInputVoidPtr(src) - cdef void* cysrc_ptr = cysrc.cptr - with nogil: - err = cyruntime.cudaMemcpyPeerAsync(cydst_ptr, dstDevice, cysrc_ptr, srcDevice, count, cystream) + cdef _HelperInputVoidPtrStruct cydstHelper + cdef void* cydst = _helper_input_void_ptr(dst, &cydstHelper) + cdef _HelperInputVoidPtrStruct cysrcHelper + cdef void* cysrc = _helper_input_void_ptr(src, &cysrcHelper) + with nogil: + err = cyruntime.cudaMemcpyPeerAsync(cydst, dstDevice, cysrc, srcDevice, count, cystream) + _helper_input_void_ptr_free(&cydstHelper) + _helper_input_void_ptr_free(&cysrcHelper) return (cudaError_t(err),) {{endif}} @@ -28304,13 +28339,15 @@ def cudaMemcpy2DAsync(dst, size_t dpitch, src, size_t spitch, size_t width, size else: pstream = int(cudaStream_t(stream)) cystream = pstream - cydst = _HelperInputVoidPtr(dst) - cdef void* cydst_ptr = cydst.cptr - cysrc = _HelperInputVoidPtr(src) - cdef void* cysrc_ptr = cysrc.cptr + cdef _HelperInputVoidPtrStruct cydstHelper + cdef void* cydst = _helper_input_void_ptr(dst, &cydstHelper) + cdef _HelperInputVoidPtrStruct cysrcHelper + cdef void* cysrc = _helper_input_void_ptr(src, &cysrcHelper) cdef cyruntime.cudaMemcpyKind cykind = int(kind) with nogil: - err = cyruntime.cudaMemcpy2DAsync(cydst_ptr, dpitch, cysrc_ptr, spitch, width, height, cykind, cystream) + err = cyruntime.cudaMemcpy2DAsync(cydst, dpitch, cysrc, spitch, width, height, cykind, cystream) + _helper_input_void_ptr_free(&cydstHelper) + _helper_input_void_ptr_free(&cysrcHelper) return (cudaError_t(err),) {{endif}} @@ -28394,11 +28431,12 @@ def cudaMemcpy2DToArrayAsync(dst, size_t wOffset, size_t hOffset, src, size_t sp else: pdst = int(cudaArray_t(dst)) cydst = pdst - cysrc = _HelperInputVoidPtr(src) - cdef void* cysrc_ptr = cysrc.cptr + cdef _HelperInputVoidPtrStruct cysrcHelper + cdef void* cysrc = _helper_input_void_ptr(src, &cysrcHelper) cdef cyruntime.cudaMemcpyKind cykind = int(kind) with nogil: - err = cyruntime.cudaMemcpy2DToArrayAsync(cydst, wOffset, hOffset, cysrc_ptr, spitch, width, height, cykind, cystream) + err = cyruntime.cudaMemcpy2DToArrayAsync(cydst, wOffset, hOffset, cysrc, spitch, width, height, cykind, cystream) + _helper_input_void_ptr_free(&cysrcHelper) return (cudaError_t(err),) {{endif}} @@ -28481,11 +28519,12 @@ def cudaMemcpy2DFromArrayAsync(dst, size_t dpitch, src, size_t wOffset, size_t h else: psrc = int(cudaArray_const_t(src)) cysrc = psrc - cydst = _HelperInputVoidPtr(dst) - cdef void* cydst_ptr = cydst.cptr + cdef _HelperInputVoidPtrStruct cydstHelper + cdef void* cydst = _helper_input_void_ptr(dst, &cydstHelper) cdef cyruntime.cudaMemcpyKind cykind = int(kind) with nogil: - err = cyruntime.cudaMemcpy2DFromArrayAsync(cydst_ptr, dpitch, cysrc, wOffset, hOffset, width, height, cykind, cystream) + err = cyruntime.cudaMemcpy2DFromArrayAsync(cydst, dpitch, cysrc, wOffset, hOffset, width, height, cykind, cystream) + _helper_input_void_ptr_free(&cydstHelper) return (cudaError_t(err),) {{endif}} @@ -28519,10 +28558,11 @@ def cudaMemset(devPtr, int value, size_t count): -------- :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32` """ - cydevPtr = _HelperInputVoidPtr(devPtr) - cdef void* cydevPtr_ptr = cydevPtr.cptr + cdef _HelperInputVoidPtrStruct cydevPtrHelper + cdef void* cydevPtr = _helper_input_void_ptr(devPtr, &cydevPtrHelper) with nogil: - err = cyruntime.cudaMemset(cydevPtr_ptr, value, count) + err = cyruntime.cudaMemset(cydevPtr, value, count) + _helper_input_void_ptr_free(&cydevPtrHelper) return (cudaError_t(err),) {{endif}} @@ -28563,10 +28603,11 @@ def cudaMemset2D(devPtr, size_t pitch, int value, size_t width, size_t height): -------- :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMemset3DAsync`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32` """ - cydevPtr = _HelperInputVoidPtr(devPtr) - cdef void* cydevPtr_ptr = cydevPtr.cptr + cdef _HelperInputVoidPtrStruct cydevPtrHelper + cdef void* cydevPtr = _helper_input_void_ptr(devPtr, &cydevPtrHelper) with nogil: - err = cyruntime.cudaMemset2D(cydevPtr_ptr, pitch, value, width, height) + err = cyruntime.cudaMemset2D(cydevPtr, pitch, value, width, height) + _helper_input_void_ptr_free(&cydevPtrHelper) return (cudaError_t(err),) {{endif}} @@ -28670,10 +28711,11 @@ def cudaMemsetAsync(devPtr, int value, size_t count, stream): else: pstream = int(cudaStream_t(stream)) cystream = pstream - cydevPtr = _HelperInputVoidPtr(devPtr) - cdef void* cydevPtr_ptr = cydevPtr.cptr + cdef _HelperInputVoidPtrStruct cydevPtrHelper + cdef void* cydevPtr = _helper_input_void_ptr(devPtr, &cydevPtrHelper) with nogil: - err = cyruntime.cudaMemsetAsync(cydevPtr_ptr, value, count, cystream) + err = cyruntime.cudaMemsetAsync(cydevPtr, value, count, cystream) + _helper_input_void_ptr_free(&cydevPtrHelper) return (cudaError_t(err),) {{endif}} @@ -28730,10 +28772,11 @@ def cudaMemset2DAsync(devPtr, size_t pitch, int value, size_t width, size_t heig else: pstream = int(cudaStream_t(stream)) cystream = pstream - cydevPtr = _HelperInputVoidPtr(devPtr) - cdef void* cydevPtr_ptr = cydevPtr.cptr + cdef _HelperInputVoidPtrStruct cydevPtrHelper + cdef void* cydevPtr = _helper_input_void_ptr(devPtr, &cydevPtrHelper) with nogil: - err = cyruntime.cudaMemset2DAsync(cydevPtr_ptr, pitch, value, width, height, cystream) + err = cyruntime.cudaMemset2DAsync(cydevPtr, pitch, value, width, height, cystream) + _helper_input_void_ptr_free(&cydevPtrHelper) return (cudaError_t(err),) {{endif}} @@ -28918,10 +28961,11 @@ def cudaMemPrefetchAsync(devPtr, size_t count, location not None : cudaMemLocati else: pstream = int(cudaStream_t(stream)) cystream = pstream - cydevPtr = _HelperInputVoidPtr(devPtr) - cdef void* cydevPtr_ptr = cydevPtr.cptr + cdef _HelperInputVoidPtrStruct cydevPtrHelper + cdef void* cydevPtr = _helper_input_void_ptr(devPtr, &cydevPtrHelper) with nogil: - err = cyruntime.cudaMemPrefetchAsync(cydevPtr_ptr, count, location._pvt_ptr[0], flags, cystream) + err = cyruntime.cudaMemPrefetchAsync(cydevPtr, count, location._pvt_ptr[0], flags, cystream) + _helper_input_void_ptr_free(&cydevPtrHelper) return (cudaError_t(err),) {{endif}} @@ -29414,11 +29458,12 @@ def cudaMemAdvise(devPtr, size_t count, advice not None : cudaMemoryAdvise, loca -------- :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyPeer`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy3DPeerAsync`, :py:obj:`~.cudaMemPrefetchAsync`, :py:obj:`~.cuMemAdvise` """ - cydevPtr = _HelperInputVoidPtr(devPtr) - cdef void* cydevPtr_ptr = cydevPtr.cptr + cdef _HelperInputVoidPtrStruct cydevPtrHelper + cdef void* cydevPtr = _helper_input_void_ptr(devPtr, &cydevPtrHelper) cdef cyruntime.cudaMemoryAdvise cyadvice = int(advice) with nogil: - err = cyruntime.cudaMemAdvise(cydevPtr_ptr, count, cyadvice, location._pvt_ptr[0]) + err = cyruntime.cudaMemAdvise(cydevPtr, count, cyadvice, location._pvt_ptr[0]) + _helper_input_void_ptr_free(&cydevPtrHelper) return (cudaError_t(err),) {{endif}} @@ -29563,10 +29608,11 @@ def cudaMemRangeGetAttribute(size_t dataSize, attribute not None : cudaMemRangeA cdef _HelperCUmem_range_attribute cydata = _HelperCUmem_range_attribute(attribute, dataSize) cdef void* cydata_ptr = cydata.cptr cdef cyruntime.cudaMemRangeAttribute cyattribute = int(attribute) - cydevPtr = _HelperInputVoidPtr(devPtr) - cdef void* cydevPtr_ptr = cydevPtr.cptr + cdef _HelperInputVoidPtrStruct cydevPtrHelper + cdef void* cydevPtr = _helper_input_void_ptr(devPtr, &cydevPtrHelper) with nogil: - err = cyruntime.cudaMemRangeGetAttribute(cydata_ptr, dataSize, cyattribute, cydevPtr_ptr, count) + err = cyruntime.cudaMemRangeGetAttribute(cydata_ptr, dataSize, cyattribute, cydevPtr, count) + _helper_input_void_ptr_free(&cydevPtrHelper) if err != cyruntime.cudaSuccess: return (cudaError_t(err), None) return (cudaError_t(err), cydata.pyObj()) @@ -29644,10 +29690,11 @@ def cudaMemRangeGetAttributes(dataSizes : tuple[int] | list[int], attributes : O cdef vector[cyruntime.cudaMemRangeAttribute] cyattributes = [int(pyattributes) for pyattributes in (attributes)] if numAttributes > len(dataSizes): raise RuntimeError("List is too small: " + str(len(dataSizes)) + " < " + str(numAttributes)) if numAttributes > len(attributes): raise RuntimeError("List is too small: " + str(len(attributes)) + " < " + str(numAttributes)) - cydevPtr = _HelperInputVoidPtr(devPtr) - cdef void* cydevPtr_ptr = cydevPtr.cptr + cdef _HelperInputVoidPtrStruct cydevPtrHelper + cdef void* cydevPtr = _helper_input_void_ptr(devPtr, &cydevPtrHelper) with nogil: - err = cyruntime.cudaMemRangeGetAttributes(cyvoidStarHelper_ptr, cydataSizes.data(), cyattributes.data(), numAttributes, cydevPtr_ptr, count) + err = cyruntime.cudaMemRangeGetAttributes(cyvoidStarHelper_ptr, cydataSizes.data(), cyattributes.data(), numAttributes, cydevPtr, count) + _helper_input_void_ptr_free(&cydevPtrHelper) if err != cyruntime.cudaSuccess: return (cudaError_t(err), None) return (cudaError_t(err), [obj.pyObj() for obj in pylist]) @@ -29704,11 +29751,12 @@ def cudaMemcpyToArray(dst, size_t wOffset, size_t hOffset, src, size_t count, ki else: pdst = int(cudaArray_t(dst)) cydst = pdst - cysrc = _HelperInputVoidPtr(src) - cdef void* cysrc_ptr = cysrc.cptr + cdef _HelperInputVoidPtrStruct cysrcHelper + cdef void* cysrc = _helper_input_void_ptr(src, &cysrcHelper) cdef cyruntime.cudaMemcpyKind cykind = int(kind) with nogil: - err = cyruntime.cudaMemcpyToArray(cydst, wOffset, hOffset, cysrc_ptr, count, cykind) + err = cyruntime.cudaMemcpyToArray(cydst, wOffset, hOffset, cysrc, count, cykind) + _helper_input_void_ptr_free(&cysrcHelper) return (cudaError_t(err),) {{endif}} @@ -29763,11 +29811,12 @@ def cudaMemcpyFromArray(dst, src, size_t wOffset, size_t hOffset, size_t count, else: psrc = int(cudaArray_const_t(src)) cysrc = psrc - cydst = _HelperInputVoidPtr(dst) - cdef void* cydst_ptr = cydst.cptr + cdef _HelperInputVoidPtrStruct cydstHelper + cdef void* cydst = _helper_input_void_ptr(dst, &cydstHelper) cdef cyruntime.cudaMemcpyKind cykind = int(kind) with nogil: - err = cyruntime.cudaMemcpyFromArray(cydst_ptr, cysrc, wOffset, hOffset, count, cykind) + err = cyruntime.cudaMemcpyFromArray(cydst, cysrc, wOffset, hOffset, count, cykind) + _helper_input_void_ptr_free(&cydstHelper) return (cudaError_t(err),) {{endif}} @@ -29909,11 +29958,12 @@ def cudaMemcpyToArrayAsync(dst, size_t wOffset, size_t hOffset, src, size_t coun else: pdst = int(cudaArray_t(dst)) cydst = pdst - cysrc = _HelperInputVoidPtr(src) - cdef void* cysrc_ptr = cysrc.cptr + cdef _HelperInputVoidPtrStruct cysrcHelper + cdef void* cysrc = _helper_input_void_ptr(src, &cysrcHelper) cdef cyruntime.cudaMemcpyKind cykind = int(kind) with nogil: - err = cyruntime.cudaMemcpyToArrayAsync(cydst, wOffset, hOffset, cysrc_ptr, count, cykind, cystream) + err = cyruntime.cudaMemcpyToArrayAsync(cydst, wOffset, hOffset, cysrc, count, cykind, cystream) + _helper_input_void_ptr_free(&cysrcHelper) return (cudaError_t(err),) {{endif}} @@ -29985,11 +30035,12 @@ def cudaMemcpyFromArrayAsync(dst, src, size_t wOffset, size_t hOffset, size_t co else: psrc = int(cudaArray_const_t(src)) cysrc = psrc - cydst = _HelperInputVoidPtr(dst) - cdef void* cydst_ptr = cydst.cptr + cdef _HelperInputVoidPtrStruct cydstHelper + cdef void* cydst = _helper_input_void_ptr(dst, &cydstHelper) cdef cyruntime.cudaMemcpyKind cykind = int(kind) with nogil: - err = cyruntime.cudaMemcpyFromArrayAsync(cydst_ptr, cysrc, wOffset, hOffset, count, cykind, cystream) + err = cyruntime.cudaMemcpyFromArrayAsync(cydst, cysrc, wOffset, hOffset, count, cykind, cystream) + _helper_input_void_ptr_free(&cydstHelper) return (cudaError_t(err),) {{endif}} @@ -30087,10 +30138,11 @@ def cudaFreeAsync(devPtr, hStream): else: phStream = int(cudaStream_t(hStream)) cyhStream = phStream - cydevPtr = _HelperInputVoidPtr(devPtr) - cdef void* cydevPtr_ptr = cydevPtr.cptr + cdef _HelperInputVoidPtrStruct cydevPtrHelper + cdef void* cydevPtr = _helper_input_void_ptr(devPtr, &cydevPtrHelper) with nogil: - err = cyruntime.cudaFreeAsync(cydevPtr_ptr, cyhStream) + err = cyruntime.cudaFreeAsync(cydevPtr, cyhStream) + _helper_input_void_ptr_free(&cydevPtrHelper) return (cudaError_t(err),) {{endif}} @@ -30863,11 +30915,12 @@ def cudaMemPoolImportFromShareableHandle(shareableHandle, handleType not None : Imported memory pools do not support creating new allocations. As such imported memory pools may not be used in :py:obj:`~.cudaDeviceSetMemPool` or :py:obj:`~.cudaMallocFromPoolAsync` calls. """ cdef cudaMemPool_t memPool = cudaMemPool_t() - cyshareableHandle = _HelperInputVoidPtr(shareableHandle) - cdef void* cyshareableHandle_ptr = cyshareableHandle.cptr + cdef _HelperInputVoidPtrStruct cyshareableHandleHelper + cdef void* cyshareableHandle = _helper_input_void_ptr(shareableHandle, &cyshareableHandleHelper) cdef cyruntime.cudaMemAllocationHandleType cyhandleType = int(handleType) with nogil: - err = cyruntime.cudaMemPoolImportFromShareableHandle(memPool._pvt_ptr, cyshareableHandle_ptr, cyhandleType, flags) + err = cyruntime.cudaMemPoolImportFromShareableHandle(memPool._pvt_ptr, cyshareableHandle, cyhandleType, flags) + _helper_input_void_ptr_free(&cyshareableHandleHelper) if err != cyruntime.cudaSuccess: return (cudaError_t(err), None) return (cudaError_t(err), memPool) @@ -30901,10 +30954,11 @@ def cudaMemPoolExportPointer(ptr): :py:obj:`~.cuMemPoolExportPointer`, :py:obj:`~.cudaMemPoolExportToShareableHandle`, :py:obj:`~.cudaMemPoolImportFromShareableHandle`, :py:obj:`~.cudaMemPoolImportPointer` """ cdef cudaMemPoolPtrExportData exportData = cudaMemPoolPtrExportData() - cyptr = _HelperInputVoidPtr(ptr) - cdef void* cyptr_ptr = cyptr.cptr + cdef _HelperInputVoidPtrStruct cyptrHelper + cdef void* cyptr = _helper_input_void_ptr(ptr, &cyptrHelper) with nogil: - err = cyruntime.cudaMemPoolExportPointer(exportData._pvt_ptr, cyptr_ptr) + err = cyruntime.cudaMemPoolExportPointer(exportData._pvt_ptr, cyptr) + _helper_input_void_ptr_free(&cyptrHelper) if err != cyruntime.cudaSuccess: return (cudaError_t(err), None) return (cudaError_t(err), exportData) @@ -31026,10 +31080,11 @@ def cudaPointerGetAttributes(ptr): In CUDA 11.0 forward passing host pointer will return :py:obj:`~.cudaMemoryTypeUnregistered` in :py:obj:`~.cudaPointerAttributes.type` and call will return :py:obj:`~.cudaSuccess`. """ cdef cudaPointerAttributes attributes = cudaPointerAttributes() - cyptr = _HelperInputVoidPtr(ptr) - cdef void* cyptr_ptr = cyptr.cptr + cdef _HelperInputVoidPtrStruct cyptrHelper + cdef void* cyptr = _helper_input_void_ptr(ptr, &cyptrHelper) with nogil: - err = cyruntime.cudaPointerGetAttributes(attributes._pvt_ptr, cyptr_ptr) + err = cyruntime.cudaPointerGetAttributes(attributes._pvt_ptr, cyptr) + _helper_input_void_ptr_free(&cyptrHelper) if err != cyruntime.cudaSuccess: return (cudaError_t(err), None) return (cudaError_t(err), attributes) @@ -32229,11 +32284,12 @@ def cudaLogsRegisterCallback(callbackFunc, userData): else: pcallbackFunc = int(cudaLogsCallback_t(callbackFunc)) cycallbackFunc = pcallbackFunc - cyuserData = _HelperInputVoidPtr(userData) - cdef void* cyuserData_ptr = cyuserData.cptr + cdef _HelperInputVoidPtrStruct cyuserDataHelper + cdef void* cyuserData = _helper_input_void_ptr(userData, &cyuserDataHelper) cdef cudaLogsCallbackHandle callback_out = cudaLogsCallbackHandle() with nogil: - err = cyruntime.cudaLogsRegisterCallback(cycallbackFunc, cyuserData_ptr, callback_out._pvt_ptr) + err = cyruntime.cudaLogsRegisterCallback(cycallbackFunc, cyuserData, callback_out._pvt_ptr) + _helper_input_void_ptr_free(&cyuserDataHelper) if err != cyruntime.cudaSuccess: return (cudaError_t(err), None) return (cudaError_t(err), callback_out) @@ -32927,15 +32983,17 @@ def cudaGraphAddMemcpyNode1D(graph, pDependencies : Optional[tuple[cudaGraphNode cypDependencies[idx] = (pDependencies[idx])._pvt_ptr[0] elif len(pDependencies) == 1: cypDependencies = (pDependencies[0])._pvt_ptr - cydst = _HelperInputVoidPtr(dst) - cdef void* cydst_ptr = cydst.cptr - cysrc = _HelperInputVoidPtr(src) - cdef void* cysrc_ptr = cysrc.cptr + cdef _HelperInputVoidPtrStruct cydstHelper + cdef void* cydst = _helper_input_void_ptr(dst, &cydstHelper) + cdef _HelperInputVoidPtrStruct cysrcHelper + cdef void* cysrc = _helper_input_void_ptr(src, &cysrcHelper) cdef cyruntime.cudaMemcpyKind cykind = int(kind) with nogil: - err = cyruntime.cudaGraphAddMemcpyNode1D(pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cydst_ptr, cysrc_ptr, count, cykind) + err = cyruntime.cudaGraphAddMemcpyNode1D(pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cydst, cysrc, count, cykind) if len(pDependencies) > 1 and cypDependencies is not NULL: free(cypDependencies) + _helper_input_void_ptr_free(&cydstHelper) + _helper_input_void_ptr_free(&cysrcHelper) if err != cyruntime.cudaSuccess: return (cudaError_t(err), None) return (cudaError_t(err), pGraphNode) @@ -33071,13 +33129,15 @@ def cudaGraphMemcpyNodeSetParams1D(node, dst, src, size_t count, kind not None : else: pnode = int(cudaGraphNode_t(node)) cynode = pnode - cydst = _HelperInputVoidPtr(dst) - cdef void* cydst_ptr = cydst.cptr - cysrc = _HelperInputVoidPtr(src) - cdef void* cysrc_ptr = cysrc.cptr + cdef _HelperInputVoidPtrStruct cydstHelper + cdef void* cydst = _helper_input_void_ptr(dst, &cydstHelper) + cdef _HelperInputVoidPtrStruct cysrcHelper + cdef void* cysrc = _helper_input_void_ptr(src, &cysrcHelper) cdef cyruntime.cudaMemcpyKind cykind = int(kind) with nogil: - err = cyruntime.cudaGraphMemcpyNodeSetParams1D(cynode, cydst_ptr, cysrc_ptr, count, cykind) + err = cyruntime.cudaGraphMemcpyNodeSetParams1D(cynode, cydst, cysrc, count, cykind) + _helper_input_void_ptr_free(&cydstHelper) + _helper_input_void_ptr_free(&cysrcHelper) return (cudaError_t(err),) {{endif}} @@ -34455,12 +34515,13 @@ def cudaGraphAddMemFreeNode(graph, pDependencies : Optional[tuple[cudaGraphNode_ elif len(pDependencies) == 1: cypDependencies = (pDependencies[0])._pvt_ptr if numDependencies > len(pDependencies): raise RuntimeError("List is too small: " + str(len(pDependencies)) + " < " + str(numDependencies)) - cydptr = _HelperInputVoidPtr(dptr) - cdef void* cydptr_ptr = cydptr.cptr + cdef _HelperInputVoidPtrStruct cydptrHelper + cdef void* cydptr = _helper_input_void_ptr(dptr, &cydptrHelper) with nogil: - err = cyruntime.cudaGraphAddMemFreeNode(pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cydptr_ptr) + err = cyruntime.cudaGraphAddMemFreeNode(pGraphNode._pvt_ptr, cygraph, cypDependencies, numDependencies, cydptr) if len(pDependencies) > 1 and cypDependencies is not NULL: free(cypDependencies) + _helper_input_void_ptr_free(&cydptrHelper) if err != cyruntime.cudaSuccess: return (cudaError_t(err), None) return (cudaError_t(err), pGraphNode) @@ -36179,13 +36240,15 @@ def cudaGraphExecMemcpyNodeSetParams1D(hGraphExec, node, dst, src, size_t count, else: phGraphExec = int(cudaGraphExec_t(hGraphExec)) cyhGraphExec = phGraphExec - cydst = _HelperInputVoidPtr(dst) - cdef void* cydst_ptr = cydst.cptr - cysrc = _HelperInputVoidPtr(src) - cdef void* cysrc_ptr = cysrc.cptr + cdef _HelperInputVoidPtrStruct cydstHelper + cdef void* cydst = _helper_input_void_ptr(dst, &cydstHelper) + cdef _HelperInputVoidPtrStruct cysrcHelper + cdef void* cysrc = _helper_input_void_ptr(src, &cysrcHelper) cdef cyruntime.cudaMemcpyKind cykind = int(kind) with nogil: - err = cyruntime.cudaGraphExecMemcpyNodeSetParams1D(cyhGraphExec, cynode, cydst_ptr, cysrc_ptr, count, cykind) + err = cyruntime.cudaGraphExecMemcpyNodeSetParams1D(cyhGraphExec, cynode, cydst, cysrc, count, cykind) + _helper_input_void_ptr_free(&cydstHelper) + _helper_input_void_ptr_free(&cysrcHelper) return (cudaError_t(err),) {{endif}} @@ -37194,10 +37257,11 @@ def cudaUserObjectCreate(ptr, destroy, unsigned int initialRefcount, unsigned in pdestroy = int(cudaHostFn_t(destroy)) cydestroy = pdestroy cdef cudaUserObject_t object_out = cudaUserObject_t() - cyptr = _HelperInputVoidPtr(ptr) - cdef void* cyptr_ptr = cyptr.cptr + cdef _HelperInputVoidPtrStruct cyptrHelper + cdef void* cyptr = _helper_input_void_ptr(ptr, &cyptrHelper) with nogil: - err = cyruntime.cudaUserObjectCreate(object_out._pvt_ptr, cyptr_ptr, cydestroy, initialRefcount, flags) + err = cyruntime.cudaUserObjectCreate(object_out._pvt_ptr, cyptr, cydestroy, initialRefcount, flags) + _helper_input_void_ptr_free(&cyptrHelper) if err != cyruntime.cudaSuccess: return (cudaError_t(err), None) return (cudaError_t(err), object_out) @@ -38002,8 +38066,8 @@ def cudaLibraryLoadData(code, jitOptions : Optional[tuple[cudaJitOption] | list[ if not all(isinstance(_x, (cudaJitOption)) for _x in jitOptions): raise TypeError("Argument 'jitOptions' is not instance of type (expected tuple[cyruntime.cudaJitOption] or list[cyruntime.cudaJitOption]") cdef cudaLibrary_t library = cudaLibrary_t() - cycode = _HelperInputVoidPtr(code) - cdef void* cycode_ptr = cycode.cptr + cdef _HelperInputVoidPtrStruct cycodeHelper + cdef void* cycode = _helper_input_void_ptr(code, &cycodeHelper) cdef vector[cyruntime.cudaJitOption] cyjitOptions = [int(pyjitOptions) for pyjitOptions in (jitOptions)] pylist = [_HelperCudaJitOption(pyoptions, pyoptionValues) for pyoptions, pyoptionValues in zip(jitOptions, jitOptionsValues)] cdef _InputVoidPtrPtrHelper voidStarHelperjitOptionsValues = _InputVoidPtrPtrHelper(pylist) @@ -38017,7 +38081,8 @@ def cudaLibraryLoadData(code, jitOptions : Optional[tuple[cudaJitOption] | list[ if numLibraryOptions > len(libraryOptions): raise RuntimeError("List is too small: " + str(len(libraryOptions)) + " < " + str(numLibraryOptions)) if numLibraryOptions > len(libraryOptionValues): raise RuntimeError("List is too small: " + str(len(libraryOptionValues)) + " < " + str(numLibraryOptions)) with nogil: - err = cyruntime.cudaLibraryLoadData(library._pvt_ptr, cycode_ptr, cyjitOptions.data(), cyjitOptionsValues_ptr, numJitOptions, cylibraryOptions.data(), cylibraryOptionValues_ptr, numLibraryOptions) + err = cyruntime.cudaLibraryLoadData(library._pvt_ptr, cycode, cyjitOptions.data(), cyjitOptionsValues_ptr, numJitOptions, cylibraryOptions.data(), cylibraryOptionValues_ptr, numLibraryOptions) + _helper_input_void_ptr_free(&cycodeHelper) if err != cyruntime.cudaSuccess: return (cudaError_t(err), None) return (cudaError_t(err), library) @@ -39560,10 +39625,11 @@ def cudaGetKernel(entryFuncAddr): cudaGetKernel (C++ API) """ cdef cudaKernel_t kernelPtr = cudaKernel_t() - cyentryFuncAddr = _HelperInputVoidPtr(entryFuncAddr) - cdef void* cyentryFuncAddr_ptr = cyentryFuncAddr.cptr + cdef _HelperInputVoidPtrStruct cyentryFuncAddrHelper + cdef void* cyentryFuncAddr = _helper_input_void_ptr(entryFuncAddr, &cyentryFuncAddrHelper) with nogil: - err = cyruntime.cudaGetKernel(kernelPtr._pvt_ptr, cyentryFuncAddr_ptr) + err = cyruntime.cudaGetKernel(kernelPtr._pvt_ptr, cyentryFuncAddr) + _helper_input_void_ptr_free(&cyentryFuncAddrHelper) if err != cyruntime.cudaSuccess: return (cudaError_t(err), None) return (cudaError_t(err), kernelPtr) @@ -39600,10 +39666,11 @@ def make_cudaPitchedPtr(d, size_t p, size_t xsz, size_t ysz): -------- make_cudaExtent, make_cudaPos """ - cyd = _HelperInputVoidPtr(d) - cdef void* cyd_ptr = cyd.cptr + cdef _HelperInputVoidPtrStruct cydHelper + cdef void* cyd = _helper_input_void_ptr(d, &cydHelper) with nogil: - err = cyruntime.make_cudaPitchedPtr(cyd_ptr, p, xsz, ysz) + err = cyruntime.make_cudaPitchedPtr(cyd, p, xsz, ysz) + _helper_input_void_ptr_free(&cydHelper) cdef cudaPitchedPtr wrapper = cudaPitchedPtr() wrapper._pvt_ptr[0] = err return wrapper From 2350e36f37b210e4a49c31d1d9ec9ef08a0a0c83 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Thu, 12 Feb 2026 14:11:50 -0500 Subject: [PATCH 2/3] Restore cptr property --- cuda_bindings/cuda/bindings/_lib/utils.pxi.in | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cuda_bindings/cuda/bindings/_lib/utils.pxi.in b/cuda_bindings/cuda/bindings/_lib/utils.pxi.in index 14da64f5f4..a6bd249604 100644 --- a/cuda_bindings/cuda/bindings/_lib/utils.pxi.in +++ b/cuda_bindings/cuda/bindings/_lib/utils.pxi.in @@ -148,6 +148,10 @@ cdef class _HelperInputVoidPtr: if self._pyobj_acquired is True: PyBuffer_Release(&self._pybuffer) + @property + def cptr(self): + return self._cptr + cdef void * _helper_input_void_ptr(ptr, _HelperInputVoidPtrStruct *helper): helper[0]._pybuffer.buf = NULL From a992e5964b22f05b0eac46be9d11dacb2a4fd1e8 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Thu, 12 Feb 2026 14:18:53 -0500 Subject: [PATCH 3/3] Fixes in runtime.pyx.in --- cuda_bindings/cuda/bindings/runtime.pyx.in | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cuda_bindings/cuda/bindings/runtime.pyx.in b/cuda_bindings/cuda/bindings/runtime.pyx.in index f9b388b9dd..9f19b0c4fc 100644 --- a/cuda_bindings/cuda/bindings/runtime.pyx.in +++ b/cuda_bindings/cuda/bindings/runtime.pyx.in @@ -21439,7 +21439,7 @@ def cudaDeviceRegisterAsyncNotification(int device, callbackFunc, userData): pcallbackFunc = int(cudaAsyncCallback(callbackFunc)) cycallbackFunc = pcallbackFunc cdef _HelperInputVoidPtrStruct cyuserDataHelper - cdef void* cyuserData = _helper_input_void_ptr(userData, &userDataHelper) + cdef void* cyuserData = _helper_input_void_ptr(userData, &cyuserDataHelper) cdef cudaAsyncCallbackData *cbData = NULL cbData = malloc(sizeof(cbData[0])) @@ -21455,7 +21455,7 @@ def cudaDeviceRegisterAsyncNotification(int device, callbackFunc, userData): free(cbData) else: m_global._allocated[int(callback)] = cbData - _helper_input_void_ptr_free(&userDataHelper) + _helper_input_void_ptr_free(&cyuserDataHelper) if err != cyruntime.cudaSuccess: return (cudaError_t(err), None) @@ -23204,7 +23204,7 @@ def cudaStreamAddCallback(stream, callback, userData, unsigned int flags): err = cyruntime.cudaStreamAddCallback(cystream, cudaStreamRtCallbackWrapper, cbData, flags) if err != cyruntime.cudaSuccess: free(cbData) - _helper_input_void_ptr_free(&userDataHelper) + _helper_input_void_ptr_free(&cyuserDataHelper) return (cudaError_t(err),) {{endif}} @@ -25452,7 +25452,7 @@ def cudaLaunchHostFunc(stream, fn, userData): err = cyruntime.cudaLaunchHostFunc(cystream, cudaStreamRtHostCallbackWrapper, cbData) if err != cyruntime.cudaSuccess: free(cbData) - _helper_input_void_ptr_free(&userDataHelper) + _helper_input_void_ptr_free(&cyuserDataHelper) return (cudaError_t(err),) {{endif}}