diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 27e76b817a..739b11f4d9 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -56,12 +56,18 @@ decltype(&cuLibraryLoadData) p_cuLibraryLoadData = nullptr; decltype(&cuLibraryUnload) p_cuLibraryUnload = nullptr; decltype(&cuLibraryGetKernel) p_cuLibraryGetKernel = nullptr; +// Linker +decltype(&cuLinkDestroy) p_cuLinkDestroy = nullptr; + // NVRTC function pointers decltype(&nvrtcDestroyProgram) p_nvrtcDestroyProgram = nullptr; // NVVM function pointers (may be null if NVVM is not available) NvvmDestroyProgramFn p_nvvmDestroyProgram = nullptr; +// nvJitLink function pointers (may be null if nvJitLink is not available) +NvJitLinkDestroyFn p_nvJitLinkDestroy = nullptr; + // ============================================================================ // GIL management helpers // ============================================================================ @@ -805,19 +811,19 @@ NvrtcProgramHandle create_nvrtc_program_handle_ref(nvrtcProgram prog) { namespace { struct NvvmProgramBox { - nvvmProgram resource; + NvvmProgramValue resource; }; } // namespace NvvmProgramHandle create_nvvm_program_handle(nvvmProgram prog) { auto box = std::shared_ptr( - new NvvmProgramBox{prog}, + new NvvmProgramBox{{prog}}, [](NvvmProgramBox* b) { // Note: nvvmDestroyProgram takes nvvmProgram* and nulls it, // but we're deleting the box anyway so nulling is harmless. // If NVVM is not available, the function pointer is null. if (p_nvvmDestroyProgram) { - p_nvvmDestroyProgram(&b->resource); + p_nvvmDestroyProgram(&b->resource.raw); } delete b; } @@ -826,8 +832,69 @@ NvvmProgramHandle create_nvvm_program_handle(nvvmProgram prog) { } NvvmProgramHandle create_nvvm_program_handle_ref(nvvmProgram prog) { - auto box = std::make_shared(NvvmProgramBox{prog}); + auto box = std::make_shared(NvvmProgramBox{{prog}}); return NvvmProgramHandle(box, &box->resource); } +// ============================================================================ +// nvJitLink Handles +// ============================================================================ + +namespace { +struct NvJitLinkBox { + NvJitLinkValue resource; +}; +} // namespace + +NvJitLinkHandle create_nvjitlink_handle(nvJitLink_t handle) { + auto box = std::shared_ptr( + new NvJitLinkBox{{handle}}, + [](NvJitLinkBox* b) { + // Note: nvJitLinkDestroy takes nvJitLinkHandle* and nulls it, + // but we're deleting the box anyway so nulling is harmless. + // If nvJitLink is not available, the function pointer is null. + if (p_nvJitLinkDestroy) { + p_nvJitLinkDestroy(&b->resource.raw); + } + delete b; + } + ); + return NvJitLinkHandle(box, &box->resource); +} + +NvJitLinkHandle create_nvjitlink_handle_ref(nvJitLink_t handle) { + auto box = std::make_shared(NvJitLinkBox{{handle}}); + return NvJitLinkHandle(box, &box->resource); +} + +// ============================================================================ +// cuLink Handles +// ============================================================================ + +namespace { +struct CuLinkBox { + CUlinkState resource; +}; +} // namespace + +CuLinkHandle create_culink_handle(CUlinkState state) { + auto box = std::shared_ptr( + new CuLinkBox{state}, + [](CuLinkBox* b) { + // cuLinkDestroy takes CUlinkState by value (not pointer). + // Errors are ignored (standard destructor practice). + if (p_cuLinkDestroy) { + p_cuLinkDestroy(b->resource); + } + delete b; + } + ); + return CuLinkHandle(box, &box->resource); +} + +CuLinkHandle create_culink_handle_ref(CUlinkState state) { + auto box = std::make_shared(CuLinkBox{state}); + return CuLinkHandle(box, &box->resource); +} + } // namespace cuda_core diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index cb66841172..1576476dc4 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -14,8 +14,28 @@ // Use void* to match cuda.bindings.cynvvm's typedef using nvvmProgram = void*; +// Forward declaration for nvJitLink - avoids nvJitLink.h dependency +// Use void* to match cuda.bindings.cynvjitlink's typedef +using nvJitLink_t = void*; + namespace cuda_core { +// ============================================================================ +// TaggedHandle - make void*-based handle types distinct for overloading +// +// Both nvvmProgram and nvJitLink_t are void*, so shared_ptr +// would be the same C++ type for both. TaggedHandle wraps the raw +// value with a unique tag type, making each shared_ptr type distinct. +// ============================================================================ + +template +struct TaggedHandle { + T raw; +}; + +using NvvmProgramValue = TaggedHandle; +using NvJitLinkValue = TaggedHandle; + // ============================================================================ // Thread-local error handling // ============================================================================ @@ -72,6 +92,9 @@ extern decltype(&cuLibraryLoadData) p_cuLibraryLoadData; extern decltype(&cuLibraryUnload) p_cuLibraryUnload; extern decltype(&cuLibraryGetKernel) p_cuLibraryGetKernel; +// Linker +extern decltype(&cuLinkDestroy) p_cuLinkDestroy; + // ============================================================================ // NVRTC function pointers // @@ -94,6 +117,19 @@ extern decltype(&nvrtcDestroyProgram) p_nvrtcDestroyProgram; using NvvmDestroyProgramFn = int (*)(nvvmProgram*); extern NvvmDestroyProgramFn p_nvvmDestroyProgram; +// ============================================================================ +// nvJitLink function pointers +// +// These are populated by _resource_handles.pyx at module import time using +// function pointers extracted from cuda.bindings.cynvjitlink.__pyx_capi__. +// Note: May be null if nvJitLink is not available at runtime. +// ============================================================================ + +// Function pointer type for nvJitLinkDestroy (avoids nvJitLink.h dependency) +// Signature: nvJitLinkResult nvJitLinkDestroy(nvJitLinkHandle *handle) +using NvJitLinkDestroyFn = int (*)(nvJitLink_t*); +extern NvJitLinkDestroyFn p_nvJitLinkDestroy; + // ============================================================================ // Handle type aliases - expose only the raw CUDA resource // ============================================================================ @@ -105,7 +141,9 @@ using MemoryPoolHandle = std::shared_ptr; using LibraryHandle = std::shared_ptr; using KernelHandle = std::shared_ptr; using NvrtcProgramHandle = std::shared_ptr; -using NvvmProgramHandle = std::shared_ptr; +using NvvmProgramHandle = std::shared_ptr; +using NvJitLinkHandle = std::shared_ptr; +using CuLinkHandle = std::shared_ptr; // ============================================================================ // Context handle functions @@ -316,6 +354,33 @@ NvvmProgramHandle create_nvvm_program_handle(nvvmProgram prog); // The program will NOT be destroyed when the handle is released. NvvmProgramHandle create_nvvm_program_handle_ref(nvvmProgram prog); +// ============================================================================ +// nvJitLink handle functions +// ============================================================================ + +// Create an owning nvJitLink handle. +// When the last reference is released, nvJitLinkDestroy is called. +// Use this to wrap a handle created via nvJitLinkCreate. +// Note: If nvJitLink is not available (p_nvJitLinkDestroy is null), the deleter is a no-op. +NvJitLinkHandle create_nvjitlink_handle(nvJitLink_t handle); + +// Create a non-owning nvJitLink handle (references existing handle). +// The handle will NOT be destroyed when the last reference is released. +NvJitLinkHandle create_nvjitlink_handle_ref(nvJitLink_t handle); + +// ============================================================================ +// cuLink handle functions +// ============================================================================ + +// Create an owning cuLink handle. +// When the last reference is released, cuLinkDestroy is called. +// Use this to wrap a CUlinkState created via cuLinkCreate. +CuLinkHandle create_culink_handle(CUlinkState state); + +// Create a non-owning cuLink handle (references existing CUlinkState). +// The handle will NOT be destroyed when the last reference is released. +CuLinkHandle create_culink_handle_ref(CUlinkState state); + // ============================================================================ // Overloaded helper functions to extract raw resources from handles // ============================================================================ @@ -354,6 +419,14 @@ inline nvrtcProgram as_cu(const NvrtcProgramHandle& h) noexcept { } inline nvvmProgram as_cu(const NvvmProgramHandle& h) noexcept { + return h ? h->raw : nullptr; +} + +inline nvJitLink_t as_cu(const NvJitLinkHandle& h) noexcept { + return h ? h->raw : nullptr; +} + +inline CUlinkState as_cu(const CuLinkHandle& h) noexcept { return h ? *h : nullptr; } @@ -395,6 +468,14 @@ inline std::intptr_t as_intptr(const NvvmProgramHandle& h) noexcept { return reinterpret_cast(as_cu(h)); } +inline std::intptr_t as_intptr(const NvJitLinkHandle& h) noexcept { + return reinterpret_cast(as_cu(h)); +} + +inline std::intptr_t as_intptr(const CuLinkHandle& h) noexcept { + return reinterpret_cast(as_cu(h)); +} + // as_py() - convert handle to Python wrapper object (returns new reference) namespace detail { // n.b. class lookup is not cached to avoid deadlock hazard, see DESIGN.md @@ -447,4 +528,13 @@ inline PyObject* as_py(const NvvmProgramHandle& h) noexcept { return PyLong_FromSsize_t(as_intptr(h)); } +inline PyObject* as_py(const NvJitLinkHandle& h) noexcept { + // nvJitLink bindings use raw integers, not wrapper classes + return PyLong_FromSsize_t(as_intptr(h)); +} + +inline PyObject* as_py(const CuLinkHandle& h) noexcept { + return detail::make_py("cuda.bindings.driver", "CUlinkState", as_intptr(h)); +} + } // namespace cuda_core diff --git a/cuda_core/cuda/core/_linker.pxd b/cuda_core/cuda/core/_linker.pxd new file mode 100644 index 0000000000..e50ebb9770 --- /dev/null +++ b/cuda_core/cuda/core/_linker.pxd @@ -0,0 +1,17 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from ._resource_handles cimport NvJitLinkHandle, CuLinkHandle + + +cdef class Linker: + cdef: + NvJitLinkHandle _nvjitlink_handle + CuLinkHandle _culink_handle + bint _use_nvjitlink + object _drv_log_bufs # formatted_options list (driver); None for nvjitlink; cleared in link() + str _info_log # decoded log; None until link() or pre-link get_*_log() + str _error_log # decoded log; None until link() or pre-link get_*_log() + object _options # LinkerOptions + object __weakref__ diff --git a/cuda_core/cuda/core/_linker.py b/cuda_core/cuda/core/_linker.pyx similarity index 52% rename from cuda_core/cuda/core/_linker.py rename to cuda_core/cuda/core/_linker.pyx index 6490e87b07..316c46178a 100644 --- a/cuda_core/cuda/core/_linker.py +++ b/cuda_core/cuda/core/_linker.pyx @@ -1,114 +1,187 @@ # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 +"""Linking machinery for combining object codes. + +This module provides :class:`Linker` for linking one or more +:class:`~cuda.core.ObjectCode` objects, with :class:`LinkerOptions` for +configuration. +""" from __future__ import annotations -import ctypes +from cpython.bytearray cimport PyByteArray_AS_STRING +from libc.stdint cimport intptr_t, uint32_t +from libcpp.vector cimport vector +from cuda.bindings cimport cydriver +from cuda.bindings cimport cynvjitlink + +from ._resource_handles cimport ( + as_cu, + as_py, + create_culink_handle, + create_nvjitlink_handle, +) +from cuda.core._utils.cuda_utils cimport HANDLE_RETURN, HANDLE_RETURN_NVJITLINK + import sys -import weakref -from contextlib import contextmanager from dataclasses import dataclass -from typing import TYPE_CHECKING, Union +from typing import Union from warnings import warn -if TYPE_CHECKING: - import cuda.bindings - from cuda.core._device import Device from cuda.core._module import ObjectCode from cuda.core._utils.clear_error_support import assert_type -from cuda.core._utils.cuda_utils import check_or_create_options, driver, handle_return, is_sequence +from cuda.core._utils.cuda_utils import ( + CUDAError, + check_or_create_options, + driver, + handle_return, + is_sequence, +) -# TODO: revisit this treatment for py313t builds -_driver = None # populated if nvJitLink cannot be used -_driver_input_types = None # populated if nvJitLink cannot be used -_driver_ver = None -_inited = False -_nvjitlink = None # populated if nvJitLink can be used -_nvjitlink_input_types = None # populated if nvJitLink cannot be used +ctypedef const char* const_char_ptr +ctypedef void* void_ptr +__all__ = ["Linker", "LinkerOptions"] -def _nvjitlink_has_version_symbol(inner_nvjitlink) -> bool: - # This condition is equivalent to testing for version >= 12.3 - return bool(inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion")) +LinkerHandleT = Union["cuda.bindings.nvjitlink.nvJitLinkHandle", "cuda.bindings.driver.CUlinkState"] -# Note: this function is reused in the tests -def _decide_nvjitlink_or_driver() -> bool: - """Returns True if falling back to the cuLink* driver APIs.""" - global _driver_ver, _driver, _nvjitlink - if _driver or _nvjitlink: - return _driver is not None +# ============================================================================= +# Principal class +# ============================================================================= - _driver_ver = handle_return(driver.cuDriverGetVersion()) - _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10) +cdef class Linker: + """Represent a linking machinery to link one or more object codes into + :class:`~cuda.core.ObjectCode`. - warn_txt_common = ( - "the driver APIs will be used instead, which do not support" - " minor version compatibility or linking LTO IRs." - " For best results, consider upgrading to a recent version of" - ) + This object provides a unified interface to multiple underlying + linker libraries (such as nvJitLink or cuLink* from the CUDA driver). - try: - import cuda.bindings.nvjitlink as _nvjitlink - except ModuleNotFoundError: - warn_txt = f"cuda.bindings.nvjitlink is not available, therefore {warn_txt_common} cuda-bindings." - else: - from cuda.bindings._internal import nvjitlink as inner_nvjitlink + Parameters + ---------- + object_codes : :class:`~cuda.core.ObjectCode` + One or more ObjectCode objects to be linked. + options : :class:`LinkerOptions`, optional + Options for the linker. If not provided, default options will be used. + """ - try: - if _nvjitlink_has_version_symbol(inner_nvjitlink): - return False # Use nvjitlink - except RuntimeError: - warn_detail = "not available" + def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): + Linker_init(self, object_codes, options) + + def link(self, target_type) -> ObjectCode: + """Link the provided object codes into a single output of the specified target type. + + Parameters + ---------- + target_type : str + The type of the target output. Must be either "cubin" or "ptx". + + Returns + ------- + :class:`~cuda.core.ObjectCode` + The linked object code of the specified target type. + + .. note:: + + Ensure that input object codes were compiled with appropriate + flags for linking (e.g., relocatable device code enabled). + """ + return Linker_link(self, target_type) + + def get_error_log(self) -> str: + """Get the error log generated by the linker. + + Returns + ------- + str + The error log. + """ + # After link(), the decoded log is cached here. + if self._error_log is not None: + return self._error_log + cdef cynvjitlink.nvJitLinkHandle c_h + cdef size_t c_log_size = 0 + cdef char* c_log_ptr + if self._use_nvjitlink: + c_h = as_cu(self._nvjitlink_handle) + cynvjitlink.nvJitLinkGetErrorLogSize(c_h, &c_log_size) + log = bytearray(c_log_size) + if c_log_size > 0: + c_log_ptr = (log) + cynvjitlink.nvJitLinkGetErrorLog(c_h, c_log_ptr) + return log.decode("utf-8", errors="backslashreplace") else: - warn_detail = "too old (<12.3)" - warn_txt = ( - f"{'nvJitLink*.dll' if sys.platform == 'win32' else 'libnvJitLink.so*'} is {warn_detail}." - f" Therefore cuda.bindings.nvjitlink is not usable and {warn_txt_common} nvJitLink." - ) - _nvjitlink = None + return (self._drv_log_bufs[2]).decode( + "utf-8", errors="backslashreplace").rstrip('\x00') - warn(warn_txt, stacklevel=2, category=RuntimeWarning) - _driver = driver - return True + def get_info_log(self) -> str: + """Get the info log generated by the linker. + Returns + ------- + str + The info log. + """ + # After link(), the decoded log is cached here. + if self._info_log is not None: + return self._info_log + cdef cynvjitlink.nvJitLinkHandle c_h + cdef size_t c_log_size = 0 + cdef char* c_log_ptr + if self._use_nvjitlink: + c_h = as_cu(self._nvjitlink_handle) + cynvjitlink.nvJitLinkGetInfoLogSize(c_h, &c_log_size) + log = bytearray(c_log_size) + if c_log_size > 0: + c_log_ptr = (log) + cynvjitlink.nvJitLinkGetInfoLog(c_h, c_log_ptr) + return log.decode("utf-8", errors="backslashreplace") + else: + return (self._drv_log_bufs[0]).decode( + "utf-8", errors="backslashreplace").rstrip('\x00') -def _lazy_init(): - global _inited, _nvjitlink_input_types, _driver_input_types - if _inited: - return + def close(self): + """Destroy this linker.""" + if self._use_nvjitlink: + self._nvjitlink_handle.reset() + else: + self._culink_handle.reset() - _decide_nvjitlink_or_driver() - if _nvjitlink: - if _driver_ver > _nvjitlink.version(): - # TODO: nvJitLink is not new enough, warn? - pass - _nvjitlink_input_types = { - "ptx": _nvjitlink.InputType.PTX, - "cubin": _nvjitlink.InputType.CUBIN, - "fatbin": _nvjitlink.InputType.FATBIN, - "ltoir": _nvjitlink.InputType.LTOIR, - "object": _nvjitlink.InputType.OBJECT, - "library": _nvjitlink.InputType.LIBRARY, - } - else: - _driver_input_types = { - "ptx": _driver.CUjitInputType.CU_JIT_INPUT_PTX, - "cubin": _driver.CUjitInputType.CU_JIT_INPUT_CUBIN, - "fatbin": _driver.CUjitInputType.CU_JIT_INPUT_FATBINARY, - "object": _driver.CUjitInputType.CU_JIT_INPUT_OBJECT, - "library": _driver.CUjitInputType.CU_JIT_INPUT_LIBRARY, - } - _inited = True + @property + def handle(self) -> LinkerHandleT: + """Return the underlying handle object. + + .. note:: + + The type of the returned object depends on the backend. + .. caution:: + + This handle is a Python object. To get the memory address of the underlying C + handle, call ``int(Linker.handle)``. + """ + if self._use_nvjitlink: + return as_py(self._nvjitlink_handle) + else: + return as_py(self._culink_handle) + + @property + def backend(self) -> str: + """Return this Linker instance's underlying backend.""" + return "nvJitLink" if self._use_nvjitlink else "driver" + + +# ============================================================================= +# Supporting classes +# ============================================================================= @dataclass class LinkerOptions: - """Customizable :obj:`Linker` options. + """Customizable options for configuring :class:`Linker`. - Since the linker would choose to use nvJitLink or the driver APIs as the linking backed, + Since the linker may choose to use nvJitLink or the driver APIs as the linking backend, not all options are applicable. When the system's installed nvJitLink is too old (<12.3), or not installed, the driver APIs (cuLink) will be used instead. @@ -154,14 +227,14 @@ class LinkerOptions: fma : bool, optional Use fast multiply-add. Default: True. - kernels_used : [Union[str, tuple[str], list[str]]], optional + kernels_used : [str | tuple[str] | list[str]], optional Pass a kernel or sequence of kernels that are used; any not in the list can be removed. - variables_used : [Union[str, tuple[str], list[str]]], optional + variables_used : [str | tuple[str] | list[str]], optional Pass a variable or sequence of variables that are used; any not in the list can be removed. optimize_unused_variables : bool, optional Assume that if a variable is not referenced in device code, it can be removed. Default: False. - ptxas_options : [Union[str, tuple[str], list[str]]], optional + ptxas_options : [str | tuple[str] | list[str]], optional Pass options to PTXAS. split_compile : int, optional Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split @@ -357,241 +430,271 @@ def as_bytes(self, backend: str = "nvjitlink") -> list[bytes]: backend = backend.lower() if backend != "nvjitlink": raise ValueError(f"as_bytes() only supports 'nvjitlink' backend, got '{backend}'") - if not _nvjitlink: + if not _use_nvjitlink_backend: raise RuntimeError("nvJitLink backend is not available") return self._prepare_nvjitlink_options(as_bytes=True) -# This needs to be a free function not a method, as it's disallowed by contextmanager. -@contextmanager -def _exception_manager(self): - """ - A helper function to improve the error message of exceptions raised by the linker backend. - """ - try: - yield - except Exception as e: - error_log = "" - if hasattr(self, "_mnff"): - # our constructor could raise, in which case there's no handle available - error_log = self.get_error_log() - # Starting Python 3.11 we could also use Exception.add_note() for the same purpose, but - # unfortunately we are still supporting Python 3.10... - # Here we rely on both CUDAError and nvJitLinkError have the error string placed in .args[0]. - e.args = (e.args[0] + (f"\nLinker error log: {error_log}" if error_log else ""), *e.args[1:]) - raise e - - -nvJitLinkHandleT = int -LinkerHandleT = Union[nvJitLinkHandleT, "cuda.bindings.driver.CUlinkState"] - - -class Linker: - """Represent a linking machinery to link one or multiple object codes into - :obj:`~cuda.core._module.ObjectCode` with the specified options. - - This object provides a unified interface to multiple underlying - linker libraries (such as nvJitLink or cuLink* from CUDA driver). - - Parameters - ---------- - object_codes : ObjectCode - One or more ObjectCode objects to be linked. - options : LinkerOptions, optional - Options for the linker. If not provided, default options will be used. - """ - - class _MembersNeededForFinalize: - __slots__ = ("handle", "use_nvjitlink", "const_char_keep_alive", "formatted_options", "option_keys") - - def __init__(self, program_obj, handle, use_nvjitlink): - self.handle = handle - self.use_nvjitlink = use_nvjitlink - self.const_char_keep_alive = [] - weakref.finalize(program_obj, self.close) - - def close(self): - if self.handle is not None: - if self.use_nvjitlink: - _nvjitlink.destroy(self.handle) - else: - handle_return(_driver.cuLinkDestroy(self.handle)) - self.handle = None - - __slots__ = ("__weakref__", "_mnff", "_options") - - def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): - if len(object_codes) == 0: - raise ValueError("At least one ObjectCode object must be provided") - - self._options = options = check_or_create_options(LinkerOptions, options, "Linker options") - with _exception_manager(self): - if _nvjitlink: - formatted_options = options._prepare_nvjitlink_options(as_bytes=False) - handle = _nvjitlink.create(len(formatted_options), formatted_options) - use_nvjitlink = True +# ============================================================================= +# Private implementation: cdef inline helpers +# ============================================================================= + +cdef inline int Linker_init(Linker self, tuple object_codes, object options) except -1: + """Initialize a Linker instance.""" + if len(object_codes) == 0: + raise ValueError("At least one ObjectCode object must be provided") + + cdef cynvjitlink.nvJitLinkHandle c_raw_nvjitlink + cdef cydriver.CUlinkState c_raw_culink + cdef Py_ssize_t c_num_opts, i + cdef vector[const_char_ptr] c_str_opts + cdef vector[cydriver.CUjit_option] c_jit_keys + cdef vector[void_ptr] c_jit_values + + self._options = options = check_or_create_options(LinkerOptions, options, "Linker options") + + if _use_nvjitlink_backend: + self._use_nvjitlink = True + options_bytes = options._prepare_nvjitlink_options(as_bytes=True) + c_num_opts = len(options_bytes) + c_str_opts.resize(c_num_opts) + for i in range(c_num_opts): + c_str_opts[i] = (options_bytes[i]) + with nogil: + HANDLE_RETURN_NVJITLINK(NULL, cynvjitlink.nvJitLinkCreate( + &c_raw_nvjitlink, c_num_opts, c_str_opts.data())) + self._nvjitlink_handle = create_nvjitlink_handle(c_raw_nvjitlink) + else: + self._use_nvjitlink = False + formatted_options, option_keys = options._prepare_driver_options() + # Keep the formatted_options list alive: it contains bytearrays that + # the driver writes into via raw pointers during linking operations. + self._drv_log_bufs = formatted_options + c_num_opts = len(option_keys) + c_jit_keys.resize(c_num_opts) + c_jit_values.resize(c_num_opts) + for i in range(c_num_opts): + c_jit_keys[i] = option_keys[i] + val = formatted_options[i] + if isinstance(val, bytearray): + c_jit_values[i] = PyByteArray_AS_STRING(val) else: - formatted_options, option_keys = options._prepare_driver_options() - handle = handle_return(_driver.cuLinkCreate(len(formatted_options), option_keys, formatted_options)) - use_nvjitlink = False - self._mnff = Linker._MembersNeededForFinalize(self, handle, use_nvjitlink) - self._mnff.formatted_options = formatted_options # Store for log access - if not _nvjitlink: - self._mnff.option_keys = option_keys - - for code in object_codes: - assert_type(code, ObjectCode) - self._add_code_object(code) - - def _add_code_object(self, object_code: ObjectCode): - data = object_code.code - with _exception_manager(self): - name_str = f"{object_code.name}" - if _nvjitlink and isinstance(data, bytes): - _nvjitlink.add_data( - self._mnff.handle, - self._input_type_from_code_type(object_code.code_type), - data, - len(data), - name_str, - ) - elif _nvjitlink and isinstance(data, str): - _nvjitlink.add_file( - self._mnff.handle, - self._input_type_from_code_type(object_code.code_type), - data, - ) - elif (not _nvjitlink) and isinstance(data, bytes): - name_bytes = name_str.encode() - handle_return( - _driver.cuLinkAddData( - self._mnff.handle, - self._input_type_from_code_type(object_code.code_type), - data, - len(data), - name_bytes, - 0, - None, - None, - ) - ) - self._mnff.const_char_keep_alive.append(name_bytes) - elif (not _nvjitlink) and isinstance(data, str): - name_bytes = name_str.encode() - handle_return( - _driver.cuLinkAddFile( - self._mnff.handle, - self._input_type_from_code_type(object_code.code_type), - data.encode(), - 0, - None, - None, - ) - ) - self._mnff.const_char_keep_alive.append(name_bytes) + c_jit_values[i] = int(val) + try: + with nogil: + HANDLE_RETURN(cydriver.cuLinkCreate( + c_num_opts, c_jit_keys.data(), c_jit_values.data(), &c_raw_culink)) + except CUDAError as e: + Linker_annotate_error_log(self, e) + raise + self._culink_handle = create_culink_handle(c_raw_culink) + + for code in object_codes: + assert_type(code, ObjectCode) + Linker_add_code_object(self, code) + return 0 + + +cdef inline void Linker_add_code_object(Linker self, object object_code) except *: + """Add a single ObjectCode to the linker.""" + data = object_code.code + cdef cynvjitlink.nvJitLinkHandle c_nvjitlink_h + cdef cydriver.CUlinkState c_culink_state + cdef cynvjitlink.nvJitLinkInputType c_nv_input_type + cdef cydriver.CUjitInputType c_drv_input_type + cdef const char* c_data_ptr + cdef size_t c_data_size + cdef const char* c_name_ptr + cdef const char* c_file_ptr + + name_bytes = f"{object_code.name}".encode() + c_name_ptr = name_bytes + + input_types = _nvjitlink_input_types if self._use_nvjitlink else _driver_input_types + py_input_type = input_types.get(object_code.code_type) + if py_input_type is None: + raise ValueError(f"Unknown code_type associated with ObjectCode: {object_code.code_type}") + + if self._use_nvjitlink: + c_nvjitlink_h = as_cu(self._nvjitlink_handle) + c_nv_input_type = py_input_type + if isinstance(data, bytes): + c_data_ptr = (data) + c_data_size = len(data) + with nogil: + HANDLE_RETURN_NVJITLINK(c_nvjitlink_h, cynvjitlink.nvJitLinkAddData( + c_nvjitlink_h, c_nv_input_type, c_data_ptr, c_data_size, c_name_ptr)) + elif isinstance(data, str): + file_bytes = data.encode() + c_file_ptr = file_bytes + with nogil: + HANDLE_RETURN_NVJITLINK(c_nvjitlink_h, cynvjitlink.nvJitLinkAddFile( + c_nvjitlink_h, c_nv_input_type, c_file_ptr)) + else: + raise TypeError(f"Expected bytes or str, but got {type(data).__name__}") + else: + c_culink_state = as_cu(self._culink_handle) + c_drv_input_type = py_input_type + try: + if isinstance(data, bytes): + c_data_ptr = (data) + c_data_size = len(data) + with nogil: + HANDLE_RETURN(cydriver.cuLinkAddData( + c_culink_state, c_drv_input_type, c_data_ptr, c_data_size, c_name_ptr, + 0, NULL, NULL)) + elif isinstance(data, str): + file_bytes = data.encode() + c_file_ptr = file_bytes + with nogil: + HANDLE_RETURN(cydriver.cuLinkAddFile( + c_culink_state, c_drv_input_type, c_file_ptr, 0, NULL, NULL)) else: raise TypeError(f"Expected bytes or str, but got {type(data).__name__}") + except CUDAError as e: + Linker_annotate_error_log(self, e) + raise + + +cdef inline object Linker_link(Linker self, str target_type): + """Complete linking and return the result as ObjectCode.""" + if target_type not in ("cubin", "ptx"): + raise ValueError(f"Unsupported target type: {target_type}") + + cdef cynvjitlink.nvJitLinkHandle c_nvjitlink_h + cdef cydriver.CUlinkState c_culink_state + cdef size_t c_output_size = 0 + cdef char* c_code_ptr + cdef void* c_cubin_out = NULL + + if self._use_nvjitlink: + c_nvjitlink_h = as_cu(self._nvjitlink_handle) + with nogil: + HANDLE_RETURN_NVJITLINK(c_nvjitlink_h, cynvjitlink.nvJitLinkComplete(c_nvjitlink_h)) + if target_type == "cubin": + HANDLE_RETURN_NVJITLINK(c_nvjitlink_h, + cynvjitlink.nvJitLinkGetLinkedCubinSize(c_nvjitlink_h, &c_output_size)) + code = bytearray(c_output_size) + c_code_ptr = (code) + with nogil: + HANDLE_RETURN_NVJITLINK(c_nvjitlink_h, + cynvjitlink.nvJitLinkGetLinkedCubin(c_nvjitlink_h, c_code_ptr)) + else: + HANDLE_RETURN_NVJITLINK(c_nvjitlink_h, + cynvjitlink.nvJitLinkGetLinkedPtxSize(c_nvjitlink_h, &c_output_size)) + code = bytearray(c_output_size) + c_code_ptr = (code) + with nogil: + HANDLE_RETURN_NVJITLINK(c_nvjitlink_h, + cynvjitlink.nvJitLinkGetLinkedPtx(c_nvjitlink_h, c_code_ptr)) + else: + c_culink_state = as_cu(self._culink_handle) + try: + with nogil: + HANDLE_RETURN(cydriver.cuLinkComplete(c_culink_state, &c_cubin_out, &c_output_size)) + except CUDAError as e: + Linker_annotate_error_log(self, e) + raise + code = (c_cubin_out)[:c_output_size] - def link(self, target_type) -> ObjectCode: - """ - Links the provided object codes into a single output of the specified target type. + # Linking is complete; cache the decoded log strings and release + # the driver's raw bytearray buffers (no longer written to). + self._info_log = self.get_info_log() + self._error_log = self.get_error_log() + self._drv_log_bufs = None - Parameters - ---------- - target_type : str - The type of the target output. Must be either "cubin" or "ptx". + return ObjectCode._init(bytes(code), target_type, name=self._options.name) - Returns - ------- - ObjectCode - The linked object code of the specified target type. - Note - ------ - See nvrtc compiler options documnetation to ensure the input object codes are - correctly compiled for linking. - """ - if target_type not in ("cubin", "ptx"): - raise ValueError(f"Unsupported target type: {target_type}") - with _exception_manager(self): - if _nvjitlink: - _nvjitlink.complete(self._mnff.handle) - if target_type == "cubin": - get_size = _nvjitlink.get_linked_cubin_size - get_code = _nvjitlink.get_linked_cubin - else: - get_size = _nvjitlink.get_linked_ptx_size - get_code = _nvjitlink.get_linked_ptx - size = get_size(self._mnff.handle) - code = bytearray(size) - get_code(self._mnff.handle, code) - else: - addr, size = handle_return(_driver.cuLinkComplete(self._mnff.handle)) - code = (ctypes.c_char * size).from_address(addr) +cdef inline void Linker_annotate_error_log(Linker self, object e): + """Annotate a CUDAError with the driver linker error log.""" + error_log = self.get_error_log() + if error_log: + e.args = (e.args[0] + f"\nLinker error log: {error_log}", *e.args[1:]) - return ObjectCode._init(bytes(code), target_type, name=self._options.name) - def get_error_log(self) -> str: - """Get the error log generated by the linker. +# ============================================================================= +# Private implementation: module-level state and initialization +# ============================================================================= - Returns - ------- - str - The error log. - """ - if _nvjitlink: - log_size = _nvjitlink.get_error_log_size(self._mnff.handle) - log = bytearray(log_size) - _nvjitlink.get_error_log(self._mnff.handle, log) - else: - log = self._mnff.formatted_options[2] - return log.decode("utf-8", errors="backslashreplace") +# TODO: revisit this treatment for py313t builds +_driver = None # populated if nvJitLink cannot be used +_driver_ver = None +_inited = False +_use_nvjitlink_backend = False # set by _decide_nvjitlink_or_driver() - def get_info_log(self) -> str: - """Get the info log generated by the linker. +# Input type mappings populated by _lazy_init() with C-level enum ints. +_nvjitlink_input_types = None +_driver_input_types = None - Returns - ------- - str - The info log. - """ - if _nvjitlink: - log_size = _nvjitlink.get_info_log_size(self._mnff.handle) - log = bytearray(log_size) - _nvjitlink.get_info_log(self._mnff.handle, log) - else: - log = self._mnff.formatted_options[0] - return log.decode("utf-8", errors="backslashreplace") - def _input_type_from_code_type(self, code_type: str): - # this list is based on the supported values for code_type in the ObjectCode class definition. - # nvJitLink/driver support other options for input type - input_type = _nvjitlink_input_types.get(code_type) if _nvjitlink else _driver_input_types.get(code_type) +def _nvjitlink_has_version_symbol(nvjitlink) -> bool: + # This condition is equivalent to testing for version >= 12.3 + return bool(nvjitlink._inspect_function_pointer("__nvJitLinkVersion")) - if input_type is None: - raise ValueError(f"Unknown code_type associated with ObjectCode: {code_type}") - return input_type - @property - def handle(self) -> LinkerHandleT: - """Return the underlying handle object. +# Note: this function is reused in the tests +def _decide_nvjitlink_or_driver() -> bool: + """Return True if falling back to the cuLink* driver APIs.""" + global _driver_ver, _driver, _use_nvjitlink_backend + if _driver_ver is not None: + return not _use_nvjitlink_backend - .. note:: + _driver_ver = handle_return(driver.cuDriverGetVersion()) + _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10) - The type of the returned object depends on the backend. + warn_txt_common = ( + "the driver APIs will be used instead, which do not support" + " minor version compatibility or linking LTO IRs." + " For best results, consider upgrading to a recent version of" + ) - .. caution:: + try: + __import__("cuda.bindings.nvjitlink") # availability check + except ModuleNotFoundError: + warn_txt = f"cuda.bindings.nvjitlink is not available, therefore {warn_txt_common} cuda-bindings." + else: + from cuda.bindings._internal import nvjitlink - This handle is a Python object. To get the memory address of the underlying C - handle, call ``int(Linker.handle)``. - """ - return self._mnff.handle + try: + if _nvjitlink_has_version_symbol(nvjitlink): + _use_nvjitlink_backend = True + return False # Use nvjitlink + except RuntimeError: + warn_detail = "not available" + else: + warn_detail = "too old (<12.3)" + warn_txt = ( + f"{'nvJitLink*.dll' if sys.platform == 'win32' else 'libnvJitLink.so*'} is {warn_detail}." + f" Therefore cuda.bindings.nvjitlink is not usable and {warn_txt_common} nvJitLink." + ) - @property - def backend(self) -> str: - """Return this Linker instance's underlying backend.""" - return "nvJitLink" if self._mnff.use_nvjitlink else "driver" + warn(warn_txt, stacklevel=2, category=RuntimeWarning) + _driver = driver + return True - def close(self): - """Destroy this linker.""" - self._mnff.close() + +def _lazy_init(): + global _inited, _nvjitlink_input_types, _driver_input_types + if _inited: + return + + _decide_nvjitlink_or_driver() + if _use_nvjitlink_backend: + _nvjitlink_input_types = { + "ptx": cynvjitlink.NVJITLINK_INPUT_PTX, + "cubin": cynvjitlink.NVJITLINK_INPUT_CUBIN, + "fatbin": cynvjitlink.NVJITLINK_INPUT_FATBIN, + "ltoir": cynvjitlink.NVJITLINK_INPUT_LTOIR, + "object": cynvjitlink.NVJITLINK_INPUT_OBJECT, + "library": cynvjitlink.NVJITLINK_INPUT_LIBRARY, + } + else: + _driver_input_types = { + "ptx": cydriver.CU_JIT_INPUT_PTX, + "cubin": cydriver.CU_JIT_INPUT_CUBIN, + "fatbin": cydriver.CU_JIT_INPUT_FATBINARY, + "object": cydriver.CU_JIT_INPUT_OBJECT, + "library": cydriver.CU_JIT_INPUT_LIBRARY, + } + _inited = True diff --git a/cuda_core/cuda/core/_program.pxd b/cuda_core/cuda/core/_program.pxd index 56618ffe42..d766ebe84d 100644 --- a/cuda_core/cuda/core/_program.pxd +++ b/cuda_core/cuda/core/_program.pxd @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 +from ._linker cimport Linker from ._resource_handles cimport NvrtcProgramHandle, NvvmProgramHandle @@ -10,6 +11,6 @@ cdef class Program: NvrtcProgramHandle _h_nvrtc NvvmProgramHandle _h_nvvm str _backend - object _linker # Linker + Linker _linker object _options # ProgramOptions object __weakref__ diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd index d573862d16..f897a688b3 100644 --- a/cuda_core/cuda/core/_resource_handles.pxd +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -10,6 +10,7 @@ from libcpp.memory cimport shared_ptr from cuda.bindings cimport cydriver from cuda.bindings cimport cynvrtc from cuda.bindings cimport cynvvm +from cuda.bindings cimport cynvjitlink # ============================================================================= @@ -26,7 +27,17 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": ctypedef shared_ptr[const cydriver.CUlibrary] LibraryHandle ctypedef shared_ptr[const cydriver.CUkernel] KernelHandle ctypedef shared_ptr[const cynvrtc.nvrtcProgram] NvrtcProgramHandle - ctypedef shared_ptr[const cynvvm.nvvmProgram] NvvmProgramHandle + + # NvvmProgramValue and NvJitLinkValue are TaggedHandle + # instantiations that make each shared_ptr type distinct for overloading. + cppclass NvvmProgramValue "cuda_core::NvvmProgramValue": + pass + cppclass NvJitLinkValue "cuda_core::NvJitLinkValue": + pass + ctypedef shared_ptr[const NvvmProgramValue] NvvmProgramHandle + ctypedef shared_ptr[const NvJitLinkValue] NvJitLinkHandle + + ctypedef shared_ptr[const cydriver.CUlinkState] CuLinkHandle # as_cu() - extract the raw CUDA handle (inline C++) cydriver.CUcontext as_cu(ContextHandle h) noexcept nogil @@ -38,6 +49,8 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": cydriver.CUkernel as_cu(KernelHandle h) noexcept nogil cynvrtc.nvrtcProgram as_cu(NvrtcProgramHandle h) noexcept nogil cynvvm.nvvmProgram as_cu(NvvmProgramHandle h) noexcept nogil + cynvjitlink.nvJitLinkHandle as_cu(NvJitLinkHandle h) noexcept nogil + cydriver.CUlinkState as_cu(CuLinkHandle h) noexcept nogil # as_intptr() - extract handle as intptr_t for Python interop (inline C++) intptr_t as_intptr(ContextHandle h) noexcept nogil @@ -49,6 +62,8 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": intptr_t as_intptr(KernelHandle h) noexcept nogil intptr_t as_intptr(NvrtcProgramHandle h) noexcept nogil intptr_t as_intptr(NvvmProgramHandle h) noexcept nogil + intptr_t as_intptr(NvJitLinkHandle h) noexcept nogil + intptr_t as_intptr(CuLinkHandle h) noexcept nogil # as_py() - convert handle to Python wrapper object (inline C++; requires GIL) object as_py(ContextHandle h) @@ -60,6 +75,8 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": object as_py(KernelHandle h) object as_py(NvrtcProgramHandle h) object as_py(NvvmProgramHandle h) + object as_py(NvJitLinkHandle h) + object as_py(CuLinkHandle h) # ============================================================================= @@ -130,3 +147,11 @@ cdef NvrtcProgramHandle create_nvrtc_program_handle_ref(cynvrtc.nvrtcProgram pro # NVVM Program handles cdef NvvmProgramHandle create_nvvm_program_handle(cynvvm.nvvmProgram prog) except+ nogil cdef NvvmProgramHandle create_nvvm_program_handle_ref(cynvvm.nvvmProgram prog) except+ nogil + +# nvJitLink handles +cdef NvJitLinkHandle create_nvjitlink_handle(cynvjitlink.nvJitLinkHandle handle) except+ nogil +cdef NvJitLinkHandle create_nvjitlink_handle_ref(cynvjitlink.nvJitLinkHandle handle) except+ nogil + +# cuLink handles +cdef CuLinkHandle create_culink_handle(cydriver.CUlinkState state) except+ nogil +cdef CuLinkHandle create_culink_handle_ref(cydriver.CUlinkState state) except+ nogil diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index 2652d4448e..ff2c568d03 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -16,6 +16,7 @@ from libc.stddef cimport size_t from cuda.bindings cimport cydriver from cuda.bindings cimport cynvrtc from cuda.bindings cimport cynvvm +from cuda.bindings cimport cynvjitlink from ._resource_handles cimport ( ContextHandle, @@ -27,11 +28,14 @@ from ._resource_handles cimport ( KernelHandle, NvrtcProgramHandle, NvvmProgramHandle, + NvJitLinkHandle, + CuLinkHandle, ) import cuda.bindings.cydriver as cydriver import cuda.bindings.cynvrtc as cynvrtc import cuda.bindings.cynvvm as cynvvm +import cuda.bindings.cynvjitlink as cynvjitlink # ============================================================================= # C++ function declarations (non-inline, implemented in resource_handles.cpp) @@ -125,6 +129,18 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": NvvmProgramHandle create_nvvm_program_handle_ref "cuda_core::create_nvvm_program_handle_ref" ( cynvvm.nvvmProgram prog) except+ nogil + # nvJitLink handles + NvJitLinkHandle create_nvjitlink_handle "cuda_core::create_nvjitlink_handle" ( + cynvjitlink.nvJitLinkHandle handle) except+ nogil + NvJitLinkHandle create_nvjitlink_handle_ref "cuda_core::create_nvjitlink_handle_ref" ( + cynvjitlink.nvJitLinkHandle handle) except+ nogil + + # cuLink handles + CuLinkHandle create_culink_handle "cuda_core::create_culink_handle" ( + cydriver.CUlinkState state) except+ nogil + CuLinkHandle create_culink_handle_ref "cuda_core::create_culink_handle_ref" ( + cydriver.CUlinkState state) except+ nogil + # ============================================================================= # CUDA Driver API capsule @@ -192,12 +208,18 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": void* p_cuLibraryUnload "reinterpret_cast(cuda_core::p_cuLibraryUnload)" void* p_cuLibraryGetKernel "reinterpret_cast(cuda_core::p_cuLibraryGetKernel)" + # Linker + void* p_cuLinkDestroy "reinterpret_cast(cuda_core::p_cuLinkDestroy)" + # NVRTC void* p_nvrtcDestroyProgram "reinterpret_cast(cuda_core::p_nvrtcDestroyProgram)" # NVVM void* p_nvvmDestroyProgram "reinterpret_cast(cuda_core::p_nvvmDestroyProgram)" + # nvJitLink + void* p_nvJitLinkDestroy "reinterpret_cast(cuda_core::p_nvJitLinkDestroy)" + # Initialize driver function pointers from cydriver.__pyx_capi__ at module load cdef void* _get_driver_fn(str name): @@ -248,6 +270,9 @@ p_cuLibraryLoadData = _get_driver_fn("cuLibraryLoadData") p_cuLibraryUnload = _get_driver_fn("cuLibraryUnload") p_cuLibraryGetKernel = _get_driver_fn("cuLibraryGetKernel") +# Linker +p_cuLinkDestroy = _get_driver_fn("cuLinkDestroy") + # ============================================================================= # NVRTC function pointer initialization # ============================================================================= @@ -270,3 +295,16 @@ cdef void* _get_nvvm_fn(str name): return PyCapsule_GetPointer(capsule, PyCapsule_GetName(capsule)) p_nvvmDestroyProgram = _get_nvvm_fn("nvvmDestroyProgram") + +# ============================================================================= +# nvJitLink function pointer initialization +# +# nvJitLink may not be available at runtime, so we handle missing function +# pointers gracefully. The C++ deleter checks for null before calling. +# ============================================================================= + +cdef void* _get_nvjitlink_fn(str name): + capsule = cynvjitlink.__pyx_capi__[name] + return PyCapsule_GetPointer(capsule, PyCapsule_GetName(capsule)) + +p_nvJitLinkDestroy = _get_nvjitlink_fn("nvJitLinkDestroy") diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pxd b/cuda_core/cuda/core/_utils/cuda_utils.pxd index a42bbf2dd0..478ce705af 100644 --- a/cuda_core/cuda/core/_utils/cuda_utils.pxd +++ b/cuda_core/cuda/core/_utils/cuda_utils.pxd @@ -6,7 +6,7 @@ cimport cpython from cpython.object cimport PyObject from libc.stdint cimport int64_t, int32_t -from cuda.bindings cimport cydriver, cynvrtc, cynvvm +from cuda.bindings cimport cydriver, cynvrtc, cynvvm, cynvjitlink ctypedef fused integer_t: @@ -21,6 +21,8 @@ cdef const cydriver.CUcontext CU_CONTEXT_INVALID = (-2) cdef int HANDLE_RETURN(cydriver.CUresult err) except?-1 nogil cdef int HANDLE_RETURN_NVRTC(cynvrtc.nvrtcProgram prog, cynvrtc.nvrtcResult err) except?-1 nogil cdef int HANDLE_RETURN_NVVM(cynvvm.nvvmProgram prog, cynvvm.nvvmResult err) except?-1 nogil +cdef int HANDLE_RETURN_NVJITLINK( + cynvjitlink.nvJitLinkHandle handle, cynvjitlink.nvJitLinkResult err) except?-1 nogil # TODO: stop exposing these within the codebase? diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pyx b/cuda_core/cuda/core/_utils/cuda_utils.pyx index 734ae32f79..3134308b55 100644 --- a/cuda_core/cuda/core/_utils/cuda_utils.pyx +++ b/cuda_core/cuda/core/_utils/cuda_utils.pyx @@ -21,8 +21,9 @@ except ImportError: from cuda import nvrtc from cuda.bindings.nvvm import nvvmError +from cuda.bindings.nvjitlink import nvJitLinkError -from cuda.bindings cimport cynvrtc, cynvvm +from cuda.bindings cimport cynvrtc, cynvvm, cynvjitlink from cuda.core._utils.driver_cu_result_explanations import DRIVER_CU_RESULT_EXPLANATIONS from cuda.core._utils.runtime_cuda_error_explanations import RUNTIME_CUDA_ERROR_EXPLANATIONS @@ -119,6 +120,34 @@ cdef int _raise_nvvm_error(cynvvm.nvvmProgram prog, cynvvm.nvvmResult err) excep raise exc +cdef int HANDLE_RETURN_NVJITLINK( + cynvjitlink.nvJitLinkHandle handle, cynvjitlink.nvJitLinkResult err) except?-1 nogil: + """Handle nvJitLink result codes, raising nvJitLinkError with error log on failure.""" + if err == cynvjitlink.nvJitLinkResult.NVJITLINK_SUCCESS: + return 0 + with gil: + _raise_nvjitlink_error(handle, err) + + +cdef int _raise_nvjitlink_error( + cynvjitlink.nvJitLinkHandle handle, cynvjitlink.nvJitLinkResult err) except -1: + """Raise nvJitLinkError annotated with the error log.""" + cdef size_t logsize = 0 + if handle != NULL: + cynvjitlink.nvJitLinkGetErrorLogSize(handle, &logsize) + cdef bytes log_bytes + cdef str log_str = "" + if logsize > 1 and handle != NULL: + log_bytes = b" " * logsize + if cynvjitlink.nvJitLinkGetErrorLog(handle, log_bytes) == \ + cynvjitlink.nvJitLinkResult.NVJITLINK_SUCCESS: + log_str = log_bytes.decode("utf-8", errors="backslashreplace") + cdef object exc = nvJitLinkError(err) + if log_str: + exc.args = (exc.args[0] + f"\nnvJitLink error log: {log_str}", *exc.args[1:]) + raise exc + + cdef object _RUNTIME_SUCCESS = runtime.cudaError_t.cudaSuccess cdef object _NVRTC_SUCCESS = nvrtc.nvrtcResult.NVRTC_SUCCESS diff --git a/cuda_core/docs/source/release/0.6.x-notes.rst b/cuda_core/docs/source/release/0.6.x-notes.rst index 49ea8b3be9..69b62a700b 100644 --- a/cuda_core/docs/source/release/0.6.x-notes.rst +++ b/cuda_core/docs/source/release/0.6.x-notes.rst @@ -17,8 +17,8 @@ Breaking Changes ---------------- - Building ``cuda.core`` from source now requires ``cuda-bindings`` >= 12.9.0, due to Cython-level - dependencies on the NVVM bindings (``cynvvm``). Pre-built wheels are unaffected. The previous - minimum was 12.8.0. + dependencies on the NVVM and nvJitLink bindings (``cynvvm``, ``cynvjitlink``). Pre-built wheels + are unaffected. The previous minimum was 12.8.0. New features @@ -36,6 +36,10 @@ None. Fixes and enhancements ---------------------- +- Reduced Python overhead in :class:`Program` and :class:`Linker` by moving compilation and + linking operations to the C level and releasing the GIL during backend calls. This benefits + workloads that create many programs or linkers, and enables concurrent compilation in + multithreaded applications. - Wheel and installed package sizes significantly reduced (e.g., on a typical Linux x86_64 build, wheel from ~4.6 MB to ~1.6 MB and installed from ~26 MB to ~4.4 MB) by excluding Cython source files, generated C++ files, and other build artifacts from distribution