diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp
index 27e76b817a..739b11f4d9 100644
--- a/cuda_core/cuda/core/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp
@@ -56,12 +56,18 @@ decltype(&cuLibraryLoadData) p_cuLibraryLoadData = nullptr;
 decltype(&cuLibraryUnload) p_cuLibraryUnload = nullptr;
 decltype(&cuLibraryGetKernel) p_cuLibraryGetKernel = nullptr;
 
+// Linker
+decltype(&cuLinkDestroy) p_cuLinkDestroy = nullptr;
+
 // NVRTC function pointers
 decltype(&nvrtcDestroyProgram) p_nvrtcDestroyProgram = nullptr;
 
 // NVVM function pointers (may be null if NVVM is not available)
 NvvmDestroyProgramFn p_nvvmDestroyProgram = nullptr;
 
+// nvJitLink function pointers (may be null if nvJitLink is not available)
+NvJitLinkDestroyFn p_nvJitLinkDestroy = nullptr;
+
 // ============================================================================
 // GIL management helpers
 // ============================================================================
@@ -805,19 +811,19 @@ NvrtcProgramHandle create_nvrtc_program_handle_ref(nvrtcProgram prog) {
 
 namespace {
 struct NvvmProgramBox {
-    nvvmProgram resource;
+    NvvmProgramValue resource;
 };
 }  // namespace
 
 NvvmProgramHandle create_nvvm_program_handle(nvvmProgram prog) {
     auto box = std::shared_ptr<NvvmProgramBox>(
-        new NvvmProgramBox{prog},
+        new NvvmProgramBox{{prog}},
         [](NvvmProgramBox* b) {
             // Note: nvvmDestroyProgram takes nvvmProgram* and nulls it,
             // but we're deleting the box anyway so nulling is harmless.
             // If NVVM is not available, the function pointer is null.
             if (p_nvvmDestroyProgram) {
-                p_nvvmDestroyProgram(&b->resource);
+                p_nvvmDestroyProgram(&b->resource.raw);
             }
             delete b;
         }
@@ -826,8 +832,69 @@ NvvmProgramHandle create_nvvm_program_handle(nvvmProgram prog) {
 }
 
 NvvmProgramHandle create_nvvm_program_handle_ref(nvvmProgram prog) {
-    auto box = std::make_shared<NvvmProgramBox>(NvvmProgramBox{prog});
+    auto box = std::make_shared<NvvmProgramBox>(NvvmProgramBox{{prog}});
     return NvvmProgramHandle(box, &box->resource);
 }
 
+// ============================================================================
+// nvJitLink Handles
+// ============================================================================
+
+namespace {
+struct NvJitLinkBox {
+    NvJitLinkValue resource;
+};
+}  // namespace
+
+NvJitLinkHandle create_nvjitlink_handle(nvJitLink_t handle) {
+    auto box = std::shared_ptr<NvJitLinkBox>(
+        new NvJitLinkBox{{handle}},
+        [](NvJitLinkBox* b) {
+            // Note: nvJitLinkDestroy takes nvJitLinkHandle* and nulls it,
+            // but we're deleting the box anyway so nulling is harmless.
+            // If nvJitLink is not available, the function pointer is null.
+            if (p_nvJitLinkDestroy) {
+                p_nvJitLinkDestroy(&b->resource.raw);
+            }
+            delete b;
+        }
+    );
+    return NvJitLinkHandle(box, &box->resource);
+}
+
+NvJitLinkHandle create_nvjitlink_handle_ref(nvJitLink_t handle) {
+    auto box = std::make_shared<NvJitLinkBox>(NvJitLinkBox{{handle}});
+    return NvJitLinkHandle(box, &box->resource);
+}
+
+// ============================================================================
+// cuLink Handles
+// ============================================================================
+
+namespace {
+struct CuLinkBox {
+    CUlinkState resource;
+};
+}  // namespace
+
+CuLinkHandle create_culink_handle(CUlinkState state) {
+    auto box = std::shared_ptr<CuLinkBox>(
+        new CuLinkBox{state},
+        [](CuLinkBox* b) {
+            // cuLinkDestroy takes CUlinkState by value (not pointer).
+            // Errors are ignored (standard destructor practice).
+            if (p_cuLinkDestroy) {
+                p_cuLinkDestroy(b->resource);
+            }
+            delete b;
+        }
+    );
+    return CuLinkHandle(box, &box->resource);
+}
+
+CuLinkHandle create_culink_handle_ref(CUlinkState state) {
+    auto box = std::make_shared<CuLinkBox>(CuLinkBox{state});
+    return CuLinkHandle(box, &box->resource);
+}
+
 }  // namespace cuda_core
diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp
index cb66841172..1576476dc4 100644
--- a/cuda_core/cuda/core/_cpp/resource_handles.hpp
+++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp
@@ -14,8 +14,28 @@
 // Use void* to match cuda.bindings.cynvvm's typedef
 using nvvmProgram = void*;
 
+// Forward declaration for nvJitLink - avoids nvJitLink.h dependency
+// Use void* to match cuda.bindings.cynvjitlink's typedef
+using nvJitLink_t = void*;
+
 namespace cuda_core {
 
+// ============================================================================
+// TaggedHandle - make void*-based handle types distinct for overloading
+//
+// Both nvvmProgram and nvJitLink_t are void*, so shared_ptr<const void*>
+// would be the same C++ type for both. TaggedHandle<T, Tag> wraps the raw
+// value with a unique tag type, making each shared_ptr type distinct.
+// ============================================================================
+
+template<typename T, int Tag>
+struct TaggedHandle {
+    T raw;
+};
+
+using NvvmProgramValue = TaggedHandle<nvvmProgram, 0>;
+using NvJitLinkValue = TaggedHandle<nvJitLink_t, 1>;
+
 // ============================================================================
 // Thread-local error handling
 // ============================================================================
@@ -72,6 +92,9 @@ extern decltype(&cuLibraryLoadData) p_cuLibraryLoadData;
 extern decltype(&cuLibraryUnload) p_cuLibraryUnload;
 extern decltype(&cuLibraryGetKernel) p_cuLibraryGetKernel;
 
+// Linker
+extern decltype(&cuLinkDestroy) p_cuLinkDestroy;
+
 // ============================================================================
 // NVRTC function pointers
 //
@@ -94,6 +117,19 @@ extern decltype(&nvrtcDestroyProgram) p_nvrtcDestroyProgram;
 using NvvmDestroyProgramFn = int (*)(nvvmProgram*);
 extern NvvmDestroyProgramFn p_nvvmDestroyProgram;
 
+// ============================================================================
+// nvJitLink function pointers
+//
+// These are populated by _resource_handles.pyx at module import time using
+// function pointers extracted from cuda.bindings.cynvjitlink.__pyx_capi__.
+// Note: May be null if nvJitLink is not available at runtime.
+// ============================================================================
+
+// Function pointer type for nvJitLinkDestroy (avoids nvJitLink.h dependency)
+// Signature: nvJitLinkResult nvJitLinkDestroy(nvJitLinkHandle *handle)
+using NvJitLinkDestroyFn = int (*)(nvJitLink_t*);
+extern NvJitLinkDestroyFn p_nvJitLinkDestroy;
+
 // ============================================================================
 // Handle type aliases - expose only the raw CUDA resource
 // ============================================================================
@@ -105,7 +141,9 @@ using MemoryPoolHandle = std::shared_ptr<const CUmemoryPool>;
 using LibraryHandle = std::shared_ptr<const CUlibrary>;
 using KernelHandle = std::shared_ptr<const CUkernel>;
 using NvrtcProgramHandle = std::shared_ptr<const nvrtcProgram>;
-using NvvmProgramHandle = std::shared_ptr<const nvvmProgram>;
+using NvvmProgramHandle = std::shared_ptr<const NvvmProgramValue>;
+using NvJitLinkHandle = std::shared_ptr<const NvJitLinkValue>;
+using CuLinkHandle = std::shared_ptr<const CUlinkState>;
 
 // ============================================================================
 // Context handle functions
@@ -316,6 +354,33 @@ NvvmProgramHandle create_nvvm_program_handle(nvvmProgram prog);
 // The program will NOT be destroyed when the handle is released.
 NvvmProgramHandle create_nvvm_program_handle_ref(nvvmProgram prog);
 
+// ============================================================================
+// nvJitLink handle functions
+// ============================================================================
+
+// Create an owning nvJitLink handle.
+// When the last reference is released, nvJitLinkDestroy is called.
+// Use this to wrap a handle created via nvJitLinkCreate.
+// Note: If nvJitLink is not available (p_nvJitLinkDestroy is null), the deleter is a no-op.
+NvJitLinkHandle create_nvjitlink_handle(nvJitLink_t handle);
+
+// Create a non-owning nvJitLink handle (references existing handle).
+// The handle will NOT be destroyed when the last reference is released.
+NvJitLinkHandle create_nvjitlink_handle_ref(nvJitLink_t handle);
+
+// ============================================================================
+// cuLink handle functions
+// ============================================================================
+
+// Create an owning cuLink handle.
+// When the last reference is released, cuLinkDestroy is called.
+// Use this to wrap a CUlinkState created via cuLinkCreate.
+CuLinkHandle create_culink_handle(CUlinkState state);
+
+// Create a non-owning cuLink handle (references existing CUlinkState).
+// The handle will NOT be destroyed when the last reference is released.
+CuLinkHandle create_culink_handle_ref(CUlinkState state);
+
 // ============================================================================
 // Overloaded helper functions to extract raw resources from handles
 // ============================================================================
@@ -354,6 +419,14 @@ inline nvrtcProgram as_cu(const NvrtcProgramHandle& h) noexcept {
 }
 
 inline nvvmProgram as_cu(const NvvmProgramHandle& h) noexcept {
+    return h ? h->raw : nullptr;
+}
+
+inline nvJitLink_t as_cu(const NvJitLinkHandle& h) noexcept {
+    return h ? h->raw : nullptr;
+}
+
+inline CUlinkState as_cu(const CuLinkHandle& h) noexcept {
     return h ? *h : nullptr;
 }
 
@@ -395,6 +468,14 @@ inline std::intptr_t as_intptr(const NvvmProgramHandle& h) noexcept {
     return reinterpret_cast<std::intptr_t>(as_cu(h));
 }
 
+inline std::intptr_t as_intptr(const NvJitLinkHandle& h) noexcept {
+    return reinterpret_cast<std::intptr_t>(as_cu(h));
+}
+
+inline std::intptr_t as_intptr(const CuLinkHandle& h) noexcept {
+    return reinterpret_cast<std::intptr_t>(as_cu(h));
+}
+
 // as_py() - convert handle to Python wrapper object (returns new reference)
 namespace detail {
 // n.b. class lookup is not cached to avoid deadlock hazard, see DESIGN.md
@@ -447,4 +528,13 @@ inline PyObject* as_py(const NvvmProgramHandle& h) noexcept {
     return PyLong_FromSsize_t(as_intptr(h));
 }
 
+inline PyObject* as_py(const NvJitLinkHandle& h) noexcept {
+    // nvJitLink bindings use raw integers, not wrapper classes
+    return PyLong_FromSsize_t(as_intptr(h));
+}
+
+inline PyObject* as_py(const CuLinkHandle& h) noexcept {
+    return detail::make_py("cuda.bindings.driver", "CUlinkState", as_intptr(h));
+}
+
 }  // namespace cuda_core
diff --git a/cuda_core/cuda/core/_linker.pxd b/cuda_core/cuda/core/_linker.pxd
new file mode 100644
index 0000000000..e50ebb9770
--- /dev/null
+++ b/cuda_core/cuda/core/_linker.pxd
@@ -0,0 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from ._resource_handles cimport NvJitLinkHandle, CuLinkHandle
+
+
+cdef class Linker:
+    cdef:
+        NvJitLinkHandle _nvjitlink_handle
+        CuLinkHandle _culink_handle
+        bint _use_nvjitlink
+        object _drv_log_bufs  # formatted_options list (driver); None for nvjitlink; cleared in link()
+        str _info_log         # decoded log; None until link() or pre-link get_*_log()
+        str _error_log        # decoded log; None until link() or pre-link get_*_log()
+        object _options       # LinkerOptions
+        object __weakref__
diff --git a/cuda_core/cuda/core/_linker.py b/cuda_core/cuda/core/_linker.pyx
similarity index 52%
rename from cuda_core/cuda/core/_linker.py
rename to cuda_core/cuda/core/_linker.pyx
index 6490e87b07..316c46178a 100644
--- a/cuda_core/cuda/core/_linker.py
+++ b/cuda_core/cuda/core/_linker.pyx
@@ -1,114 +1,187 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
+"""Linking machinery for combining object codes.
+
+This module provides :class:`Linker` for linking one or more
+:class:`~cuda.core.ObjectCode` objects, with :class:`LinkerOptions` for
+configuration.
+"""
 
 from __future__ import annotations
 
-import ctypes
+from cpython.bytearray cimport PyByteArray_AS_STRING
+from libc.stdint cimport intptr_t, uint32_t
+from libcpp.vector cimport vector
+from cuda.bindings cimport cydriver
+from cuda.bindings cimport cynvjitlink
+
+from ._resource_handles cimport (
+    as_cu,
+    as_py,
+    create_culink_handle,
+    create_nvjitlink_handle,
+)
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN, HANDLE_RETURN_NVJITLINK
+
 import sys
-import weakref
-from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Union
+from typing import Union
 from warnings import warn
 
-if TYPE_CHECKING:
-    import cuda.bindings
-
 from cuda.core._device import Device
 from cuda.core._module import ObjectCode
 from cuda.core._utils.clear_error_support import assert_type
-from cuda.core._utils.cuda_utils import check_or_create_options, driver, handle_return, is_sequence
+from cuda.core._utils.cuda_utils import (
+    CUDAError,
+    check_or_create_options,
+    driver,
+    handle_return,
+    is_sequence,
+)
 
-# TODO: revisit this treatment for py313t builds
-_driver = None  # populated if nvJitLink cannot be used
-_driver_input_types = None  # populated if nvJitLink cannot be used
-_driver_ver = None
-_inited = False
-_nvjitlink = None  # populated if nvJitLink can be used
-_nvjitlink_input_types = None  # populated if nvJitLink cannot be used
+ctypedef const char* const_char_ptr
+ctypedef void* void_ptr
 
+__all__ = ["Linker", "LinkerOptions"]
 
-def _nvjitlink_has_version_symbol(inner_nvjitlink) -> bool:
-    # This condition is equivalent to testing for version >= 12.3
-    return bool(inner_nvjitlink._inspect_function_pointer("__nvJitLinkVersion"))
+LinkerHandleT = Union["cuda.bindings.nvjitlink.nvJitLinkHandle", "cuda.bindings.driver.CUlinkState"]
 
 
-# Note: this function is reused in the tests
-def _decide_nvjitlink_or_driver() -> bool:
-    """Returns True if falling back to the cuLink* driver APIs."""
-    global _driver_ver, _driver, _nvjitlink
-    if _driver or _nvjitlink:
-        return _driver is not None
+# =============================================================================
+# Principal class
+# =============================================================================
 
-    _driver_ver = handle_return(driver.cuDriverGetVersion())
-    _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10)
+cdef class Linker:
+    """Represent a linking machinery to link one or more object codes into
+    :class:`~cuda.core.ObjectCode`.
 
-    warn_txt_common = (
-        "the driver APIs will be used instead, which do not support"
-        " minor version compatibility or linking LTO IRs."
-        " For best results, consider upgrading to a recent version of"
-    )
+    This object provides a unified interface to multiple underlying
+    linker libraries (such as nvJitLink or cuLink* from the CUDA driver).
 
-    try:
-        import cuda.bindings.nvjitlink as _nvjitlink
-    except ModuleNotFoundError:
-        warn_txt = f"cuda.bindings.nvjitlink is not available, therefore {warn_txt_common} cuda-bindings."
-    else:
-        from cuda.bindings._internal import nvjitlink as inner_nvjitlink
+    Parameters
+    ----------
+    object_codes : :class:`~cuda.core.ObjectCode`
+        One or more ObjectCode objects to be linked.
+    options : :class:`LinkerOptions`, optional
+        Options for the linker. If not provided, default options will be used.
+    """
 
-        try:
-            if _nvjitlink_has_version_symbol(inner_nvjitlink):
-                return False  # Use nvjitlink
-        except RuntimeError:
-            warn_detail = "not available"
+    def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None):
+        Linker_init(self, object_codes, options)
+
+    def link(self, target_type) -> ObjectCode:
+        """Link the provided object codes into a single output of the specified target type.
+
+        Parameters
+        ----------
+        target_type : str
+            The type of the target output. Must be either "cubin" or "ptx".
+
+        Returns
+        -------
+        :class:`~cuda.core.ObjectCode`
+            The linked object code of the specified target type.
+
+        .. note::
+
+            Ensure that input object codes were compiled with appropriate
+            flags for linking (e.g., relocatable device code enabled).
+        """
+        return Linker_link(self, target_type)
+
+    def get_error_log(self) -> str:
+        """Get the error log generated by the linker.
+
+        Returns
+        -------
+        str
+            The error log.
+        """
+        # After link(), the decoded log is cached here.
+        if self._error_log is not None:
+            return self._error_log
+        cdef cynvjitlink.nvJitLinkHandle c_h
+        cdef size_t c_log_size = 0
+        cdef char* c_log_ptr
+        if self._use_nvjitlink:
+            c_h = as_cu(self._nvjitlink_handle)
+            cynvjitlink.nvJitLinkGetErrorLogSize(c_h, &c_log_size)
+            log = bytearray(c_log_size)
+            if c_log_size > 0:
+                c_log_ptr = <char*>(<bytearray>log)
+                cynvjitlink.nvJitLinkGetErrorLog(c_h, c_log_ptr)
+            return log.decode("utf-8", errors="backslashreplace")
         else:
-            warn_detail = "too old (<12.3)"
-        warn_txt = (
-            f"{'nvJitLink*.dll' if sys.platform == 'win32' else 'libnvJitLink.so*'} is {warn_detail}."
-            f" Therefore cuda.bindings.nvjitlink is not usable and {warn_txt_common} nvJitLink."
-        )
-        _nvjitlink = None
+            return (<bytearray>self._drv_log_bufs[2]).decode(
+                "utf-8", errors="backslashreplace").rstrip('\x00')
 
-    warn(warn_txt, stacklevel=2, category=RuntimeWarning)
-    _driver = driver
-    return True
+    def get_info_log(self) -> str:
+        """Get the info log generated by the linker.
 
+        Returns
+        -------
+        str
+            The info log.
+        """
+        # After link(), the decoded log is cached here.
+        if self._info_log is not None:
+            return self._info_log
+        cdef cynvjitlink.nvJitLinkHandle c_h
+        cdef size_t c_log_size = 0
+        cdef char* c_log_ptr
+        if self._use_nvjitlink:
+            c_h = as_cu(self._nvjitlink_handle)
+            cynvjitlink.nvJitLinkGetInfoLogSize(c_h, &c_log_size)
+            log = bytearray(c_log_size)
+            if c_log_size > 0:
+                c_log_ptr = <char*>(<bytearray>log)
+                cynvjitlink.nvJitLinkGetInfoLog(c_h, c_log_ptr)
+            return log.decode("utf-8", errors="backslashreplace")
+        else:
+            return (<bytearray>self._drv_log_bufs[0]).decode(
+                "utf-8", errors="backslashreplace").rstrip('\x00')
 
-def _lazy_init():
-    global _inited, _nvjitlink_input_types, _driver_input_types
-    if _inited:
-        return
+    def close(self):
+        """Destroy this linker."""
+        if self._use_nvjitlink:
+            self._nvjitlink_handle.reset()
+        else:
+            self._culink_handle.reset()
 
-    _decide_nvjitlink_or_driver()
-    if _nvjitlink:
-        if _driver_ver > _nvjitlink.version():
-            # TODO: nvJitLink is not new enough, warn?
-            pass
-        _nvjitlink_input_types = {
-            "ptx": _nvjitlink.InputType.PTX,
-            "cubin": _nvjitlink.InputType.CUBIN,
-            "fatbin": _nvjitlink.InputType.FATBIN,
-            "ltoir": _nvjitlink.InputType.LTOIR,
-            "object": _nvjitlink.InputType.OBJECT,
-            "library": _nvjitlink.InputType.LIBRARY,
-        }
-    else:
-        _driver_input_types = {
-            "ptx": _driver.CUjitInputType.CU_JIT_INPUT_PTX,
-            "cubin": _driver.CUjitInputType.CU_JIT_INPUT_CUBIN,
-            "fatbin": _driver.CUjitInputType.CU_JIT_INPUT_FATBINARY,
-            "object": _driver.CUjitInputType.CU_JIT_INPUT_OBJECT,
-            "library": _driver.CUjitInputType.CU_JIT_INPUT_LIBRARY,
-        }
-    _inited = True
+    @property
+    def handle(self) -> LinkerHandleT:
+        """Return the underlying handle object.
+
+        .. note::
+
+           The type of the returned object depends on the backend.
 
+        .. caution::
+
+            This handle is a Python object. To get the memory address of the underlying C
+            handle, call ``int(Linker.handle)``.
+        """
+        if self._use_nvjitlink:
+            return as_py(self._nvjitlink_handle)
+        else:
+            return as_py(self._culink_handle)
+
+    @property
+    def backend(self) -> str:
+        """Return this Linker instance's underlying backend."""
+        return "nvJitLink" if self._use_nvjitlink else "driver"
+
+
+# =============================================================================
+# Supporting classes
+# =============================================================================
 
 @dataclass
 class LinkerOptions:
-    """Customizable :obj:`Linker` options.
+    """Customizable options for configuring :class:`Linker`.
 
-    Since the linker would choose to use nvJitLink or the driver APIs as the linking backed,
+    Since the linker may choose to use nvJitLink or the driver APIs as the linking backend,
     not all options are applicable. When the system's installed nvJitLink is too old (<12.3),
     or not installed, the driver APIs (cuLink) will be used instead.
 
@@ -154,14 +227,14 @@ class LinkerOptions:
     fma : bool, optional
         Use fast multiply-add.
         Default: True.
-    kernels_used : [Union[str, tuple[str], list[str]]], optional
+    kernels_used : [str | tuple[str] | list[str]], optional
         Pass a kernel or sequence of kernels that are used; any not in the list can be removed.
-    variables_used : [Union[str, tuple[str], list[str]]], optional
+    variables_used : [str | tuple[str] | list[str]], optional
         Pass a variable or sequence of variables that are used; any not in the list can be removed.
     optimize_unused_variables : bool, optional
         Assume that if a variable is not referenced in device code, it can be removed.
         Default: False.
-    ptxas_options : [Union[str, tuple[str], list[str]]], optional
+    ptxas_options : [str | tuple[str] | list[str]], optional
         Pass options to PTXAS.
     split_compile : int, optional
         Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split
@@ -357,241 +430,271 @@ def as_bytes(self, backend: str = "nvjitlink") -> list[bytes]:
         backend = backend.lower()
         if backend != "nvjitlink":
             raise ValueError(f"as_bytes() only supports 'nvjitlink' backend, got '{backend}'")
-        if not _nvjitlink:
+        if not _use_nvjitlink_backend:
             raise RuntimeError("nvJitLink backend is not available")
         return self._prepare_nvjitlink_options(as_bytes=True)
 
 
-# This needs to be a free function not a method, as it's disallowed by contextmanager.
-@contextmanager
-def _exception_manager(self):
-    """
-    A helper function to improve the error message of exceptions raised by the linker backend.
-    """
-    try:
-        yield
-    except Exception as e:
-        error_log = ""
-        if hasattr(self, "_mnff"):
-            # our constructor could raise, in which case there's no handle available
-            error_log = self.get_error_log()
-        # Starting Python 3.11 we could also use Exception.add_note() for the same purpose, but
-        # unfortunately we are still supporting Python 3.10...
-        # Here we rely on both CUDAError and nvJitLinkError have the error string placed in .args[0].
-        e.args = (e.args[0] + (f"\nLinker error log: {error_log}" if error_log else ""), *e.args[1:])
-        raise e
-
-
-nvJitLinkHandleT = int
-LinkerHandleT = Union[nvJitLinkHandleT, "cuda.bindings.driver.CUlinkState"]
-
-
-class Linker:
-    """Represent a linking machinery to link one or multiple object codes into
-    :obj:`~cuda.core._module.ObjectCode` with the specified options.
-
-    This object provides a unified interface to multiple underlying
-    linker libraries (such as nvJitLink or cuLink* from CUDA driver).
-
-    Parameters
-    ----------
-    object_codes : ObjectCode
-        One or more ObjectCode objects to be linked.
-    options : LinkerOptions, optional
-        Options for the linker. If not provided, default options will be used.
-    """
-
-    class _MembersNeededForFinalize:
-        __slots__ = ("handle", "use_nvjitlink", "const_char_keep_alive", "formatted_options", "option_keys")
-
-        def __init__(self, program_obj, handle, use_nvjitlink):
-            self.handle = handle
-            self.use_nvjitlink = use_nvjitlink
-            self.const_char_keep_alive = []
-            weakref.finalize(program_obj, self.close)
-
-        def close(self):
-            if self.handle is not None:
-                if self.use_nvjitlink:
-                    _nvjitlink.destroy(self.handle)
-                else:
-                    handle_return(_driver.cuLinkDestroy(self.handle))
-                self.handle = None
-
-    __slots__ = ("__weakref__", "_mnff", "_options")
-
-    def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None):
-        if len(object_codes) == 0:
-            raise ValueError("At least one ObjectCode object must be provided")
-
-        self._options = options = check_or_create_options(LinkerOptions, options, "Linker options")
-        with _exception_manager(self):
-            if _nvjitlink:
-                formatted_options = options._prepare_nvjitlink_options(as_bytes=False)
-                handle = _nvjitlink.create(len(formatted_options), formatted_options)
-                use_nvjitlink = True
+# =============================================================================
+# Private implementation: cdef inline helpers
+# =============================================================================
+
+cdef inline int Linker_init(Linker self, tuple object_codes, object options) except -1:
+    """Initialize a Linker instance."""
+    if len(object_codes) == 0:
+        raise ValueError("At least one ObjectCode object must be provided")
+
+    cdef cynvjitlink.nvJitLinkHandle c_raw_nvjitlink
+    cdef cydriver.CUlinkState c_raw_culink
+    cdef Py_ssize_t c_num_opts, i
+    cdef vector[const_char_ptr] c_str_opts
+    cdef vector[cydriver.CUjit_option] c_jit_keys
+    cdef vector[void_ptr] c_jit_values
+
+    self._options = options = check_or_create_options(LinkerOptions, options, "Linker options")
+
+    if _use_nvjitlink_backend:
+        self._use_nvjitlink = True
+        options_bytes = options._prepare_nvjitlink_options(as_bytes=True)
+        c_num_opts = len(options_bytes)
+        c_str_opts.resize(c_num_opts)
+        for i in range(c_num_opts):
+            c_str_opts[i] = <const char*>(<bytes>options_bytes[i])
+        with nogil:
+            HANDLE_RETURN_NVJITLINK(NULL, cynvjitlink.nvJitLinkCreate(
+                &c_raw_nvjitlink, <uint32_t>c_num_opts, c_str_opts.data()))
+        self._nvjitlink_handle = create_nvjitlink_handle(c_raw_nvjitlink)
+    else:
+        self._use_nvjitlink = False
+        formatted_options, option_keys = options._prepare_driver_options()
+        # Keep the formatted_options list alive: it contains bytearrays that
+        # the driver writes into via raw pointers during linking operations.
+        self._drv_log_bufs = formatted_options
+        c_num_opts = len(option_keys)
+        c_jit_keys.resize(c_num_opts)
+        c_jit_values.resize(c_num_opts)
+        for i in range(c_num_opts):
+            c_jit_keys[i] = <cydriver.CUjit_option><int>option_keys[i]
+            val = formatted_options[i]
+            if isinstance(val, bytearray):
+                c_jit_values[i] = <void*>PyByteArray_AS_STRING(val)
             else:
-                formatted_options, option_keys = options._prepare_driver_options()
-                handle = handle_return(_driver.cuLinkCreate(len(formatted_options), option_keys, formatted_options))
-                use_nvjitlink = False
-        self._mnff = Linker._MembersNeededForFinalize(self, handle, use_nvjitlink)
-        self._mnff.formatted_options = formatted_options  # Store for log access
-        if not _nvjitlink:
-            self._mnff.option_keys = option_keys
-
-        for code in object_codes:
-            assert_type(code, ObjectCode)
-            self._add_code_object(code)
-
-    def _add_code_object(self, object_code: ObjectCode):
-        data = object_code.code
-        with _exception_manager(self):
-            name_str = f"{object_code.name}"
-            if _nvjitlink and isinstance(data, bytes):
-                _nvjitlink.add_data(
-                    self._mnff.handle,
-                    self._input_type_from_code_type(object_code.code_type),
-                    data,
-                    len(data),
-                    name_str,
-                )
-            elif _nvjitlink and isinstance(data, str):
-                _nvjitlink.add_file(
-                    self._mnff.handle,
-                    self._input_type_from_code_type(object_code.code_type),
-                    data,
-                )
-            elif (not _nvjitlink) and isinstance(data, bytes):
-                name_bytes = name_str.encode()
-                handle_return(
-                    _driver.cuLinkAddData(
-                        self._mnff.handle,
-                        self._input_type_from_code_type(object_code.code_type),
-                        data,
-                        len(data),
-                        name_bytes,
-                        0,
-                        None,
-                        None,
-                    )
-                )
-                self._mnff.const_char_keep_alive.append(name_bytes)
-            elif (not _nvjitlink) and isinstance(data, str):
-                name_bytes = name_str.encode()
-                handle_return(
-                    _driver.cuLinkAddFile(
-                        self._mnff.handle,
-                        self._input_type_from_code_type(object_code.code_type),
-                        data.encode(),
-                        0,
-                        None,
-                        None,
-                    )
-                )
-                self._mnff.const_char_keep_alive.append(name_bytes)
+                c_jit_values[i] = <void*><intptr_t>int(val)
+        try:
+            with nogil:
+                HANDLE_RETURN(cydriver.cuLinkCreate(
+                    <unsigned int>c_num_opts, c_jit_keys.data(), c_jit_values.data(), &c_raw_culink))
+        except CUDAError as e:
+            Linker_annotate_error_log(self, e)
+            raise
+        self._culink_handle = create_culink_handle(c_raw_culink)
+
+    for code in object_codes:
+        assert_type(code, ObjectCode)
+        Linker_add_code_object(self, code)
+    return 0
+
+
+cdef inline void Linker_add_code_object(Linker self, object object_code) except *:
+    """Add a single ObjectCode to the linker."""
+    data = object_code.code
+    cdef cynvjitlink.nvJitLinkHandle c_nvjitlink_h
+    cdef cydriver.CUlinkState c_culink_state
+    cdef cynvjitlink.nvJitLinkInputType c_nv_input_type
+    cdef cydriver.CUjitInputType c_drv_input_type
+    cdef const char* c_data_ptr
+    cdef size_t c_data_size
+    cdef const char* c_name_ptr
+    cdef const char* c_file_ptr
+
+    name_bytes = f"{object_code.name}".encode()
+    c_name_ptr = <const char*>name_bytes
+
+    input_types = _nvjitlink_input_types if self._use_nvjitlink else _driver_input_types
+    py_input_type = input_types.get(object_code.code_type)
+    if py_input_type is None:
+        raise ValueError(f"Unknown code_type associated with ObjectCode: {object_code.code_type}")
+
+    if self._use_nvjitlink:
+        c_nvjitlink_h = as_cu(self._nvjitlink_handle)
+        c_nv_input_type = <cynvjitlink.nvJitLinkInputType><int>py_input_type
+        if isinstance(data, bytes):
+            c_data_ptr = <const char*>(<bytes>data)
+            c_data_size = len(data)
+            with nogil:
+                HANDLE_RETURN_NVJITLINK(c_nvjitlink_h, cynvjitlink.nvJitLinkAddData(
+                    c_nvjitlink_h, c_nv_input_type, <const void*>c_data_ptr, c_data_size, c_name_ptr))
+        elif isinstance(data, str):
+            file_bytes = data.encode()
+            c_file_ptr = <const char*>file_bytes
+            with nogil:
+                HANDLE_RETURN_NVJITLINK(c_nvjitlink_h, cynvjitlink.nvJitLinkAddFile(
+                    c_nvjitlink_h, c_nv_input_type, c_file_ptr))
+        else:
+            raise TypeError(f"Expected bytes or str, but got {type(data).__name__}")
+    else:
+        c_culink_state = as_cu(self._culink_handle)
+        c_drv_input_type = <cydriver.CUjitInputType><int>py_input_type
+        try:
+            if isinstance(data, bytes):
+                c_data_ptr = <const char*>(<bytes>data)
+                c_data_size = len(data)
+                with nogil:
+                    HANDLE_RETURN(cydriver.cuLinkAddData(
+                        c_culink_state, c_drv_input_type, <void*>c_data_ptr, c_data_size, c_name_ptr,
+                        0, NULL, NULL))
+            elif isinstance(data, str):
+                file_bytes = data.encode()
+                c_file_ptr = <const char*>file_bytes
+                with nogil:
+                    HANDLE_RETURN(cydriver.cuLinkAddFile(
+                        c_culink_state, c_drv_input_type, c_file_ptr, 0, NULL, NULL))
             else:
                 raise TypeError(f"Expected bytes or str, but got {type(data).__name__}")
+        except CUDAError as e:
+            Linker_annotate_error_log(self, e)
+            raise
+
+
+cdef inline object Linker_link(Linker self, str target_type):
+    """Complete linking and return the result as ObjectCode."""
+    if target_type not in ("cubin", "ptx"):
+        raise ValueError(f"Unsupported target type: {target_type}")
+
+    cdef cynvjitlink.nvJitLinkHandle c_nvjitlink_h
+    cdef cydriver.CUlinkState c_culink_state
+    cdef size_t c_output_size = 0
+    cdef char* c_code_ptr
+    cdef void* c_cubin_out = NULL
+
+    if self._use_nvjitlink:
+        c_nvjitlink_h = as_cu(self._nvjitlink_handle)
+        with nogil:
+            HANDLE_RETURN_NVJITLINK(c_nvjitlink_h, cynvjitlink.nvJitLinkComplete(c_nvjitlink_h))
+        if target_type == "cubin":
+            HANDLE_RETURN_NVJITLINK(c_nvjitlink_h,
+                cynvjitlink.nvJitLinkGetLinkedCubinSize(c_nvjitlink_h, &c_output_size))
+            code = bytearray(c_output_size)
+            c_code_ptr = <char*>(<bytearray>code)
+            with nogil:
+                HANDLE_RETURN_NVJITLINK(c_nvjitlink_h,
+                    cynvjitlink.nvJitLinkGetLinkedCubin(c_nvjitlink_h, c_code_ptr))
+        else:
+            HANDLE_RETURN_NVJITLINK(c_nvjitlink_h,
+                cynvjitlink.nvJitLinkGetLinkedPtxSize(c_nvjitlink_h, &c_output_size))
+            code = bytearray(c_output_size)
+            c_code_ptr = <char*>(<bytearray>code)
+            with nogil:
+                HANDLE_RETURN_NVJITLINK(c_nvjitlink_h,
+                    cynvjitlink.nvJitLinkGetLinkedPtx(c_nvjitlink_h, c_code_ptr))
+    else:
+        c_culink_state = as_cu(self._culink_handle)
+        try:
+            with nogil:
+                HANDLE_RETURN(cydriver.cuLinkComplete(c_culink_state, &c_cubin_out, &c_output_size))
+        except CUDAError as e:
+            Linker_annotate_error_log(self, e)
+            raise
+        code = (<char*>c_cubin_out)[:c_output_size]
 
-    def link(self, target_type) -> ObjectCode:
-        """
-        Links the provided object codes into a single output of the specified target type.
+    # Linking is complete; cache the decoded log strings and release
+    # the driver's raw bytearray buffers (no longer written to).
+    self._info_log = self.get_info_log()
+    self._error_log = self.get_error_log()
+    self._drv_log_bufs = None
 
-        Parameters
-        ----------
-        target_type : str
-            The type of the target output. Must be either "cubin" or "ptx".
+    return ObjectCode._init(bytes(code), target_type, name=self._options.name)
 
-        Returns
-        -------
-        ObjectCode
-            The linked object code of the specified target type.
 
-        Note
-        ------
-        See nvrtc compiler options documnetation to ensure the input object codes are
-        correctly compiled for linking.
-        """
-        if target_type not in ("cubin", "ptx"):
-            raise ValueError(f"Unsupported target type: {target_type}")
-        with _exception_manager(self):
-            if _nvjitlink:
-                _nvjitlink.complete(self._mnff.handle)
-                if target_type == "cubin":
-                    get_size = _nvjitlink.get_linked_cubin_size
-                    get_code = _nvjitlink.get_linked_cubin
-                else:
-                    get_size = _nvjitlink.get_linked_ptx_size
-                    get_code = _nvjitlink.get_linked_ptx
-                size = get_size(self._mnff.handle)
-                code = bytearray(size)
-                get_code(self._mnff.handle, code)
-            else:
-                addr, size = handle_return(_driver.cuLinkComplete(self._mnff.handle))
-                code = (ctypes.c_char * size).from_address(addr)
+cdef inline void Linker_annotate_error_log(Linker self, object e):
+    """Annotate a CUDAError with the driver linker error log."""
+    error_log = self.get_error_log()
+    if error_log:
+        e.args = (e.args[0] + f"\nLinker error log: {error_log}", *e.args[1:])
 
-        return ObjectCode._init(bytes(code), target_type, name=self._options.name)
 
-    def get_error_log(self) -> str:
-        """Get the error log generated by the linker.
+# =============================================================================
+# Private implementation: module-level state and initialization
+# =============================================================================
 
-        Returns
-        -------
-        str
-            The error log.
-        """
-        if _nvjitlink:
-            log_size = _nvjitlink.get_error_log_size(self._mnff.handle)
-            log = bytearray(log_size)
-            _nvjitlink.get_error_log(self._mnff.handle, log)
-        else:
-            log = self._mnff.formatted_options[2]
-        return log.decode("utf-8", errors="backslashreplace")
+# TODO: revisit this treatment for py313t builds
+_driver = None  # populated if nvJitLink cannot be used
+_driver_ver = None
+_inited = False
+_use_nvjitlink_backend = False  # set by _decide_nvjitlink_or_driver()
 
-    def get_info_log(self) -> str:
-        """Get the info log generated by the linker.
+# Input type mappings populated by _lazy_init() with C-level enum ints.
+_nvjitlink_input_types = None
+_driver_input_types = None
 
-        Returns
-        -------
-        str
-            The info log.
-        """
-        if _nvjitlink:
-            log_size = _nvjitlink.get_info_log_size(self._mnff.handle)
-            log = bytearray(log_size)
-            _nvjitlink.get_info_log(self._mnff.handle, log)
-        else:
-            log = self._mnff.formatted_options[0]
-        return log.decode("utf-8", errors="backslashreplace")
 
-    def _input_type_from_code_type(self, code_type: str):
-        # this list is based on the supported values for code_type in the ObjectCode class definition.
-        # nvJitLink/driver support other options for input type
-        input_type = _nvjitlink_input_types.get(code_type) if _nvjitlink else _driver_input_types.get(code_type)
+def _nvjitlink_has_version_symbol(nvjitlink) -> bool:
+    # This condition is equivalent to testing for version >= 12.3
+    return bool(nvjitlink._inspect_function_pointer("__nvJitLinkVersion"))
 
-        if input_type is None:
-            raise ValueError(f"Unknown code_type associated with ObjectCode: {code_type}")
-        return input_type
 
-    @property
-    def handle(self) -> LinkerHandleT:
-        """Return the underlying handle object.
+# Note: this function is reused in the tests
+def _decide_nvjitlink_or_driver() -> bool:
+    """Return True if falling back to the cuLink* driver APIs."""
+    global _driver_ver, _driver, _use_nvjitlink_backend
+    if _driver_ver is not None:
+        return not _use_nvjitlink_backend
 
-        .. note::
+    _driver_ver = handle_return(driver.cuDriverGetVersion())
+    _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10)
 
-           The type of the returned object depends on the backend.
+    warn_txt_common = (
+        "the driver APIs will be used instead, which do not support"
+        " minor version compatibility or linking LTO IRs."
+        " For best results, consider upgrading to a recent version of"
+    )
 
-        .. caution::
+    try:
+        __import__("cuda.bindings.nvjitlink")  # availability check
+    except ModuleNotFoundError:
+        warn_txt = f"cuda.bindings.nvjitlink is not available, therefore {warn_txt_common} cuda-bindings."
+    else:
+        from cuda.bindings._internal import nvjitlink
 
-            This handle is a Python object. To get the memory address of the underlying C
-            handle, call ``int(Linker.handle)``.
-        """
-        return self._mnff.handle
+        try:
+            if _nvjitlink_has_version_symbol(nvjitlink):
+                _use_nvjitlink_backend = True
+                return False  # Use nvjitlink
+        except RuntimeError:
+            warn_detail = "not available"
+        else:
+            warn_detail = "too old (<12.3)"
+        warn_txt = (
+            f"{'nvJitLink*.dll' if sys.platform == 'win32' else 'libnvJitLink.so*'} is {warn_detail}."
+            f" Therefore cuda.bindings.nvjitlink is not usable and {warn_txt_common} nvJitLink."
+        )
 
-    @property
-    def backend(self) -> str:
-        """Return this Linker instance's underlying backend."""
-        return "nvJitLink" if self._mnff.use_nvjitlink else "driver"
+    warn(warn_txt, stacklevel=2, category=RuntimeWarning)
+    _driver = driver
+    return True
 
-    def close(self):
-        """Destroy this linker."""
-        self._mnff.close()
+
+def _lazy_init():
+    global _inited, _nvjitlink_input_types, _driver_input_types
+    if _inited:
+        return
+
+    _decide_nvjitlink_or_driver()
+    if _use_nvjitlink_backend:
+        _nvjitlink_input_types = {
+            "ptx": <int>cynvjitlink.NVJITLINK_INPUT_PTX,
+            "cubin": <int>cynvjitlink.NVJITLINK_INPUT_CUBIN,
+            "fatbin": <int>cynvjitlink.NVJITLINK_INPUT_FATBIN,
+            "ltoir": <int>cynvjitlink.NVJITLINK_INPUT_LTOIR,
+            "object": <int>cynvjitlink.NVJITLINK_INPUT_OBJECT,
+            "library": <int>cynvjitlink.NVJITLINK_INPUT_LIBRARY,
+        }
+    else:
+        _driver_input_types = {
+            "ptx": <int>cydriver.CU_JIT_INPUT_PTX,
+            "cubin": <int>cydriver.CU_JIT_INPUT_CUBIN,
+            "fatbin": <int>cydriver.CU_JIT_INPUT_FATBINARY,
+            "object": <int>cydriver.CU_JIT_INPUT_OBJECT,
+            "library": <int>cydriver.CU_JIT_INPUT_LIBRARY,
+        }
+    _inited = True
diff --git a/cuda_core/cuda/core/_program.pxd b/cuda_core/cuda/core/_program.pxd
index 56618ffe42..d766ebe84d 100644
--- a/cuda_core/cuda/core/_program.pxd
+++ b/cuda_core/cuda/core/_program.pxd
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from ._linker cimport Linker
 from ._resource_handles cimport NvrtcProgramHandle, NvvmProgramHandle
 
 
@@ -10,6 +11,6 @@ cdef class Program:
         NvrtcProgramHandle _h_nvrtc
         NvvmProgramHandle _h_nvvm
         str _backend
-        object _linker  # Linker
+        Linker _linker
         object _options  # ProgramOptions
         object __weakref__
diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd
index d573862d16..f897a688b3 100644
--- a/cuda_core/cuda/core/_resource_handles.pxd
+++ b/cuda_core/cuda/core/_resource_handles.pxd
@@ -10,6 +10,7 @@ from libcpp.memory cimport shared_ptr
 from cuda.bindings cimport cydriver
 from cuda.bindings cimport cynvrtc
 from cuda.bindings cimport cynvvm
+from cuda.bindings cimport cynvjitlink
 
 
 # =============================================================================
@@ -26,7 +27,17 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     ctypedef shared_ptr[const cydriver.CUlibrary] LibraryHandle
     ctypedef shared_ptr[const cydriver.CUkernel] KernelHandle
     ctypedef shared_ptr[const cynvrtc.nvrtcProgram] NvrtcProgramHandle
-    ctypedef shared_ptr[const cynvvm.nvvmProgram] NvvmProgramHandle
+
+    # NvvmProgramValue and NvJitLinkValue are TaggedHandle<void*, Tag>
+    # instantiations that make each shared_ptr type distinct for overloading.
+    cppclass NvvmProgramValue "cuda_core::NvvmProgramValue":
+        pass
+    cppclass NvJitLinkValue "cuda_core::NvJitLinkValue":
+        pass
+    ctypedef shared_ptr[const NvvmProgramValue] NvvmProgramHandle
+    ctypedef shared_ptr[const NvJitLinkValue] NvJitLinkHandle
+
+    ctypedef shared_ptr[const cydriver.CUlinkState] CuLinkHandle
 
     # as_cu() - extract the raw CUDA handle (inline C++)
     cydriver.CUcontext as_cu(ContextHandle h) noexcept nogil
@@ -38,6 +49,8 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     cydriver.CUkernel as_cu(KernelHandle h) noexcept nogil
     cynvrtc.nvrtcProgram as_cu(NvrtcProgramHandle h) noexcept nogil
     cynvvm.nvvmProgram as_cu(NvvmProgramHandle h) noexcept nogil
+    cynvjitlink.nvJitLinkHandle as_cu(NvJitLinkHandle h) noexcept nogil
+    cydriver.CUlinkState as_cu(CuLinkHandle h) noexcept nogil
 
     # as_intptr() - extract handle as intptr_t for Python interop (inline C++)
     intptr_t as_intptr(ContextHandle h) noexcept nogil
@@ -49,6 +62,8 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     intptr_t as_intptr(KernelHandle h) noexcept nogil
     intptr_t as_intptr(NvrtcProgramHandle h) noexcept nogil
     intptr_t as_intptr(NvvmProgramHandle h) noexcept nogil
+    intptr_t as_intptr(NvJitLinkHandle h) noexcept nogil
+    intptr_t as_intptr(CuLinkHandle h) noexcept nogil
 
     # as_py() - convert handle to Python wrapper object (inline C++; requires GIL)
     object as_py(ContextHandle h)
@@ -60,6 +75,8 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     object as_py(KernelHandle h)
     object as_py(NvrtcProgramHandle h)
     object as_py(NvvmProgramHandle h)
+    object as_py(NvJitLinkHandle h)
+    object as_py(CuLinkHandle h)
 
 
 # =============================================================================
@@ -130,3 +147,11 @@ cdef NvrtcProgramHandle create_nvrtc_program_handle_ref(cynvrtc.nvrtcProgram pro
 # NVVM Program handles
 cdef NvvmProgramHandle create_nvvm_program_handle(cynvvm.nvvmProgram prog) except+ nogil
 cdef NvvmProgramHandle create_nvvm_program_handle_ref(cynvvm.nvvmProgram prog) except+ nogil
+
+# nvJitLink handles
+cdef NvJitLinkHandle create_nvjitlink_handle(cynvjitlink.nvJitLinkHandle handle) except+ nogil
+cdef NvJitLinkHandle create_nvjitlink_handle_ref(cynvjitlink.nvJitLinkHandle handle) except+ nogil
+
+# cuLink handles
+cdef CuLinkHandle create_culink_handle(cydriver.CUlinkState state) except+ nogil
+cdef CuLinkHandle create_culink_handle_ref(cydriver.CUlinkState state) except+ nogil
diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx
index 2652d4448e..ff2c568d03 100644
--- a/cuda_core/cuda/core/_resource_handles.pyx
+++ b/cuda_core/cuda/core/_resource_handles.pyx
@@ -16,6 +16,7 @@ from libc.stddef cimport size_t
 from cuda.bindings cimport cydriver
 from cuda.bindings cimport cynvrtc
 from cuda.bindings cimport cynvvm
+from cuda.bindings cimport cynvjitlink
 
 from ._resource_handles cimport (
     ContextHandle,
@@ -27,11 +28,14 @@ from ._resource_handles cimport (
     KernelHandle,
     NvrtcProgramHandle,
     NvvmProgramHandle,
+    NvJitLinkHandle,
+    CuLinkHandle,
 )
 
 import cuda.bindings.cydriver as cydriver
 import cuda.bindings.cynvrtc as cynvrtc
 import cuda.bindings.cynvvm as cynvvm
+import cuda.bindings.cynvjitlink as cynvjitlink
 
 # =============================================================================
 # C++ function declarations (non-inline, implemented in resource_handles.cpp)
@@ -125,6 +129,18 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     NvvmProgramHandle create_nvvm_program_handle_ref "cuda_core::create_nvvm_program_handle_ref" (
         cynvvm.nvvmProgram prog) except+ nogil
 
+    # nvJitLink handles
+    NvJitLinkHandle create_nvjitlink_handle "cuda_core::create_nvjitlink_handle" (
+        cynvjitlink.nvJitLinkHandle handle) except+ nogil
+    NvJitLinkHandle create_nvjitlink_handle_ref "cuda_core::create_nvjitlink_handle_ref" (
+        cynvjitlink.nvJitLinkHandle handle) except+ nogil
+
+    # cuLink handles
+    CuLinkHandle create_culink_handle "cuda_core::create_culink_handle" (
+        cydriver.CUlinkState state) except+ nogil
+    CuLinkHandle create_culink_handle_ref "cuda_core::create_culink_handle_ref" (
+        cydriver.CUlinkState state) except+ nogil
+
 
 # =============================================================================
 # CUDA Driver API capsule
@@ -192,12 +208,18 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     void* p_cuLibraryUnload "reinterpret_cast<void*&>(cuda_core::p_cuLibraryUnload)"
     void* p_cuLibraryGetKernel "reinterpret_cast<void*&>(cuda_core::p_cuLibraryGetKernel)"
 
+    # Linker
+    void* p_cuLinkDestroy "reinterpret_cast<void*&>(cuda_core::p_cuLinkDestroy)"
+
     # NVRTC
     void* p_nvrtcDestroyProgram "reinterpret_cast<void*&>(cuda_core::p_nvrtcDestroyProgram)"
 
     # NVVM
     void* p_nvvmDestroyProgram "reinterpret_cast<void*&>(cuda_core::p_nvvmDestroyProgram)"
 
+    # nvJitLink
+    void* p_nvJitLinkDestroy "reinterpret_cast<void*&>(cuda_core::p_nvJitLinkDestroy)"
+
 
 # Initialize driver function pointers from cydriver.__pyx_capi__ at module load
 cdef void* _get_driver_fn(str name):
@@ -248,6 +270,9 @@ p_cuLibraryLoadData = _get_driver_fn("cuLibraryLoadData")
 p_cuLibraryUnload = _get_driver_fn("cuLibraryUnload")
 p_cuLibraryGetKernel = _get_driver_fn("cuLibraryGetKernel")
 
+# Linker
+p_cuLinkDestroy = _get_driver_fn("cuLinkDestroy")
+
 # =============================================================================
 # NVRTC function pointer initialization
 # =============================================================================
@@ -270,3 +295,16 @@ cdef void* _get_nvvm_fn(str name):
     return PyCapsule_GetPointer(capsule, PyCapsule_GetName(capsule))
 
 p_nvvmDestroyProgram = _get_nvvm_fn("nvvmDestroyProgram")
+
+# =============================================================================
+# nvJitLink function pointer initialization
+#
+# nvJitLink may not be available at runtime, so we handle missing function
+# pointers gracefully. The C++ deleter checks for null before calling.
+# =============================================================================
+
+cdef void* _get_nvjitlink_fn(str name):
+    capsule = cynvjitlink.__pyx_capi__[name]
+    return PyCapsule_GetPointer(capsule, PyCapsule_GetName(capsule))
+
+p_nvJitLinkDestroy = _get_nvjitlink_fn("nvJitLinkDestroy")
diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pxd b/cuda_core/cuda/core/_utils/cuda_utils.pxd
index a42bbf2dd0..478ce705af 100644
--- a/cuda_core/cuda/core/_utils/cuda_utils.pxd
+++ b/cuda_core/cuda/core/_utils/cuda_utils.pxd
@@ -6,7 +6,7 @@ cimport cpython
 from cpython.object cimport PyObject
 from libc.stdint cimport int64_t, int32_t
 
-from cuda.bindings cimport cydriver, cynvrtc, cynvvm
+from cuda.bindings cimport cydriver, cynvrtc, cynvvm, cynvjitlink
 
 
 ctypedef fused integer_t:
@@ -21,6 +21,8 @@ cdef const cydriver.CUcontext CU_CONTEXT_INVALID = <cydriver.CUcontext>(-2)
 cdef int HANDLE_RETURN(cydriver.CUresult err) except?-1 nogil
 cdef int HANDLE_RETURN_NVRTC(cynvrtc.nvrtcProgram prog, cynvrtc.nvrtcResult err) except?-1 nogil
 cdef int HANDLE_RETURN_NVVM(cynvvm.nvvmProgram prog, cynvvm.nvvmResult err) except?-1 nogil
+cdef int HANDLE_RETURN_NVJITLINK(
+    cynvjitlink.nvJitLinkHandle handle, cynvjitlink.nvJitLinkResult err) except?-1 nogil
 
 
 # TODO: stop exposing these within the codebase?
diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pyx b/cuda_core/cuda/core/_utils/cuda_utils.pyx
index 734ae32f79..3134308b55 100644
--- a/cuda_core/cuda/core/_utils/cuda_utils.pyx
+++ b/cuda_core/cuda/core/_utils/cuda_utils.pyx
@@ -21,8 +21,9 @@ except ImportError:
     from cuda import nvrtc
 
 from cuda.bindings.nvvm import nvvmError
+from cuda.bindings.nvjitlink import nvJitLinkError
 
-from cuda.bindings cimport cynvrtc, cynvvm
+from cuda.bindings cimport cynvrtc, cynvvm, cynvjitlink
 
 from cuda.core._utils.driver_cu_result_explanations import DRIVER_CU_RESULT_EXPLANATIONS
 from cuda.core._utils.runtime_cuda_error_explanations import RUNTIME_CUDA_ERROR_EXPLANATIONS
@@ -119,6 +120,34 @@ cdef int _raise_nvvm_error(cynvvm.nvvmProgram prog, cynvvm.nvvmResult err) excep
     raise exc
 
 
+cdef int HANDLE_RETURN_NVJITLINK(
+        cynvjitlink.nvJitLinkHandle handle, cynvjitlink.nvJitLinkResult err) except?-1 nogil:
+    """Handle nvJitLink result codes, raising nvJitLinkError with error log on failure."""
+    if err == cynvjitlink.nvJitLinkResult.NVJITLINK_SUCCESS:
+        return 0
+    with gil:
+        _raise_nvjitlink_error(handle, err)
+
+
+cdef int _raise_nvjitlink_error(
+        cynvjitlink.nvJitLinkHandle handle, cynvjitlink.nvJitLinkResult err) except -1:
+    """Raise nvJitLinkError annotated with the error log."""
+    cdef size_t logsize = 0
+    if handle != NULL:
+        cynvjitlink.nvJitLinkGetErrorLogSize(handle, &logsize)
+    cdef bytes log_bytes
+    cdef str log_str = ""
+    if logsize > 1 and handle != NULL:
+        log_bytes = b" " * logsize
+        if cynvjitlink.nvJitLinkGetErrorLog(handle, <char*>log_bytes) == \
+                cynvjitlink.nvJitLinkResult.NVJITLINK_SUCCESS:
+            log_str = log_bytes.decode("utf-8", errors="backslashreplace")
+    cdef object exc = nvJitLinkError(err)
+    if log_str:
+        exc.args = (exc.args[0] + f"\nnvJitLink error log: {log_str}", *exc.args[1:])
+    raise exc
+
+
 cdef object _RUNTIME_SUCCESS = runtime.cudaError_t.cudaSuccess
 cdef object _NVRTC_SUCCESS = nvrtc.nvrtcResult.NVRTC_SUCCESS
 
diff --git a/cuda_core/docs/source/release/0.6.x-notes.rst b/cuda_core/docs/source/release/0.6.x-notes.rst
index 49ea8b3be9..69b62a700b 100644
--- a/cuda_core/docs/source/release/0.6.x-notes.rst
+++ b/cuda_core/docs/source/release/0.6.x-notes.rst
@@ -17,8 +17,8 @@ Breaking Changes
 ----------------
 
 - Building ``cuda.core`` from source now requires ``cuda-bindings`` >= 12.9.0, due to Cython-level
-  dependencies on the NVVM bindings (``cynvvm``). Pre-built wheels are unaffected. The previous
-  minimum was 12.8.0.
+  dependencies on the NVVM and nvJitLink bindings (``cynvvm``, ``cynvjitlink``). Pre-built wheels
+  are unaffected. The previous minimum was 12.8.0.
 
 
 New features
@@ -36,6 +36,10 @@ None.
 Fixes and enhancements
 ----------------------
 
+- Reduced Python overhead in :class:`Program` and :class:`Linker` by moving compilation and
+  linking operations to the C level and releasing the GIL during backend calls. This benefits
+  workloads that create many programs or linkers, and enables concurrent compilation in
+  multithreaded applications.
 - Wheel and installed package sizes significantly reduced (e.g., on a typical Linux x86_64
   build, wheel from ~4.6 MB to ~1.6 MB and installed from ~26 MB to ~4.4 MB) by excluding
   Cython source files, generated C++ files, and other build artifacts from distribution