diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/canary_probe_subprocess.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/canary_probe_subprocess.py
new file mode 100644
index 0000000000..4435b7e433
--- /dev/null
+++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/canary_probe_subprocess.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import sys
+
+from cuda.pathfinder._dynamic_libs.load_dl_common import LoadedDL
+from cuda.pathfinder._utils.platform_aware import IS_WINDOWS
+
+if IS_WINDOWS:
+    from cuda.pathfinder._dynamic_libs.load_dl_windows import load_with_system_search
+else:
+    from cuda.pathfinder._dynamic_libs.load_dl_linux import load_with_system_search
+
+
+def _probe_canary_abs_path(libname: str) -> str | None:
+    loaded: LoadedDL | None = load_with_system_search(libname)
+    if loaded is None:
+        return None
+    abs_path = loaded.abs_path
+    if not isinstance(abs_path, str):
+        return None
+    return abs_path
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = sys.argv[1:] if argv is None else argv
+    if len(args) != 1:
+        return 2
+    print(json.dumps(_probe_canary_abs_path(args[0])))  # noqa: T201
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_nvidia_dynamic_lib.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_nvidia_dynamic_lib.py
index 65c9f4bf3c..6265992e4a 100644
--- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_nvidia_dynamic_lib.py
+++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_nvidia_dynamic_lib.py
@@ -101,7 +101,7 @@ def _find_lib_dir_using_anchor_point(libname: str, anchor_point: str, linux_lib_
     for rel_path in rel_paths:
         for dirname in sorted(glob.glob(os.path.join(anchor_point, rel_path))):
             if os.path.isdir(dirname):
-                return dirname
+                return os.path.normpath(dirname)
 
     return None
 
@@ -152,6 +152,57 @@ def _find_dll_using_lib_dir(
     return None
 
 
+def _derive_ctk_root_linux(resolved_lib_path: str) -> str | None:
+    """Derive the CTK installation root from a resolved library path on Linux.
+
+    Standard system CTK layout: ``$CTK_ROOT/lib64/libfoo.so.XX``
+    (some installs use ``lib`` instead of ``lib64``).
+
+    Returns None if the path doesn't match a recognized layout.
+    """
+    lib_dir = os.path.dirname(resolved_lib_path)
+    basename = os.path.basename(lib_dir)
+    if basename in ("lib64", "lib"):
+        return os.path.dirname(lib_dir)
+    return None
+
+
+def _derive_ctk_root_windows(resolved_lib_path: str) -> str | None:
+    """Derive the CTK installation root from a resolved library path on Windows.
+
+    Handles two CTK layouts:
+    - CTK 13: ``$CTK_ROOT/bin/x64/foo.dll``
+    - CTK 12: ``$CTK_ROOT/bin/foo.dll``
+
+    Returns None if the path doesn't match a recognized layout.
+
+    Uses ``ntpath`` explicitly so the function is testable on any platform.
+    """
+    import ntpath
+
+    lib_dir = ntpath.dirname(resolved_lib_path)
+    basename = ntpath.basename(lib_dir).lower()
+    if basename == "x64":
+        parent = ntpath.dirname(lib_dir)
+        if ntpath.basename(parent).lower() == "bin":
+            return ntpath.dirname(parent)
+    elif basename == "bin":
+        return ntpath.dirname(lib_dir)
+    return None
+
+
+def derive_ctk_root(resolved_lib_path: str) -> str | None:
+    """Derive the CTK installation root from a resolved library path.
+
+    Given the absolute path of a loaded CTK shared library, walk up the
+    directory tree to find the CTK root.  Returns None if the path doesn't
+    match any recognized CTK directory layout.
+    """
+    if IS_WINDOWS:
+        return _derive_ctk_root_windows(resolved_lib_path)
+    return _derive_ctk_root_linux(resolved_lib_path)
+
+
 class _FindNvidiaDynamicLib:
     def __init__(self, libname: str):
         self.libname = libname
@@ -185,6 +236,16 @@ def try_with_conda_prefix(self) -> str | None:
     def try_with_cuda_home(self) -> str | None:
         return self._find_using_lib_dir(_find_lib_dir_using_cuda_home(self.libname))
 
+    def try_via_ctk_root(self, ctk_root: str) -> str | None:
+        """Find the library under a derived CTK root directory.
+
+        Uses :func:`_find_lib_dir_using_anchor_point` which already knows
+        about non-standard sub-paths (e.g. ``nvvm/lib64`` for nvvm).
+        """
+        return self._find_using_lib_dir(
+            _find_lib_dir_using_anchor_point(self.libname, anchor_point=ctk_root, linux_lib_dir="lib64")
+        )
+
     def _find_using_lib_dir(self, lib_dir: str | None) -> str | None:
         if lib_dir is None:
             return None
diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py
index 3431c2f86b..c50e253810 100644
--- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py
+++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py
@@ -2,10 +2,15 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import functools
+import json
 import struct
+import subprocess
 import sys
 
-from cuda.pathfinder._dynamic_libs.find_nvidia_dynamic_lib import _FindNvidiaDynamicLib
+from cuda.pathfinder._dynamic_libs.find_nvidia_dynamic_lib import (
+    _FindNvidiaDynamicLib,
+    derive_ctk_root,
+)
 from cuda.pathfinder._dynamic_libs.load_dl_common import LoadedDL, load_dependencies
 from cuda.pathfinder._utils.platform_aware import IS_WINDOWS
 
@@ -22,6 +27,73 @@
         load_with_system_search,
     )
 
+# Libs that reside on the standard linker path in system CTK installs.
+# Used to discover the CTK root when a lib with a non-standard path
+# (e.g. nvvm under $CTK_ROOT/nvvm/lib64) can't be found directly.
+_CTK_ROOT_CANARY_LIBNAMES = ("cudart",)
+
+
+def _resolve_system_loaded_abs_path_in_subprocess(libname: str) -> str | None:
+    """Resolve a library's system-search absolute path in a child process.
+
+    This keeps any side-effects of loading the canary library scoped to the
+    child process instead of polluting the current process.
+    """
+    cmd = [
+        sys.executable,
+        "-m",
+        "cuda.pathfinder._dynamic_libs.canary_probe_subprocess",
+        libname,
+    ]
+    try:
+        result = subprocess.run(  # noqa: S603
+            cmd,
+            check=False,
+            capture_output=True,
+            text=True,
+            timeout=10.0,
+        )
+    except (OSError, subprocess.SubprocessError):
+        return None
+    if result.returncode != 0:
+        return None
+
+    # Read the final non-empty stdout line in case earlier lines are emitted.
+    lines = [line for line in result.stdout.splitlines() if line.strip()]
+    if not lines:
+        return None
+    try:
+        payload = json.loads(lines[-1])
+    except json.JSONDecodeError:
+        return None
+    if isinstance(payload, str):
+        return payload
+    return None
+
+
+def _try_ctk_root_canary(finder: _FindNvidiaDynamicLib) -> str | None:
+    """Derive the CTK root from a system-installed canary lib.
+
+    For libs like nvvm whose shared object doesn't reside on the standard
+    linker path, we locate a well-known CTK lib that IS on the linker path
+    via system search, derive the CTK installation root from its resolved
+    path, and then look for the target lib relative to that root.
+
+    The canary load is performed in a subprocess to avoid introducing loader
+    state into the current process.
+    """
+    for canary_libname in _CTK_ROOT_CANARY_LIBNAMES:
+        canary_abs_path = _resolve_system_loaded_abs_path_in_subprocess(canary_libname)
+        if canary_abs_path is None:
+            continue
+        ctk_root = derive_ctk_root(canary_abs_path)
+        if ctk_root is None:
+            continue
+        abs_path: str | None = finder.try_via_ctk_root(ctk_root)
+        if abs_path is not None:
+            return abs_path
+    return None
+
 
 def _load_lib_no_cache(libname: str) -> LoadedDL:
     finder = _FindNvidiaDynamicLib(libname)
@@ -50,11 +122,21 @@ def _load_lib_no_cache(libname: str) -> LoadedDL:
         loaded = load_with_system_search(libname)
         if loaded is not None:
             return loaded
+
         abs_path = finder.try_with_cuda_home()
-        if abs_path is None:
-            finder.raise_not_found_error()
-        else:
+        if abs_path is not None:
             found_via = "CUDA_HOME"
+        else:
+            # Canary probe: if the direct system search and CUDA_HOME both
+            # failed (e.g. nvvm isn't on the linker path and CUDA_HOME is
+            # unset), try to discover the CTK root by loading a well-known CTK
+            # lib in a subprocess, then look for the target lib relative to
+            # that root.
+            abs_path = _try_ctk_root_canary(finder)
+            if abs_path is not None:
+                found_via = "system-ctk-root"
+            else:
+                finder.raise_not_found_error()
 
     return load_with_abs_path(libname, abs_path, found_via)
 
@@ -123,6 +205,14 @@ def load_nvidia_dynamic_lib(libname: str) -> LoadedDL:
 
            - If set, use ``CUDA_HOME`` or ``CUDA_PATH`` (in that order).
 
+        5. **CTK root canary probe**
+
+           - For libraries whose shared object doesn't reside on the standard
+             linker path (e.g. ``libnvvm.so`` lives under ``$CTK_ROOT/nvvm/lib64``),
+             attempt to discover the CTK installation root by system-loading a
+             well-known CTK library (``cudart``) in a subprocess, then derive
+             the root from its resolved absolute path.
+
     Notes:
         The search is performed **per library**. There is currently no mechanism to
         guarantee that multiple libraries are all resolved from the same location.
diff --git a/cuda_pathfinder/tests/test_ctk_root_discovery.py b/cuda_pathfinder/tests/test_ctk_root_discovery.py
new file mode 100644
index 0000000000..71a61c86c0
--- /dev/null
+++ b/cuda_pathfinder/tests/test_ctk_root_discovery.py
@@ -0,0 +1,276 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+
+import pytest
+
+from cuda.pathfinder._dynamic_libs.find_nvidia_dynamic_lib import (
+    _derive_ctk_root_linux,
+    _derive_ctk_root_windows,
+    _FindNvidiaDynamicLib,
+    derive_ctk_root,
+)
+from cuda.pathfinder._dynamic_libs.load_dl_common import LoadedDL
+from cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib import (
+    _load_lib_no_cache,
+    _try_ctk_root_canary,
+)
+from cuda.pathfinder._utils.platform_aware import IS_WINDOWS
+
+_MODULE = "cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib"
+_FIND_MODULE = "cuda.pathfinder._dynamic_libs.find_nvidia_dynamic_lib"
+
+
+# ---------------------------------------------------------------------------
+# Platform-aware test helpers
+# ---------------------------------------------------------------------------
+
+
+def _create_nvvm_in_ctk(ctk_root):
+    """Create a fake nvvm lib in the platform-appropriate CTK subdirectory."""
+    if IS_WINDOWS:
+        nvvm_dir = ctk_root / "nvvm" / "bin"
+        nvvm_dir.mkdir(parents=True)
+        nvvm_lib = nvvm_dir / "nvvm64.dll"
+    else:
+        nvvm_dir = ctk_root / "nvvm" / "lib64"
+        nvvm_dir.mkdir(parents=True)
+        nvvm_lib = nvvm_dir / "libnvvm.so"
+    nvvm_lib.write_bytes(b"fake")
+    return nvvm_lib
+
+
+def _create_cudart_in_ctk(ctk_root):
+    """Create a fake cudart lib in the platform-appropriate CTK subdirectory."""
+    if IS_WINDOWS:
+        lib_dir = ctk_root / "bin"
+        lib_dir.mkdir(parents=True)
+        lib_file = lib_dir / "cudart64_12.dll"
+    else:
+        lib_dir = ctk_root / "lib64"
+        lib_dir.mkdir(parents=True)
+        lib_file = lib_dir / "libcudart.so"
+    lib_file.write_bytes(b"fake")
+    return lib_file
+
+
+def _fake_canary_path(ctk_root):
+    """Return the path a system-loaded canary lib would resolve to."""
+    if IS_WINDOWS:
+        return str(ctk_root / "bin" / "cudart64_13.dll")
+    return str(ctk_root / "lib64" / "libcudart.so.13")
+
+
+# ---------------------------------------------------------------------------
+# derive_ctk_root
+# ---------------------------------------------------------------------------
+
+
+def test_derive_ctk_root_linux_lib64():
+    assert _derive_ctk_root_linux("/usr/local/cuda-13/lib64/libcudart.so.13") == "/usr/local/cuda-13"
+
+
+def test_derive_ctk_root_linux_lib():
+    assert _derive_ctk_root_linux("/opt/cuda/lib/libcudart.so.12") == "/opt/cuda"
+
+
+def test_derive_ctk_root_linux_unrecognized():
+    assert _derive_ctk_root_linux("/some/weird/path/libcudart.so.13") is None
+
+
+def test_derive_ctk_root_linux_root_level():
+    assert _derive_ctk_root_linux("/lib64/libcudart.so.13") == "/"
+
+
+def test_derive_ctk_root_windows_ctk13():
+    path = r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\bin\x64\cudart64_13.dll"
+    assert _derive_ctk_root_windows(path) == r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0"
+
+
+def test_derive_ctk_root_windows_ctk12():
+    path = r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin\cudart64_12.dll"
+    assert _derive_ctk_root_windows(path) == r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8"
+
+
+def test_derive_ctk_root_windows_unrecognized():
+    assert _derive_ctk_root_windows(r"C:\weird\cudart64_13.dll") is None
+
+
+def test_derive_ctk_root_windows_case_insensitive_bin():
+    assert _derive_ctk_root_windows(r"C:\CUDA\Bin\cudart64_12.dll") == r"C:\CUDA"
+
+
+def test_derive_ctk_root_windows_case_insensitive_x64():
+    assert _derive_ctk_root_windows(r"C:\CUDA\BIN\X64\cudart64_13.dll") == r"C:\CUDA"
+
+
+def test_derive_ctk_root_dispatches_to_linux(mocker):
+    mocker.patch(f"{_FIND_MODULE}.IS_WINDOWS", False)
+    assert derive_ctk_root("/usr/local/cuda/lib64/libcudart.so.13") == "/usr/local/cuda"
+
+
+def test_derive_ctk_root_dispatches_to_windows(mocker):
+    mocker.patch(f"{_FIND_MODULE}.IS_WINDOWS", True)
+    assert derive_ctk_root(r"C:\CUDA\v13\bin\cudart64_13.dll") == r"C:\CUDA\v13"
+
+
+# ---------------------------------------------------------------------------
+# _FindNvidiaDynamicLib.try_via_ctk_root
+# ---------------------------------------------------------------------------
+
+
+def test_try_via_ctk_root_finds_nvvm(tmp_path):
+    ctk_root = tmp_path / "cuda-13"
+    nvvm_lib = _create_nvvm_in_ctk(ctk_root)
+
+    assert _FindNvidiaDynamicLib("nvvm").try_via_ctk_root(str(ctk_root)) == str(nvvm_lib)
+
+
+def test_try_via_ctk_root_returns_none_when_dir_missing(tmp_path):
+    ctk_root = tmp_path / "cuda-13"
+    ctk_root.mkdir()
+
+    assert _FindNvidiaDynamicLib("nvvm").try_via_ctk_root(str(ctk_root)) is None
+
+
+def test_try_via_ctk_root_regular_lib(tmp_path):
+    ctk_root = tmp_path / "cuda-13"
+    cudart_lib = _create_cudart_in_ctk(ctk_root)
+
+    assert _FindNvidiaDynamicLib("cudart").try_via_ctk_root(str(ctk_root)) == str(cudart_lib)
+
+
+# ---------------------------------------------------------------------------
+# _try_ctk_root_canary
+# ---------------------------------------------------------------------------
+
+
+def _make_loaded_dl(path, found_via):
+    return LoadedDL(path, False, 0xDEAD, found_via)
+
+
+def test_canary_finds_nvvm(tmp_path, mocker):
+    ctk_root = tmp_path / "cuda-13"
+    _create_cudart_in_ctk(ctk_root)
+    nvvm_lib = _create_nvvm_in_ctk(ctk_root)
+
+    probe = mocker.patch(
+        f"{_MODULE}._resolve_system_loaded_abs_path_in_subprocess",
+        return_value=_fake_canary_path(ctk_root),
+    )
+    parent_system_loader = mocker.patch(f"{_MODULE}.load_with_system_search")
+
+    assert _try_ctk_root_canary(_FindNvidiaDynamicLib("nvvm")) == str(nvvm_lib)
+    probe.assert_called_once_with("cudart")
+    parent_system_loader.assert_not_called()
+
+
+def test_canary_returns_none_when_subprocess_probe_fails(mocker):
+    mocker.patch(f"{_MODULE}._resolve_system_loaded_abs_path_in_subprocess", return_value=None)
+    assert _try_ctk_root_canary(_FindNvidiaDynamicLib("nvvm")) is None
+
+
+def test_canary_returns_none_when_ctk_root_unrecognized(mocker):
+    mocker.patch(
+        f"{_MODULE}._resolve_system_loaded_abs_path_in_subprocess",
+        return_value="/weird/path/libcudart.so.13",
+    )
+    assert _try_ctk_root_canary(_FindNvidiaDynamicLib("nvvm")) is None
+
+
+def test_canary_returns_none_when_nvvm_not_in_ctk_root(tmp_path, mocker):
+    ctk_root = tmp_path / "cuda-13"
+    # Create only the canary lib dir, not nvvm
+    _create_cudart_in_ctk(ctk_root)
+
+    mocker.patch(
+        f"{_MODULE}._resolve_system_loaded_abs_path_in_subprocess",
+        return_value=_fake_canary_path(ctk_root),
+    )
+    assert _try_ctk_root_canary(_FindNvidiaDynamicLib("nvvm")) is None
+
+
+def test_canary_skips_when_abs_path_none(mocker):
+    mocker.patch(f"{_MODULE}._resolve_system_loaded_abs_path_in_subprocess", return_value=None)
+    assert _try_ctk_root_canary(_FindNvidiaDynamicLib("nvvm")) is None
+
+
+# ---------------------------------------------------------------------------
+# _load_lib_no_cache search-order
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def _isolate_load_cascade(mocker):
+    """Disable the search steps that run before system-search in _load_lib_no_cache.
+
+    This lets the ordering tests focus on system-search, CUDA_HOME, and the
+    canary probe without needing a real site-packages or conda environment.
+    """
+    # No wheels installed
+    mocker.patch.object(_FindNvidiaDynamicLib, "try_site_packages", return_value=None)
+    # No conda env
+    mocker.patch.object(_FindNvidiaDynamicLib, "try_with_conda_prefix", return_value=None)
+    # Lib not already loaded by another component
+    mocker.patch(f"{_MODULE}.check_if_already_loaded_from_elsewhere", return_value=None)
+    # Skip transitive dependency loading
+    mocker.patch(f"{_MODULE}.load_dependencies")
+
+
+@pytest.mark.usefixtures("_isolate_load_cascade")
+def test_cuda_home_takes_priority_over_canary(tmp_path, mocker):
+    # Two competing CTK roots: one from CUDA_HOME, one the canary would find.
+    cuda_home_root = tmp_path / "cuda-home"
+    nvvm_home_lib = _create_nvvm_in_ctk(cuda_home_root)
+
+    canary_root = tmp_path / "cuda-system"
+    _create_cudart_in_ctk(canary_root)
+    _create_nvvm_in_ctk(canary_root)
+
+    canary_mock = mocker.MagicMock(return_value=_fake_canary_path(canary_root))
+
+    # System search finds nothing for nvvm.
+    mocker.patch(f"{_MODULE}.load_with_system_search", return_value=None)
+    # Canary subprocess probe would find cudart if consulted.
+    mocker.patch(f"{_MODULE}._resolve_system_loaded_abs_path_in_subprocess", side_effect=canary_mock)
+    # CUDA_HOME points to a separate root that also has nvvm
+    mocker.patch(f"{_FIND_MODULE}.get_cuda_home_or_path", return_value=str(cuda_home_root))
+    # Capture the final load call
+    mocker.patch(
+        f"{_MODULE}.load_with_abs_path",
+        side_effect=lambda _libname, path, via: _make_loaded_dl(path, via),
+    )
+
+    result = _load_lib_no_cache("nvvm")
+
+    # CUDA_HOME must win; the canary should never have been consulted
+    assert result.found_via == "CUDA_HOME"
+    assert result.abs_path == str(nvvm_home_lib)
+    canary_mock.assert_not_called()
+
+
+@pytest.mark.usefixtures("_isolate_load_cascade")
+def test_canary_fires_only_after_all_earlier_steps_fail(tmp_path, mocker):
+    canary_root = tmp_path / "cuda-system"
+    _create_cudart_in_ctk(canary_root)
+    nvvm_lib = _create_nvvm_in_ctk(canary_root)
+
+    # System search: nvvm not on linker path.
+    mocker.patch(f"{_MODULE}.load_with_system_search", return_value=None)
+    # Canary subprocess probe finds cudart under a system CTK root.
+    mocker.patch(
+        f"{_MODULE}._resolve_system_loaded_abs_path_in_subprocess",
+        return_value=_fake_canary_path(canary_root),
+    )
+    # No CUDA_HOME set
+    mocker.patch(f"{_FIND_MODULE}.get_cuda_home_or_path", return_value=None)
+    # Capture the final load call
+    mocker.patch(
+        f"{_MODULE}.load_with_abs_path",
+        side_effect=lambda _libname, path, via: _make_loaded_dl(path, via),
+    )
+
+    result = _load_lib_no_cache("nvvm")
+
+    assert result.found_via == "system-ctk-root"
+    assert result.abs_path == str(nvvm_lib)