diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/_agent_eval/main.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/_agent_eval/main.py
new file mode 100644
index 0000000..bc13223
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/_agent_eval/main.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+"""Runs environment setup, build, benchmark prep, and experiment runs checks for Neutrino (OSDI'25)."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Dict, Tuple
+import os
+import sys
+
+
+from evaluator.utils import (  # noqa: E402
+    EntryConfig,
+    LoggerConfig,
+    get_logger,
+    record_result,
+)
+
+from oracle_artifact_build import OracleArtifactBuild  # noqa: E402
+from oracle_benchmark_prep import OracleBenchmarkPrep  # noqa: E402
+from oracle_env_setup import OracleEnvSetup  # noqa: E402
+from oracle_experiment_runs import OracleExperimentRuns  # noqa: E402
+
+
+def _resolve_workspace_paths() -> Tuple[Path, Path, Path]:
+    """Resolve and validate _agent_eval/ and neutrino/ locations.
+
+    Expects either:
+      (1) _agent_eval/ and the Neutrino repo are located in the same workspace root; or
+      (2) _AGENT_EVAL_DIR and _NEUTRINO_HOME are set by the user.
+    """
+    try:
+        env_agent_eval = os.environ.get("_AGENT_EVAL_DIR")
+        env_neutrino_home = os.environ.get("_NEUTRINO_HOME")
+
+        agent_eval_dir = (
+            Path(env_agent_eval).expanduser().resolve()
+            if env_agent_eval
+            else Path(__file__).resolve().parent
+        )
+
+        workspace_root = (
+            Path(env_neutrino_home).expanduser().resolve()
+            if env_neutrino_home
+            else agent_eval_dir.parent.resolve()
+        )
+
+        if not agent_eval_dir.is_dir():
+            raise RuntimeError(
+                f"Invalid _agent_eval dir: {agent_eval_dir}\n"
+                "Set _AGENT_EVAL_DIR to the directory containing main.py if needed."
+            )
+
+        neutrino_repo_root = workspace_root / "neutrino"
+        if not neutrino_repo_root.is_dir():
+            raise RuntimeError(
+                f"Invalid Neutrino workspace: {workspace_root}\n"
+                f"Expected to find a Neutrino repository directory at: {neutrino_repo_root}\n"
+                "This runner expects _agent_eval/ and the Neutrino repo to be located in the same workspace root.\n"
+                "Set _NEUTRINO_HOME to the workspace root if needed."
+            )
+
+        return agent_eval_dir, workspace_root, neutrino_repo_root
+
+    except OSError as exc:
+        raise RuntimeError(f"Failed to resolve workspace paths: {exc}") from exc
+
+
+def _build_neutrino_config(
+    *, agent_eval_dir: Path, workspace_root: Path, neutrino_repo_root: Path
+) -> EntryConfig:
+    """Constructs EntryConfig for the Neutrino evaluation bundle from resolved paths."""
+    
+    return EntryConfig(
+        name="osdi25-neutrino",
+        home_dir=workspace_root,
+        repository_paths={
+            "osdi25-neutrino": neutrino_repo_root,
+        },
+        results_paths={
+            # Need to add results dir
+        },
+        ground_truth_paths={
+            # Need _agent_eval/refs.
+        },
+        similarity_ratio=0.75,
+    )
+
+
+def main(argv: list[str]) -> int:
+    verbose = "--verbose" in argv
+
+    results: Dict[str, int] = {}
+    score = 0
+
+    logger_name = os.environ.get("EVAL_LOGGER_NAME", "NEUTRINO-AGENT-EVALUATOR")
+    logger = get_logger(LoggerConfig(root_name=logger_name))
+
+    try:
+        agent_eval_dir, workspace_root, neutrino_repo_root = _resolve_workspace_paths()
+        NEUTRINO_CONFIG = _build_neutrino_config(
+            agent_eval_dir=agent_eval_dir,
+            workspace_root=workspace_root,
+            neutrino_repo_root=neutrino_repo_root,
+        )
+    except RuntimeError as exc:
+        raise SystemExit(str(exc)) from exc
+
+    env_checker = OracleEnvSetup(config=NEUTRINO_CONFIG, logger=logger)
+    score += record_result(results, type(env_checker).__name__, env_checker.run(verbose=verbose))
+
+    build_checker = OracleArtifactBuild(config=NEUTRINO_CONFIG, logger=logger)
+    score += record_result(results, type(build_checker).__name__, build_checker.run(verbose=verbose))
+
+    prep_checker = OracleBenchmarkPrep(config=NEUTRINO_CONFIG, logger=logger)
+    score += record_result(results, type(prep_checker).__name__, prep_checker.run(verbose=verbose))
+
+    runs_checker = OracleExperimentRuns(config=NEUTRINO_CONFIG, logger=logger)
+    score += record_result(results, type(runs_checker).__name__, runs_checker.run(verbose=verbose))
+
+    logger.info("Agent scores: %s", results)
+    return score
+
+
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/_agent_eval/oracle_artifact_build.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/_agent_eval/oracle_artifact_build.py
new file mode 100644
index 0000000..d64f0be
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/_agent_eval/oracle_artifact_build.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+"""Artifact build oracle for Neutrino (OSDI'25).
+
+Validates:
+  - Repository working directory exists.
+  - The Neutrino CLI is on PATH and can invoke `--help`.
+  - The Neutrino module is importable after installation.
+"""
+
+from collections.abc import Mapping, Sequence
+from dataclasses import dataclass, field
+import logging
+from pathlib import Path
+import sys
+
+from evaluator.oracle_artifact_build_primitives import (
+    BuildCommandRequirement,
+    OracleArtifactBuildBase,
+)
+from evaluator.utils import BaseRequirement, EntryConfig
+
+
+@dataclass(frozen=True, slots=True, kw_only=True)
+class BuildTarget:
+    """Declarative description of one build command to run."""
+
+    name: str
+    cmd: Sequence[str]
+    relative_workdir: Path | None = None
+    optional: bool = False
+    timeout_seconds: float = 60.0
+    env_overrides: Mapping[str, str] = field(default_factory=dict)
+
+    def __post_init__(self) -> None:
+        if not self.name:
+            raise ValueError("BuildTarget.name must be non-empty")
+
+        if isinstance(self.cmd, (str, bytes)) or not self.cmd:
+            raise ValueError("BuildTarget.cmd must be a non-empty argv sequence")
+
+        object.__setattr__(self, "cmd", tuple(self.cmd))
+
+        if self.relative_workdir is not None and not isinstance(self.relative_workdir, Path):
+            object.__setattr__(self, "relative_workdir", Path(self.relative_workdir))
+
+
+class OracleArtifactBuild(OracleArtifactBuildBase):
+    """The artifact build oracle for Neutrino."""
+
+    def __init__(
+        self,
+        *,
+        config: EntryConfig,
+        logger: logging.Logger,
+        targets: Sequence[BuildTarget] | None = None,
+    ) -> None:
+        super().__init__(logger=logger)
+        self._config = config
+
+        if targets is None:
+            targets = self._make_default_targets()
+        self._targets = tuple(targets)
+
+        names = [t.name for t in self._targets]
+        if len(names) != len(set(names)):
+            raise ValueError(f"Duplicate build target names: {names!r}")
+
+    def _make_default_targets(self) -> tuple[BuildTarget, ...]:
+        py = sys.executable or "python"
+
+        return (
+            BuildTarget(
+                name="neutrino: import test",
+                cmd=(py, "-c", "import neutrino; print(neutrino.__file__)"),
+                timeout_seconds=30.0,
+            ),
+            BuildTarget(
+                name="neutrino: CLI help (optional)",
+                cmd=("neutrino", "--help"),
+                optional=True,
+                timeout_seconds=30.0,
+            ),
+        )
+
+    def requirements(self) -> Sequence[BaseRequirement]:
+        """Returns an ordered list of build requirements to validate."""
+        repo_root = self._config.repository_paths.get(self._config.name)
+
+        if repo_root is None:
+            return (
+                BuildCommandRequirement(
+                    name=f"config: missing repository_paths entry for {self._config.name!r}",
+                    optional=False,
+                    cwd=Path(self._config.home_dir) / "__MISSING_REPOSITORY_PATH__",
+                    cmd=("true",),
+                    timeout_seconds=30.0,
+                ),
+            )
+
+        return tuple(
+            BuildCommandRequirement(
+                name=target.name,
+                optional=target.optional,
+                cwd=repo_root,
+                cmd=target.cmd,
+                relative_workdir=target.relative_workdir,
+                timeout_seconds=target.timeout_seconds,
+                env_overrides=target.env_overrides,
+            )
+            for target in self._targets
+        )
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/_agent_eval/oracle_env_setup.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/_agent_eval/oracle_env_setup.py
new file mode 100644
index 0000000..f585863
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/_agent_eval/oracle_env_setup.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+"""Environment setup oracle for Neutrino (OSDI'25).
+
+Validates:
+  - Baseline tools for running the (static) evaluation workflow.
+  - Repository directory layout and required artifact files.
+  - Static and dynamic evaluation prerequisites.
+"""
+
+import logging
+from pathlib import Path
+from typing import Mapping, Sequence
+
+from evaluator import utils
+from evaluator.utils import EntryConfig
+from evaluator.oracle_env_setup_primitives import (
+    DependencyVersionRequirement,
+    FilesystemPathRequirement,
+    OracleEnvSetupBase,
+    PathType,
+    VersionCompare,
+)
+
+
+def _required_path(paths: Mapping[str, Path], key: str, *, label: str) -> Path:
+    """Returns a required path from a mapping with a clear error."""
+    try:
+        return paths[key]
+    except KeyError as e:
+        raise ValueError(f"Missing {label}[{key!r}] in EntryConfig") from e
+
+
+class OracleEnvSetup(OracleEnvSetupBase):
+    """Validates environment prerequisites for Neutrino (OSDI'25)."""
+
+    def __init__(self, *, config: EntryConfig, logger: logging.Logger) -> None:
+        super().__init__(logger)
+        self._config = config
+
+    def requirements(self) -> Sequence[utils.BaseRequirement]:
+        repo_root = _required_path(
+            self._config.repository_paths, self._config.name, label="repository_paths"
+        )
+
+        artifact_dir = repo_root / "artifact"
+        pkg_dir = repo_root / "neutrino"
+
+        # Static evaluation requirements
+        reqs: list[utils.BaseRequirement] = [
+            DependencyVersionRequirement(
+                name="python",
+                cmd=("python", "--version"),
+                required_version=(3, 11, 0),
+                compare=VersionCompare.GEQ,
+            ),
+            DependencyVersionRequirement(
+                name="pip",
+                cmd=("python", "-m", "pip", "--version"),
+                required_version=(0, 0, 0),
+                compare=VersionCompare.GEQ,
+            ),
+            DependencyVersionRequirement(
+                name="wget",
+                cmd=("wget", "--version"),
+                required_version=(0, 0, 0),
+                compare=VersionCompare.GEQ,
+                optional=True,
+            ),
+            DependencyVersionRequirement(
+                name="unzip",
+                cmd=("unzip", "-v"),
+                required_version=(0, 0, 0),
+                compare=VersionCompare.GEQ,
+                optional=True,
+            ),
+            FilesystemPathRequirement(
+                name="repo_root_exists",
+                path=repo_root,
+                path_type=PathType.DIRECTORY,
+            ),
+            FilesystemPathRequirement(
+                name="artifact_dir_exists",
+                path=artifact_dir,
+                path_type=PathType.DIRECTORY,
+            ),
+            FilesystemPathRequirement(
+                name="static_notebook_exists",
+                path=artifact_dir / "static.ipynb",
+                path_type=PathType.FILE,
+            ),
+        ]
+
+        # Dynamic evaluation requirements
+        reqs.extend(
+            [
+                DependencyVersionRequirement(
+                    name="gcc",
+                    cmd=("gcc", "--version"),
+                    required_version=(0, 0, 0),
+                    compare=VersionCompare.GEQ,
+                    optional=False,
+                ),
+                DependencyVersionRequirement(
+                    name="nm",
+                    cmd=("nm", "--version"),
+                    required_version=(0, 0, 0),
+                    compare=VersionCompare.GEQ,
+                    optional=False,
+                ),
+                DependencyVersionRequirement(
+                    name="cmake",
+                    cmd=("cmake", "--version"),
+                    required_version=(0, 0, 0),
+                    compare=VersionCompare.GEQ,
+                    optional=False,
+                ),
+                DependencyVersionRequirement(
+                    name="make",
+                    cmd=("make", "--version"),
+                    required_version=(0, 0, 0),
+                    compare=VersionCompare.GEQ,
+                    optional=False,
+                ),
+                DependencyVersionRequirement(
+                    name="nvidia-smi",
+                    cmd=("nvidia-smi",),
+                    required_version=(0, 0, 0),
+                    compare=VersionCompare.GEQ,
+                    optional=True,
+                ),
+                DependencyVersionRequirement(
+                    name="ptxas",
+                    cmd=("ptxas", "--version"),
+                    required_version=(0, 0, 0),
+                    compare=VersionCompare.GEQ,
+                    optional=True,
+                ),
+                DependencyVersionRequirement(
+                    name="cuobjdump",
+                    cmd=("cuobjdump", "--version"),
+                    required_version=(0, 0, 0),
+                    compare=VersionCompare.GEQ,
+                    optional=True,
+                ),
+                FilesystemPathRequirement(
+                    name="dynamic_notebook_exists",
+                    path=artifact_dir / "dynamic.ipynb",
+                    path_type=PathType.FILE,
+                ),
+            ]
+        )
+
+        return reqs
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/MANIFEST.in b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/MANIFEST.in
new file mode 100644
index 0000000..14eaab6
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/MANIFEST.in
@@ -0,0 +1 @@
+recursive-include neutrino *
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/README.md b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/README.md
new file mode 100644
index 0000000..602edc1
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/README.md
@@ -0,0 +1,210 @@
+# Neutrino
+
+by [Huang Songlin](https://huangs0.github.io) and [Wu Chenshu](https://cswu.me) from the University of Hong Kong.
+
+Neutrino is a Probing-based GPU Kernel Profiler providing eBPF-like user experience for GPU Kernel Profiling, targeting:
+
+1. **Fine-granularity**: Directly works on instructions to offer the finest granularity that can be mapped to particular hardware units.
+2. **Programmability**: Extends the programmability of previous tools to probe cooperation with probe
+3. **Versatility**: Supports both value profiling (register value like memory address) and value profiling (timestamp from device-side clock).
+4. **Hardware-Independence**: Support both NVIDIA/CUDA and AMD/ROCm, more platforms to come!
+5. **Ecosystem-Compatibility**: Built-in compatible with PyTorch (and everything on top like Huggingface), JAX, Triton, CUTLASS...
+
+The foundations of this project are described in our OSDI '25 publication: [Neutrino: Fine-grained GPU Kernel Profiling via Programmable Probing](https://www.usenix.org/conference/osdi25/presentation/huang-songlin). Please consider citing this work if you use Neutrino!
+The [official documentation](https://open-neutrino.github.io) contains more installation instructions, tutorials, internals and the DMAT galley!
+
+## Latest News
+* May 31, 2025: [Neutrino's artifact](https://github.com/open-neutrino/neutrino/tree/artifact) received all [badges](https://sysartifacts.github.io/osdi2024/badges) (Available, Functional, Reproduced) from OSDI 25 Artifact Evaluation!
+
+## Quick Start
+
+### Demos
+
+Following demos are hosted on Colab with simple click `Runtime -> Run All`:
+
+| Demo | Colab Link|
+|---|---|
+| Unrevealing block scheduling cost of `torch.zeros` | <a href="https://colab.research.google.com/drive/1rhIsfQnV1OwGIud8dSSFrod7AIvsA2cB?usp=sharing" target="_blank" rel="noopener noreferrer">  ![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg) </a> |
+| Visualizing FlashAttn-v2 Memory Access | <a href="https://colab.research.google.com/drive/1RTcJfn32Ocx9B_icgpiDiC2WEHTKp1JB?usp=sharing" target="_blank" rel="noopener noreferrer">  ![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg) </a>|
+| Warp Scheduling and Tailing Effect | <a href="https://colab.research.google.com/drive/1P50vzbkkgrbF-fVh1rcEe4YdHHUuq1tE?usp=sharing" target="_blank" rel="noopener noreferrer">  ![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg) </a> |
+
+### Installation
+
+Neutrino can be installed as a Python package from source. Building is fast (<30 seconds)!
+
+```bash
+# Virtual Environmnt is highly recommended!
+conda create conda create -y -n <name> python=3.11 && conda activate <name>
+git clone https://github.com/open-neutrino/neutrino
+cd neutrino && python setup.py install && cd ..
+neutrino --help # test installation
+```
+
+Neutrino does not have pre-build wheels, please **DO NOT** `pip instsall neutrino`!
+
+## Using Neutrino
+
+Inspired by [eBPF](https://ebpf.io/what-is-ebpf/), `probe` in Neutrino refers to a tiny sandboxed code snippet that could be attached to the GPU kernel at the assembly level (PTX, GCNAsm, SPIR-V) in the runtime. 
+`probe` extends a new programmable interface than traditional programming and provides a convenient way for observability to black-boxed GPU runtime.
+Currently Neutrino probes support two programming ways:
+1. Pythonic Tracing DSL, suitable for beginners.
+2. Direct Assembly probes wrapped in [TOML](https://toml.io/en/), suitable for advanced usage but it is platform-dependent.
+
+<table style="width:50%">
+<tr>
+<td valign="top">
+
+### Pythonic Tracing DSL
+```python
+from neutrino import probe, Map
+import neutrino.language as nl
+CALLBACK = "block_sched.py" # for trace analysis
+# declare maps for persistence
+@Map(level="warp", type="array", size=16, cap=1)
+class block_sched:
+  start: nl.u64
+  elapsed: nl.u32
+  cuid: nl.u32
+# declare probe registers shared across probes
+start: nl.u64 = 0 # starting clock
+elapsed: nl.u64 = 0 # elapsed time, initialized to 0
+# define probes with decorator
+@probe(pos="kernel", level="warp", before=True)
+def thread_start():
+  start = nl.clock()
+@probe(pos="kernel", level="warp")
+def thread_end():
+  elapsed = nl.clock() - start
+  block_sched.save(start, elapsed, nl.cuid())
+```
+
+</td>
+<td valign="top">
+
+### Direct Assembly wrapped in TOML
+```toml
+# CUDA PTX Assembly Example
+callback="block_sched.py"
+[ map.block_sched ]
+type = "array"
+level = "warp"
+size = "16"
+cap = "1"
+[ probe.thread_start_thread_end ]
+position = "kernel"
+level = "warp"
+register = {"u32": 2, "u64": 3}
+before = """.reg .b64 %PD<3>;
+.reg .b32 %P<2>;
+mov.u64 %PD0, %clock64;"""
+after = """mov.u64 %PD1, %clock64;
+sub.u64 %PD1, %PD1, %PD0;
+cvt.u32.u64 %P1, %PD1;
+mov.u32 %P2, %smid;
+SAVE [ block_sched ] {%PD0, %P1, %P2};"""
+```
+
+</td>
+</tr>
+</table>
+
+The interface of `@neutrino.Probe` is inspired by [Triton](https://triton-lang.org/main/index.html) whose contents (left) will be compiled, rather than executed, into platform-specific assemblies (right). 
+Probes of same `level` and `pos` will be merged.
+
+The formulation (and the name) of `@neutrino.Map` is prompted by [eBPF Map](https://docs.ebpf.io/linux/concepts/maps/). With structured definition, Neutrino can have save (no illegal memory access) and efficient (race-free, no atomics) persistence. 
+
+To simplify the development, Neutrino also provides some helper functions / operands:
+* `nl.clock() / nl.time()`: for reading device-side clock and timer.
+* `nl.addr/out/in1/in2/in3`: for reading register values
+* `Map.save()`: for persisting values for posterior analysis.
+
+## Compatibility
+
+More information can be found in our documentation. If you have more platforms or workloads need the support, please raise an issue to let us know!
+
+<table style="width:50%">
+<tr>
+<td valign="top">
+
+### Hardware
+
+
+| Hardware Platform	| Support Status |
+| --- | --- |
+| NVIDIA/CUDA/PTX	| ✅ Supported | 
+| AMD/ROCm/GCNAsm |	🛠️ Testing |
+| General/OpenCL/SPIR-V	| 🚀 Planning |
+
+</td>
+<td valign="top">
+
+### Software
+
+| Software Framework | Status | 
+| --- | --- |
+| cuBLAS/cuFFT/cuSparse...	| ❌ (no plan for supporting) |
+| CUTLASS	| ✅ (with macro in building) |
+| PyTorch family (torchvision...) | ✅ (with custom build) |
+| JAX	| ✅ (with envariable in runtime) | 
+| Triton	| ✅ |
+
+</td>
+</tr>
+</table>
+
+## Internals
+
+`neutrino` is designed to operate in the following workflow:
+
+<img src="assets/workflow.png" alt="workflow" width="500"/>
+
+The source code are placed in the following structure:
+
+```
+neutrino
+├── language # DSL and Compiler, Still in Testing
+│   ├── __init__.py # DSL Primitive
+│   ├── compiler.py # Exported Compiler API
+│   ├── frontend.py # Parser and AST Transformer
+│   ├── gcn.py     # CUDA PTX Codegen Backend
+│   └── ptx.py      # AMD ROCm Codegen Backend
+├── probe    # Probe Engine
+│   ├── __init__.py # Common Definition and Utilities
+│   ├── cuda.py     # CUDA PTX Impl
+│   └── hip.py      # AMD ROCm Impl
+├── src      # Hook Driver
+│   ├── common.h    # Platform-agnostic Definition (GNU-only)
+│   ├── cuda.c      # CUDA Impl (NVIDIA-related)
+│   ├── hip.c       # ROCm Impl (AMD-related)
+│   ├── preload.c   # Injector via LD_PRELOAD
+│   ├── parse.py    # Generate Unhook API (NVIDIA/AMD)
+│   ├── sha1.h      # third-parties header-only library
+│   └── uthash.h    # third-parties header-only library
+├── build.py    # Builder for driver in src/
+├── cli.py      # Command Line Interface Entry
+├── common.py   # Common Internal API not for User import
+└── __init__.py # Common Defn for user import like probe, Map
+```
+
+The overall structure is clean and approachable, we welcome developers to hack the system for their need. Raise issues if you need help.
+
+## More 
+
+* How to write my probe? Check the [Probe Writing Guide](https://open-neutrino.github.io/docs/write-probes).
+* How are probes executed? Check the [Probe Execution Model](https://open-neutrino.github.io/docs/execute-model).
+* How to read the neutrino trace? Check the [Trace File Structure](https://open-neutrino.github.io/docs/read-trace).
+* How to Neutrino works and how to extend? [Check the Reference and Internals](https://open-neutrino.github.io/docs/system-workflow).
+* How good is Neutrino? Check the [Utilities and Extensions](https://open-neutrino.github.io/docs/analysis-code)
+
+## Citation
+If you used Neutrino in your research, please cite the paper below. And we welcome you to send us a link to your paper. 
+```
+@inproceedings{huang2025neutrino,
+    author = {Songlin Huang and Chenshu Wu},
+    title = {Neutrino: Fine-grained GPU Kernel Profiling via Programmable Probing},
+    booktitle = {19th USENIX Symposium on Operating Systems Design and Implementation (OSDI 25)},
+    year = {2025},
+    url = {https://www.usenix.org/conference/osdi25/presentation/huang-songlin},
+    publisher = {USENIX Association},
+}
+```
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/analysis.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/analysis.py
new file mode 100644
index 0000000..2ff4f9b
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/analysis.py
@@ -0,0 +1,54 @@
+# Neutrino Generated Code for Reading Trace
+import struct
+from typing import NamedTuple, List, Tuple
+from neutrino import TraceHeader, TraceSection
+class block_sched(NamedTuple):
+  start: int
+  elapsed: int
+  cuid: int
+def parse(path: str):
+  with open(path, "rb") as f:
+    header: TraceHeader = TraceHeader(
+      struct.unpack("iiiiiiii", f.read(32)))
+    sections: List[TraceSection] = []
+    for _ in range(header.numProbes):
+      size, offset = struct.unpack("QQ", f.read(16))
+      sections.append(TraceSection(size, offset))
+    gridSize = header.gridDimX * header.gridDimY 
+                               * header.gridDimZ
+    blockSize = header.blockDimX * header.blockDimY
+                                 * header.blockDimZ
+    records: List[List[block_sched]] = []
+    for i in range(gridSize):
+      records.append([])
+      for j in range(blockSize):
+        start, elapsed, cuid = struct.unpack(
+                                 "QII", f.read(16))
+        records[i].append(
+                 block_sched(start, elapsed, cuid))
+  return header, sections, records
+# END OF GENERATED CODE
+import numpy as np
+header, sections, records = parse(sys.argv[1])
+unique_cus = set()    
+for block in records:
+  unique_cus.add(block[0].cuid)
+cu_timelines = [[]] * len(unique_cus)
+sched_times = [0.0] * len(unique_cus)
+work_times = [0.0] * len(unique_cus)
+for cur in records:
+  sched_out = False
+  for block in cu_timelines[cur[0].cuid]:
+    if block.start+block.elapsed<=cur[0].start:
+      sched_times[cur[0].cuid]+=cur[0].start 
+                 - (block.start + block.elapsed)
+      cu_timelines[cur[0].cuid].remove(block)
+      cu_timelines[cur[0].cuid].append(cur[0])
+      work_times[cur[0].cuid] += cur[0].elapsed
+      sched_out = True
+      break
+    if not sched_out:
+      cu_timelines[cur[0].cuid].append(cur[0]) 
+      work_times[cur[0].cuid] += cur[0].elapsed
+print(np.array(sched_times).mean(), 
+      np.array(work_times).mean())
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/gcn.asm b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/gcn.asm
new file mode 100644
index 0000000..256a044
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/gcn.asm
@@ -0,0 +1 @@
+CALLBACK = "block_sched.py"
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/zero_persistent.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/zero_persistent.py
new file mode 100644
index 0000000..a829336
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/zero_persistent.py
@@ -0,0 +1,29 @@
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def zero_persistent_kernel(output_ptr, numel,
+  BLOCK_SIZE: tl.constexpr, NUM_SMS: tl.constexpr):
+  start_pid = tl.program_id(axis=0)
+  num_blocks = tl.cdiv(numel, BLOCK_SIZE)
+  blocks_per_sm = num_blocks // NUM_SMS
+  if start_pid < num_blocks % NUM_SMS:
+    blocks_per_sm += 1
+  block_id = start_pid - NUM_SMS
+  for _ in range(blocks_per_sm):
+    block_id += NUM_SMS
+    offsets=block_id*BLOCK_SIZE+tl.arange(0,BLOCK_SIZE)
+    mask = offsets < numel
+    tl.store(output_ptr + offsets, 
+      tl.zeros([BLOCK_SIZE], dtype=tl.float16), mask)
+def zero_persistent(x: torch.Tensor):
+  numel = x.numel()
+  NUM_SMS = torch.cuda.get_device_properties("cuda")\
+                      .multi_processor_count
+  BLOCK_SIZE = 128
+  grid = lambda META: (min(NUM_SMS, 
+     triton.cdiv(numel, META['BLOCK_SIZE'])),)
+  zero_persistent_kernel[grid](
+    x, numel, BLOCK_SIZE, NUM_SMS)
+t=torch.empty((4096,4096),torch.float16,device="cuda")
+zero_persistent(t)
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/__init__.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/__init__.py
new file mode 100644
index 0000000..a6f2479
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/__init__.py
@@ -0,0 +1,39 @@
+from typing import NamedTuple, Union, Literal
+
+class TraceHeader(NamedTuple):
+    gridDimX: int
+    gridDimY: int
+    gridDimZ: int
+    blockDimX: int
+    blockDimY: int
+    blockDimZ: int
+    sharedMemBytes: int
+    numProbes: int
+
+class TraceSection(NamedTuple):
+    size:    int
+    warpDiv: int
+    offset:  int
+
+def probe(pos: str, after: bool = False, level: str = "thread", size: int = 0):
+    """Neutrino Probe Entry"""
+    from functools import wraps
+    # Just preventing the execution as we take it as part of AST only
+    def inner(func: callable):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+          raise RuntimeError(f"{func.__name__} shall be jit other than run")
+        return wrapper
+    return inner
+
+def Map(level: Literal["warp", "thread"], type: str, size: int, cap: Union[int, Literal["dynamic"]]):
+    """Neutrino Map Definition"""
+    from functools import wraps
+    def inner(cls): 
+        @wraps(cls)
+        def wrapper(*args, **kwargs):
+          raise RuntimeError(f"{cls.__name__} shall be jit other than run")
+        return wrapper
+    return inner
+
+# Following are internal definition
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/build.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/build.py
new file mode 100644
index 0000000..8685e9f
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/build.py
@@ -0,0 +1,129 @@
+import os
+import sys
+import subprocess
+from pprint import pprint
+
+try:
+    import toml
+except:
+    import pip
+    pip.main(["install", "toml"])
+    import toml
+
+CURDIR = os.path.dirname(os.path.realpath(__file__))
+
+def check_command(cmd: str):
+    try:
+        _ = subprocess.run([cmd], stdout=subprocess.PIPE,  
+            stderr=subprocess.PIPE, text=True, check=True)
+        return True
+    except FileNotFoundError:
+        return False
+
+# Use rocm-smi or nvidia-smi to detect if installed
+if check_command("rocm-smi"):
+    NEUTRINO_MODE = "HIP"
+    NEUTRINO_DRIVER_HEADER_NAME = "hip/hip_runtime_api.h"
+    NEUTRINO_IMPL_SRC = "hip.c"
+    NEUTRINO_HOOK_DRIVER_LIB_NAME = "libamdhip64.so.6"
+    NEUTRINO_DRIVER_HEADER_SEARCH_PATH = [
+        "/opt/rocm/include/", # AFAIK, add if new path is met
+    ]
+    extra_flags = ["-D__HIP_PLATFORM_AMD__"]
+elif check_command("nvidia-smi"):
+    NEUTRINO_MODE = "CUDA"
+    NEUTRINO_DRIVER_HEADER_NAME = "cuda.h"
+    NEUTRINO_IMPL_SRC = "cuda.c"
+    NEUTRINO_HOOK_DRIVER_LIB_NAME = "libcuda.so.1"
+    NEUTRINO_DRIVER_HEADER_SEARCH_PATH = [
+        "/usr/local/cuda/targets/x86_64-linux/include/", # for x86
+        "/usr/local/cuda/targets/aarch64-linux/include/", # for ARM
+        # add if missed
+    ]
+    extra_flags = []
+else:
+    raise RuntimeError("ONLY SUPPORT CUDA and HIP(AMD-ONLY)")
+
+# Internal Configurations
+SRC_DIR = os.path.join(CURDIR, "src")
+BUILD_DIR = os.path.join(CURDIR, "build")
+CC = "cc" # NOTE don't use nvcc or hipcc, need gcc or clang
+PY = sys.executable
+
+for dir_ in NEUTRINO_DRIVER_HEADER_SEARCH_PATH:
+    try:
+        if NEUTRINO_DRIVER_HEADER_NAME in os.listdir(dir_):
+          break
+    except:
+        pass
+# NOTE this will be written in config.toml
+NEUTRINO_DRIVER_HEADER_DIR = dir_
+
+# NOTE Locate Driver Shared Library
+# inspired by: https://github.com/triton-lang/triton/commit/58c54455ffa691be64f90f4e856501162373572c#diff-3d1f29795218f61553ab953426c15fa1e4162b224405b85529022293054da57aR25
+# but we need to further locate the real driver library
+libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode()
+locs = [line.split()[-1] for line in libs.splitlines() if NEUTRINO_HOOK_DRIVER_LIB_NAME in line]
+env_ld_library_path = os.getenv("LD_LIBRARY_PATH")
+if env_ld_library_path and not locs:
+    locs = [os.path.join(dir_, NEUTRINO_HOOK_DRIVER_LIB_NAME) for dir_ in env_ld_library_path.split(":")
+            if os.path.exists(os.path.join(dir_, NEUTRINO_HOOK_DRIVER_LIB_NAME))]
+
+# try to locate the pointed path
+NEUTRINO_REAL_DRIVER_LIB_NAME = ""
+NEUTRINO_REAL_DRIVER_LIB_DIR = ""
+real_libs = []
+for loc in locs:
+    real_lib = os.readlink(loc)
+     # NOTE fix lib32 and i386 bug
+    if "lib32" not in loc and "lib32" not in real_lib and "i386" not in loc and "i386" not in real_lib:
+        if not real_lib.startswith("/"):
+            NEUTRINO_REAL_DRIVER_LIB_DIR = os.path.dirname(loc)
+            NEUTRINO_REAL_DRIVER_LIB_NAME = real_lib
+        else:
+            NEUTRINO_REAL_DRIVER_LIB_DIR = os.path.dirname(loc)
+            NEUTRINO_REAL_DRIVER_LIB_NAME = os.path.basename(loc)
+
+print(NEUTRINO_REAL_DRIVER_LIB_DIR, NEUTRINO_REAL_DRIVER_LIB_NAME, file=sys.stderr)
+
+# NOTE call parse.py
+cmd = [PY, os.path.join(SRC_DIR, "parse.py"), 
+    os.path.join(NEUTRINO_DRIVER_HEADER_DIR, NEUTRINO_DRIVER_HEADER_NAME), 
+    os.path.join(NEUTRINO_REAL_DRIVER_LIB_DIR, NEUTRINO_REAL_DRIVER_LIB_NAME)]
+print(" ".join(cmd), file=sys.stderr)
+subprocess.check_output(cmd)
+
+# NOTE compile cuda.c/hip.c with common.h
+cmd = [CC, os.path.join(SRC_DIR, NEUTRINO_IMPL_SRC), "-fPIC", "-shared", "-ldl", "-lpthread", "-O3", *extra_flags,
+        "-I", NEUTRINO_DRIVER_HEADER_DIR, "-o", os.path.join(BUILD_DIR, NEUTRINO_HOOK_DRIVER_LIB_NAME)]
+print(" ".join(cmd), file=sys.stderr)
+subprocess.check_output(cmd)
+
+# NOTE compile preload.c
+cmd = [CC, os.path.join(SRC_DIR, "preload.c"), "-fPIC", "-shared", "-O3", 
+       "-o", os.path.join(BUILD_DIR, "preload.so")]
+print(" ".join(cmd), file=sys.stderr)
+subprocess.check_output(cmd)
+
+# NOTE create a symbolic link like libcuda.so -> libcuda.so.1
+# TODO verify if this is need 
+cmd = ["ln", "-sf", NEUTRINO_HOOK_DRIVER_LIB_NAME, 
+       os.path.join(BUILD_DIR, NEUTRINO_HOOK_DRIVER_LIB_NAME[:NEUTRINO_HOOK_DRIVER_LIB_NAME.index("so") + 2])]
+print(" ".join(cmd), file=sys.stderr)
+subprocess.check_output(cmd)
+
+# NOTE dump system configuration for CLI usage
+config = {}
+config["system"] = {
+    "NEUTRINO_MODE"                 : NEUTRINO_MODE,
+    "NEUTRINO_DRIVER_HEADER_NAME"   : NEUTRINO_DRIVER_HEADER_NAME,
+    "NEUTRINO_DRIVER_HEADER_DIR"    : NEUTRINO_DRIVER_HEADER_DIR,
+    "NEUTRINO_HOOK_DRIVER_LIB_NAME" : NEUTRINO_HOOK_DRIVER_LIB_NAME,
+    "NEUTRINO_REAL_DRIVER_LIB_NAME" : NEUTRINO_REAL_DRIVER_LIB_NAME,
+    "NEUTRINO_REAL_DRIVER_LIB_DIR"  : NEUTRINO_REAL_DRIVER_LIB_DIR,
+}
+toml.dump(config, open(os.path.join(BUILD_DIR, "config.toml"), "w"))
+
+print("Build Success, Configuration")
+print("============================")
+pprint(config)
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/cli.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/cli.py
new file mode 100644
index 0000000..6049a1d
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/cli.py
@@ -0,0 +1,165 @@
+"""CLI Entry for NEUTRINO: Fine-grained GPU Kernel Profiling via Programmable Probing"""
+
+import subprocess
+import os
+import sys
+import toml
+import argparse
+
+
+# Main Function, need a func to be referred by setup.py build system
+def main(): 
+    # NOTE READ CONFIG.TOML FOR DEFAULT SYSTEM CONFIGURATION
+    # directory of this python file and other toolkits
+    CURDIR = os.path.dirname(os.path.realpath(__file__))
+    # directory of the neutrino internals
+    NEUTRINO_BUILD_DIR: str = os.path.join(CURDIR, "build")
+    NEUTRINO_PROBE_DIR: str = os.path.join(CURDIR, "probe")
+    NEUTRINO_TOOLS_DIR: str = os.path.join(CURDIR, "tools")
+    # load system configuration, generated in building
+    config = toml.load(os.path.join(NEUTRINO_BUILD_DIR, "config.toml"))["system"] 
+    # default configurations, can be overwritten by CLI parameters
+    NEUTRINO_HOOK_DRIVER_NAME: str = config["NEUTRINO_HOOK_DRIVER_LIB_NAME"]
+    NEUTRINO_REAL_DRIVER_DIR : str = config["NEUTRINO_REAL_DRIVER_LIB_DIR"]
+    NEUTRINO_REAL_DRIVER_NAME: str = config["NEUTRINO_REAL_DRIVER_LIB_NAME"]
+    NEUTRINO_MODE            : str = config["NEUTRINO_MODE"]
+    # available built-in tools
+    NEUTRINO_TOOLS = {tool[:-3] : tool for tool in os.listdir(NEUTRINO_TOOLS_DIR) if tool.endswith(".py")}
+    
+    parser = argparse.ArgumentParser(
+        prog='neutrino', usage='%(prog)s [options] command',
+        description=f"""NOTE: Probes must be given via -p (--probe) option. Buit-in tools: {tuple(NEUTRINO_TOOLS.keys())}""", 
+        epilog="Examples: `neutrino -t gmem_bytes python test/zero_.py`. Open issue(s) in https://github.com/neutrino-gpu/neutrino if encountered problems", 
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    
+    parser.add_argument('-p', '--probe', required=True, 
+                        help="probe in form of path to the file")
+    parser.add_argument('--tracedir', default="./trace", 
+                        help="parent folder of traces")
+    parser.add_argument('--driver', default=os.path.join(NEUTRINO_REAL_DRIVER_DIR, NEUTRINO_REAL_DRIVER_NAME),
+                        help='path to the real cuda/hip driver shared library')
+    parser.add_argument("--python", default=sys.executable, 
+                        help='path to python executable used')
+    parser.add_argument('--filter', 
+                        help='filter OUT buggy kernels by (part of) name, split by :')
+    parser.add_argument('-k', '--kernel', 
+                        help='filter the kernel by (part of) name, split by :')
+    parser.add_argument('--callback', 
+                        help='attach callback for trace analysis')
+    parser.add_argument('--benchmark', action='store_true', 
+                        help="enable benchmark mode to evaluate overhead w.r.t. the original kernel")
+    parser.add_argument('--memusage', action='store_true', 
+                        help="prevent the profiling and only measure the memory usage")
+    # put command at the end of command
+    parser.add_argument("command", nargs=argparse.REMAINDER) 
+    # parse the arguments
+    args = parser.parse_args()
+    
+    # same as this executable
+    NEUTRINO_PYTHON: str = args.python # default to be this executable
+    # directory to put the trace
+    NEUTRINO_TRACEDIR: str = args.tracedir
+    # filter of kernel
+    NEUTRINO_FILTER: str = args.filter if args.filter is not None else ""
+    NEUTRINO_KERNEL: str = args.kernel if args.kernel is not None else ""
+    # Benchmark mode, will include an additional launch after the trace kernel
+    # Used to measure the kernel-level slowdown of Neutrino, disabled by default
+    NEUTRINO_BENCHMARK: str = str(int(args.benchmark))
+    NEUTRINO_MEMUSAGE: str = str(int(args.memusage))
+    # Path to the real driver
+    NEUTRINO_REAL_DRIVER: str = args.driver
+    # command to be executed
+    command: str = args.command
+    assert len(command) > 0, "Command must be specified"
+    
+    # Parse the PROBE
+    NEUTRINO_PROBE_PATH: str = args.probe
+    NEUTRINO_READING: str = None
+    # NOTE endswith .py triggers the Tracing DSL
+    if NEUTRINO_PROBE_PATH.endswith(".py"):
+        from neutrino.language.compiler import compile
+        source = open(NEUTRINO_PROBE_PATH, "r").read()
+        NEUTRINO_PROBE = compile(NEUTRINO_MODE, source)
+    elif NEUTRINO_PROBE_PATH.endswith(".toml"):
+        NEUTRINO_PROBE = toml.load(NEUTRINO_PROBE_PATH)
+    else:
+        # No suffix := use built-in tools
+        if  NEUTRINO_PROBE_PATH not in NEUTRINO_TOOLS:
+            print(f"[error] {NEUTRINO_PROBE_PATH} not in tools: {NEUTRINO_TOOLS}", file=sys.stderr)
+            exit(-1)
+        else:
+            from neutrino.language.compiler import compile
+            source = open(os.path.join(NEUTRINO_TOOLS_DIR, NEUTRINO_TOOLS[NEUTRINO_PROBE_PATH]), "r").read()
+            NEUTRINO_PROBE = compile(NEUTRINO_MODE, source)
+            
+    # NOTE generate the trace reading code
+    from neutrino.utils.trace_reading import gen_reading_code
+    NEUTRINO_READING = gen_reading_code(NEUTRINO_PROBE)
+    
+    # NOTE check if dynamic is True, shall have a specific keyword in top-level of probe
+    NEUTRINO_DYNAMIC = "dynamic" in NEUTRINO_PROBE and NEUTRINO_PROBE["dynamic"] is True
+    
+    # TODO change the callback to other places
+    NEUTRINO_CALLBACK = NEUTRINO_PROBE["CALLBACK"] if "CALLBACK" in NEUTRINO_PROBE else None
+    NEUTRINO_CALLBACK = args.callback if args.callback is not None else NEUTRINO_CALLBACK
+    if NEUTRINO_CALLBACK:
+        # search the path 
+        if not os.path.exists(NEUTRINO_CALLBACK):
+            searched = os.path.join(os.path.dirname(NEUTRINO_PROBE_PATH), NEUTRINO_CALLBACK)
+            if os.path.exists(searched):
+                NEUTRINO_CALLBACK = searched
+            else:
+                print(f"[warn] callback {NEUTRINO_CALLBACK} not found")
+                NEUTRINO_CALLBACK = None
+
+    # a copied environment variables
+    env = os.environ.copy()
+    # configure Neutrino related environment variables
+    env["NEUTRINO_REAL_DRIVER"]  = NEUTRINO_REAL_DRIVER
+    env["NEUTRINO_DRIVER_NAME"]  = NEUTRINO_HOOK_DRIVER_NAME
+    env["NEUTRINO_HOOK_DRIVER"]  = os.path.join(NEUTRINO_BUILD_DIR, NEUTRINO_HOOK_DRIVER_NAME)
+    env["NEUTRINO_PYTHON"]       = NEUTRINO_PYTHON
+    env["NEUTRINO_PROBING_PY"]   = os.path.join(NEUTRINO_BUILD_DIR, "process.py")
+    env["NEUTRINO_FILTER"]       = NEUTRINO_FILTER
+    env["NEUTRINO_KERNEL"]       = NEUTRINO_KERNEL
+    env["NEUTRINO_TRACEDIR"]     = NEUTRINO_TRACEDIR
+    env["NEUTRINO_PROBES"]       = toml.dumps(NEUTRINO_PROBE) # dump it to string
+    # GNU LD_PRELOAD to overwrite dlopen, https://man7.org/linux/man-pages/man8/ld.so.8.html
+    env["LD_PRELOAD"]            = os.path.join(NEUTRINO_BUILD_DIR, "preload.so")
+    # Add to the LD_LIBRARY_PATH, this would overwrite ldconfig
+    if "LD_LIBRARY_PATH" in env:
+        env["LD_LIBRARY_PATH"]   = NEUTRINO_BUILD_DIR + ":" + env["LD_LIBRARY_PATH"]
+    else:
+        env["LD_LIBRARY_PATH"]   = NEUTRINO_BUILD_DIR
+    # An Environmental Variable to enable the trace
+    # NOTE some bugs here -> still working on
+    env["NEUTRINO_ENABLE"] = "1"
+    # An Environmental Variable to enable the benchmark mode
+    env["NEUTRINO_BENCHMARK"]    = NEUTRINO_BENCHMARK
+    env["NEUTRINO_MEMUSAGE"]     = NEUTRINO_MEMUSAGE
+    # An Environmental Variables to enable the debug mode -> more messages
+    # env["NEUTRINO_VERBOSE"] = "1"
+    if NEUTRINO_DYNAMIC:
+        env["NEUTRINO_DYNAMIC"]  = "1"
+    if NEUTRINO_READING:
+        env["NEUTRINO_READING"]  = NEUTRINO_READING
+    if NEUTRINO_CALLBACK:
+        env["NEUTRINO_CALLBACK"] = NEUTRINO_CALLBACK
+
+    # FIX for Triton
+    if NEUTRINO_MODE == "CUDA":
+        env["TRITON_LIBCUDA_PATH"] = NEUTRINO_BUILD_DIR
+        env["NEUTRINO_PROBING_PY"] = os.path.join(NEUTRINO_PROBE_DIR, "cuda.py")
+    elif NEUTRINO_MODE == "HIP":
+        # NOTE There's a bug in Triton's impl here, for path we refer to the 
+        # directory for ld.so to search, instead of spcific file name ...
+        env["TRITON_LIBHIP_PATH"]  = os.path.join(NEUTRINO_BUILD_DIR, "libamdhip64.so") 
+        env["NEUTRINO_PROBING_PY"] = os.path.join(NEUTRINO_PROBE_DIR, "hip.py")
+
+    # start the program with new environment
+    if len(command) > 0:
+        proc = subprocess.Popen(command, env=env)
+        proc.wait()
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/common.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/common.py
new file mode 100644
index 0000000..873c5fc
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/common.py
@@ -0,0 +1,68 @@
+"""Neutrino Internal APIs, not for user import"""
+
+from typing import Optional, Literal, Union
+from dataclasses import dataclass
+
+@dataclass
+class Register:
+    name: str
+    dtype: Literal['u32', 'u64']
+    init: Optional[int] = None
+
+
+@dataclass
+class Probe:
+    name:   str                       # name is the key in TOML
+    level:  Literal["thread", "warp"] # level of the probe
+    pos:    list[str]                 # := tracepoint in the paper
+    before: Union[list, str] = None   # snippet inserted before, one of before and after shall be given
+    after:  Union[list, str] = None   # snippet inserted after,  one of before and after shall be given
+
+
+@dataclass
+class Map:
+    name:   str 
+    level:  Literal["thread", "warp"]
+    type:   Literal["array"]
+    size:   int
+    cap:    Union[int, Literal["dynamic"]]
+    regs:   list[Register]
+
+
+def load(raw: dict) -> tuple[list[Probe], list[Map], int]:
+    """Unserialize Neutrino probes in Python dict to probes, maps, regs"""
+    assert "probe" in raw.keys() and "map" in raw.keys(), "At least a probe and a map"
+    probes: list[Probe] = []
+    maps: list[Map] = []
+    for name, probe in raw["probe"].items():
+        # first validate the 
+        keys = probe.keys()
+        assert "position" in keys or "pos" in keys, f"[error] {name} has no position (required)"
+        # assert "datamodel" in keys, f"[error] "
+        assert "before" in keys or "after" in keys, f"[error] {name} is empty, one of before or after shall be given"
+        assert "level" in keys and probe["level"] in ("warp", "thread"), f"[error] level must be given and one of 'warp', 'thread'"
+        probes.append(Probe(name=name,
+                            level=probe["level"], 
+                            pos=probe["pos"].split(":"), 
+                            before=probe["before"] if "before" in keys else None,
+                            after=probe["after"] if "after" in keys else None))
+    for name, map_ in raw["map"].items():
+        maps.append(Map(name=name, 
+                        level=map_["level"], 
+                        type=map_["type"], 
+                        size=map_["size"], 
+                        cap=map_["cap"], 
+                        regs=[Register(name, val[0], init=val[1]) for name, val in map_["regs"].items()]))
+    return probes, maps, raw["regs"]
+
+
+def dump(probes, maps, regs, callback = "") -> dict:
+    """Serialize Neutrino probes to Python dict"""
+    dict_probe = { 
+        "regs": regs, 
+        "probe" : {p.name: {"level": p.level, "pos": p.pos, "before": p.before, "after": p.after} for p in probes},
+        "map": {m.name: {"level": m.level, "type": m.type, "size": m.size, "cap": m.cap, "regs": {r.name: [r.dtype, r.init] for r in m.regs}} for m in maps}
+    }
+    if len(callback) > 0:
+        dict_probe["CALLBACK"] = callback
+    return dict_probe
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/__init__.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/__init__.py
new file mode 100644
index 0000000..342b036
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/__init__.py
@@ -0,0 +1,35 @@
+"""Neutrino Trace Language Primitive"""
+from functools import wraps
+from typing import TypeAlias
+
+# NOTE Neutrino Language's type system is incomplete and tiny because they're 
+# placeholders for compilers instead of functional code for Python.
+u32: TypeAlias = int
+u64: TypeAlias = int
+reg: TypeAlias = int
+
+TYPES = ["u32", "u64"]
+FUNCS = ["smid", "time", "clock", "save"]
+
+def smid() -> u32: ...
+
+def time() -> u64: ...
+
+def clock() -> u64: ...
+
+def save(regs: list[reg], dtype) -> None: ...
+
+# @_disable_execution_
+# def tid() -> None: ...
+
+# @_disable_execution_
+# def pid() -> None: ...
+
+# Following are helpers for parsing register operands
+out: reg = ...
+in1: reg = ...
+in2: reg = ...
+in3: reg = ...
+in4: reg = ...
+addr: reg = ...
+bytes: reg = ...
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/compiler.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/compiler.py
new file mode 100644
index 0000000..516e802
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/compiler.py
@@ -0,0 +1,49 @@
+"""CLI Entry of Language Submodule"""
+from neutrino.common import Probe, dump
+from neutrino.language.frontend import parse
+
+def compile(mode: str, source: str) -> tuple[str, str]:
+    """Compile the Tracing DSL into Assembly Probes"""
+    regs, probes, maps, callback = parse(source)
+    if mode == "CUDA":
+        from neutrino.language.ptx import gencode
+        probes = gencode(probes)
+    elif mode == "HIP":
+        from neutrino.language.gcn import gencode
+        probes = gencode(probes)
+
+    # NOTE Merge probes of the same level and pos
+    merged_probes: dict[tuple[str, str], Probe] = {}
+    for probe in probes:
+        key = (probe.level, probe.pos)
+        if key not in merged_probes:
+            merged_probes[key] = probe
+        else: # merge
+            merged_probes[key].name += "_" + probe.name
+            merged_probes[key].before = (
+                (merged_probes[key].before or "") + (probe.before or "")
+                if merged_probes[key].before or probe.before
+                else None
+            )
+            merged_probes[key].after = (
+                (merged_probes[key].after or "") + (probe.after or "")
+                if merged_probes[key].after or probe.after
+                else None
+            )
+    probes = list(merged_probes.values())
+
+    dumped = dump(probes, maps, regs, callback)
+
+    for map_ in maps:
+        if map_.cap == "dynamic":
+            dumped["dynamic"] = True
+
+    return dumped
+
+if __name__ == "__main__": # A small test case
+    import sys
+    import toml
+    mode, source = sys.argv[1], sys.argv[2]
+    source = open(source, "r").read()
+    asm_probes = compile(mode, source)
+    print(toml.dumps(asm_probes))
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/frontend.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/frontend.py
new file mode 100644
index 0000000..2aa3736
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/frontend.py
@@ -0,0 +1,251 @@
+"""Parse and flatten Python Tracing DSL"""
+from neutrino.language import TYPES # neutrino/language/__init__.py
+import ast
+from typing import Optional
+from dataclasses import dataclass
+from neutrino.common import Register, Probe, Map
+
+allowed_nodes = {
+    ast.Import,     # Imported Stuff
+    ast.Module,     # the greatest start
+    ast.Name,       # Name of Variable
+    ast.Assign,     # Assign Value
+    ast.AugAssign,  # +=
+    ast.UnaryOp,    # Unary Op, only negative
+    ast.BinOp,      # Binary Op, +-*/
+    ast.Call,       # Call function
+    ast.Attribute,  # Access Attribute of Namespace
+    ast.Constant,   # Constant Value
+    ast.Expr,       # Single Expression
+}
+
+binary_ops = {
+    ast.Add:  "add",
+    ast.Sub:  "sub",
+    ast.Mult: "mul",
+    ast.Div:  "div"
+}
+
+unary_ops = {
+    ast.USub: "neg"
+}
+
+class NeutrinoVisitor(ast.NodeVisitor):
+    def __init__(self, nl_name: str, regs: list[str], maps: list[str]):
+        super().__init__()
+        self.nl_name = nl_name
+        self.reg_counter = -1 # make it R0
+        self.ir: list[tuple] = []
+        self.reg_map: dict[str, str] = {reg: self.fresh_name() for reg in regs}
+        self.maps = maps
+        # initialize and visit tree
+
+    def fresh_name(self):
+        self.reg_counter += 1
+        return f"NR{self.reg_counter}"
+
+    def visit_Assign(self, node): # Lowered to mov 
+        # we shall check if the target has a known name
+        name = self.reg_map[node.targets[0].id]
+        if isinstance(node.value, ast.Attribute):
+            self.ir.append(["mov", name, self.visit(node.value)])
+        else:
+            new_name = self.visit(node.value) # this is the temporary name
+            for inst in self.ir:
+                for idx in range(len(inst)):
+                    if inst[idx] == new_name:
+                        inst[idx] = name
+            self.reg_counter -= 1
+
+    def process_operand(self, operand) -> str:
+        if isinstance(operand, ast.Name):
+            return self.reg_map[operand.id]
+        elif isinstance(operand, ast.Constant):
+            return operand.value
+        elif isinstance(operand, (ast.Attribute, ast.Call, ast.BinOp, ast.UnaryOp)):
+            return self.visit(operand)
+        else:
+            raise ValueError
+    
+    def visit_BinOp(self, node): # Lowered to add/sub
+        lhs = self.process_operand(node.left)
+        rhs = self.process_operand(node.right)
+        new_name = self.fresh_name()
+        if isinstance(node.op, ast.Add):
+            self.ir.append(["add", new_name, lhs, rhs])
+        elif isinstance(node.op, ast.Sub):
+            self.ir.append(["sub", new_name, lhs, rhs])
+        elif isinstance(node.op, ast.Mult):
+            self.ir.append(["mul", new_name, lhs, rhs])
+        elif isinstance(node.op, ast.Div):
+            self.ir.append(["div", new_name, lhs, rhs])
+        elif isinstance(node.op, ast.LShift):
+            self.ir.append(["lsh", new_name, lhs, rhs])
+        else:
+            raise NotImplementedError()
+        self.reg_map[new_name] = new_name
+        return new_name
+    
+    def visit_AugAssign(self, node):
+        rhs = self.process_operand(node.value)
+        name = self.reg_map[node.target.id]
+        if isinstance(node.op, ast.Add):
+            self.ir.append(["add", name, name, rhs])
+        elif isinstance(node.op, ast.Sub):
+            self.ir.append(["sub", name, name, rhs])
+        elif isinstance(node.op, ast.Mult):
+            self.ir.append(["mul", name, name, rhs])
+        elif isinstance(node.op, ast.Div):
+            self.ir.append(["div", name, name, rhs])
+        elif isinstance(node.op, ast.LShift):
+            self.ir.append(["lsh", name, name, rhs])
+        else:
+            raise NotImplementedError()
+        return name
+
+    def visit_UnaryOp(self, node): 
+        value = self.process_operand(node.operand)
+        new_name = self.fresh_name()
+        if isinstance(node.op, ast.USub):
+            self.ir.append(["neg", new_name, value])
+        else:
+            raise NotImplementedError()
+        return new_name
+
+    def visit_Call(self, node):
+        func_name = self.visit(node.func)
+        if func_name == "cuid":
+            new_name = self.fresh_name()
+            self.ir.append(["cuid", new_name])
+            self.reg_map[new_name] = new_name
+            return new_name
+        elif func_name == "time":
+            new_name = self.fresh_name()
+            self.reg_map[new_name] = new_name
+            self.ir.append(["time", new_name]) 
+            return new_name
+        elif func_name == "clock":
+            new_name = self.fresh_name()
+            self.reg_map[new_name] = new_name
+            self.ir.append(["clock", new_name]) 
+            return new_name
+        elif func_name == "save":
+            map_name = node.func.value.id
+            if map_name not in self.maps:
+                raise ValueError(f"Map {map_name} not found, known maps: {self.maps}")            
+            regs = []
+            for arg in node.args:
+                if isinstance(arg, ast.Name):
+                    regs.append(self.reg_map[arg.id])
+                elif isinstance(arg, ast.Attribute):
+                    regs.append(self.visit_Attribute(arg))
+                else:
+                    regs.append(self.reg_map[self.visit(arg)])
+            self.ir.append(["SAVE", map_name] + regs)
+        else:
+            raise NotImplementedError()
+    
+    def visit_Name(self, node):
+        return node.id
+
+    def visit_Attribute(self, node):
+        if node.value.id == self.nl_name or node.value.id in self.maps:
+            if node.attr in ("bytes", "addr", "out", "in1", "in2", "in3"):
+                return node.attr.upper()
+            return node.attr
+        else:
+            raise ValueError(f"can only refer to neutrino.language semantic but got {node.value.id}")
+
+    def visit_Constant(self, node):
+        return node
+
+    def generic_visit(self, node):
+        if type(node) not in allowed_nodes:
+            raise NotImplementedError(f"{type(node).__name__} (lineno: {node.lineno})")
+        super().generic_visit(node)
+
+
+def parse(code: str) -> tuple[list[Register], list[Probe], list[Map], str]:
+    """Parse the code into probes"""
+    tree = ast.parse(code)
+    nl_name:  str = None # name of neutrino.language in the code
+    regs:     list[Register] = []
+    num_regs: int = 0
+    probes:   list[Probe] = []
+    callback: str = "" # not used yet, but we can use it later
+    maps:     list[Map] = [] # not used yet, but we can use it later
+
+    for node in tree.body:
+        if type(node) is ast.Import and node.names[0].name == "neutrino.language":
+            nl_name = node.names[0].asname
+        elif type(node) is ast.Assign and node.targets[0].id == "CALLBACK":
+            if isinstance(node.value, ast.Constant):
+                callback = node.value.value
+            else:
+                raise ValueError("CALLBACK must be a string constant")
+        elif type(node) is ast.AnnAssign and node.annotation:
+            if node.annotation.value.id == nl_name and node.annotation.attr in TYPES:
+                regs.append(Register(node.target.id, node.annotation.attr, node.value.value)) 
+        elif type(node) is ast.ClassDef and node.decorator_list:
+            name = node.name # take class name as map name
+            decorator = node.decorator_list[0]
+            if decorator.func.id == "Map":
+                level, type_, size, cap, contents = None, None, 0, 1, []
+                for keyword in decorator.keywords:
+                    if   keyword.arg == "level": level = keyword.value.value
+                    elif keyword.arg == "type":  type_ = keyword.value.value
+                    elif keyword.arg == "size":  size = keyword.value.value
+                    elif keyword.arg == "cap":   cap = keyword.value.value
+                if size % 8 != 0: 
+                    raise ValueError("size must be multiple of 8 to avoid misaligned address")
+                if not level or not type_: 
+                    raise ValueError("level and type must be specified")
+                if not isinstance(cap, int) and cap != "dynamic": 
+                    raise ValueError("cap must be an integer or 'dynamic'")
+                # check if map existed or not
+                for node in node.body:
+                    if type(node) is ast.AnnAssign and node.annotation:
+                        if node.annotation.value.id == nl_name and node.annotation.attr in TYPES:
+                            contents.append(Register(node.target.id, node.annotation.attr, None))
+                    else:
+                        raise ValueError(f"Map {name} must only contain AnnAssign nodes")
+                ordered = sorted(contents, key=lambda reg: reg.dtype, reverse=True)
+                if ordered != contents:
+                    print("[warn] map contents reordered")
+                # create a map object 
+                maps.append(Map(name=name, level=level, type=type_, size=size, cap=cap, regs=ordered))
+        elif type(node) is ast.FunctionDef and node.decorator_list:
+            name = node.name # take func name as probe name
+            decorator = node.decorator_list[0]
+            if decorator.func.id == "probe":
+                pos, level, before = None, None, False
+                for keyword in decorator.keywords:
+                    if   keyword.arg == "pos":       pos = keyword.value.value
+                    elif keyword.arg == "level":     level = keyword.value.value
+                    elif keyword.arg == "before":    before = keyword.value.value
+                if not pos or not level: raise ValueError("position must be specified")
+                # check if probe existed or not
+                visitor = NeutrinoVisitor(nl_name=nl_name, regs=[reg.name for reg in regs], maps=[map.name for map in maps])
+                visitor.visit(ast.Module(body=node.body)) # Take it as independent code
+                probe = Probe(name=name, pos=pos, level=level)
+                if before:
+                    probe.before = visitor.ir
+                else:
+                    probe.after  = visitor.ir
+                probes.append(probe)
+                num_regs = max(num_regs, visitor.reg_counter)
+    
+    return num_regs + len(regs), probes, maps, callback
+
+# A Simple Test Case, not really used in production
+if __name__ == "__main__":
+    import sys
+
+    code = open(sys.argv[1], "r").read()
+
+    regs, probes, maps, callback = parse(code)
+    
+    print(regs)
+    print(probes)
+    print(maps)
+    print(callback)
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/gcn.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/gcn.py
new file mode 100644
index 0000000..21ef908
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/gcn.py
@@ -0,0 +1,12 @@
+"""Generate the AMD GCN Assembly, a x86-like asm
+
+NOTE Currently only targets CDNA branch of GCN, covering MI100/200/300/325
+This is because AMD's Assembly diverge into CDNA/RDNA in 2020, before that
+there's only one architecture named GCN (so as the name of GCNAsm).
+
+CDNA and RDNA shares the same syntax inherited from GCNAsm, but has slight
+difference in instruction set, for example, CDNA use `S_MEMTIME S[0:1]` to
+read the clock in 64bit but RDNA use `S_GETREG S0, SHADER_CYCLES` in 32bit
+
+We plan to support CDNA arch first and then port to RDNA arch later.
+"""
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/ptx.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/ptx.py
new file mode 100644
index 0000000..051f194
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/ptx.py
@@ -0,0 +1,75 @@
+"""Generate the CUDA PTX Assembly, a C-style asm"""
+
+from neutrino.common import Register, Probe, Map
+
+def filter_keyword(reg: str) -> str:
+    if reg in {"ADDR", "BYTES", "OUT", "IN1", "IN2", "IN3", "IN4"}:
+        return reg
+    elif isinstance(reg, int):
+        return reg
+    else:
+        return "%" + reg
+
+def cvt_inst(inst: list[str]) -> str:
+    match inst[0]:
+        # ALU Instructions
+        case "add":
+            return f"add.u64 {filter_keyword(inst[1])}, {filter_keyword(inst[2])}, {filter_keyword(inst[3])};"
+        case "sub":
+            return f"sub.u64 {filter_keyword(inst[1])}, {filter_keyword(inst[2])}, {filter_keyword(inst[3])};"
+        case "mul":
+            return f"mul.u64 {filter_keyword(inst[1])}, {filter_keyword(inst[2])}, {filter_keyword(inst[3])};"
+        case "div":
+            return f"div.u64 {filter_keyword(inst[1])}, {filter_keyword(inst[2])}, {filter_keyword(inst[3])};"
+        case "mod":
+            return f"rem.u64 {filter_keyword(inst[1])}, {filter_keyword(inst[2])}, {filter_keyword(inst[3])};"
+        case "lsh":
+            return f"shl.u64 {filter_keyword(inst[1])}, {filter_keyword(inst[2])}, {filter_keyword(inst[3])};"
+        case "rsh":
+            return f"shr.u64 {filter_keyword(inst[1])}, {filter_keyword(inst[2])}, {filter_keyword(inst[3])};"
+        # Memory Instructions
+        case "SAVE":
+            contents = (filter_keyword(reg) for reg in inst[2:])
+            contents = ", ".join(contents)
+            return f"SAVE [ {inst[1]} ] {{ { contents } }}" # just return everything
+        # Other Instructions
+        case "mov":
+            return f"mov.u64 {filter_keyword(inst[1])}, {filter_keyword(inst[2])};"
+        case "clock":
+            return f"mov.u64 {filter_keyword(inst[1])}, %clock64;"
+        case "time":
+            return f"mov.u64 {filter_keyword(inst[1])}, %globaltimer;"
+        case "cuid":
+            return f"""{{
+                .reg .b32 %tmp;
+                mov.u32 %tmp, %smid;
+                cvt.u64.u32 {filter_keyword(inst[1])}, %tmp;
+            }}"""
+        case _:
+            raise NotImplementedError(f"{inst} not yet supported")
+
+def gencode(probes: list[Probe]) -> list[Probe]:
+    # First handle the initialization of regs
+
+    # Then handle the syntax of probes
+    for probe in probes:
+        # only change the instructions, i.e., before and after part
+        if probe.before is not None:
+            insts: list[str] = []
+            for inst in probe.before:
+                insts.append(cvt_inst(inst))
+            probe.before = "\n".join(insts)
+        elif probe.after is not None:
+            insts: list[str] = []
+            for inst in probe.after:
+                insts.append(cvt_inst(inst))
+            probe.after = "\n".join(insts)
+    
+    return probes
+
+if __name__ == "__main__":
+    probes = [
+        Probe(name='thread_start', level='warp', pos='kernel', size=0, before=None, after=[['clock', 'R0']]), 
+        Probe(name='thread_end', level='warp', pos='kernel', size=0, before=None, after=[['clock', 'R2'], ['sub', 'R1', 'R2', 'R0'], ['cuid', 'R3'], ['SAVE', 'block_sched', 'R0', 'R1', 'R3']])
+    ]
+    print(gencode(probes))
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/__init__.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/__init__.py
new file mode 100644
index 0000000..4c385f2
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/__init__.py
@@ -0,0 +1,50 @@
+"""Neutrino Probing Engine Protocol
+
+NOTE just a protocol for developers, don't import / export"""
+
+from dataclasses import dataclass
+from neutrino.common import Register, Probe, Map
+
+__all__ = ["Ref", "load_probes", "TRACE_READING_CODE_PY"]
+
+@dataclass
+class Ref:
+    """Reference for replacement"""
+    line: str          # Original line
+    probe: str         # Probe name for matchine
+    before_after: bool # True if before and False if after -> to distinguish which snippet is used
+
+
+@dataclass
+class KernelParam:
+    dtype: str
+    name: str
+
+
+# NOTE following is just protocol, please implement yours, developers can 
+# extend other functions for their need, just keep following implemented
+
+def get_arch() -> str:
+    """get architecture for assembler"""
+    ...
+
+def dump(workdir: str, name: str, suffix: str) -> str:
+    """call objdump to extract assembly from binary"""
+    ...
+
+def prune(ptx: str, entry_name: str):
+    """Prune Assembly to specific entry_name"""
+    ...
+
+def probing(asm: str, probes: list[Probe]):
+    """Probe the probes into asm"""
+    ...
+
+def assemble(workdir: str, name: str):
+    """call assembler to turn assembly to machine code"""
+    ...
+
+def write_kernel_info(name: str, params, probe_mem_sizes: list[int], 
+    workdir: str, analyze_hook: str = "", file_name: str = "kernel.info"): 
+    """write kernel info for hook driver to read back"""
+    ...
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/cuda.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/cuda.py
new file mode 100644
index 0000000..9f4c1c7
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/cuda.py
@@ -0,0 +1,786 @@
+"""Neutrino Probing Engine, CUDA Implementation"""
+
+import os
+import sys
+import shutil
+import subprocess
+import traceback # usef for print backtrace to log file instead of stdout
+import toml      # to load probes from envariables
+from neutrino.common import Register, Probe, Map, load
+from neutrino.probe import Ref, KernelParam
+
+workdir = sys.argv[1]     # directory contains original.bin
+log = open(os.path.join(workdir, "process.log"), 'w')
+
+# a macro like terms
+SUPPORTED_DATAMODEL = { "thread": 0, "warp": 1 }
+
+# TODO move it to global variable or configurable
+def get_arch() -> str:
+    """get compute_arch of the gpu, like 'sm_89'
+    # Run nvidia-smi command
+    result = subprocess.run(
+        ['nvidia-smi', '--query-gpu=compute_cap', '--format=csv,noheader'],
+        stdout=subprocess.PIPE, 
+        text=True)
+    # sm_version like `8.9`
+    sm_version = result.stdout.split("\n")[0].strip()
+    major, minor = sm_version.split(".")
+    """
+    # NOTE sometimes auto-detection like above will fail, so manually fix at the time
+    result = subprocess.run(
+        ['nvidia-smi', '--query-gpu=compute_cap', '--format=csv,noheader'],
+        stdout=subprocess.PIPE, 
+        text=True)
+    # sm_version like `8.9`
+    sm_version = result.stdout.split("\n")[0].strip()
+    major, minor = sm_version.split(".")
+    return f"sm_{major}{minor}" 
+
+def dump(workdir: str, name: str = "original", suffix: str = ".bin") -> str:
+    """Extract PTX from cuda binaries (cubin or fatbin) via cuobjdump
+
+    NOTE accept three kind of binary:
+    1. fatbin @see https://docs.nvidia.com/cuda/nvfatbin/index.html
+    2. cubin  @see https://docs.nvidia.com/cuda/cuda-binary-utilities/index.html
+    3. PTX text file - no need to further process, just rename it to .ptx
+    """
+    bin_path = os.path.join(workdir, name) + suffix
+    # first check if it's already a NULL-Terminated PTX (i.e., ASCII Text)
+    result = subprocess.run(['file', bin_path], stdout=subprocess.PIPE, text=True)
+    out = result.stdout
+    if "ASCII text" in result.stdout: # raw PTX file, just read it all
+        shutil.copyfile(bin_path, os.path.join(workdir, name) + ".ptx")
+        print("[objdump] bin is ptx", file=log)
+        with open(os.path.join(workdir, name) + ".ptx", "r") as outf:
+            return outf.read()
+    # then try cuobjdump -ptx flag
+    result = subprocess.run(
+        ['cuobjdump', '-ptx', bin_path], 
+        stdout=subprocess.PIPE, 
+        stderr=subprocess.PIPE,
+        text=True)
+    out = result.stdout
+    if len(result.stderr) > 0:
+        print(result.stderr, file=log)
+    if out.find(".version") != -1:
+        start = out.index(".version") # ptx valid part starts with .version
+        with open(os.path.join(workdir, name) + ".ptx", "w") as outf:
+            outf.write(out[start:])
+        print("[objdump] via cuobjdump -ptx", file=log)
+        return out[start:]
+    else:
+    # finally try cuobjdump -elf to dump elf content and check .nv_debug_ptx_txt
+        result = subprocess.run(['cuobjdump', '-elf', bin_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        if len(result.stderr) > 0:
+            print(result.stderr, file=log)
+        sections = result.stdout.split(".section ") # don't remove the space
+        for section in sections:
+            if section.startswith(".nv_debug_ptx_txt"): # PTX Info      
+                # write to the original.ptx
+                start = section.index(".version")
+                with open(os.path.join(workdir, name) + ".ptx", "w") as outf:
+                    outf.write(section[start:])
+                print("[objdump] via cuobjdump -elf", file=log)
+                return section[start:]
+        # if still not found
+        raise ValueError("PTX Not Found in CUBIN")
+
+def prune(ptx: str, entry_name: str) -> tuple[str, str, str, str]:
+    """ a minimum parser to truncate the ptx for specific entry
+
+    Use this function to locate a specific entry with entry_name.
+    as Single PTX objdumped usually have > 1 entry, (try cuobjdump -ptx libcublas.so)
+
+    NOTE verified on PTX from NVCC GCC Backend and LLVM PTX Backend
+    """
+    # first try to find entry_name and raise error if entry_name not found
+    # .visible .entry and .entry corresponds to CUDA __global__
+    length = len(ptx)
+    # NOTE fix PyTorch problem
+    entry_loc = ptx.find(f".visible .entry {entry_name}")
+    if entry_loc == -1:
+        entry_loc = ptx.find(f".entry {entry_name}") # try raw .entry instead
+        if entry_loc == -1:
+            found = False
+            for i in range(1, 30):
+                # ignore last few character for fuzzy finding
+                entry_loc = ptx.find(f".entry {entry_name[:-i]}") # try raw .entry instead
+                # print(entry_loc, entry_name[:-i])
+                if entry_loc != -1: # if find!
+                    entry_name = entry_name[:-i]
+                    found = True
+                    break
+            if not found:
+                raise ValueError(f"{entry_name} not found")
+
+    # parse the first global section - shall be included for global info
+    # global info are complicated and not involved in later processing, just keep them
+    start_visible_entry = ptx.find(".visible .entry") if ptx.find(".visible .entry") != -1 else length
+    start_entry = ptx.find(".entry") if ptx.find(".entry") != -1 else length
+    start = min(start_visible_entry, start_entry)
+    # include global_section functions like __assertfail and definitions such as gmems
+    global_section = ptx[:start]
+
+    # parse the .func which corresponds to CUDA __device__, might be used by entries
+    start_func = start
+    func_sections = []
+    # tbh only very little code arrives here, so not much overhead
+    while start_func != -1:
+        start_func = ptx.find(".func", start_func)
+        if start_func == -1:
+            break
+        # function entry could be complicated, just keep them
+        pos = ptx.find("{", start_func) + 1
+        ket_count = 1
+        next_bra = ptx.find("{", pos)
+        next_ket = ptx.find("}", pos)
+        # now parse end
+        while ket_count > 0:
+            if next_bra < next_ket:
+                pos = next_bra + 1
+                next_bra = ptx.find("{", pos) # find next {
+                if next_bra == -1: # not found, set to max := ptx length
+                    next_bra = length
+                ket_count += 1
+            else: # next_ket < next_bra, := close a bra
+                pos = next_ket + 1
+                next_ket = ptx.find("}", pos) # find next }
+                if next_ket == -1: # not found, set to max := ptx length
+                    next_ket = length
+                ket_count -= 1
+        # now pos is end_body
+        func_sections.append(ptx[start_func:pos])
+        start_func = pos # 
+    func_section = "\n".join(func_sections)
+
+    # given entry_loc try to parse the whole body
+    pos = ptx.find("{", entry_loc) + 1
+    ket_count = 1 # one } miss
+    next_bra = ptx.find("{", pos)
+    if next_bra == -1:
+        next_bra = length
+    next_ket = ptx.find("}", pos)
+    while ket_count > 0:
+        if next_bra < next_ket:
+            pos = next_bra + 1
+            next_bra = ptx.find("{", pos)
+            if next_bra == -1: # not found, set to max := ptx length
+                next_bra = length
+            ket_count += 1
+        elif next_bra > next_ket: # next_ket < next_bra, := close a bra
+            pos = next_ket + 1 # not found, set to max := ptx length
+            next_ket = ptx.find("}", pos)
+            if next_ket == -1:
+                next_ket = length
+            ket_count -= 1
+    
+    entry_section = ptx[entry_loc:pos]
+
+    return global_section, func_section, entry_section, entry_name
+
+"""
+NOTE: templates for thread-constant datamodel buffer calculation
+These part shall be placed ONCE at the beginning of every kernel function definition 
+if there's any thread-constant probes
+
+Most registers below is duplicate and will be optimized by PTXAS
+TODO Optimize calculation for 1D/2D Indexing (many kernel don't use 3D Indexing
+"""
+COMMON_BUFFER_CALC = """// begin buffer calculation
+.reg .b32   %buf<20>;                       // b32 reg to record access, will be optimized by ptxas
+mov.u32 	%buf2, %tid.x;                  // threadIdx.x
+mov.u32 	%buf3, %tid.y;                  // threadIdx.y
+mov.u32 	%buf4, %tid.z;                  // threadIdx.z
+mov.u32 	%buf5, %ntid.x;                 // blockDim.x
+mov.u32 	%buf6, %ntid.y;                 // blockDim.y
+mov.u32 	%buf7, %ntid.z;                 // blockDim.z
+mov.u32 	%buf8, %ctaid.x;                // blockIdx.x
+mov.u32 	%buf9, %ctaid.y;                // blockIdx.y
+mov.u32 	%buf10, %ctaid.z;               // blockIdx.z
+mov.u32 	%buf11, %nctaid.x;              // gridDim.x
+mov.u32 	%buf12, %nctaid.y;              // gridDim.y
+mad.lo.s32 	%buf13, %buf6,  %buf4,  %buf3;  // blockDim.y * threadIdx.z + threadIdx.y
+mad.lo.s32 	%buf15, %buf13, %buf5,  %buf2;  // thread_idx = (blockDim.y * threadIdx.z + threadIdx.y) * blockDim.x + threadIdx.x
+mad.lo.s32 	%buf16, %buf12, %buf10, %buf9;  // gridDim.y * blockIdx.z + blockIdx.y
+mad.lo.s32 	%buf17, %buf16, %buf11, %buf8;  // block_idx = (gridDim.y * blockIdx.z + blockIdx.y) * gridDim.x + blockIdx.x
+mul.lo.s32  %buf18, %buf5,  %buf6;          // blockDim.x * blockDim.y
+mul.lo.s32  %buf19, %buf18, %buf7;          // blockSize = blockDim.x * blockDim.y * blockDim.z
+mad.lo.s32 	%buf1, %buf17, %buf19, %buf15;  // buf_idx = block_idx * blockSize + thread_idx
+// end buffer calculation"""
+
+"""
+NOTE templates for warp-constant datamodel buffer calculation
+These part shall be placed ONCE at the beginning of every kernel function definition 
+if there's any warp-constant probes
+
+Most registers below is duplicate and will be optimized by PTXAS
+"""
+WARP_BUFFER_CALC = """// begin buffer calculation
+.reg .b32   %warpbuf<21>;                   // b32 reg to record access, will be optimized by ptxas
+.reg .pred  %leader;                        // predicate register
+.reg .pred  %joint_pred;                    // used to store AND result of %leader and instruction operand
+mov.u32     %warpbuf2, %laneid;             // read lane id
+setp.eq.u32 %leader, %warpbuf2, 0;          // check if thread is warp leader
+@%leader mov.u32 %warpbuf3, %nwarpid;       // warpDim := number of warp in current group
+@%leader mov.u32 %warpbuf4, %tid.x;         // threadIdx.x
+@%leader mov.u32 %warpbuf5, %tid.y;         // threadIdx.y
+@%leader mov.u32 %warpbuf6, %tid.z;         // threadIdx.z
+@%leader mov.u32 %warpbuf7, %ntid.x;        // blockDim.x
+@%leader mov.u32 %warpbuf8, %ntid.y;        // blockDim.y
+@%leader mov.u32 %warpbuf18, %ntid.z;       // blockDim.z
+@%leader mov.u32 %warpbuf9, %ctaid.x;       // blockIdx.x
+@%leader mov.u32 %warpbuf10, %ctaid.y;      // blockIdx.y
+@%leader mov.u32 %warpbuf11, %ctaid.z;      // blockIdx.z
+@%leader mov.u32 %warpbuf12, %nctaid.x;     // gridDim.x
+@%leader mov.u32 %warpbuf13, %nctaid.y;     // gridDim.y
+@%leader mad.lo.s32 %warpbuf14, %warpbuf8,  %warpbuf6,  %warpbuf5;  // blockDim.y * threadIdx.z + threadIdx.y
+@%leader mad.lo.s32 %warpbuf15, %warpbuf14, %warpbuf7,  %warpbuf4;  // thread_idx = (blockDim.y * threadIdx.z + threadIdx.y) * blockDim.x + threadIdx.x
+@%leader div.s32 %warpbuf15, %warpbuf15, 32;                        // get persistent warpid instead of dynamic %warpid
+@%leader mad.lo.s32 %warpbuf16, %warpbuf13, %warpbuf11, %warpbuf10; // gridDim.y * blockIdx.z + blockIdx.y
+@%leader mad.lo.s32 %warpbuf17, %warpbuf16, %warpbuf12, %warpbuf9;  // block_idx = (gridDim.y * blockIdx.z + blockIdx.y) * gridDim.x + blockIdx.x
+@%leader mul.lo.s32 %warpbuf19, %warpbuf7, %warpbuf8;
+@%leader mul.lo.s32 %warpbuf20, %warpbuf19, %warpbuf18;
+@%leader div.s32 %warpbuf20, %warpbuf20, 32;
+@%leader mad.lo.s32 %warpbuf1,  %warpbuf17, %warpbuf20,  %warpbuf15; // buf_idx = block_idx * warpSize + warpIdx
+// end buffer calculation"""
+
+# NOTE buffer location for thread-local buffers, every probe has independent this part
+THREAD_PROBE_BUFFER = """// begin {name} map
+.reg .b64 %map_{name}<5>;                         // register group defn
+mul.wide.s32  %map_{name}4, %buf1, {no_bytes};    // get buffer location, no_bytes is per thread
+ld.param.u64  %map_{name}3, [param_{name}];       // load address from .param state space
+cvta.to.global.u64 	%map_{name}2, %map_{name}3;   // convert address to .global state space
+add.s64 %map_{name}1, %map_{name}2, %map_{name}4; // offset to get final thread-specific address
+// end {name} map"""
+
+# NOTE buffer of the dynamic stuffs
+THREAD_PROBE_DYNAMIC_BUFFER = """// begin {name} dynamic map
+.reg .b64 %map_{name}<5>;                         // register group defn
+.reg .b32 %cnt_{name};                            // The dynamic count of buffer size
+ld.param.u32  %cnt_{name}, [bytes_{name}];        // load sizes from .param state spaces
+mul.wide.s32  %map_{name}4, %buf1, %cnt_{name};   // get buffer location, no_bytes is per thread
+ld.param.u64  %map_{name}3, [param_{name}];       // load address from .param state space
+cvta.to.global.u64 	%map_{name}2, %map_{name}3;   // convert address to .global state space
+add.s64 %map_{name}1, %map_{name}2, %map_{name}4; // offset to get final thread-specific address
+// end {name} dynamic map"""
+
+# NOTE buffer location for warp-local buffers, every probe has independent this part
+WARP_PROBE_BUFFER = """// begin {name} map
+.reg .b64 %map_{name}<5>;                                   // register group defn
+@%leader mul.wide.s32  %map_{name}4, %warpbuf1, {no_bytes}; // get buffer location, no_bytes is per thread
+@%leader ld.param.u64  %map_{name}3, [param_{name}];        // load address from .param state space
+@%leader cvta.to.global.u64 	%map_{name}2, %map_{name}3; // convert address to .global state space
+@%leader add.s64 %map_{name}1, %map_{name}2, %map_{name}4;  // offset to get final thread-specific address
+// end {name} map"""
+
+# NOTE for every probe with datamodel not none
+# only support .u64 and recommend use 16 bytes alignment, minimum is 8 bytes
+PROBE_PARAM = ".param .u64 param_{name}"
+COUNT_PARAM = ".param .u32 bytes_{name}"
+
+# NOTE This is a special probe applied if dynamic = True, to be filled with count_inst and count_size
+COUNT_PROBE = """regs = 1
+[map.Count]
+level = "thread"
+type = "array"
+size = 8
+cap = 1
+
+[map.Count.regs]
+count = [ "u64", "None", ]
+
+[probe.Save]
+pos = "kernel"
+level = "thread"
+before = "mov.u64 %NR0, 0;"
+after = "SAVE [ Count ] {{ %NR0 }};"
+
+[probe.Count]
+pos = "{count_inst}"
+level = "thread"
+before = "add.u64 %NR0, %NR0, {count_size};"
+"""
+
+def probing(asm: str, probes: list[Probe], maps: list[Map], regs: int) -> tuple[str, list[int]]:
+    """Process the probes, the core function of probing engine"""
+
+    # NOTE parse interesting locations
+    # A mapping from location to probes, a probe can hook at multiple location
+    positions: dict[str, list[Probe]] = dict()
+    kernel_start_probes: list[Probe]  = []
+    # NOTE turn kernel:end into ret:start for better matching
+    for probe in probes:
+        # different position split by ;, and inside split by : for start/end
+        for position in probe.pos:
+            if position == "kernel": # turn into listening instructions
+                if probe.after is not None:
+                    if "ret;" in positions:
+                        positions["ret;"].append(probe)
+                    else:
+                        positions["ret;"] = [probe, ]
+                if probe.before is not None:
+                    kernel_start_probes.append(probe)
+            else:
+                if position in positions:
+                    positions[position].append(probe)
+                else:
+                    positions[position] = [probe, ]
+    
+    # NOTE parse PTX Assembly
+    ptx_lines = asm.split("\n") # let's do it line by line
+    # first extract basic kernel signature
+    entry_found: bool = False # line of .entry or .visible .entry
+    entry_last_line : int = 0  # last line of entry, marked by ()
+    param_end_line  : int = 0  # last line of  param declaration, for probe params
+    body_start_line : int = 0  # first line of body
+    idx = 0
+    while idx < len(ptx_lines):
+        line = ptx_lines[idx]
+        if not entry_found and ".entry" in line: # entry not yet found
+            entry_found = True
+        if entry_found: # now entry is found
+            # first check if the entry has been closed
+            if ")" in line and entry_last_line == 0:
+                entry_last_line = idx 
+            # if entry is closed, time for body!, another if as ) { can in one line
+            if body_start_line == 0 and "{" in line and entry_last_line >= 0:
+                body_start_line = idx
+            # if not yet reach the entry, then line with .param is param declaration
+            if ".param" in line and entry_last_line == 0: 
+                param_end_line = idx
+            # here pattern matching positions TODO optimize performance here
+            else:
+                for position, probes in positions.items():
+                    if position in line: # BUG might mismatch parameter with confused naming
+                        # NOTE we got a match, then every probe will insert snippet before or after the line
+                        # this might cause idx fluctuatting if we use idx to process it
+                        line_idx = idx # a copy to fix the insertion position
+                        for probe in probes: 
+                            # specially handle ret;, we need to place it before ret or it won't be executed
+                            if position == "ret;" and probe.after is not None:
+                                ptx_lines.insert(line_idx, Ref(line=line, probe=probe, before_after=False))
+                                idx += 1
+                                line_idx += 1
+                            else:
+                                if probe.before is not None: 
+                                    ptx_lines.insert(line_idx, Ref(line=line, probe=probe, before_after=True))
+                                    idx += 1
+                                    line_idx += 1
+                                if probe.after is not None:
+                                    ptx_lines.insert(line_idx + 1, Ref(line=line, probe=probe, before_after=False))
+                                    idx += 1
+        idx += 1
+
+    # Now add the probes to PTX Assembly
+    offset: int = 0 # adding every line need to offset 1 to make it correct
+    # First let's add parameters
+    ptx_lines[param_end_line] = ptx_lines[param_end_line] + "," # add , to indicate more param
+    # NOTE parameter layouts: Parameters are pointers to buffer, or buffer size
+    # We arange buffer pointers linearly in advance (u64), and later size (u32)
+    params_added: list[str] = []
+    count_params: list[str] = [] # NOTE used for dynamic counts only
+
+    # NOTE save the map_sizes so Hook Driver has a way to allocate the map memory
+    # we must make sure this is aligned with the order of parameter or will be illegal access
+    map_sizes: list[tuple[str, int]] = [] # 
+    
+    levels: set[str] = set()
+    for map_ in maps:
+        if map_.cap != "dynamic":
+            map_sizes.append((map_.level, map_.cap * map_.size))
+            params_added.append(PROBE_PARAM.format(name=map_.name))
+            levels.add(map_.level)
+        else:
+            map_sizes.append((map_.level, -1))
+            params_added.append(PROBE_PARAM.format(name=map_.name))
+            count_params.append(COUNT_PARAM.format(name=map_.name))
+            levels.add(map_.level)
+
+        # else just ignore
+    params_added = params_added + count_params # formulate the layout
+    ptx_lines.insert(param_end_line + 1, ",\n".join(params_added))
+    offset += 1 # in total one line is added 
+    # Now add the probe with kernel:start -> this shall not dump anything I think
+    ptx_lines.insert(body_start_line + offset + 1, f".reg .u64 %NR<{regs + 1}>;")
+    offset += 1
+    for probe in kernel_start_probes:
+        # NOTE kernel:start probe has no helpers and have no predicate
+        ptx_lines.insert(body_start_line + offset + 1, probe.before) # None is checked before
+        offset += 1
+    # Now add the common buffer calculation
+    if "thread" in levels:
+        ptx_lines.insert(body_start_line + offset + 1, COMMON_BUFFER_CALC)
+        offset += 1
+    if "warp" in levels:
+        ptx_lines.insert(body_start_line + offset + 1, WARP_BUFFER_CALC)
+        offset += 1
+    # Now add the individual buffer calculation
+    # NOTE add the handle of u32 registers
+    for map_ in maps:
+        num_u32 = 0
+        for reg in map_.regs:
+            if reg.dtype == "u32":
+                num_u32 += 1
+        if map_.level == "thread":
+            if map_.cap != "dynamic":
+                buffer_asm = THREAD_PROBE_BUFFER.format(name=map_.name, no_bytes=str(map_.cap * map_.size))
+                if num_u32 > 0:
+                    buffer_asm = buffer_asm + f"\n.reg .u32 %{map_.name}_u32_<{num_u32 + 1}>;"
+                ptx_lines.insert(body_start_line + offset + 1, buffer_asm)
+                offset += 1
+            else:
+                buffer_asm = THREAD_PROBE_DYNAMIC_BUFFER.format(name=map_.name)
+                if num_u32 > 0:
+                    buffer_asm = buffer_asm + f"\n.reg .u32 %{map_.name}_u32_<{num_u32 + 1}>;"
+                ptx_lines.insert(body_start_line + offset + 1, buffer_asm)
+                offset += 1
+        elif map_.level == "warp":
+            if map_.cap != "dynamic":
+                buffer_asm = WARP_PROBE_BUFFER.format(name=map_.name, no_bytes=str(map_.cap * map_.size))
+                if num_u32 > 0:
+                    buffer_asm = buffer_asm + f"\n.reg .u32 %{map_.name}_u32_<{num_u32 + 1}>;"
+                ptx_lines.insert(body_start_line + offset + 1, buffer_asm)
+                offset += 1
+            else:
+                raise NotImplementedError()
+        else:
+            raise NotImplementedError()
+
+    name_to_map: dict[str, Map] = {m.name: m for m in maps}
+    # Now add the instruction listenings 
+    for idx in range(len(ptx_lines)):
+        # ignore most of line that is a string!
+        if type(ptx_lines[idx]) == Ref: # NOTE isinstance is slow?
+            line: str         = ptx_lines[idx].line
+            probe: Probe      = ptx_lines[idx].probe
+            before_after: str = ptx_lines[idx].before_after
+            # parse instruction operands, operands are separated by space fundamentally
+            tmp = line[:line.index(";")].split(",")
+            operands: list[str] = []
+            # NOTE handling vectorized operands with { and }
+            merges = []
+            merging: bool = False
+            for operand in tmp:
+                if "{" in operand and not "}" in operand:
+                    merging = True
+                    merges.append(operand)
+                elif "}" in operand and not "{" in operand:
+                    merges.append(operand) # FIX, now operand is the last one and shall be included
+                    operands.append(",".join(merges).strip("{} ")) # we don't want {} remains
+                    merges = []     # flush merges
+                    merging = False # reset status
+                else:
+                    operands.append(operand) if not merging else merges.append(operand)
+            # first operand also have pred, inst and the real first operand
+            remaining = operands[0].strip() if len(operands) > 0 else print(line, tmp, operands, merges)
+            # handle predicate -> used in final insertion
+            if "@" in remaining:
+                pred = remaining[:remaining.index(" ") + 1] # include the space!
+                remaining = remaining[remaining.index(" ") + 1:].strip()
+            else:
+                pred = ""
+            # TODO assert matching instruction
+            mem_bytes: str = None
+            out: str = None
+            if remaining.find(" ") != -1:          
+                inst = remaining[:remaining.index(" ")]
+                # NOTE a helper to calculate bytes, ld and st's bytes are inferred not from operand
+                # but the instruction body (likewise ld.global.v2.u64)
+                if "ld" in inst or "st" in inst:
+                    vec = 1
+                    if "v2" in inst or "x2" in inst:
+                        vec = 2
+                    elif "v4" in inst or "x4" in inst:
+                        vec = 4
+                    # most dtypes are u32, no worries
+                    dtypes = ["u32", "u64", "b16", "u16", "u8", "f32", "f64", "b128", "s32", "s64", "s16", "s8", "b32", "b64", "b8"]
+                    for dtype in dtypes:
+                        if dtype in inst:
+                            mem_bytes = str(vec * int(dtype[1:]) // 8) # 8 "= size"
+                            break
+                out = remaining[remaining.index(" "):].strip()
+                if "[" in out:
+                    # NOTE handle [ addr ] used to indicate the memory address
+                    out = out[out.index("[") + 1 : out.index("]")] if out is not None and "[" in out else out # fix
+            in1:  str = operands[1] if len(operands) >= 2 else None
+            # NOTE handle [ addr ] used to indicate the memory address
+            in1 = in1[in1.index("[") + 1 : in1.index("]")] if in1 is not None and "[" in in1 else in1
+            in2 = operands[2] if len(operands) >= 3 else None
+            in3 = operands[3] if len(operands) >= 4 else None
+            # TODO handle some weird syntax like + 0 used meaninglessly to locate correct places
+            # Currently only a minimal solution
+            if out is not None and "+" in out: out = out[:out.find("+")]
+            if in1 is not None and "+" in in1: in1 = in1[:in1.find("+")]
+            # print(line, out, in1, in2, sep=" / ")
+            # now handles operand helpers by directly replacing the value
+            snippet = probe.before if before_after else probe.after
+            snippet = snippet.replace("OUT", out) if "OUT" in snippet else snippet
+            snippet = snippet.replace("IN1", in1) if "IN1" in snippet else snippet
+            snippet = snippet.replace("IN2", in2) if "IN2" in snippet else snippet
+            snippet = snippet.replace("IN3", in3) if "IN3" in snippet else snippet
+            # NOTE add a new helper named ADDR referencing gmem address
+            if "ADDR" in snippet:
+                if "ld" in operands[0] or "cp.async" in operands[0]:
+                    snippet = snippet.replace("ADDR", in1)
+                elif "st" in operands[0]: # st has 
+                    snippet = snippet.replace("ADDR", out)
+            if mem_bytes is not None:
+                snippet = snippet.replace("BYTES", mem_bytes) if "BYTES" in snippet else snippet
+            # now handles STORE helpers
+            snippet_lines = snippet.split("\n")
+            # NOTE special arrangements for warp datamodel
+            org_pred = pred
+            if probe.level == "warp":
+                if pred == "":
+                    pred = "@%leader " # apply filter that only leader works
+                else:
+                    pred = "@%joint_pred " # will be updated %leader AND pred
+
+            for snippet_line_idx in range(len(snippet_lines)):
+                snippet_line: str = snippet_lines[snippet_line_idx]
+                if "SAVE" in snippet_line: # only one save, at the begin of line
+                    # SAVE [ block_sched ] { R0, R1, R3 }
+                    map_ = name_to_map[snippet_line[snippet_line.index("[") + 1: snippet_line.index("]")].strip()]
+                    # dtype = snippet_line[snippet_line.find("SAVE") + 5: snippet_line.find("SAVE") + 8]
+                    items = snippet_line[snippet_line.index("{") + 1:snippet_line.index("}")].split(",")
+                    assert len(items) == len(map_.regs), f"{map_.name}.save not follow definition"
+                    cvt_lines, u64s, u32s = [], [], []
+                    u32_idx = 1 # NOTE must be 1 due to some syntax issue I think
+                    for item, reg in zip(items, map_.regs):
+                        if reg.dtype == "u64": 
+                            u64s.append(item)
+                        elif reg.dtype == "u32": # apply conversion
+                            cvt_lines.append(f"{pred}cvt.u32.u64 %{map_.name}_u32_{u32_idx}, {item};\n")
+                            u32s.append(f"%{map_.name}_u32_{u32_idx}")
+                            u32_idx += 1
+                        else:
+                            raise NotImplementedError
+                    assert len(u32s) % 2 == 0, "Must save 2n u32 registers for memory alignment, please promote to u64"
+                    save_lines = []
+                    for item_idx in range(len(u64s) // 2):
+                        save_lines.append(f"{pred}st.global.v2.u64 [%map_{map_.name}1],  {{ {u64s[item_idx * 2]}, {u64s[item_idx * 2 + 1]} }};\n{pred}add.s64 %map_{map_.name}1, %map_{map_.name}1, 16;")
+                    if len(u64s) % 2 != 0: # odd number -> one item left!
+                        save_lines.append(f"{pred}st.global.u64 [%map_{map_.name}1], {u64s[-1]};\n{pred}add.s64 %map_{map_.name}1, %map_{map_.name}1, 8;")
+                    for item_idx in range(len(u32s) // 4):
+                        save_lines.append(f"{pred}st.global.v4.u32 [%map_{map_.name}1],  {{ {u32s[item_idx * 4]}, {u32s[item_idx * 4 + 1]}, {u32s[item_idx * 4 + 2]}, {u32s[item_idx * 4 + 3]} }};\n{pred}add.s64 %map_{map_.name}1, %map_{map_.name}1, 16;")
+                    if len(u32s) % 4 != 0: # two items left...
+                        save_lines.append(f"{pred}st.global.v2.u32 [%map_{map_.name}1],  {{ {u32s[-2]}, {u32s[-1]} }};\n{pred}add.s64 %map_{map_.name}1, %map_{map_.name}1, 8;")
+                    snippet_lines[snippet_line_idx] = "\n".join(cvt_lines) + "\n".join(save_lines)
+                else:
+                    # or just add the pred!
+                    # NOTE handling warp that having double buffer
+                    if ".reg" not in snippet_line and snippet_line.strip() not in ("{", "}"):
+                        snippet_lines[snippet_line_idx] = pred + snippet_line
+                    else:
+                        snippet_lines[snippet_line_idx] = snippet_line
+            if probe.level == "warp" and org_pred != "":
+                snippet_lines.insert(0, f"and.pred %tmp, %leader, {org_pred[1:]}; // joint prediction") # ignore the '@' signal at first
+            snippet = "\n".join(snippet_lines)
+            # finally replace the Ref with snippet to finish the probing!
+            ptx_lines[idx] = snippet
+
+    # Finally finished.1
+    return "\n".join(ptx_lines)
+
+def assemble(workdir: str, name: str) -> None:
+    """compile the ptx into cubin via ptxas
+    NOTE: ptxas command like `ptxas -arch=sm_80 --verbose -m64  "original.ptx"  -o "original.cubin"` 
+    * This is not actually need for running because CUDA Driver cuModuleLoad can load PTX (JIT),
+    * But is useful for checking as ptxas --verbose can give more info for debugging
+    """
+    ptx_path = os.path.join(workdir, name) + ".ptx"
+    bin_path = os.path.join(workdir, name) + ".bin" # target binary
+    command = ["ptxas", f'-arch={get_arch()}', '-m64', "--verbose", ptx_path, '-o', bin_path]
+    print(" ".join(command), file=log)
+    result = subprocess.run(
+        command, 
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        check=True
+    )
+    # print debug and verbose information to the process.log
+    if len(result.stderr) > 0:
+        print(result.stderr.decode("utf-8"), file=log)
+    if len(result.stdout) > 0:
+        print(result.stdout.decode("utf-8"), file=log)
+
+def parse_params(ptx: str) -> tuple[list[KernelParam], str]:
+    """parse kernel function parameters
+    @see https://docs.nvidia.com/cuda/parallel-thread-execution/#kernel-function-parameters
+
+    NOTE this is because cuLaunchKernel receive void** as kernelParam and one can not infer
+    the valid no.params from void** (NVIDIA driver also use similar parsing for that)
+    """
+    start = ptx.find("(")
+    name_start = ptx.rfind(" ", 0, start)
+    end = ptx.find(")", start)
+    ptx_lines = ptx[start + 1 : end].split("\n")
+    param_lines: list[str] = []
+    params: list[KernelParam] = []
+
+    for line in ptx_lines:
+        if  ".param" in line:
+            param_lines.append(line.strip(" ,"))
+    for param_line in param_lines:
+        tmp = param_line.split(" ")
+        dtype = tmp[1][1:]   # .s32 .u64 ...
+        name = tmp[-1]
+        params.append(KernelParam(dtype, name))
+    return params, ptx[name_start + 1:start] # + 1 := ignore space
+
+
+def write_kernel_info(name: str, params: list[KernelParam], map_sizes: list[int], 
+                      workdir: str, file_name: str = "kernel.info"):
+    """write kernel info to workdir/file_name"""
+    # TODO add support for vectorized items
+    with open(os.path.join(workdir, file_name), "w") as f:
+        # print kernel name
+        print(name, file=f)
+        # number of parameters, for parsing void** kernelParams
+        print(len(params), file=f)
+        # number of probes with memory
+        print(len(map_sizes), file=f)
+        # size of each memory section
+        for probe_type, size in map_sizes:
+            print(f"{SUPPORTED_DATAMODEL[probe_type]},{size}", file=f)
+        # # NOTE: print the hook here, resolve relative path
+        # if callback != "" and not callback.startswith("/"): 
+        #     callback = os.path.join(os.path.dirname(os.path.dirname(__file__)), "tools", callback)
+        # print(callback, file=f)
+        # # NOTE: following are referencing stuff not really used by hook driver
+        # for param in params:
+        #     print(f"{param.name},{param.dtype}", file=f)
+
+# ENTRY for this tool
+if __name__ == "__main__":
+    # no argparse as the CLI is straightforward
+    workdir = sys.argv[1]     # directory contains original.bin
+    kernel_name = sys.argv[2].encode('utf-8', 'ignore').decode('utf-8', 'ignore') # for possible case with multiple entry in one binary
+
+    probes: dict
+    if len(sys.argv) > 3: # NOTE to facilitate debugging, not used in production
+        probe_path = sys.argv[3]
+        probes = toml.load(probe_path)
+        print(probes)
+    else: # the path in production
+        # parse the environment variable to read the probes
+        probe_envvar = os.environ.get("NEUTRINO_PROBES")
+        if probe_envvar is None:
+            raise ValueError("Can not read probes from envaraible 'NEUTRINO_PROBES'")
+        # load it via toml
+        probes = toml.loads(probe_envvar)
+    
+    # # filter out, probes are nested dict in TOML via [name]
+    # probes: Dict[str, dict] = dict()
+    # callback = probe_toml["CALLBACK"] if "callback" in probe_toml else ""
+    # for key, value in probe_toml.items():
+    #     if isinstance(value, dict):
+    #         probes[key] = value
+
+    # parse the environment variable for filtered out kernel, this is for
+    # 1. Some buggy kernels caused system fails -> many GPU error is not recoverable
+    # 2. Some uninterested kernels such as vectorized_elementwise for PyTorch
+    filter_out = os.environ.get("NEUTRINO_FILTER", "")
+    filter_out = filter_out.split(":") if len(filter_out) > 0 else None
+    print(filter_out, file=log)
+    
+    filter_in = os.environ.get("NEUTRINO_KERNEL", "")
+    filter_in = filter_in.split(":") if len(filter_in) > 0 else None
+    print(filter_in, file=log)
+    
+    # NOTE check if some probe is defined as dynamic, if so, we need to add a counter
+    #      for these probes in different arangements
+    dynamic = bool(os.environ.get("NEUTRINO_DYNAMIC", 0)) 
+
+    try:
+        # first objdump binary to ptx
+        ptx = dump(workdir)
+        # then truncate ptx for entry_name
+        global_section, func_section, entry_section, _ = prune(ptx, kernel_name)
+        # split and process ptx lines and write kernel info
+        params, kernel_name = parse_params(entry_section)
+
+        # basic logging
+        print(kernel_name, file=log)
+        if filter_in:
+            matched = False
+            for tmp in filter_in:
+                if tmp in kernel_name:
+                    matched = True
+            if not matched:
+                print(f"{kernel_name} is not in {filter_in}", file=log)
+                exit(1)
+        if filter_out:
+            for tmp in filter_out:
+                if tmp != "" and tmp in kernel_name:
+                    print(f"{kernel_name} filtered out from {filter_out}", file=log)
+                    exit(1)
+
+        # write pruned ptx to file
+        pruned_ptx = global_section + "\n" + func_section + "\n" + entry_section
+        with open(os.path.join(workdir, "pruned.ptx"), "w") as f:
+            f.write(pruned_ptx)
+
+        # convert probes from Python Dict to data structure
+        probes, maps, regs = load(probes)
+
+        # NOTE generate the trace reading code
+        if dynamic:
+            # First check the probe with size is dynamic, aka size = -1
+            count_inst = []
+            count_size = 0
+            count_map  = ""
+            for map_ in maps:
+                if map_.cap == "dynamic":
+                    count_size = map_.size
+                    count_map  = map_.name
+                    break
+            for probe in probes:
+                # NOTE there might be a bug that before and after can only save once
+                if probe.before is not None:
+                    if "SAVE" in probe.before and count_map in probe.before:
+                        count_inst = count_inst + probe.pos
+                if probe.after is not None:
+                    if "SAVE" in probe.after and count_map in probe.after:
+                        count_inst = count_inst + probe.pos
+            assert len(count_inst) > 0
+            count_inst = ":".join(count_inst)
+            count_probe = COUNT_PROBE.format(count_inst = count_inst, count_size = count_size)
+            count_probe, count_map, count_reg = load(toml.loads(count_probe))
+            count_ptx = probing(entry_section, count_probe, count_map, count_reg)
+            count_ptx = global_section + "\n" + func_section + "\n" + count_ptx
+            with open(os.path.join(workdir, "countd.ptx"), "w") as f:
+                f.write(count_ptx)
+
+        map_sizes: list[tuple[str, int]] = []
+        for map_ in maps:
+            if isinstance(map_.cap, int):
+                map_sizes.append((map_.level, map_.size * map_.cap))
+            elif map_.cap == "dynamic":
+                map_sizes.append((map_.level, -1))
+            else:
+                raise NotImplementedError
+
+        # process ptx lines
+        probed_ptx = probing(entry_section, probes, maps, regs)
+
+        # merge global and func back
+        probed_ptx  = global_section + "\n" + func_section + "\n" + probed_ptx
+
+        # write ptx to file
+        with open(os.path.join(workdir, "probed.ptx"), "w") as f:
+            f.write(probed_ptx)
+        
+        # params = parse_params(ptx_lines)
+        write_kernel_info(kernel_name, params, map_sizes, workdir)
+
+        # compile ptx to binary, we want both probed and pruned
+        assemble(workdir, "probed")
+        assemble(workdir, "pruned")
+        if dynamic:
+            assemble(workdir, "countd")
+
+    except Exception as e:
+        traceback.print_exc(file=log)
+        exit(1)
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/hip.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/hip.py
new file mode 100644
index 0000000..39bd02c
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/hip.py
@@ -0,0 +1,486 @@
+"""Neutrino Probing Engine, AMD ROCm HIP Implementation
+
+HIP's AMDGCN (.amdgcn) has only one source: LLVM AMDGPU Backend
+CITE https://llvm.org/docs/AMDGPUUsage.html
+
+GCNAsm is similar to x86 assembly, kernel mainly has two parts (two .text):
+1. GCNAsm Code in form of .section .text
+2. AMD HSA Configuration or said `.amdgpu_metadata`
+
+NOTE At the moment, we only support part of syntax because there's no official
+     documentation on syntax / semantics of GCN Assembly (only instructions).
+
+Important GCNAsm syntax for Neutrino developers:
+1. s[0:1] holds pointer to kernargs, use s_load_dword to ld.param
+2. v0 (32bit) holds 00 + threadIdx.z(30-20)+threadIDx.y(20-10)+threadIdx.x(10-0)
+3. blockIdx.xyz is the next 3 registers after first three, gridDim.xyz is ttmp8/9/10
+4. It seems blockDim can not be retrieved easily from special registers, only via 
+
+It's worth noticed that v0 and s[0:1] can be changed by developers, i.e., their 
+value (threadIdx.xyz) is only available at kernel begins.
+
+NOTE Why not fully support?
+Because we can not find AMD GPUs for testing or debugging (we are not AMD).
+There's nearly no cloud providers for AMD GPUs (only MI300x on runpods.io). 
+
+And part of AMD's ISA is ridiculous, for example, until CDNA3, add two u64
+is finally supported on VGPRs, still not supported on SGPRs. Why? Tell me why?
+"""
+
+from typing import List, Tuple, Optional, Dict, Set
+import os
+import sys
+import shutil
+import subprocess
+import traceback # usef for print backtrace to log file instead of stdout
+import toml      # to load probes from envariables
+import yaml      # AMD GCN ASM use YAML as METADATA Storage
+from dataclasses import dataclass
+from neutrino.common import Register, Probe, Map, load
+from neutrino.probe import Ref, KernelParam
+
+workdir = sys.argv[1]     # directory contains original.bin
+log = open(os.path.join(workdir, "process.log"), 'w')
+
+# a macro like terms
+SUPPORTED_DATAMODEL = { "thread": 0, "warp": 1 }
+
+# NOTE applicable to CDNA GPUs but might not be applicable to GDNA GPUs
+# TODO change to amdgpu_metadata['amdhsa.kernels'][0]['.wavefront_size']
+WARP_SIZE = 64 
+
+@dataclass
+class KernelParam: # NOTE GCNASM has different defn
+    value_kind: str
+    size: int
+
+# NOTE it's risky but safe as this is a CLI tool invoked for specific kernel
+amdgpu_metadata: Dict = None
+
+# TODO finalize rocm-smi toolchain
+def get_arch() -> str: 
+    """At the moment, we extract target arch from the assembly, but not sure
+    if this may leads to misleading arch for codegen, will see"""
+    ...
+
+# TODO finalize llvm-objdump toolchain
+def extract(workdir: str, name: str = "original", suffix: str = ".bin") -> str:
+    bin_path = os.path.join(workdir, name) + suffix
+    # first check if it's already a NULL-Terminated PTX (i.e., ASCII Text)
+    result = subprocess.run(['file', bin_path], stdout=subprocess.PIPE, text=True)
+    out = result.stdout
+    if "ASCII text" in result.stdout: # raw PTX file, just read it all
+        shutil.copyfile(bin_path, os.path.join(workdir, name) + ".asm")
+        print("[decompile] bin is gcnasm", file=log)
+        with open(os.path.join(workdir, name) + ".asm", "r") as outf:
+            return outf.read()
+
+# TODO add prune support
+def prune(asm: str, entry_name: str) -> Tuple[str, str]:
+    """A Minimum parser to truncate the gcn asm for specific entry
+    
+    Use this function to locate a specific entry with entry_name as
+    single .asm / .s usually have > 1 entry!    
+    """
+    # First find the two single line of .text
+    lines = asm.split("\n")
+    sections = []
+    target = None # assembler target like .amdgcn_target "amdgcn-amd-amdhsa--gfx942"
+    for idx in range(len(lines)):
+        if ".text" in lines[idx]:
+            sections.append(idx) # record the sections
+        elif ".amdgcn_target" in lines[idx]:
+            target = lines[idx]
+    # reorganize sections
+    kernels = []
+    for idx in range(len(sections) - 2): # last section holds gcnasm
+        if "@function" in "\n".join(lines[sections[idx] : sections[idx + 1]]):
+            kernels.append(sections[idx])
+    kernels.append(sections[-1])
+    # Now locate the entry
+    # TODO add rough matching!!!
+    entry_section = None
+    for idx in range(len(kernels) - 2):
+        temp = "\n".join(lines[kernels[idx] : kernels[idx + 1]])
+        if entry_name in temp:
+            entry_section = temp
+    assert entry_section is not None, "Fail to find"
+    # adding target if not found
+    if target not in entry_section:
+        entry_section = entry_section.split("\n")
+        entry_section = entry_section[0] + target + "\n".join(entry_section[1:])
+    # fix the metadata section
+    last_section = "\n".join(lines[sections[-1]:])
+    assert ".amdgpu_metadata" in last_section
+    metadata = last_section[last_section.index(".amdgpu_metadata") +  16: last_section.index(".end_amdgpu_metadata") - 1] # BUG -1 is a fix
+    global amdgpu_metadata
+    amdgpu_metadata = yaml.safe_load(metadata)
+    for kernelmeta in amdgpu_metadata['amdhsa.kernels']:
+        if kernelmeta['.name'] == entry_name:
+            amdgpu_metadata['amdhsa.kernels'] = [kernelmeta, ] # only want this one
+            break
+    return entry_section, last_section
+
+def parse_params() -> Tuple[List[KernelParam], str]:
+    global amdgpu_metadata
+    kernel_name = amdgpu_metadata['amdhsa.kernels'][0]['.name']
+    params: List[KernelParam] = []
+    for arg in amdgpu_metadata['amdhsa.kernels'][0]['.args']:
+        params.append(KernelParam(arg['.value_kind'], arg['.size']))
+    return params, kernel_name
+
+"""
+NOTE: templates for thread-constant datamodel buffer calculation
+These part shall be placed ONCE at the beginning of every kernel function definition 
+if there's any thread-constant probes
+
+Most registers below is duplicate and will be optimized by AMD Assembler
+"""
+
+THREAD_BUFFER_COMMON = """;;# begin buffer calculation
+V_MOV_B32 v{thread_buff}, v0 ;;# v0 holds threadIdx.x, don't know what's threadIdx.y, threadIdx.z
+;;# end buffer calculation"""
+
+WARP_BUFFER_COMMON = """;;# begin buffer calculation
+V_LSHRREV_B32_E32 v{warp_buff}, 6, v0;; # shift 6 bits := // 64
+;;# end buffer calculation"""
+
+# NOTE buffer location for thread-local buffers, every probe has independent this part
+THREAD_BUFFER = """;;# begin {name} buffer
+S_LOAD_DWORDX2 s[{param_reg}], s[0:1], {param_offset};;# load buffer address into 64bit register (2x32)
+V_MAD_I64_I32 v[{param_addr}], v{thread_buff}, {no_bytes}, s[{param_reg}];;# calculate the address
+;;# end {name} buffer"""
+
+# BUG it shall be possible to move everything into sgpr, but I don't know how to locate
+# warpIdx in solely SGPR, please help me
+WARP_BUFFER = """;;# begin {name} buffer
+S_LOAD_DWORDX2 s[{param_reg}], s[0:1], {param_offset};;# load buffer address into 64bit register (2x32)
+V_MAD_I64_I32 v[{param_addr}], v{warp_buff}, {no_bytes}, s[{param_reg}];;# calculate the address
+;;# end {name} buffer"""
+
+def probing(asm: str, probes: List[Probe]) -> Tuple[str, List[int], str]:
+    """Probing the Assembly, the core of probing engine
+    
+    NOTE we assume probe is parsed and (security checked)"""
+    
+    # NOTE parse interesting locations
+    # A mapping from location to probes, a probe can hook at multiple location
+    positions: Dict[str, List[Probe]] = dict()
+    kernel_start_probes: List[Probe]  = []
+    # NOTE turn kernel:end into ret:start for better matching
+    print(probes)
+    for probe in probes:
+        # different position split by ;, and inside split by : for start/end
+        for position in probe.position:
+            if position == "kernel": # turn into listening instructions
+                if probe.after is not None:
+                    if "s_endpgm" in positions: # AMD use s_endpgm to terminate
+                        positions["s_endpgm"].append(probe)
+                    else:
+                        positions["s_endpgm"] = [probe, ]
+                if probe.before is not None:
+                    kernel_start_probes.append(probe)
+            else: 
+                if position in positions:
+                    positions[position].append(probe)
+                else:
+                    positions[position] = [probe, ]
+    
+    # NOTE parse GCN Assembly
+    gcn_lines = asm.split("\n") # let's do it line by line
+    # first extract basic kernel signature
+    body_start_line : int = 0  # first line of body
+    idx = 0
+    # NOTE specially handle kernel start probe
+
+    while idx < len(gcn_lines):
+        line = gcn_lines[idx]
+        # First try to find ; %bb.0: NOTE just the behavior of hipcc not standard syntax
+        # but we don't know what's the standard syntax, there's no documentation about this...
+        # maybe this is the reason why AMD product is hard to use?
+        if "%bb.0" in line:
+            body_start_line = idx
+            # BUG move it to the real beginning before loading and saving ? 
+            for probe in kernel_start_probes:
+                gcn_lines.insert(idx + 1, Ref(line=line, probe=probe, before_after=True)) # place after
+                idx += 1
+        # here pattern matching positions TODO optimize performance here
+        else:
+            for position, probes in positions.items():
+                if position in line: # BUG might mismatch parameter with confused naming
+                    # NOTE we got a match, then every probe will insert snippet before or after the line
+                    # this might cause idx fluctuatting if we use idx to process it
+                    line_idx = idx # a copy to fix the insertion position
+                    for probe in probes:
+                        # specially handle ret;, we need to place it before ret or it won't be executed
+                        if position == "s_endpgm" and probe.after is not None:
+                            gcn_lines.insert(line_idx, Ref(line=line, probe=probe, before_after=False))
+                            idx += 1
+                            line_idx += 1
+                        else:
+                            if probe.before is not None: 
+                                gcn_lines.insert(line_idx, Ref(line=line, probe=probe, before_after=True))
+                                idx += 1
+                                line_idx += 1
+                            if probe.after is not None:
+                                gcn_lines.insert(line_idx + 1, Ref(line=line, probe=probe, before_after=False))
+                                idx += 1
+        idx += 1
+    
+    # work with register spaces, NOTE AMD GCN Asm don't have declartion syntax
+    # for registers, just a flatten v[0:1], we need to manage them manually
+    # GCN Asm has two register spaces:
+    # 1. VGPR (v0), holding thread-spcific values
+    # 2. SGPR (s0), holding warp-specifc values
+    # TODO we can optimize warp probes to SGPR only, avoiding VGPR usage
+
+    # Now add the probes to PTX Assembly
+    offset: int = 0 # adding every line need to offset 1 to make it correct
+    probe_mem_sizes: List[Tuple[str, int]] = [] # 
+    # TODO parse these from meta
+    global amdgpu_metadata
+    # NOTE here the sgpr number is wrong, there'll be 6 more, I don't know why, I can only
+    # record it at the moment.
+    # BUG SGPR number from metadata doesn't match the actual usage. Always
+    # 6 more is used. I don't know why but let's keep it.
+    sgpr_all = amdgpu_metadata["amdhsa.kernels"][0]['.sgpr_count'] 
+    sgpr = 0
+    for idx in range(len(gcn_lines)):
+        if type(gcn_lines[idx]) is str and ".amdhsa_next_free_sgpr" in gcn_lines[idx]:
+            sgpr = int(gcn_lines[idx].strip().split()[1])
+    sgpr_diff = sgpr_all - sgpr
+    vgpr = amdgpu_metadata['amdhsa.kernels'][0]['.vgpr_count'] # used for new stuff
+    param_off = amdgpu_metadata["amdhsa.kernels"][0]['.kernarg_segment_size']
+    param_align = amdgpu_metadata["amdhsa.kernels"][0]['.kernarg_segment_align']
+    param_off = ((param_off + param_align - 1) // param_align ) * param_align # round up
+    params = []
+    thread_buff_vgpr, warp_buff_vgpr = None, None # conform Python scope
+
+    processed: Set[str] = set() # a set to avoid repeated process same probe that leads to error
+    datamodels: Set[str] = set()
+    for probe in probes:
+        if probe.name not in processed and probe.datamodel is not None:
+            probe_mem_sizes.append((probe.datamodel, int(probe.cap) * int(probe.no_bytes)))
+            processed.add(probe.name)
+            datamodels.add(probe.datamodel)
+
+    if "thread" in datamodels:
+        thread_buff_vgpr = f"{vgpr}"
+        gcn_lines.insert(body_start_line + offset + 1, THREAD_BUFFER_COMMON.format(thread_buff=thread_buff_vgpr))
+        offset += 1
+        vgpr += 1
+    if "warp" in datamodels:
+        warp_buff_vgpr = f"{vgpr}"
+        gcn_lines.insert(body_start_line + offset + 1, WARP_BUFFER_COMMON.format(warp_buff=warp_buff_vgpr))
+        offset += 1
+        vgpr += 1
+
+    # Now add the individual buffer calculation
+    processed = set()
+    for probe in probes:
+        if probe.name not in processed:
+            if probe.datamodel == "thread":
+                no_bytes = str(int(probe.cap) * int(probe.no_bytes))
+                gcn_lines.insert(body_start_line + offset + 1, 
+                    THREAD_BUFFER.format(name=probe.name, no_bytes=no_bytes, 
+                        param_offset=param_off, param_reg=f"{sgpr}:{sgpr+1}", 
+                        thread_buff=thread_buff_vgpr, param_addr=f"{vgpr}:{vgpr+1}"))
+                probe.param_addr = f"{vgpr}:{vgpr+1}" # NOTE record the address 
+                offset += 1
+                sgpr += 2 # 2x32bit registers to hold 8bytes, specific to warp
+                vgpr += 2 # 2x32bit registers to hold 8bytes, specific to thread
+                params.append({'.address_space': 'global', '.size': 8, 
+                    '.offset': param_off, '.value_kind': 'global_buffer'})
+                param_off += 8 # only pass in pointers so 8bytes := 64bits
+            elif probe.datamodel == "warp":
+                no_bytes = str(int(probe.cap) * int(probe.no_bytes))
+                gcn_lines.insert(body_start_line + offset + 1, 
+                    WARP_BUFFER.format(name=probe.name, no_bytes=no_bytes, 
+                        param_offset=param_off, param_reg=f"{sgpr}:{sgpr+1}", 
+                        warp_buff=warp_buff_vgpr, param_addr=f"{vgpr}:{vgpr+1}"))
+                probe.param_addr = f"{vgpr}:{vgpr+1}" # NOTE record the address 
+                offset += 1
+                sgpr += 2 # 2x32bit registers to hold 8bytes, specific to warp
+                vgpr += 2 # 2x32bit registers to hold 8bytes, specific to thread
+                params.append({'.address_space': 'global', '.size': 8, 
+                    '.offset': param_off, '.value_kind': 'global_buffer'})
+                param_off += 8 # only pass in pointers so 8bytes := 64bits
+            for reg in probe.registers:
+                if probe.registers[reg] == "b32":
+                    if probe.datamodel == "warp":
+                        probe.registers[reg] = f"s{sgpr}"
+                        sgpr += 1
+                    elif probe.datamodel == "thread":
+                        probe.registers[reg] = f"v{vgpr}"
+                        vgpr += 1
+                elif probe.registers[reg] == "b64":
+                    if probe.datamodel == "warp":
+                        probe.registers[reg] = f"s[{sgpr}:{sgpr+1}]"
+                        sgpr += 2
+                    elif probe.datamodel == "thread":
+                        probe.registers[reg] = f"v[{vgpr}:{vgpr+1}]"
+                        vgpr += 2
+            processed.add(probe.name)
+        # all rest is treated as no saving
+    
+    # Now add the instruction listening
+    for idx in range(len(gcn_lines)):
+        # ignore most of line that is a string!
+        if type(gcn_lines[idx]) == Ref: # NOTE isinstance is slow?
+            line: str         = gcn_lines[idx].line.strip()
+            probe: Probe      = gcn_lines[idx].probe
+            before_after: str = gcn_lines[idx].before_after
+            # parse instruction operands, operands are separated by comma
+            if ";" in line: line = line[:line.find(";")]
+            tmp = line.split(",")
+            operands: List[str] = []
+            inst, op1 = tmp[0].split(" ")[0], tmp[0].split(" ")[-1] # 
+            operands.append(op1)
+            for t in tmp[1:]:
+                operands.append(t.strip().split(" ")[0])
+            snippet = probe.before if before_after else probe.after
+            if "OUT" in snippet: snippet = snippet.replace("OUT", operands[0])
+            if "IN1" in snippet: snippet = snippet.replace("IN1", operands[1])
+            if "IN2" in snippet: snippet = snippet.replace("IN2", operands[2])
+            if "IN3" in snippet: snippet = snippet.replace("IN3", operands[3])
+            
+            # Adding support for SAVE.u64 statement
+            # NOTE for reading the probe afterwards
+            snippet_lines = snippet.split("\n")
+            for snippet_line_idx in range(len(snippet_lines)):
+                snippet_line: str = snippet_lines[snippet_line_idx]
+                if "SAVE" in snippet_line: # only one save, at the begin of line
+                    save_lines = [] # start a new string
+                    items = snippet_line[snippet_line.index("{") + 1:snippet_line.index("}")].split(",")
+                    dtype = snippet_line[snippet_line.find("SAVE") + 5: snippet_line.find("SAVE") + 8]
+                    if dtype == "u64":
+                        for item_idx in range(len(items)):
+                            item_val = probe.registers[items[item_idx].strip()]
+                            save_lines.append(f"\tGLOBAL_STORE_DWORDX2 v[{probe.param_addr}], {item_val} \n\tV_LSHL_ADD_U64  v[{probe.param_addr}], 0, 8")
+                    elif dtype == "u32":
+                        for item_idx in range(len(items)):
+                            item_val = probe.registers[items[item_idx].strip()]
+                            save_lines.append(f"\tGLOBAL_STORE_DWORD v[{probe.param_addr}], {item_val} \n\tV_LSHL_ADD_U64  v[{probe.param_addr}], 0, 4")
+                    else:
+                        raise ValueError("Only Support Saving u32 / u64")
+                    snippet_lines[snippet_line_idx] = "\n".join(save_lines)
+            snippet = "\n".join(snippet_lines)
+            for reg in probe.registers:
+                if reg in snippet: 
+                    snippet = snippet.replace(reg, probe.registers[reg])
+            # Finally replace the line
+            gcn_lines[idx] = snippet
+    
+    # NOTE we need to modify the number of registers used in metasection
+    # 1. Mofify the kernarg_size .amdhsa_kernarg_size 28
+    # 2. Modify the SGPRs used   .amdhsa_next_free_sgpr 12
+    # 3. Modify the VGPRs used   .amdhsa_next_free_vgpr 9
+    # Something might need .amdhsa_user_sgpr_count 2
+    for idx in range(len(gcn_lines)):
+        if ".amdhsa_kernarg_size" in gcn_lines[idx]:
+            gcn_lines[idx] = f"\t.amdhsa_kernarg_size {param_off}"
+        elif ".amdhsa_next_free_sgpr" in gcn_lines[idx]:
+            gcn_lines[idx] = f"\t.amdhsa_next_free_sgpr {sgpr}"
+        elif ".amdhsa_next_free_vgpr" in gcn_lines[idx]:
+            gcn_lines[idx] = f"\t.amdhsa_next_free_vgpr {vgpr}"
+    
+    # NOTE also modify the amdgpu_metadata, after all, becomes 
+    amdgpu_metadata["amdhsa.kernels"][0]['.sgpr_count'] = sgpr + sgpr_diff
+    amdgpu_metadata['amdhsa.kernels'][0]['.vgpr_count'] = vgpr
+    amdgpu_metadata["amdhsa.kernels"][0]['.kernarg_segment_size'] = param_off 
+    amdgpu_metadata["amdhsa.kernels"][0]['.args'] += params
+    # Finally finished, we might need to finalize the metadata
+    return "\n".join(gcn_lines), probe_mem_sizes
+
+    # NOTE also add new parameters
+
+
+def assemble(workdir: str, name: str) -> None:
+    """Assemble the GCN Asm (probed.asm) into Machine Code (probed.bin)
+    NOTE AMD assembler command is part of Clang LLVM like 
+    clang -cc1as -triple amdgcn-amd-amdhsa -filetype obj -target-cpu gfx942 
+    -mrelocation-model pic -v -mllvm -amdgpu-early-inline-all=true -mllvm 
+    -amdgpu-function-calls=false -o probed.bin probed.asm
+    """
+    # TODO need to locate the clang of ROCm, unlike like ptxas of unique name
+    asm_path = os.path.join(workdir, name) + ".asm"
+    bin_path = os.path.join(workdir, name) + ".bin" # target binary
+    command = ["clang", '-cc1as', '-triple', 'amdgcn-amd-amdhsa', '-filetype=obj', 
+               f"-target-cpu={get_arch()}", '-mrelocation-model=pic', '--verbose', 
+               '-mllvm', '-amdgpu-early-inline-all=true',
+               '-mllvm', '-amdgpu-function-calls=falsep', 
+               asm_path, '-o', bin_path]
+    print(" ".join(command), file=log)
+    result = subprocess.run(
+        command, 
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        check=True
+    )
+    # print debug and verbose information to the process.log
+    if len(result.stderr) > 0:
+        print(result.stderr.decode("utf-8"), file=log)
+    if len(result.stdout) > 0:
+        print(result.stdout.decode("utf-8"), file=log)
+
+# ENTRY for this tool
+if __name__ == "__main__":
+    # no argparse as the CLI is straightforward
+    workdir = sys.argv[1]     # directory contains original.bin
+    kernel_name = sys.argv[2].encode('utf-8', 'ignore').decode('utf-8', 'ignore') # for possible case with multiple entry in one binary
+
+    if len(sys.argv) > 3: # NOTE to facilitate debugging, not used in production
+        probe_path = sys.argv[3]
+        probe_toml = toml.load(probe_path)
+    else: # the pass in production
+        # parse the environment variable to read the probes
+        probe_envvar = os.environ.get("NEUTRINO_PROBES")
+        if probe_envvar is None:
+            raise ValueError("Can not read probes from envaraible 'NEUTRINO_PROBES'")
+        # load it via toml
+        probe_toml = toml.loads(probe_envvar)
+    
+    # filter out, probes are nested dict in TOML via [name]
+    probes: Dict[str, dict] = dict()
+    analyze_hook = probe_toml["analyze_hook"] if "analyze_hook" in probe_toml else ""
+    for key, value in probe_toml.items():
+        if isinstance(value, dict):
+            probes[key] =value
+
+    probes = safe_load_probes(probes)
+    # apply a 
+
+    try:
+        # first decompile binary to ptx
+        asm = extract(workdir)
+        # then truncate ptx for entry_name
+        entry_section, meta_section = prune(asm, kernel_name)
+        
+        # split and process ptx lines and write kernel info
+        params, kernel_name = parse_params()
+
+        # basic logging
+        print(kernel_name, file=log)
+
+        # write pruned gcnasm to file
+        meta_section = meta_section[: meta_section.index(".amdgpu_metadata") +  16] + yaml.safe_dump(amdgpu_metadata) + meta_section[meta_section.index(".end_amdgpu_metadata") - 1:]
+        pruned_ptx = entry_section + "\n" + meta_section
+        with open(os.path.join(workdir, "pruned.asm"), "w") as f:
+            f.write(pruned_ptx)
+
+        probed_asm, probe_mem_sizes = probing(entry_section, probes)
+
+        # NOTE we need to update the meta_section we updated
+        # TODO split into multiple lines
+        meta_section = meta_section[: meta_section.index(".amdgpu_metadata") +  16] + yaml.safe_dump(amdgpu_metadata) + meta_section[meta_section.index(".end_amdgpu_metadata") - 1:]
+        
+        # merge global and func back
+        probed_asm  = probed_asm + "\n" + meta_section
+
+        # write probed gcnasm to file
+        with open(os.path.join(workdir, "probed.asm"), "w") as f:
+            f.write(probed_asm)
+        
+    except Exception as e:
+        traceback.print_exc(file=log)
+        exit(1)
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/common.h b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/common.h
new file mode 100644
index 0000000..cac4710
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/common.h
@@ -0,0 +1,611 @@
+/**
+ * Common Definition of Neutrino Hooked Driver
+ * 
+ * @note Keep common.h only Linux/GNU dependencies, no other platform-specifics
+ */
+#include <unistd.h>   // for many thing
+#include <stdlib.h>   // for standard library
+#include <stdio.h>    // for file dump
+#include <time.h>     // for timing
+#include <dlfcn.h>    // for loading real shared library
+#include <stdint.h>   // for uint64_t defn
+#include <stdbool.h>  // for true false
+#include <elf.h>      // for ELF Header
+#include <sys/wait.h> // for waiting subprocess
+#include <sys/stat.h> // for directory
+#include <pthread.h>  // for mutex lock
+#include "uthash.h"   // for hashmap
+#include "sha1.h"     // for SHA1 hash algorithm
+
+/**
+ * @todo change probe type to enum for better portability
+ * @todo standardize trace saving, got duplicate codes in cuda.c / hip.c
+ * @todo standardize JIT interaction, got duplicate codes in cuda.c / hip.c
+ */
+
+#define PROBE_TYPE_THREAD 0
+#define PROBE_TYPE_WARP 1
+#define CDIV(a,b) (a + b - 1) / (b)
+
+static FILE* event_log; // file pointer to event_log:  NEUTRINO_TRACEDIR/MM_DD_HH_MM_SS/event.event_log
+
+/**
+ * System Configuration and Setup
+ */
+
+static void* shared_lib           = NULL; // handle to real cuda driver
+static char* NEUTRINO_REAL_DRIVER = NULL; // path to real cuda driver, loaded by env_var NEUTRINO_REAL_DRIVER
+static char* NEUTRINO_PYTHON      = NULL; // path to python exe, loaded by env_var NEUTRINO_PYTHON
+static char* NEUTRINO_PROBING_PY  = NULL; // path to process.py, loaded by env_var NEUTRINO_PROBING_PY
+static char* NEUTRINO_CALLBACK    = NULL; // callback to analyze the kernel
+
+// directory structure 
+static char* RESULT_DIR = NULL; // env_var NEUTRINO_TRACEDIR/MM_DD_HH_MM_SS/result
+static char* KERNEL_DIR = NULL; // env_var NEUTRINO_TRACEDIR/MM_DD_HH_MM_SS/kernel
+
+/**
+ * Benchmark mode, will include an additional launch after the trace kernel
+ * Used to measure the kernel-level slowdown of Neutrino, disabled by default
+ * @warning might cause CUDA_ERROR with in-place kernels, coupled with --filter if encountered
+ *          this intrinsic of program and can not be resolved by Neutrino
+ * @note benchmark_mem is a 256MB empty memory that will be cuMemSetD32 to 0
+ *       which take the L2 Cache Space and Remove Previous L2 Cache Value, 
+ * @cite this is inspired by Triton do_bench and Nvidia https://github.com/NVIDIA/nvbench/
+ */
+static int NEUTRINO_BENCHMARK = 0;
+static size_t NEUTRINO_BENCHMARK_FLUSH_MEM_SIZE = 256e6; 
+
+/**
+ * A feature to measure the memory usage of probes other than really launch the
+ * profiling kernel. Useful in debugging / preventing out-of-memory errors.
+ */
+static int NEUTRINO_MEMUSAGE = 0;
+
+// simple auto-increasing idx to distinguish kernels of the same name
+static int kernel_idx = 0;
+
+// start time for event_logging. Neutrino trace are named as time since start
+static struct timespec start;
+
+// verbose setting -> to prevent event_log file too large due to unimportant setting
+static int VERBOSE = 0; 
+
+// dynamic setting -> enable it leads to a count kernel launched to detect the dynamic part
+static int DYNAMIC = 0;
+
+// helper macro to check dlopen/dlsym error
+#define CHECK_DL() do {                    \
+    const char *dl_error = dlerror();      \
+    if (dl_error) {                        \
+        fprintf(stderr, "%s\n", dl_error); \
+        exit(EXIT_FAILURE);                \
+    }                                      \
+} while (0)
+
+// utilities to get the trace folder name
+char* get_tracedir() {
+    // First read the parent directory
+    char* NEUTRINO_TRACEDIR = getenv("NEUTRINO_TRACEDIR");
+    if (NEUTRINO_TRACEDIR == NULL) {
+        fprintf(stderr, "Environment Variable NEUTRINO_TRACEDIR not set\n");
+        exit(EXIT_FAILURE);
+    }
+    // check and create folder structure
+    // first create NEUTRINO_TRACE_DIR
+    if (access(NEUTRINO_TRACEDIR, F_OK) != 0) { // not existed or bugs
+        if (mkdir(NEUTRINO_TRACEDIR, 0755) != 0) {
+            perror("Can not create NEUTRINO_TRACEDIR");
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    unsigned long long start_time_jiffies;
+    unsigned long uptime_seconds;
+
+    // 1. read the 22nd value of /proc/[pid]/stat (jiffies of proc start time)
+    FILE *stat_file = fopen("/proc/self/stat", "r");
+    if (!stat_file) {
+        perror("Failed to open /proc/[pid]/stat");
+        exit(1);
+    }
+    for (int i = 0; i < 21; i++) {
+        if (fscanf(stat_file, "%*s") == EOF) {
+            fclose(stat_file);
+            fprintf(stderr, "Invalid /proc/self/stat format\n");
+            exit(1);
+        }
+    }
+    int read_items = fscanf(stat_file, "%llu", &start_time_jiffies);
+    fclose(stat_file);
+
+    // 2. get system clock frequency (Hz, usually 100MHz)
+    long clk_tck = sysconf(_SC_CLK_TCK);
+    if (clk_tck <= 0) {
+        fprintf(stderr, "Failed to get system clock tick\n");
+        exit(1);
+    }
+
+    // 3. read the systme boot time (second, since 1970)
+    FILE *uptime_file = fopen("/proc/uptime", "r");
+    if (!uptime_file) {
+        perror("Failed to open /proc/uptime");
+        exit(1);
+    }
+    read_items = fscanf(uptime_file, "%lu", &uptime_seconds);
+    fclose(uptime_file);
+
+    // 4. compute absolute timestamp of proc boot time and format
+    time_t procstart = (time_t) (time(NULL) - uptime_seconds \
+                                 + (double)start_time_jiffies / clk_tck);
+    struct tm *timeinfo = localtime(&procstart);
+    char time_str[20];
+    strftime(time_str, sizeof(time_str), "%b%d_%H%M%S", timeinfo);
+    // generate TRACE_DIR and create if need
+    char* TRACE_DIR = (char*) malloc(strlen(NEUTRINO_TRACEDIR) + 30);
+    sprintf(TRACE_DIR, "%s/%s_%d", NEUTRINO_TRACEDIR, time_str, getpid());
+    // get or create the TRACE_DIR
+    if (access(TRACE_DIR, F_OK) != 0) { 
+        if (mkdir(TRACE_DIR, 0755) != 0) {
+            perror("Can not create TRACE_DIR");
+            exit(EXIT_FAILURE);
+        }
+    }
+    return TRACE_DIR;
+}
+
+/**
+ * @note semaphores for thread safety: Neutrino don't envision multi-threading
+ *       but upper layer, like PyTorch may use multi-threading for their need
+ * There's only a few critical section like init and hashmaps
+ */
+static pthread_once_t mutex_is_initialized = PTHREAD_ONCE_INIT; // for safe initialization of mutex
+static pthread_mutex_t mutex; // initialization is protected by the mutex_is_initialized
+void mutex_init(void) { pthread_mutex_init(&mutex, NULL); }
+
+/**
+ * initialize event_log, dir, envvar, these kind of platform-diagnostic commons
+ * need to be called at the beginning of platform-specific init()
+ * @note shall be executed with mutex protection!!!
+ */
+static void common_init(void) {
+    // first verify NEUTRINO_PROBE is set 
+    char* NEUTRINO_PROBES = getenv("NEUTRINO_PROBES");
+    if (NEUTRINO_PROBES == NULL) {
+        fprintf(stderr, "[error] envariable NEUTRINO_PROBES not set\n");
+    }
+    // get environment variables
+    NEUTRINO_REAL_DRIVER = getenv("NEUTRINO_REAL_DRIVER");
+    if (NEUTRINO_REAL_DRIVER == NULL) {
+        fprintf(stderr, "[error] envariable NEUTRINO_REAL_DRIVER not set\n");
+        exit(EXIT_FAILURE);
+    }
+    NEUTRINO_PYTHON = getenv("NEUTRINO_PYTHON");
+    if (NEUTRINO_PYTHON == NULL) {
+        fprintf(stderr, "[error] envariable NEUTRINO_PYTHON not set\n");
+        exit(EXIT_FAILURE);
+    }
+    NEUTRINO_PROBING_PY = getenv("NEUTRINO_PROBING_PY");
+    if (NEUTRINO_PROBING_PY == NULL) {
+        fprintf(stderr, "[error] envariable NEUTRINO_PROBING_PY not set\n");
+        exit(EXIT_FAILURE);
+    }
+    NEUTRINO_CALLBACK = getenv("NEUTRINO_CALLBACK");
+    // External Feature Controls
+    char* dynamic = getenv("NEUTRINO_DYNAMIC");
+    if (dynamic != NULL && atoi(dynamic) != 0) {
+        DYNAMIC = 1;
+    }
+    char* verbose = getenv("NEUTRINO_VERBOSE");
+    if (verbose != NULL && atoi(verbose) != 0) { // otherwise, default is 0
+        VERBOSE = 1;
+    } 
+    char* benchmark = getenv("NEUTRINO_BENCHMARK");
+    if (benchmark != NULL && atoi(benchmark) != 0) {
+        NEUTRINO_BENCHMARK = 1;
+    }
+    char* memusage = getenv("NEUTRINO_MEMUSAGE");
+    if (memusage != NULL && atoi(memusage) != 0) {
+        NEUTRINO_MEMUSAGE = 1;
+    }
+    // generate TRACE_DIR and create if need
+    char* TRACE_DIR = get_tracedir();
+    fprintf(stderr, "[info] trace in %s \n", TRACE_DIR);
+    // RESULT_DIR put metrics
+    RESULT_DIR = malloc(strlen(TRACE_DIR) + 8);
+    sprintf(RESULT_DIR, "%s/result", TRACE_DIR);
+    if (mkdir(RESULT_DIR, 0755) != 0) {
+        perror("Can not create RESULT_DIR");
+        exit(EXIT_FAILURE);
+    }
+    // KERNEL_DIR is workdirs of the probe engine
+    KERNEL_DIR = malloc(strlen(TRACE_DIR) + 8);
+    sprintf(KERNEL_DIR, "%s/kernel", TRACE_DIR);
+    if (mkdir(KERNEL_DIR, 0755) != 0) {
+        perror("Can not create KERNEL_DIR");
+        exit(EXIT_FAILURE);
+    }
+    /**
+     * Dump the probe.toml to TRACE_DIR/probe.toml
+     */
+    char* TMP_PATH = malloc(strlen(TRACE_DIR) + 20);
+    sprintf(TMP_PATH, "%s/probe.toml", TRACE_DIR);
+    FILE* probes_f = fopen(TMP_PATH, "w");
+    if (probes_f == NULL) {
+        perror("Can open probe.toml");
+        exit(EXIT_FAILURE);
+    }
+    fwrite(NEUTRINO_PROBES, sizeof(char), strlen(NEUTRINO_PROBES), probes_f);
+    fclose(probes_f);
+    /**
+     * Dump the trace reading code to the TRACE_DIR/read.py
+     */
+    const char* NEUTRINO_READING = getenv("NEUTRINO_READING");
+    if (NEUTRINO_READING) {
+        sprintf(TMP_PATH, "%s/read.py", TRACE_DIR);
+        FILE* reading_f = fopen(TMP_PATH, "w");
+        if (reading_f != NULL) {
+            fwrite(NEUTRINO_READING, sizeof(char), strlen(NEUTRINO_READING), reading_f);
+            fclose(reading_f);
+        }
+    }
+    /**
+     * Open the event.log as the operation log of hook driver
+     */
+    sprintf(TMP_PATH, "%s/event.log", TRACE_DIR);
+    event_log = fopen(TMP_PATH, "a");
+    if (event_log == NULL) {
+        perror("Can open event.log");
+        exit(EXIT_FAILURE);
+    }
+    // print metadata like pid and cmdline
+    fprintf(event_log, "[init] pid %d\n", getpid()); // print the process id
+    // get command line arguments
+    char cmdpath[128], cmdline[1024];
+    sprintf(cmdpath, "/proc/%d/cmdline", getpid());
+    FILE *cmdfile = fopen(cmdpath, "r");
+    size_t len = fread(cmdline, 1, sizeof(cmdline) - 1, cmdfile);
+    if (len > 0) {
+        // Replace null characters with spaces
+        for (int i = 0; i < len; i++) {
+            if (cmdline[i] == '\0') { 
+                cmdline[i] = ' ';
+            }
+        }
+    }
+    fclose(cmdfile);
+    // print the command line, helpful to correlate source code
+    fprintf(event_log, "[init] cmd %zu %s\n", len, cmdline); 
+    fflush(event_log);
+    // load real driver shared library
+    shared_lib = dlopen(NEUTRINO_REAL_DRIVER, RTLD_LAZY);
+    CHECK_DL();
+    fprintf(event_log, "[init] dl %p\n", shared_lib); 
+    fflush(event_log);
+    // get the starting time
+    clock_gettime(CLOCK_REALTIME, &start);
+    free(TMP_PATH);
+    free(TRACE_DIR);
+    // don't free RESULT_DIR and KERNEL_DIR, we will use it later
+}
+
+/**
+ * Neutrino Trace Headers being dumped
+ * 
+ * Similar to most binary, Neutrino trace started with a header (trace_header_t) and
+ * followed by an array of section (trace_section_t) for each probe, and datas.
+ * @todo add section table similar to ELF for faster parsing
+ * @todo add a placeholder for probe type
+ * @todo standardize saving from cuda.c/hip.c
+ */
+typedef struct {
+    // basic launch configuration
+    uint32_t gridDimX;
+    uint32_t gridDimY;
+    uint32_t gridDimZ;
+    uint32_t blockDimX;
+    uint32_t blockDimY;
+    uint32_t blockDimZ;
+    uint32_t sharedMemBytes; // @todo replace with WARP_SIZE
+    // all above from CUDA/ROCm launch configuration
+    uint32_t numProbes; // number of traces exposed
+    // followed by an array of trace_section_t
+} trace_header_t;
+
+typedef struct {
+    uint32_t size;    // size of record per thread/warp in byte
+    uint32_t warpDiv; // warpSize for warp-level, 1 for thread-level
+    uint64_t offset;  // offset for fseek
+} trace_section_t;
+
+/**
+ * GPU Code Binary Header Definitions, supporting cubin, fatbin, text(ptx/gcn asm)
+ * @note ELF is standard ELF and fatbin 
+ * @todo support .hsaco 
+ */
+
+// fat binary header defined for fatbin
+// @cite https://github.com/rvbelapure/gpu-virtmem/blob/master/cudaFatBinary.h
+typedef struct {
+    unsigned int           magic;   // magic numbers, checked it before
+    unsigned int           version; // fatbin version
+    unsigned long long int size;    // fatbin size excluding 
+} fatBinaryHeader;
+
+// the fat binary wrapper header
+// @see fatbinary_section.h in cuda toolkit
+typedef struct {
+    int magic;
+    int version;
+    unsigned long long* data;  // pointer to real fatbin
+    void *filename_or_fatbin;  /* version 1: offline filename,
+                                * version 2: array of prelinked fatoutbuf */
+} fatBinaryWrapper;
+
+/**
+ * Binary Size Calculation based on header because code are of void*
+ * @note Please use unified API get_managed_code_size
+ */
+
+#define ELF 1
+#define FATBIN 2
+#define WRAPPED_FATBIN 3
+#define PTX 4
+#define ERROR_TYPE 0
+
+static const char *code_types[] = { "error", "elf", "fatbin", "warpped_fatbin", "ptx" };
+
+// check if content of void *ptr is ELF format or FatBinary Format
+static int check_magic(const int magic) {
+    if (magic == 0x464c457f || magic == 0x7f454c46) {
+        return ELF;
+    } else if (magic == 0xba55ed50 || magic == 0x50ed55ba) {
+        return FATBIN;
+    } else if (magic == 0x466243B1 || magic == 0xB1436246) {
+        return WRAPPED_FATBIN;
+    } else {
+        return ERROR_TYPE;
+    }
+}
+
+static unsigned long long get_elf_size(const Elf64_Ehdr *header) {    
+    // for standard executable, use section header
+    size_t size = header->e_shoff + header->e_shentsize * header->e_shnum;
+
+    // for cubin, only program header can give correct size
+    if (header->e_phoff + header->e_phentsize * header->e_phnum > size)
+        size = header->e_phoff + header->e_phentsize * header->e_phnum;
+
+    return size;
+}
+
+static unsigned long long get_fatbin_size(const fatBinaryHeader *header) {
+    // size of fatbin is given by header->size and don't forget sizeof header
+    return header->size + sizeof(fatBinaryHeader); 
+}
+
+static int get_managed_code_size(void** managed, size_t* size, const void* bin) {
+    int magic, bin_type;
+    // check the magic number for binary type
+    memcpy(&magic, bin, sizeof(int)); 
+    bin_type = check_magic(magic);
+    const void *code;
+    if (bin_type == WRAPPED_FATBIN) { 
+        fatBinaryWrapper wrapper;
+        memcpy(&wrapper, bin, sizeof(wrapper));
+        fatBinaryHeader header;
+        memcpy(&header, wrapper.data, sizeof(header));
+        *size = get_fatbin_size(&header);
+        code = (const void*) wrapper.data;
+        fprintf(event_log, "[bin] type %s size %zu\n", code_types[bin_type], *size);
+    } else if (bin_type == FATBIN) { 
+        fatBinaryHeader header;
+        memcpy(&header, bin, sizeof(header));
+        *size = get_fatbin_size(&header);
+        code = (const void*) bin;
+        fprintf(event_log, "[bin] type %s size %zu\n", code_types[bin_type], *size);
+    } else if (bin_type == ELF) {
+        Elf64_Ehdr header;
+        memcpy(&header, bin, sizeof(header));
+        *size = get_elf_size(&header);
+        code = (const void*) bin;
+        fprintf(event_log, "[bin] type %s size %zu\n", code_types[bin_type], *size);
+    } else if (bin_type == ERROR_TYPE) {
+        // check whether it's text file of NULL-Terminated ASM File
+        // ptx must start with '//' and end with '\0'
+        // @todo add GCN ASM here
+        const char* ptx = (const char*) bin;
+        if (ptx[0] == '/' && ptx[1] == '/') {
+            *size = strlen(ptx); // naturally count till '\0'
+            code = (const void*) bin;
+            bin_type = PTX;
+            fprintf(event_log, "[bin] type %s size %zu\n", code_types[bin_type], *size);
+        } else { // still unrecognize, report the bug and terminates
+            fprintf(event_log, "[bin] unrecognize %d\n", magic);
+            return -1;
+        }
+    }
+    // copy the image to a new managed and protected place
+    *managed = malloc(*size);
+    memcpy(*managed, code, *size);
+    return 0;
+}
+
+/**
+ * Hash map (uthash) as Code Cache to avoid re-probing the same GPU function, include:
+ * 1. Binary Map for GPU code before probe, could be library, module, function
+ * 2. Function Map for probed code, including original/pruned/probed function
+ * @todo binmap logics are duplicated (update_key, update_name_key), simplify them
+ */
+
+typedef struct {
+    void* key;  // could be CUlibrary, CUmodule, CUfunction or HIP equivalent
+    void* code; // the binary code
+    char* name; // name of function
+    unsigned long long size; // size of bin
+    UT_hash_handle hh; 
+} binmap_item;
+
+static binmap_item*  binmap  = NULL; // UTHash Initialization
+
+// add item to bin hashmap, won't raise
+int binmap_set(void* key, void* code, unsigned long long size, char* name) {
+    pthread_mutex_lock(&mutex);
+    binmap_item* item = (binmap_item*) malloc(sizeof(binmap_item));
+    item->key = key;
+    item->code = code;
+    item->size = size;
+    item->name = name;
+    HASH_ADD_PTR(binmap, key, item);
+    pthread_mutex_unlock(&mutex);
+    return 0;
+}
+
+int binmap_update_key(void* old_key, void* new_key) {
+    pthread_mutex_lock(&mutex);
+    binmap_item* item;
+    HASH_FIND_PTR(binmap, &old_key, item);
+    if (item != NULL) {
+        HASH_DEL(binmap, item);
+        item->key = new_key;
+        HASH_ADD_PTR(binmap, key, item);
+        pthread_mutex_unlock(&mutex);
+        return 0;
+    } else {
+        pthread_mutex_unlock(&mutex);
+        return -1;
+    }
+}
+
+/**
+ * Update both the name and the key, favored by cuModuleGetFunction
+ * and cuLibraryGetKernel, which will create new entry to hold the
+ * new key and value, but underlying binary and size will be shared
+ */
+int binmap_update_name_key(void* old_key, void* new_key, char* name) {
+    pthread_mutex_lock(&mutex);
+    binmap_item* old_item;
+    HASH_FIND_PTR(binmap, &old_key, old_item);
+    if (old_item != NULL) { 
+        binmap_item* new_item = (binmap_item*) malloc(sizeof(binmap_item));
+        new_item->name = name;
+        new_item->key  = new_key;
+        new_item->size = old_item->size;
+        new_item->code  = old_item->code;
+        HASH_ADD_PTR(binmap, key, new_item);
+        pthread_mutex_unlock(&mutex);
+        return 0;
+    } else {
+        pthread_mutex_unlock(&mutex);
+        return -1;
+    }
+}
+
+int binmap_get(void* key, size_t* size, char** name, void** code) {
+    pthread_mutex_lock(&mutex);
+    binmap_item* item;
+    HASH_FIND_PTR(binmap, &key, item);
+    if (item != NULL) { 
+        *size = item->size;
+        *name = item->name;
+        *code = item->code;
+        pthread_mutex_unlock(&mutex);
+        return 0;
+    } else {
+        pthread_mutex_unlock(&mutex);
+        return -1;
+    }
+}
+
+// function map items, used as JIT code cache to avoid re-compilation
+typedef struct {
+    void* original;    // original CUfunction/HIPfunction
+    char* name;        // name of function, if made possible, can be NULL
+    int n_param;       // number of parameters, obtained from parsing
+    int n_probe;       // number of probes that would dump memory
+    int* probe_sizes;  // sizes of probe memory, order matches
+    int* probe_types;  // types of probe, 
+    bool succeed;      // specify JIT status -> if failed, always goto backup
+    void* probed;      // probed CUfunction/HIPfunction
+    void* pruned;      // pruned CUfunction/HIPfunction, for benchmark only
+    void* countd;      // counting CUfunction/HIPfunction, for DYNAMIC=TRUE only
+    UT_hash_handle hh; // reserved by uthash
+} funcmap_item_t;
+
+static funcmap_item_t* funcmap = NULL;
+
+// add an item to the hashmap-based code cache
+int funcmap_set(void* original, char* name, int n_param, int n_probe, int* probe_sizes, int* probe_types, bool succeed, void* probed, void* pruned, void* countd) {
+    pthread_mutex_lock(&mutex);
+    funcmap_item_t* item = (funcmap_item_t*) malloc(sizeof(funcmap_item_t));
+    item->original = original;
+    item->probed = probed;
+    item->pruned = pruned;
+    item->countd = countd;
+    item->name = name;
+    item->n_param = n_param;
+    item->n_probe = n_probe;
+    item->probe_sizes = probe_sizes;
+    item->probe_types = probe_types;
+    item->succeed = succeed; // add func status -> if failed then no need to try probing again and again
+    HASH_ADD_PTR(funcmap, original, item);
+    pthread_mutex_unlock(&mutex);
+    return 0;
+}
+
+// get an item from hashmap-based code cache
+int funcmap_get(void* original, char** name, int* n_param, int* n_probe, int** probe_sizes, int** probe_types, bool* succeed, void** probed, void** pruned, void** countd) {
+    pthread_mutex_lock(&mutex);
+    funcmap_item_t* item;
+    HASH_FIND_PTR(funcmap, &original, item);
+    if (item != NULL) {
+        *name        = item->name;
+        *n_param     = item->n_param;
+        *n_probe     = item->n_probe;
+        *probe_sizes = item->probe_sizes;
+        *probe_types = item->probe_types;
+        *succeed     = item->succeed;
+        *probed      = item->probed;
+        *pruned      = item->pruned;
+        *countd      = item->countd;
+        pthread_mutex_unlock(&mutex);
+        return 0;
+    } else { 
+        pthread_mutex_unlock(&mutex);
+        return -1;
+    }        
+}
+
+/**
+ * hash text based on sha1 algorithm, mainly to flush kernel name, because the
+ * C++ template can be long and contains weird bytes (to ASCII).
+ * @note not memory safe, remember to free pointer returned
+ */
+char* sha1(const char* text) {
+	SHA1_CTX ctx;
+	sha1_init(&ctx);
+	sha1_update(&ctx, text, strlen(text));
+	BYTE hash[SHA1_BLOCK_SIZE];
+	sha1_final(&ctx, hash);
+	char* hexed = malloc(41 * sizeof(char)); // 1 for '\0'
+	sprintf(hexed, "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x",
+		hash[0], hash[1], hash[2], hash[3], hash[4], hash[5], hash[6], hash[7], hash[8], hash[9],
+		hash[10],hash[11],hash[12],hash[13],hash[14],hash[15],hash[16],hash[17],hash[18],hash[19]);
+	return hexed;
+}
+
+/**
+ * File Utilities, Read File without knowing size
+ * @note remember to free pointer returned
+ */
+inline void* readf(char* path, const char* mode) {
+    FILE* file = fopen(path, mode);
+    fseek(file, 0, SEEK_END);
+    long file_size = ftell(file);
+    fseek(file, 0, SEEK_SET);
+    void* ptr = malloc(file_size);
+    size_t read_size = fread(ptr, 1, file_size, file);
+    if (read_size != file_size)
+        fprintf(stderr, "read size mismatched\n");
+    fclose(file);
+    return ptr;
+}
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/cuda.c b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/cuda.c
new file mode 100644
index 0000000..fa146a5
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/cuda.c
@@ -0,0 +1,788 @@
+/**
+ * Neutrino Hook Driver, NVIDIA CUDA Implementation
+*/
+
+#include "common.h" // for common headers
+#include <cuda.h>   // for cuda related definition
+
+/**
+ * Undefine some symbols updated to v2. These are some historical issue with
+ * 32bit machine. Now NVIDIA update them to v2 for 64bit.
+ */
+#undef cuMemAlloc
+#undef cuStreamGetCaptureInfo
+#undef cuArray3DCreate
+#undef cuArray3DGetDescriptor
+#undef cuArrayCreate
+#undef cuArrayGetDescriptor
+#undef cuCtxCreate
+#undef cuCtxDestroy
+#undef cuCtxPopCurrent
+#undef cuCtxPushCurrent
+#undef cuDevicePrimaryCtxRelease
+#undef cuDevicePrimaryCtxReset
+#undef cuDevicePrimaryCtxSetFlags
+#undef cuDeviceTotalMem
+#undef cuEventDestroy
+#undef cuGetProcAddress
+#undef cuGraphAddKernelNode
+#undef cuGraphExecKernelNodeSetParams
+#undef cuGraphExecUpdate
+#undef cuGraphicsResourceGetMappedPointer
+#undef cuGraphicsResourceSetMapFlags
+#undef cuGraphKernelNodeGetParams
+#undef cuGraphKernelNodeSetParams
+#undef cuIpcOpenMemHandle
+#undef cuLinkAddData
+#undef cuLinkAddFile
+#undef cuLinkCreate
+#undef cuMemAllocHost
+#undef cuMemAllocPitch
+#undef cuMemcpy2DAsync
+#undef cuMemcpy2DUnaligned
+#undef cuMemcpy2D
+#undef cuMemcpy3DAsync
+#undef cuMemcpy3D
+#undef cuMemcpyAtoA
+#undef cuMemcpyAtoD
+#undef cuMemcpyAtoHAsync
+#undef cuMemcpyAtoH
+#undef cuMemcpyDtoA
+#undef cuMemcpyDtoDAsync
+#undef cuMemcpyDtoD
+#undef cuMemcpyDtoHAsync
+#undef cuMemcpyDtoH
+#undef cuMemcpyHtoAAsync
+#undef cuMemcpyHtoA
+#undef cuMemcpyHtoDAsync
+#undef cuMemcpyHtoD
+#undef cuMemFree
+#undef cuMemGetAddressRange
+#undef cuMemGetInfo
+#undef cuMemHostGetDevicePointer
+#undef cuMemHostRegister
+#undef cuMemsetD16
+#undef cuMemsetD2D16
+#undef cuMemsetD2D32
+#undef cuMemsetD2D8
+#undef cuMemsetD32
+#undef cuMemsetD8
+#undef cuModuleGetGlobal
+#undef cuStreamBatchMemOp
+#undef cuStreamBeginCapture
+#undef cuStreamDestroy
+#undef cuStreamWaitValue32
+#undef cuStreamWaitValue64
+#undef cuStreamWriteValue32
+#undef cuStreamWriteValue64
+#undef cuTexRefGetAddress
+#undef cuTexRefSetAddress2D
+#undef cuTexRefSetAddress
+
+#define WARP_SIZE 32 // NVIDIA GPUs use 32 for WARP_SIZE
+// used by benchmark mode
+static CUdeviceptr benchmark_flush_mem = 0u; // aka NULL
+
+// following functions are hooked for internal usage
+CUresult (*real_cuModuleLoadData)(CUmodule*, const void*) = NULL;
+CUresult (*real_cuModuleLoadDataEx)(CUmodule*, const void*, unsigned int, CUjit_option*, void**) = NULL;
+CUresult (*real_cuModuleGetFunction)(CUfunction*, CUmodule, const char*) = NULL;
+CUresult (*real_cuKernelGetFunction)(CUfunction*, CUkernel) = NULL;
+CUresult (*real_cuLibraryGetKernel)(CUkernel*, CUlibrary, const char*) = NULL;
+CUresult (*real_cuLibraryGetModule)(CUmodule*, CUlibrary) = NULL;
+CUresult (*real_cuLibraryLoadData)(CUlibrary*, const void*, CUjit_option*, void**, unsigned int, CUlibraryOption*, void**, unsigned int) = NULL;
+CUresult (*real_cuLaunchKernel)(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void**, void**) = NULL;
+CUresult (*real_cuMemAlloc_v2)(CUdeviceptr*, size_t) = NULL;
+CUresult (*real_cuMemFree_v2)(CUdeviceptr) = NULL;
+CUresult (*real_cuModuleLoad)(CUmodule*, const char*) = NULL;
+CUresult (*real_cuModuleLoadFatBinary)(CUmodule*, const void*) = NULL;
+CUresult (*real_cuLaunchKernelEx)(const CUlaunchConfig*, CUfunction, void**, void**) = NULL;
+
+// helper macro to check cuda error
+#define CUDA_CHECK(cmd) do {                    \
+	CUresult result = cmd;                      \
+	if (result != CUDA_SUCCESS) {               \
+		const char *msg;                        \
+		real_cuGetErrorName(result, &msg);      \
+		printf("Neutrino fail: %s:%d '%s'\n",   \
+					 __FILE__, __LINE__, msg);  \
+		exit(EXIT_FAILURE);                     \
+	}                                           \
+} while (0)
+
+// include auto-generated signatures for unmodified functions
+// @note signature.c will be auto generated by parse.py
+#include "signature.c"
+
+/**
+ * Function to initialize the environment, including
+ * * init the cuda driver module via dlopen
+ * * init the file system as specified above
+ * * init the hashmap for binaries and CUfunction
+ * * init commonly used functions like real_cuModuleLoad...
+ * 
+ * @note this will be called only once when any hooked driver function is called
+ */
+static void init(void) {
+    pthread_once(&mutex_is_initialized, mutex_init);
+    // init() is critical section to be protected
+    pthread_mutex_lock(&mutex);
+    if (shared_lib != NULL) { // then it has been initialized by another
+        pthread_mutex_unlock(&mutex);
+        return;
+    }
+    common_init(); // init common modules
+    // load hooked function of Neutrino
+    real_cuModuleLoadData      = dlsym(shared_lib, "cuModuleLoadData");
+    real_cuModuleLoadDataEx    = dlsym(shared_lib, "cuModuleLoadDataEx");
+    real_cuModuleGetFunction   = dlsym(shared_lib, "cuModuleGetFunction");
+    real_cuKernelGetFunction   = dlsym(shared_lib, "cuKernelGetFunction");
+    real_cuLibraryGetKernel    = dlsym(shared_lib, "cuLibraryGetKernel");
+    real_cuLibraryGetModule    = dlsym(shared_lib, "cuLibraryGetModule");
+    real_cuLibraryLoadData     = dlsym(shared_lib, "cuLibraryLoadData");
+    real_cuLaunchKernel        = dlsym(shared_lib, "cuLaunchKernel");
+    real_cuMemAlloc_v2         = dlsym(shared_lib, "cuMemAlloc_v2");
+    real_cuMemFree_v2          = dlsym(shared_lib, "cuMemFree_v2");
+    real_cuModuleLoad          = dlsym(shared_lib, "cuModuleLoad");
+    real_cuModuleLoadFatBinary = dlsym(shared_lib, "cuModuleLoadFatBinary");
+    real_cuLaunchKernelEx      = dlsym(shared_lib, "cuLaunchKernelEx");
+    init_unmodified(); // init unmodified functions, defined in signature.c
+    CHECK_DL(); // checking if any dl error presented
+    // initialzie the L2 Flush Memory if benchmark is enabled
+    if (NEUTRINO_BENCHMARK) {
+        fprintf(event_log, "[benchmark] ENABLED L2 Flush Size %ld\n", NEUTRINO_BENCHMARK_FLUSH_MEM_SIZE);
+        real_cuMemAlloc_v2(&benchmark_flush_mem, NEUTRINO_BENCHMARK_FLUSH_MEM_SIZE);
+    }
+    fprintf(event_log, "[init] success\n"); 
+    // leaving critical section, unlock
+    pthread_mutex_unlock(&mutex);
+    return;
+}
+
+/**
+ * Module Management: cuModuleXXX
+ * @see https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html
+ * 
+ * This is to track the lowering pass from code (binary on disk) to CUfunction
+ */
+
+// Triton use this to load data
+CUresult cuModuleLoadData(CUmodule* module, const void* image) {
+    if (shared_lib == NULL) { init(); }
+    
+    // call the real function, after this, module will be valid
+    CUresult result = real_cuModuleLoadData(module, image);
+
+    fprintf(event_log, "[mod] cuModuleLoadData %d module %p image %p \n", result, *module, image);
+
+    void*  managed;
+    size_t size;
+    if (get_managed_code_size(&managed, &size, image) != -1) {
+        binmap_set(*module, managed, size, NULL); // name = NULL as we don't know it now
+    }
+    
+    return result;
+}
+
+CUresult cuLibraryLoadData(CUlibrary* library, const void* code, CUjit_option* jitOptions, void** jitOptionsValues, unsigned int numJitOptions, CUlibraryOption* libraryOptions, void** libraryOptionValues, unsigned int numLibraryOptions) {
+    if (shared_lib == NULL) { init(); }
+
+    CUresult result = real_cuLibraryLoadData(library, code, jitOptions, jitOptionsValues, numJitOptions, libraryOptions, libraryOptionValues, numLibraryOptions);
+    fprintf(event_log, "[mod] cuLibraryLoadData %d lib %p code %p\n", result, *library, code);
+
+    // update to hashmap
+    void*  managed;
+    size_t size;
+    if (get_managed_code_size(&managed, &size, code) != -1) {
+        binmap_set(*library, managed, size, NULL); // name = NULL as we don't know it now
+    }
+
+    return result;
+}
+
+CUresult cuModuleLoadDataEx(CUmodule* module, const void* image, unsigned int numOptions, CUjit_option* options, void** optionValues) {
+    if (shared_lib == NULL) { init(); }
+
+    CUresult ret = real_cuModuleLoadDataEx(module, image, numOptions, options, optionValues);
+    
+    fprintf(event_log, "[mod] cuModuleLoadDataEx mod %p code %p\n", *module, image);
+
+    void*  managed;
+    size_t size;
+    if (get_managed_code_size(&managed, &size, image) != -1) {
+        binmap_set(*module, managed, size, NULL); // name = NULL as we don't know it now
+    }
+
+    return ret;
+}
+
+// JAX use this API, but they don't pass in fatbin but cubin, so a wrong API to use...
+CUresult cuModuleLoadFatBinary(CUmodule* module, const void* fatCubin) {
+    if (shared_lib == NULL) { init(); }
+
+    CUresult result = real_cuModuleLoadFatBinary(module, fatCubin); // call the symbol
+
+    fprintf(event_log, "[mod] cuModuleLoadFatBinary mod %p code %p\n", *module, fatCubin);
+
+    void*  managed;
+    size_t size;
+    if (get_managed_code_size(&managed, &size, fatCubin) != -1) {
+        binmap_set(*module, managed, size, NULL); // name = NULL as we don't know it now
+    }
+    return result;
+}
+
+// @todo handle the multiple function with different name problem
+CUresult cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, const char* name) {
+    if (shared_lib == NULL) { init(); }
+
+    // first update the name
+    size_t len = strlen(name);
+    char* managed_name = malloc(len);
+    memcpy(managed_name, name, len);
+    
+    // call real function
+    CUresult result = real_cuModuleGetFunction(hfunc, hmod, name);
+
+    fprintf(event_log, "[mod] cuModuleGetFunction func %p mod %p name %s\n", *hfunc, hmod, name);
+
+    // then update the key from module to function
+    if (binmap_update_name_key(hmod, *hfunc, managed_name) == -1) {
+        fprintf(event_log, "[hash] cuModuleGetFunction failed-update %p %p %s\n", hmod, *hfunc, managed_name);
+    }
+    
+    return result;
+}
+
+CUresult cuKernelGetFunction(CUfunction* pFunc, CUkernel kernel) {
+    if (shared_lib == NULL) { init(); }
+
+    CUresult result = real_cuKernelGetFunction(pFunc, kernel);
+
+    fprintf(event_log, "[mod] cuKernelGetFunction %p %p\n", *pFunc, kernel);
+
+    // then update the key from kernel to function
+    if (binmap_update_key(kernel, *pFunc) == -1) {
+        fprintf(event_log, "[hash] cuKernelGetFunction failed-update %p %p\n", kernel, *pFunc);
+    }
+    
+    return result;
+}
+
+CUresult cuLibraryGetKernel(CUkernel* pKernel, CUlibrary library, const char* name) {
+    if (shared_lib == NULL) { init(); }
+
+    // first update the name
+    size_t len = strlen(name);
+    char* managed_name = malloc(len);
+    memcpy(managed_name, name, len);
+
+    CUresult result = real_cuLibraryGetKernel(pKernel, library, name);
+
+    fprintf(event_log, "[mod] cuLibraryGetKernel kernel %p lib %p name %s\n", *pKernel, library, name);
+
+    // then update the key from library to kernel
+    if (binmap_update_name_key(library, *pKernel, managed_name) == -1) {
+        fprintf(event_log, "[hash] cuLibraryGetKernel failed-update %p %p %s\n", library, *pKernel, managed_name);
+    }
+
+    return result;
+}
+
+CUresult cuLibraryGetModule(CUmodule* pMod, CUlibrary library) {
+    if (shared_lib == NULL) { init(); }
+
+    CUresult result = real_cuLibraryGetModule(pMod, library);
+
+    fprintf(event_log, "[mod] cuLibraryGetModule %d mod %p lib %p\n", result, *pMod, library);
+
+    // then update the key from library to kernel
+    if (binmap_update_key(library, *pMod) == -1) {
+        fprintf(event_log, "[hash] cuLibraryGetModule failed-update %p %p\n", library, *pMod);
+    }
+
+    return result;
+}
+
+/**
+ * Execution Control, cuLaunchXXX and cuFuncXXX
+ * @see https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html
+ * 
+ * aims at providing runtime probing support
+ */
+
+CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void** kernelParams, void** extra) {
+    if (shared_lib == NULL) { init(); }
+    
+    // for time measurement to understand the overhead of Neutrino
+    CUevent start_event, end_event;
+    CUDA_CHECK(real_cuEventCreate(&start_event, CU_EVENT_DEFAULT));
+    CUDA_CHECK(real_cuEventCreate(&end_event,   CU_EVENT_DEFAULT));
+    
+    CUDA_CHECK(real_cuEventRecord(start_event, hStream)); // use the stream specified in param
+
+    float prologue_time, kernel_time, epilogue_time; // time
+    CUresult result;
+    CUfunction probed, pruned; // countd is only used when DYNAMIC == True
+    char* kernel_name;
+    int n_param, n_probe; 
+    int* probe_sizes; // size of probes
+    int* probe_types; // type of probes
+    bool succeed;         // jit status
+    // @note for dynamic buffer, i.e., only when DYNAMIC=true
+    CUfunction countd = NULL;
+    int n_count = 0, count_size = 0; // count_size is used only when DYNAMIC == True
+
+    // try obtain the kernel compiled or raise compilation process 
+    // @note count and record is only valid if succeed == true
+    if (funcmap_get((void*)f, &kernel_name, &n_param, &n_probe, &probe_sizes, &probe_types, &succeed, (void**)&probed, (void**)&pruned, (void**)&countd) == -1) {
+        fprintf(event_log, "[exec] funcmap-not-find %p\n", f);
+        fflush(event_log); 
+        // here try to get binary  from binmap and start JIT compile
+        size_t size;
+        void* bin;
+        if (binmap_get(f, &size, &kernel_name, &bin) == -1) { // not found the binary, fall back
+            fprintf(event_log, "[probe] can't-find %p\n", f);
+            funcmap_set(f,kernel_name, 0, 0, NULL, NULL, false, NULL, NULL, NULL); // set dummy with status FALSE
+            goto backup;
+        } else {
+            fprintf(event_log, "[probe] find %p name %s bin %p size %zu\n", f, kernel_name, bin, size);
+            fflush(event_log);
+            // create a directory under the kernel directory with kernel_name
+            // @note Linux has limit on directory length 255, replace it to sh1 so 20 char
+            // @bugfix PyTorch kernel name usually is extremely long :(
+            // @bugfix Triton autotune leads to a set of kernel with same name -> use counter to differentiate
+            char *tmp = sha1(kernel_name);
+            char *folder_name = (char*) malloc(5 + strlen(tmp));
+            sprintf(folder_name, "%d_%s", kernel_idx, tmp);
+            free(tmp);
+            kernel_idx++;
+            fprintf(event_log, "[probe] rename %s %s\n", kernel_name, folder_name);
+            char* dir = malloc(strlen(KERNEL_DIR) + strlen(folder_name) + 10);
+            sprintf(dir, "%s/%s", KERNEL_DIR, folder_name);
+            if (mkdir(dir, 0755) == 0) { 
+                fprintf(event_log, "[probe] mkdir %s\n", dir);
+            } else {
+                fprintf(event_log, "[probe] can't-mkdir %s\n", dir);
+                funcmap_set(f,kernel_name, 0, 0, NULL, NULL, false, NULL, NULL, NULL); // set dummy with status FALSE
+                goto backup;
+            }
+            // create original.bin and write the binary to it
+            char* path = malloc(strlen(dir) + 15);
+            sprintf(path, "%s/original.bin", dir);
+            FILE* original_bin = fopen(path, "wb");
+            if (original_bin == NULL) {
+                fprintf(event_log, "[probe] can't-open %s\n", path);
+                funcmap_set(f,kernel_name, 0, 0, NULL, NULL, false, NULL, NULL, NULL); // set dummy with status FALSE
+                goto backup;
+            }
+            fwrite(bin, size, 1, original_bin);
+            fclose(original_bin);
+            fprintf(event_log, "[probe] write %s\n", path);
+            // create subprocess to run process.py, be aware of multi-processing
+            pid_t pid = fork();
+            if (pid < 0) {
+                fprintf(event_log, "[probe] can't-folk\n");
+                funcmap_set(f,kernel_name, 0, 0, NULL, NULL, false, NULL, NULL, NULL); // set dummy with status FALSE
+                goto backup;
+            } else if (pid == 0) { // child process, run python process.py kernel name
+                // python process.py <work_dir> <kernel_name>
+                execlp(NEUTRINO_PYTHON, NEUTRINO_PYTHON, NEUTRINO_PROBING_PY, dir, kernel_name, NULL);
+                exit(EXIT_FAILURE); // reach here only if exec error -> failure
+            } else { // parent process, wait for child
+                fprintf(event_log, "[probe] subproc %s %s %s %s\n", NEUTRINO_PYTHON, NEUTRINO_PROBING_PY, dir, kernel_name);
+                int status;
+                waitpid(pid, &status, 0);
+                if (status != EXIT_SUCCESS) { 
+                    fprintf(event_log, "[probe] python failed\n");
+                    funcmap_set(f,kernel_name, 0, 0, NULL, NULL, false, NULL, NULL, NULL); // set dummy with status FALSE
+                    goto backup; 
+                } else {
+                    fprintf(event_log, "[probe] python succeed\n");
+                }
+            }
+            // read the kernel.info from file system
+            sprintf(path, "%s/kernel.info", dir);
+            char* kernel_info = readf(path, "r");
+            // poor parser for kernel.info
+            // @todo add checking alignment
+            char* kernel_end = strchr(kernel_info, '\n');
+            *kernel_end = '\0';
+            kernel_name = kernel_info;
+            char* start = kernel_end + 1;
+            sscanf(start, " %d\n%d\n", &n_param, &n_probe);
+            // read sizes and types of probe 
+            probe_sizes = malloc(n_probe * sizeof(int));
+            probe_types = malloc(n_probe * sizeof(int));
+            char* strptr = strchr(strchr(start, '\n') + 1, '\n') + 1;
+            for (int idx = 0; idx < n_probe; idx++) {
+                sscanf(strptr, "%d,%d\n", &probe_types[idx], &probe_sizes[idx]);
+                strptr = strchr(strptr, '\n') + 1;
+            }
+            // // @note read process hook, not yet checked
+            // char* info_end = strchr(strptr, '\n');
+            // *info_end = '\0';
+            // callback = strptr;
+            // here read the 
+            fprintf(event_log, "[probe] read %s name %s n_param %d n_probe %d \n", path, kernel_name, n_param, n_probe);
+            // load probed.bin -> for collecting runtime info
+            sprintf(path, "%s/probed.bin", dir);
+            void* probed_bin = readf(path, "rb");
+            // load pruned.bin -> for benchmark
+            sprintf(path, "%s/pruned.bin", dir);
+            void* pruned_bin = readf(path, "rb");
+            // then load the binary to module
+            CUmodule probed_mod, pruned_mod;
+            // then get function with the SAME name -> we distinguish via Module
+            CUDA_CHECK(real_cuModuleLoadData(&probed_mod,  probed_bin));
+            CUDA_CHECK(real_cuModuleGetFunction(&probed, probed_mod, kernel_name));
+            CUDA_CHECK(real_cuModuleLoadData(&pruned_mod,  pruned_bin));
+            CUDA_CHECK(real_cuModuleGetFunction(&pruned, pruned_mod, kernel_name));
+            if (DYNAMIC) {
+                sprintf(path, "%s/countd.bin", dir);
+                void* countd_bin = readf(path, "rb");
+                CUmodule countd_mod;
+                CUDA_CHECK(real_cuModuleLoadData(&countd_mod,  countd_bin));
+                CUDA_CHECK(real_cuModuleGetFunction(&countd, countd_mod, kernel_name));
+            }
+            // add record to hashmap to avoid re-compile 
+            funcmap_set(f, kernel_name, n_param, n_probe, probe_sizes, probe_types, true, probed, pruned, countd);
+            fprintf(event_log, "[probe] finish %p name %s n_param %d\n", f, kernel_name, n_param);
+            fflush(event_log);
+            // free memory before we leave
+            free(dir);
+            free(path);
+            free(kernel_info);
+            free(probed_bin);
+            free(pruned_bin);
+            free(folder_name);
+            // don't free(probe_sizes) -> used by func-map!!!
+            succeed = true;
+        }
+    }
+    // expose the original param
+    fprintf(event_log, "[exec] funcmap-find %p %s\n", f, succeed ? "success" : "fail");
+    // check the jit status, if failed, goto backup
+    if (!succeed) { goto backup; }
+
+    // @bugfix add timestamp to match with readings from high-level integration (PyTorch)
+    struct timespec ts;
+    clock_gettime(CLOCK_REALTIME, &ts);
+    long long time = ts.tv_nsec + ts.tv_sec * 1e9;
+    fprintf(event_log, "[exec] %lld param ", time);
+    for (int i = 0; i < n_param; i++) {
+        // @note print raw value -> help check raw number but mostly pointers...
+        fprintf(event_log, "%llx ", *(CUdeviceptr*)kernelParams[i]);
+    } 
+    fprintf(event_log, "\n");
+    fprintf(event_log, "[exec] grid %u %u %u block %u %u %u shared %u\n", gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes);
+    fflush(event_log);
+
+    const size_t gridSize = gridDimX * gridDimY * gridDimZ;   
+    const size_t blockSize = blockDimX * blockDimY * blockDimZ;
+    const size_t warpSize = CDIV(blockSize, WARP_SIZE);
+    
+    /**
+     * Handling Dynamic Memory Size Requirements
+     * @bug  COUNT will not work for kernels with inplace modification, i.e., 
+     *       the computed result will pollute next launch, usually seen in the
+     *       decode kernels like topP/topK, please use a large enough constant
+     * @note We may turns to a ring buffer implementation for dynamic buffers,
+     *       but the trouble is how to have least interruption to frontned job
+     *       Particularly under the case that PCIe speed <<< Memory Bandwidth
+     * @todo Support Multiple Dynamic Buffer Allocation (by count many times)
+     */
+    if (DYNAMIC) { 
+        n_count = 1; // Let's support 1 dynamic first
+        // first set the attributes
+        CUDA_CHECK(real_cuFuncSetAttribute(countd, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, sharedMemBytes));
+        // second set the memory sizes
+        const size_t size_counter = gridSize * blockSize * sizeof(uint64_t);
+        uint64_t *h_counter = malloc(size_counter);
+        CUdeviceptr d_counter;
+        CUDA_CHECK(real_cuMemAlloc_v2(&d_counter, size_counter));
+        CUDA_CHECK(real_cuMemsetD32_v2(d_counter, 0, size_counter / 4UL));
+        void** count_args = malloc((n_param + 1) * sizeof(void*));
+        memcpy(count_args, kernelParams, n_param * sizeof(void*)); // copy the raw parameters
+        count_args[n_param] = &d_counter;
+        result = real_cuLaunchKernel(countd, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, 
+                                        blockDimZ, sharedMemBytes, hStream, count_args, extra);
+        // cuMemcpy is a blocked call
+        result = real_cuMemcpyDtoH_v2(h_counter, d_counter, size_counter);
+        if (result != CUDA_SUCCESS) 
+            goto backup;
+        for (int idx = 0; idx < gridSize * blockSize; idx++) 
+            count_size = (count_size > h_counter[idx]) ? count_size : h_counter[idx]; 
+        fprintf(event_log, "[count] size: %d \n", count_size);
+        // clean memory before we leave
+        free(h_counter);
+        free(count_args);
+        CUDA_CHECK(real_cuMemFree_v2(d_counter));
+    }
+
+    // here start to calculate memory size for every probe based on grid, block and probe_sizes
+    // formula similar to ndarray based on grid, block / warp
+    
+    size_t *probe_real_sizes = malloc(n_probe * sizeof(size_t));
+    size_t total_probe_sizes = 0;
+    for (int idx = 0; idx < n_probe; idx++) {
+        if (probe_types[idx] == PROBE_TYPE_THREAD) {
+            if (probe_sizes[idx] != -1) {
+                probe_real_sizes[idx] = gridSize * blockSize * probe_sizes[idx];
+                fprintf(event_log, "[exec] grid %zu block %zu probe %d total %zu\n", gridSize, blockSize, probe_sizes[idx], probe_real_sizes[idx]);
+            } else {
+                probe_real_sizes[idx] = gridSize * blockSize * count_size;
+                fprintf(event_log, "[exec] grid %zu block %zu probe %d total %zu\n", gridSize, blockSize, count_size, probe_real_sizes[idx]);
+            }
+        } else if (probe_types[idx] == PROBE_TYPE_WARP) {
+            probe_real_sizes[idx] = gridSize * warpSize * probe_sizes[idx];
+            fprintf(event_log, "[exec] grid %zu warp  %zu probe %d total %zu\n", gridSize, warpSize, probe_sizes[idx], probe_real_sizes[idx]);
+        }
+        total_probe_sizes += probe_real_sizes[idx];
+    }
+
+    fprintf(event_log, "[exec] probe-mem %zu (bytes)\n", total_probe_sizes);
+
+    // if NEUTRINO_MEMUSAGE, don't execute, just leave
+    if (NEUTRINO_MEMUSAGE) {
+        free(probe_real_sizes); // free the allocated
+        goto backup; 
+    }
+    
+    // Allocate Memory on Host and Device
+    void** h_probe_mems = malloc(n_probe * sizeof(void*));
+    CUdeviceptr* d_probe_mems = malloc(n_probe * sizeof(CUdeviceptr));
+    for (int idx = 0; idx < n_probe; idx++) {
+        h_probe_mems[idx] = malloc(probe_real_sizes[idx]);
+        CUDA_CHECK(real_cuMemAlloc_v2(&d_probe_mems[idx], probe_real_sizes[idx]));
+        CUDA_CHECK(real_cuMemsetD32_v2(d_probe_mems[idx], 0, probe_real_sizes[idx] / 4UL));
+    }
+    
+    // @note argument layout is (n_param + n_probe) * sizeof(void*), n_param is parsed inside ptx
+    void** probe_args = malloc((n_param + n_probe + n_count) * sizeof(void*));
+    // first copy the raw parameters
+    memcpy(probe_args, kernelParams, n_param * sizeof(void*)); 
+    for (int idx = 0; idx < n_probe; idx++) { 
+        probe_args[n_param + idx] = &d_probe_mems[idx]; // offset with n_param -> place later
+    }
+    for (int idx = 0; idx < n_count; idx++) {
+        probe_args[n_param + n_probe + idx] = &count_size; // similar offset
+    }
+    
+    /**
+     * @note set the shared memory size. If the kernel shared memory size exceed a limit (usually half) 
+     * of the physical SMEM size (per SM), then cuLaunchKernel will raise CUDA_ERROR_INVALID_VALUE, we
+     * need to manually set via cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared)
+     * 
+     * @details Neutrino JIT Function is considered a new one and can not inherit original setup...
+     */
+    CUDA_CHECK(real_cuFuncSetAttribute(probed, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, sharedMemBytes));
+    if (NEUTRINO_BENCHMARK) {
+        real_cuMemsetD32_v2(benchmark_flush_mem,0, NEUTRINO_BENCHMARK_FLUSH_MEM_SIZE / 4UL);
+    }
+    CUDA_CHECK(real_cuEventRecord(end_event, hStream)); // use the stream specified in param
+    CUDA_CHECK(real_cuEventSynchronize(end_event));
+    CUDA_CHECK(real_cuEventElapsedTime(&prologue_time, start_event, end_event));
+    CUDA_CHECK(real_cuEventRecord(start_event, hStream)); // use the stream specified in param
+    // launch kernel by call real_cuLaunchKernel function
+    result = real_cuLaunchKernel(probed, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, 
+                                    blockDimZ, sharedMemBytes, hStream, probe_args, extra);
+    CUDA_CHECK(real_cuEventRecord(end_event, hStream)); // use the stream specified in param
+    CUDA_CHECK(real_cuEventSynchronize(end_event));     // kernel ends at this line
+    // calculate the real kernel time
+    CUDA_CHECK(real_cuEventElapsedTime(&kernel_time, start_event, end_event));
+    CUDA_CHECK(real_cuEventRecord(start_event, hStream)); // use the stream specified in param
+    if (result != CUDA_SUCCESS) {
+        for (int idx = 0; idx < n_probe; idx++) {
+            free(h_probe_mems[idx]);
+            CUDA_CHECK(real_cuMemFree_v2(d_probe_mems[idx]));
+        }
+        free(h_probe_mems);
+        free(d_probe_mems);
+        free(probe_real_sizes);
+        free(probe_args);
+        fprintf(event_log, "[exec] failed %d\n", result);
+        goto backup;
+    } else {
+        fprintf(event_log, "[exec] succeed %d\n", result);
+    }
+
+    // On benchmark, we don't save results because we're testing the kernel
+    if (NEUTRINO_BENCHMARK) { 
+        goto leave; 
+    }
+
+    /**
+     * Saving Trace to disk
+     * @todo Standardize this part in common.h because it's platform-agnostic!
+     */
+    for (int idx = 0; idx < n_probe; idx++) {
+        CUDA_CHECK(real_cuMemcpyDtoH_v2(h_probe_mems[idx], d_probe_mems[idx], probe_real_sizes[idx]));
+    }
+    // create dump file
+    char* DUMP_FILE_NAME = malloc(strlen(RESULT_DIR) + 20);
+    struct timespec end;
+    clock_gettime(CLOCK_REALTIME, &end);
+    double elapsed = ((end.tv_sec * 1e9 + end.tv_nsec) - (start.tv_sec * 1e9 + start.tv_nsec)) / 1e9;
+    sprintf(DUMP_FILE_NAME, "%s/%.6f.bin", RESULT_DIR, elapsed);
+    FILE *fp = fopen(DUMP_FILE_NAME, "wb");
+    if (!fp) { 
+        fprintf(event_log, "[exec] can't-save %s\n", DUMP_FILE_NAME); 
+        return CUDA_SUCCESS; 
+    }
+    // write header to file
+    trace_header_t header = { gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, n_probe };
+    fwrite(&header, sizeof(header), 1, fp);
+    // write sections to file
+    size_t offset = sizeof(header) + n_probe * sizeof(trace_section_t);
+    for (int idx = 0; idx < n_probe; idx++) {
+        trace_section_t section;
+        section.size = probe_sizes[idx] != -1 ? probe_sizes[idx] : count_size; 
+        section.warpDiv = (probe_types[idx] == PROBE_TYPE_WARP) ? WARP_SIZE : 1; 
+        section.offset = offset;
+        offset += gridSize * blockSize * section.size / section.warpDiv; 
+        fwrite(&section, sizeof(section), 1, fp);
+    }
+    // write data
+    for (int idx = 0; idx < n_probe; idx++) {
+        fwrite(h_probe_mems[idx], 1, probe_real_sizes[idx], fp);
+    }
+    // close file
+    fclose(fp);
+    fprintf(event_log, "[exec] save %s size %zu\n", DUMP_FILE_NAME, offset);
+    
+leave:
+    // on leave
+    // free allocated memory before leave
+    for (int idx = 0; idx < n_probe; idx++) {
+        free(h_probe_mems[idx]);
+        CUDA_CHECK(real_cuMemFree_v2(d_probe_mems[idx]));
+    }
+    CUDA_CHECK(real_cuEventRecord(end_event, hStream)); // use the stream specified in param
+    CUDA_CHECK(real_cuEventSynchronize(end_event));
+    CUDA_CHECK(real_cuEventElapsedTime(&epilogue_time, start_event, end_event));
+
+    if (NEUTRINO_BENCHMARK)  { 
+        // On benchmark mode, we 
+        // @note it seems this will launch a kernel implicitly to clear L2 cache
+        real_cuMemsetD32_v2(benchmark_flush_mem,0, NEUTRINO_BENCHMARK_FLUSH_MEM_SIZE / 4UL);
+        // here Neutrino use pruned ptx being compiled with exactly the same configuration (assmbler & optimization) with probed
+        float original_time;
+        CUDA_CHECK(real_cuFuncSetAttribute(pruned, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, sharedMemBytes));
+        CUDA_CHECK(real_cuEventRecord(start_event, hStream)); // use the stream specified in param
+        // launch original kernel with original parameter
+        result = real_cuLaunchKernel(pruned, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
+        CUDA_CHECK(real_cuEventRecord(end_event, hStream)); // use the stream specified in param
+        CUDA_CHECK(real_cuEventSynchronize(end_event));
+        // calculate the real kernel time
+        CUDA_CHECK(real_cuEventElapsedTime(&original_time, start_event, end_event));
+        fprintf(event_log, "[benchmark] prologue %f kernel %f epilogue %f original %f impact %f %d\n", prologue_time, kernel_time, epilogue_time, original_time, kernel_time / original_time, result);
+    } else {
+        // In normal mode, report the prologue, kernel, epilogue and impact ratio
+        fprintf(event_log, "[exec] prologue %f kernel %f epilogue %f ratio %f\n", prologue_time, kernel_time, epilogue_time, (prologue_time + kernel_time + epilogue_time) / kernel_time);
+        // Also create subprocess for analyze routine
+        if (NEUTRINO_CALLBACK && strlen(NEUTRINO_CALLBACK) >= 3 && strcmp(NEUTRINO_CALLBACK + strlen(NEUTRINO_CALLBACK) - 3, ".py") == 0) {
+            pid_t pid = fork();
+            if (pid < 0) {
+                fprintf(event_log, "[probe] can't-folk\n");
+            } else if (pid == 0) { // child process, run python process.py kernel name
+                // python process.py <work_dir> <kernel_name>
+                execlp(NEUTRINO_PYTHON, NEUTRINO_PYTHON, NEUTRINO_CALLBACK, DUMP_FILE_NAME, NULL);
+                exit(EXIT_FAILURE); // reach here only if exec error -> failure
+            } else { // parent process, wait for child
+                fprintf(event_log, "[callback] subproc %s %s %s\n", NEUTRINO_PYTHON, NEUTRINO_CALLBACK, DUMP_FILE_NAME);
+                int status;
+                waitpid(pid, &status, 0);
+                if (status != EXIT_SUCCESS) { 
+                    fprintf(event_log, "[callback] failed\n");
+                } else {
+                    fprintf(event_log, "[callback] succeed\n");
+                }
+            }
+        }
+        free(DUMP_FILE_NAME);
+    }
+    
+    free(h_probe_mems);
+    free(d_probe_mems);
+    free(probe_real_sizes);
+    free(probe_args);
+    fflush(event_log);   // make sure all logs are written before we go
+    return CUDA_SUCCESS; // reach here must be CUDA_SUCCESS
+
+backup:
+    // fall back to original version
+    fprintf(event_log, "[exec] backup %u %u %u block %u %u %u shared %u\n", gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes);
+    result = real_cuLaunchKernel(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
+    fflush(event_log);
+    return result;
+}
+
+/**
+ * Memory API that helps debugging memory operation erros
+ */
+CUresult cuMemAlloc_v2(CUdeviceptr *dptr, size_t bytesize) {
+    if (shared_lib == NULL) { init(); } 
+
+    CUresult result = real_cuMemAlloc_v2(dptr, bytesize);
+    fprintf(event_log, "[mem] cuMemAlloc_v2 %d dptr %llx bytesize %zu\n", result, *dptr, bytesize);
+    return result;
+}
+
+CUresult cuMemFree_v2(CUdeviceptr dptr) {
+    if (shared_lib == NULL) { init(); }
+
+    CUresult result = real_cuMemFree_v2(dptr);
+    fprintf(event_log, "[mem] cuMemFree_v2 %d dptr %llx\n", result, dptr);
+    return result;
+}
+
+/**
+ * Following functions shall also be hooked but we don't observe any workload
+ * calling them, thus having a [info] section for tracing, add if needed
+ */
+CUresult cuModuleLoad(CUmodule* module, const char* fname) {
+    if (shared_lib == NULL) { init(); }
+    
+    CUresult result = real_cuModuleLoad(module, fname); // call the symbol
+    fprintf(event_log, "[info] cuModuleLoad %d\n", result);
+    return result;
+}
+
+
+/**
+ * Unmodified part of code, automatically generated by parse.py
+ * usually we don't trace these API, just print a event_log to indicate they're used
+ * if there's any weird behavior caused by Neutrino (unlikely), we can have a look
+ */
+#include "unmodified.c" // include the auto-generated code
+
+/**
+ * @note this function is intentially masked out by Neutrino because it might
+ *       let process jump out of Neutrino's Hook Driver directly to real driver
+ */
+/*
+CUresult cuGetProcAddress_v2(const char* symbol, void** pfn, int cudaVersion, cuuint64_t flags, CUdriverProcAddressQueryResult* symbolStatus) {
+    if (real_cuGetProcAddress_v2 == NULL)
+        init();
+    
+    CUresult ret;
+
+    if (strcmp(symbol, "cuGetProcAddress") == 0) {
+        CUresult (*cuGetProcAddress_v2_ptr)(const char* symbol, void** pfn, int cudaVersion, cuuint64_t flags, CUdriverProcAddressQueryResult* symbolStatus) = cuGetProcAddress_v2;
+        *pfn = cuGetProcAddress_v2_ptr;
+        ret = CUDA_SUCCESS;
+        fprintf(event_log, "[pass] cuGetProcAddress_v2 %d %s %d return-myself\n", ret, symbol, cudaVersion); // unexpected func call
+    } else if (strcmp(symbol, "cuGetExportTable") == 0) {
+        // CUresult (*cuGetExportTable_ptr)(const void**, const CUuuid*) = cuGetExportTable;
+        // *pfn = cuGetExportTable_ptr;
+        ret = CUDA_ERROR_INVALID_VALUE;
+        fprintf(event_log, "[pass] cuGetProcAddress_v2 %d %s %d return-ours\n", ret, symbol, cudaVersion); // unexpected func call
+    } else {
+        ret = real_cuGetProcAddress_v2(symbol, pfn, cudaVersion, flags, symbolStatus);
+        fprintf(event_log, "[pass] cuGetProcAddress_v2 %d %s %d\n", ret, symbol, cudaVersion); // unexpected func call
+    }
+   
+    return ret;
+}
+*/
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/hip.c b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/hip.c
new file mode 100644
index 0000000..0fee397
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/hip.c
@@ -0,0 +1,586 @@
+/**
+ * Neutrino Hook Driver, AMD Implementation
+ * 
+ * @note experimental support only
+ */
+
+#include "common.h" // for common headers
+#include <hip/hip_runtime.h> // for AMD HIP Hooked APIs
+
+#define WARP_SIZE 64 // @bug some RDNA use 32 as WARP_SIZE
+
+// used by benchmark mode
+static hipDeviceptr_t benchmark_flush_mem = NULL;
+
+// following functions are hooked for internal usage
+hipError_t (*real_hipModuleLoadData)(hipModule_t*, const void*) = NULL;
+hipError_t (*real_hipModuleLoadDataEx)(hipModule_t*, const void*, unsigned int, hipJitOption*, void**) = NULL;
+hipError_t (*real_hipModuleGetFunction)(hipFunction_t*, hipModule_t, const char*) = NULL;
+hipError_t (*real_hipModuleLaunchKernel)(hipFunction_t, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, hipStream_t, void**, void**) = NULL;
+hipError_t (*real_hipMalloc)(void**, size_t) = NULL;
+hipError_t (*real_hipFree)(void*) = NULL;
+hipError_t (*real_hipModuleLoad)(hipModule_t*, const char*) = NULL;
+// following are for weird parsing
+hipError_t (*real_hipEventRecord)(hipEvent_t, hipStream_t) = NULL;
+const char* (*real_hipGetErrorName)(hipError_t) = NULL;
+const char* (*real_hipGetErrorString)(hipError_t) = NULL;
+const char* (*real_hipApiName)(uint32_t) = NULL;
+const char* (*real_hipKernelNameRef)(const hipFunction_t) = NULL;
+const char* (*real_hipKernelNameRefByPtr)(const void*, hipStream_t) = NULL;
+
+// include auto-generated signatures for unmodified functions
+// @note signature.c will be auto generated by parse.py
+#include "signature.c"
+
+#define HIP_CHECK(cmd) do {                     \
+    hipError_t result = cmd;                    \
+    if (result != hipSuccess) {                 \
+        const char *msg;                        \
+        real_hipDrvGetErrorName(result, &msg);  \
+        printf("Neutrino fail: %s:%d '%s'\n",   \
+                __FILE__, __LINE__, msg);       \
+        exit(EXIT_FAILURE);                     \
+    }                                           \
+} while (0)
+
+static void init(void) {
+    common_init(); // init common modules
+    real_hipModuleLoadData      = dlsym(shared_lib, "hipModuleLoadData");
+    real_hipModuleLoadDataEx    = dlsym(shared_lib, "hipModuleLoadDataEx");
+    real_hipModuleGetFunction   = dlsym(shared_lib, "hipModuleGetFunction");
+    real_hipModuleLaunchKernel  = dlsym(shared_lib, "hipModuleLaunchKernel");
+    real_hipMalloc              = dlsym(shared_lib, "hipMalloc");
+    real_hipFree                = dlsym(shared_lib, "hipFree");
+    real_hipModuleLoad          = dlsym(shared_lib, "hipModuleLoad");
+    real_hipEventRecord         = dlsym(shared_lib, "hipEventRecord");
+    real_hipGetErrorName        = dlsym(shared_lib, "hipGetErrorName");
+    real_hipGetErrorString      = dlsym(shared_lib, "hipGetErrorString");
+    real_hipApiName             = dlsym(shared_lib, "hipApiName");
+    real_hipKernelNameRef       = dlsym(shared_lib, "hipKernelNameRef");
+    real_hipKernelNameRefByPtr  = dlsym(shared_lib, "hipKernelNameRefByPtr");
+    init_unmodified(); // init unmodified functions, defined in signature.c
+    CHECK_DL(); // checking if any dl error presented
+    // initialzie the L2 Flush Memory if benchmark is enabled
+    if (NEUTRINO_BENCHMARK) {
+        fprintf(event_log, "[benchmark] ENABLED L2 Flush Size %d\n", NEUTRINO_BENCHMARK_FLUSH_MEM_SIZE);
+        real_hipMalloc(&benchmark_flush_mem, NEUTRINO_BENCHMARK_FLUSH_MEM_SIZE);
+    }
+    fprintf(event_log, "[init] success\n"); 
+}
+
+/**
+ * Module Management
+ * @see https://rocm.docs.amd.com/projects/HIP/en/latest/reference/hip_runtime_api/modules/module_management.html
+ * 
+ * HIP Module Management is simpler than CUDA, only Module -> Function
+ */
+
+hipError_t hipModuleLoadData(hipModule_t* module, const void* image) {
+    if (shared_lib == NULL) { init(); }
+
+    hipError_t ret = real_hipModuleLoadData(module, image);
+
+    fprintf(event_log, "[mod] hipModuleLoadData mod %p code %p\n", *module, image);
+
+    // update to hashmap
+    void*  managed;
+    size_t size;
+    if (get_managed_code_size(&managed, &size, image) != -1) {
+        binmap_set(*module, managed, size, NULL); // name = NULL as we don't know it now
+    }
+
+    return ret;
+}
+
+hipError_t hipModuleLoadDataEx(hipModule_t* module, const void* image, unsigned int numOptions, hipJitOption* options, void** optionValues) {
+    if (shared_lib == NULL) { init(); }
+
+    hipError_t ret = real_hipModuleLoadDataEx(module, image, numOptions, options, optionValues);
+    
+    fprintf(event_log, "[mod] hipModuleLoadData mod %p code %p\n", *module, image);
+
+    // update to hashmap
+    void*  managed;
+    size_t size;
+    if (get_managed_code_size(&managed, &size, image) != -1) {
+        binmap_set(*module, managed, size, NULL); // name = NULL as we don't know it now
+    }
+    return ret;
+}
+
+hipError_t hipModuleGetFunction(hipFunction_t* function, hipModule_t module, const char* kname) {
+    if (shared_lib == NULL) { init(); }
+
+    // first update the name
+    size_t len = strlen(kname);
+    char* managed_name = malloc(len);
+    memcpy(managed_name, kname, len);
+    
+    // call real function
+    hipError_t result = real_hipModuleGetFunction(function, module, kname);
+
+    fprintf(event_log, "[mod] hipModuleGetFunction func %p mod %p name %s\n", *function, module, kname);
+
+    // then update the key from module to function
+    if (binmap_update_name_key(module, *function, managed_name) == -1) {
+        fprintf(event_log, "[hash] hipModuleGetFunction failed-update %p %p %s\n", module, *function, managed_name);
+    }
+    
+    return result;
+}
+
+hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, hipStream_t stream, void** kernelParams, void** extra) {
+    if (shared_lib == NULL) { init(); }
+    
+    hipEvent_t start_event, end_event;
+    HIP_CHECK(real_hipEventCreate(&start_event));
+    HIP_CHECK(real_hipEventCreate(&end_event));
+    
+    HIP_CHECK(real_hipEventRecord(start_event, stream)); // use the stream specified in param
+
+    float prologue_time, kernel_time, epilogue_time;
+    hipErrot_t result;
+    hipFunction_t probed, pruned;
+    char* kernel_name;
+    int n_param, n_probe; 
+    int* probe_sizes; // size of probes
+    int* probe_types; // type of probes
+    bool succeed; // jit status
+    char* analyze_hook; // path to python script 
+
+    // try obtain the kernel compiled or raise compilation process 
+    // @note count and record is only valid if succeed == true
+    if (funcmap_get(f, &kernel_name, &n_param, &n_probe, &probe_sizes, &probe_types, &succeed, &probed, &pruned) == -1) {
+        fprintf(event_log, "[exec] funcmap-not-find %p\n", f);
+        fflush(event_log); // we need many fflush to avoid trace not printed
+        // here try to get binary from binmap and start JIT compile
+        size_t size;
+        void* bin;
+        if (binmap_get(f, &size, &kernel_name, &bin) == -1) { // not found the binary, fall back
+            fprintf(event_log, "[jit] can't-find %p\n", f);
+            fflush(event_log);
+            funcmap_set(f,kernel_name, 0, 0, NULL, NULL, false, NULL, NULL); // set dummy with status FALSE
+            goto backup;
+        } else {
+            fprintf(event_log, "[jit] find %p name %s bin %p size %zu\n", f, kernel_name, bin, size);
+            fflush(event_log);
+            // create a directory under the kernel directory with kernel_name
+            // @note Linux has limit on directory length 255, replace it to sh1 so 20 char
+            // @bugfix PyTorch kernel name usually is extremely long :(
+            // @bugfix Triton autotune leads to a set of kernel with same name -> use counter to differentiate
+            char *tmp = sha1(kernel_name);
+            char *folder_name = (char*) malloc(5 + strlen(tmp));
+            sprintf(folder_name, "%d_%s", kernel_idx, tmp);
+            free(tmp);
+            kernel_idx++;
+            fprintf(event_log, "[jit] rename %s %s\n", kernel_name, folder_name);
+            fflush(event_log);
+            char* dir = malloc(strlen(KERNEL_DIR) + strlen(folder_name) + 10);
+            sprintf(dir, "%s/%s", KERNEL_DIR, folder_name);
+            if (mkdir(dir, 0755) == 0) { 
+                fprintf(event_log, "[jit] mkdir %s\n", dir);
+            } else {
+                fprintf(event_log, "[jit] can't-mkdir %s\n", dir);
+                funcmap_set(f, kernel_name, 0, 0, NULL, NULL, false, NULL, NULL); // set status FALSE to prevent recompile fault
+                goto backup;
+            }
+            // create original.bin and write the binary to it
+            char* path = malloc(strlen(dir) + 15);
+            sprintf(path, "%s/original.bin", dir);
+            FILE* original_bin = fopen(path, "wb");
+            if (original_bin == NULL) {
+                fprintf(event_log, "[jit] can't-open %s\n", path);
+                funcmap_set(f, kernel_name, 0, 0, NULL, NULL, false, NULL, NULL); // set status FALSE to prevent recompile fault
+                goto backup;
+            }
+            fwrite(bin, size, 1, original_bin);
+            fclose(original_bin);
+            fprintf(event_log, "[jit] write %s\n", path);
+            // create subprocess to run process.py, be aware of multi-processing
+            pid_t pid = fork();
+            if (pid < 0) {
+                fprintf(event_log, "[jit] can't-folk\n");
+                funcmap_set(f,kernel_name, 0, 0, NULL, NULL, false, NULL, NULL); // set status FALSE to prevent recompile fault
+                goto backup;
+            } else if (pid == 0) { // child process, run python process.py kernel name
+                // python process.py <work_dir> <kernel_name>
+                execlp(NEUTRINO_PYTHON, NEUTRINO_PYTHON, NEUTRINO_PROBING_PY, dir, kernel_name, NULL);
+                exit(EXIT_FAILURE); // reach here only if exec error -> failure
+            } else { // parent process, wait for child
+                fprintf(event_log, "[jit] subproc %s %s %s %s\n", NEUTRINO_PYTHON, NEUTRINO_PROBING_PY, dir, kernel_name);
+                int status;
+                waitpid(pid, &status, 0);
+                if (status != EXIT_SUCCESS) { 
+                    fprintf(event_log, "[jit] python failed\n");
+                    funcmap_set(f,kernel_name, 0, 0, NULL, NULL, false, NULL, NULL); // set dummy with status FALSE
+                    goto backup; 
+                } else {
+                    fprintf(event_log, "[jit] python succeed\n");
+                }
+            }
+            // read the kernel.info from file system
+            sprintf(path, "%s/kernel.info", dir);
+            char* kernel_info = readf(path, "r");
+            // poor parser for kernel.info
+            // @todo add checking alignment
+            char* kernel_end = strchr(kernel_info, '\n');
+            *kernel_end = '\0';
+            kernel_name = kernel_info;
+            char* start = kernel_end + 1;
+            sscanf(start, " %d\n%d\n", &n_param, &n_probe);
+            // read sizes and types of probe 
+            probe_sizes = malloc(n_probe * sizeof(int));
+            probe_types = malloc(n_probe * sizeof(int));
+            char* strptr = strchr(strchr(start, '\n') + 1, '\n') + 1;
+            for (int idx = 0; idx < n_probe; idx++) {
+                sscanf(strptr, "%d,%d\n", &probe_types[idx], &probe_sizes[idx]);
+                strptr = strchr(strptr, '\n') + 1;
+            }
+            // @note read process hook, not yet checked
+            char* info_end = strchr(strptr, '\n');
+            *info_end = '\0';
+            analyze_hook = strptr;
+            // here read the 
+            fprintf(event_log, "[jit] read %s name %s n_param %d n_probe %d analyze_hook %s\n", path, kernel_name, n_param, n_probe, analyze_hook);
+            // load probed.bin -> for collecting runtime info
+            sprintf(path, "%s/probed.bin", dir);
+            void* probed_bin = readf(path, "rb");
+            // load pruned.bin -> for benchmark
+            sprintf(path, "%s/pruned.bin", dir);
+            void* pruned_bin = readf(path, "rb");
+            // then load the binary to module
+            hipModule_t probed_mod, pruned_mod;
+            HIP_CHECK(real_hipModuleLoadData(&probed_mod,  probed_bin));
+            HIP_CHECK(real_hipModuleGetFunction(&probed, probed_mod, kernel_name));
+            HIP_CHECK(real_hipModuleLoadData(&pruned_mod,  pruned_bin));
+            HIP_CHECK(real_hipModuleGetFunction(&pruned, pruned_mod, kernel_name));
+            // add record to hashmap to avoid re-compile 
+            funcmap_set(f, kernel_name, n_param, n_probe, probe_sizes, probe_types, true, probed, pruned);
+            fprintf(event_log, "[jit] finish %p name %s n_param %d\n", f, kernel_name, n_param);
+            fflush(event_log);
+            // free memory before we leave
+            free(dir);
+            free(path);
+            free(kernel_info);
+            free(probed_bin);
+            free(pruned_bin);
+            free(folder_name);
+            // don't free(probe_sizes) -> used by func-map!!!
+            succeed = true;
+        }
+    }
+    // expose the original param
+    fprintf(event_log, "[exec] funcmap-find %p %s\n", f, succeed ? "success" : "fail");
+    // check the jit status, if failed, goto backup
+    if (!succeed) { goto backup; }
+
+    struct timespec ts;
+    clock_gettime(CLOCK_REALTIME, &ts);
+    long long time = ts.tv_nsec + ts.tv_sec * 1e9;
+    fprintf(event_log, "[exec] %lld param ", time);
+    for (int i = 0; i < n_param; i++) {
+        // @note print raw value -> help check raw number but mostly pointers...
+        fprintf(event_log, "%llx ", *(hipDeviceptr_t*)kernelParams[i]);
+    } 
+    fprintf(event_log, "\n");
+    fprintf(event_log, "[exec] grid %u %u %u block %u %u %u shared %u\n", gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes);
+    fflush(event_log);
+
+    // here start to calculate memory size for every probe based on grid, block and probe_sizes
+    // formula similar to ndarray based on grid, block / warp
+    const size_t gridSize = gridDimX * gridDimY * gridDimZ;   
+    const size_t blockSize = blockDimX * blockDimY * blockDimZ;
+    const size_t warpSize = CDIV(blockSize, WARP_SIZE);
+    size_t *probe_real_sizes = malloc(n_probe * sizeof(size_t));
+    size_t total_probe_sizes = 0;
+    for (int idx = 0; idx < n_probe; idx++) {
+        if (probe_types[idx] == PROBE_TYPE_THREAD) {
+            probe_real_sizes[idx] = gridSize * blockSize * probe_sizes[idx];
+            fprintf(event_log, "[exec] grid %zu block %zu probe %d total %zu\n", gridSize, blockSize, probe_sizes[idx], probe_real_sizes[idx]);
+        } else if (probe_types[idx] == PROBE_TYPE_WARP) {
+            probe_real_sizes[idx] = gridSize * warpSize * probe_sizes[idx];
+            fprintf(event_log, "[exec] grid %zu warp  %zu probe %d total %zu\n", gridSize, warpSize, probe_sizes[idx], probe_real_sizes[idx]);
+        }
+        total_probe_sizes += probe_real_sizes[idx];
+    }
+
+    fprintf(event_log, "[exec] probe-mem %zu (bytes)\n", total_probe_sizes);
+    
+    // Allocate Memory on Host and Device
+    void** h_probe_mems = malloc(n_probe * sizeof(void*));
+    hipDeviceptr_t* d_probe_mems = malloc(n_probe * sizeof(hipDeviceptr_t));
+    for (int idx = 0; idx < n_probe; idx++) {
+        h_probe_mems[idx] = malloc(probe_real_sizes[idx]);
+        HIP_CHECK(real_hipMalloc(&d_probe_mems[idx], probe_real_sizes[idx]));
+        HIP_CHECK(real_hipMemsetD32(d_probe_mems[idx], 0, probe_real_sizes[idx] / 4UL));
+    }
+    
+    // @note argument layout is (n_param + n_probe) * sizeof(void*), n_param is parsed inside ptx
+    void** probe_args = malloc((n_param + n_probe) * sizeof(void*));
+    memcpy(probe_args, kernelParams, n_param * sizeof(void*)); // copy the raw parameters
+    for (int idx = 0; idx < n_probe; idx++) { 
+        probe_args[n_param + idx] = &d_probe_mems[idx]; // offset with n_param -> place later
+    }
+    /**
+     * @note set the shared memory size. If the kernel shared memory size exceed a limit (usually half) 
+     * of the physical SMEM size (per SM), then hipModuleLaunchKernel will raise CUDA_ERROR_INVALID_VALUE, we
+     * need to manually set via cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared)
+     * 
+     * @details Neutrino JIT Function is considered a new one and can not inherit original setup...
+     */
+    HIP_CHECK(real_hipFuncSetAttribute(probed, hipFuncAttributeMaxDynamicSharedMemorySize, sharedMemBytes));
+    if (NEUTRINO_BENCHMARK) {
+        real_hipMemsetD32(benchmark_flush_mem, 0, NEUTRINO_BENCHMARK_FLUSH_MEM_SIZE / 4UL);
+    }
+    HIP_CHECK(real_hipEventRecord(end_event, stream)); // use the stream specified in param
+    HIP_CHECK(real_hipEventSynchronize(end_event));
+    HIP_CHECK(real_hipEventElapsedTime(&prologue_time, start_event, end_event));
+    HIP_CHECK(real_hipEventRecord(start_event, stream)); // use the stream specified in param
+    // launch kernel by call real_hipModuleLaunchKernel function
+    result = real_hipModuleLaunchKernel(probed, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, 
+                                    blockDimZ, sharedMemBytes, stream, probe_args, extra);
+    HIP_CHECK(real_hipEventRecord(end_event, stream)); // use the stream specified in param
+    HIP_CHECK(real_hipEventSynchronize(end_event));
+    // calculate the real kernel time
+    HIP_CHECK(real_hipEventElapsedTime(&kernel_time, start_event, end_event));
+    HIP_CHECK(real_hipEventRecord(start_event, stream)); // use the stream specified in param
+    if (result != hipSuccess) {
+        for (int idx = 0; idx < n_probe; idx++) {
+            free(h_probe_mems[idx]);
+            HIP_CHECK(real_hipFree(d_probe_mems[idx]));
+        }
+        free(h_probe_mems);
+        free(d_probe_mems);
+        free(probe_real_sizes);
+        free(probe_args);
+        fprintf(event_log, "[exec] failed %d\n", result);
+        goto backup;
+    } else {
+        fprintf(event_log, "[exec] succeed %d\n", result);
+    }
+
+    // dump to disk
+    for (int idx = 0; idx < n_probe; idx++) {
+        HIP_CHECK(real_hipMemcpyDtoH(h_probe_mems[idx], d_probe_mems[idx], probe_real_sizes[idx]));
+    }
+    // create dump file
+    char* DUMP_FILE_NAME = malloc(strlen(RESULT_DIR) + 20);
+    struct timespec end;
+    clock_gettime(CLOCK_REALTIME, &end);
+    double elapsed = ((end.tv_sec * 1e9 + end.tv_nsec) - (start.tv_sec * 1e9 + start.tv_nsec)) / 1e9;
+    sprintf(DUMP_FILE_NAME, "%s/%.6f.bin", RESULT_DIR, elapsed);
+    FILE *fp = fopen(DUMP_FILE_NAME, "wb");
+    if (!fp) { 
+        fprintf(event_log, "[exec] can't-save %s\n", DUMP_FILE_NAME); 
+        return hipSuccess; // only can't save, still success in execution
+    }
+    // write header to file
+    trace_header_t header;
+    // gridDim: uint3
+    header.gridDimX = gridDimX;
+    header.gridDimY = gridDimY;
+    header.gridDimZ = gridDimZ;
+    // blockDim: uint3
+    header.blockDimX = blockDimX;
+    header.blockDimY = blockDimY;
+    header.blockDimZ = blockDimZ;
+    // sharedMemBytes and numProbes
+    header.sharedMemBytes = sharedMemBytes;
+    header.numProbes = n_probe;
+    fwrite(&header, sizeof(header), 1, fp);
+    // write sections to file
+    size_t offset = sizeof(header) + n_probe * sizeof(trace_section_t);
+    for (int idx = 0; idx < n_probe; idx++) {
+        trace_section_t section;
+        section.size = probe_real_sizes[idx];
+        section.offset = offset;
+        offset += section.size;
+        fwrite(&section, sizeof(section), 1, fp);
+    }
+    // write data
+    for (int idx = 0; idx < n_probe; idx++) {
+        fwrite(h_probe_mems[idx], 1, probe_real_sizes[idx], fp);
+    }
+    // close file
+    fclose(fp);
+    fprintf(event_log, "[exec] save %s size %zu\n", DUMP_FILE_NAME, offset);
+    // free allocated memory before leave
+    for (int idx = 0; idx < n_probe; idx++) {
+        free(h_probe_mems[idx]);
+        HIP_CHECK(real_hipFree(d_probe_mems[idx]));
+    }
+
+    // on leave
+    HIP_CHECK(real_hipEventRecord(end_event, stream)); // use the stream specified in param
+    HIP_CHECK(real_hipEventSynchronize(end_event));
+    HIP_CHECK(real_hipEventElapsedTime(&epilogue_time, start_event, end_event));
+    if (NEUTRINO_BENCHMARK)  {
+        real_hipMemsetD32(benchmark_flush_mem,0, NEUTRINO_BENCHMARK_FLUSH_MEM_SIZE / 4UL);
+        // here Neutrino use pruned ptx being compiled with exactly the same configuration (assmbler & optimization) with probed
+        float original_time;
+        HIP_CHECK(real_hipFuncSetAttribute(pruned, hipFuncAttributeMaxDynamicSharedMemorySize, sharedMemBytes));
+        HIP_CHECK(real_hipEventRecord(start_event, stream)); // use the stream specified in param
+        // launch original kernel with original parameter
+        result = real_hipModuleLaunchKernel(pruned, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, stream, kernelParams, extra);
+        HIP_CHECK(real_hipEventRecord(end_event, stream)); // use the stream specified in param
+        HIP_CHECK(real_hipEventSynchronize(end_event));
+        // calculate the real kernel time
+        HIP_CHECK(real_hipEventElapsedTime(&original_time, start_event, end_event));
+        fprintf(event_log, "[benchmark] prologue %f kernel %f epilogue %f original %f impact %f %d\n", prologue_time, kernel_time, epilogue_time, original_time, kernel_time / original_time, result);
+    } else {
+        fprintf(event_log, "[exec] prologue %f kernel %f epilogue %f ratio %f\n", prologue_time, kernel_time, epilogue_time, (prologue_time + kernel_time + epilogue_time) / kernel_time);
+    }
+
+    // @note do the analyze_hook if having
+    if (strlen(analyze_hook) >= 3 && strcmp(analyze_hook + strlen(analyze_hook) - 3, ".py") == 0) {
+        pid_t pid = fork();
+        if (pid < 0) {
+            fprintf(event_log, "[jit] can't-folk\n");
+        } else if (pid == 0) { // child process, run python process.py kernel name
+            execlp(NEUTRINO_PYTHON, NEUTRINO_PYTHON, analyze_hook, DUMP_FILE_NAME, NULL);
+            exit(EXIT_FAILURE); // reach here only if exec error -> failure
+        } else { // parent process, wait for child
+            fprintf(event_log, "[analyze] subproc %s %s %s\n", NEUTRINO_PYTHON, analyze_hook, DUMP_FILE_NAME);
+            int status;
+            waitpid(pid, &status, 0);
+            if (status != EXIT_SUCCESS) { 
+                fprintf(event_log, "[analyze] failed\n");
+            } else {
+                fprintf(event_log, "[analyze] succeed\n");
+            }
+        }
+    }
+    
+    free(h_probe_mems);
+    free(d_probe_mems);
+    free(probe_real_sizes);
+    free(probe_args);
+    
+    return hipSuccess; // reach here must be hipSuccess
+
+backup:
+    // fall back to original version
+    fprintf(event_log, "[exec] backup %u %u %u block %u %u %u shared %u\n", gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes);
+    result = real_hipModuleLaunchKernel(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, stream, kernelParams, extra);
+    return result;
+}
+
+/**
+ * Memory API, hooked to be helpful in checking allocated memory region
+ */
+
+hipError_t hipMalloc(void **ptr, size_t size) {
+    if (shared_lib == NULL)  { init(); } 
+    
+    hipError_t result = real_hipMalloc(ptr, size);
+
+    fprintf(event_log, "[mem] hipMalloc %d dptr %llx bytesize %zu\n", result, *ptr, size);
+
+    return result;
+}
+
+
+hipError_t hipFree(void *ptr) {
+    if (shared_lib == NULL) { init(); } 
+
+    hipError_t result = real_hipFree(ptr);
+
+    fprintf(event_log, "[mem] cuMemFree_v2 %d dptr %llx\n", result, ptr);
+    
+    return result;
+}
+
+// verify if this API is used 
+hipError_t hipModuleLoad(hipModule_t *module, const char *fname) {
+    if (shared_lib == NULL) { init(); }
+
+    hipError_t result = real_hipModuleLoad(module, fname);
+
+    fprintf(event_log, "[mod] hipModuleLoad %d mod %llx name %s\n", *module, fname);
+    
+    return result;
+}
+
+// just for some parsing error, don't understand why HIP has such weird API
+hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) {
+    if (shared_lib == NULL) { init(); }
+    hipError_t result = real_hipEventRecord(event, stream);
+    return result;
+}
+
+const char* hipGetErrorName(hipError_t hip_error) { // Hi AMD, why hip_error here?
+    if (shared_lib == NULL) { init(); }
+    hipError_t result = real_hipGetErrorName(hip_error);
+    return result;
+}
+
+const char* hipGetErrorString(hipError_t hipError) { // Hi AMD, why hipError here?
+    if (shared_lib == NULL) { init(); }
+    hipError_t result = real_hipGetErrorString(hipError);
+    return result;
+}
+
+const char* hipApiName(uint32_t id) {
+    if (shared_lib == NULL) { init(); }
+    hipError_t result = real_hipApiName(id);
+    return result;
+}
+
+const char* hipKernelNameRef(const hipFunction_t f) {
+    if (shared_lib == NULL) { init(); }
+    hipError_t result = real_hipKernelNameRef(f);
+    return result;
+}
+
+const char* hipKernelNameRefByPtr(const void* hostFunction, hipStream_t stream) {
+    if (shared_lib == NULL) { init(); }
+    hipError_t result = real_hipKernelNameRefByPtr(hostFunction, stream);
+    return result;
+}
+
+/**
+ * Unmodified part of code, automatically generated by parse.py
+ * usually we don't trace these API, just print a event_log to indicate they're used
+ * if there's any weird behavior caused by Neutrino (unlikely), we can have a look
+ */
+#include "unmodified.c" // include the auto-generated code
+
+// Following are some (now) undocumented API from HIP v4.2
+// but they're still in use, at least rocBLAS (dependent of PyToch) will use it
+
+// @see https://github.com/ROCm/hip/blob/rocm-4.2.x/rocclr/hip_platform.cpp#L76
+void** __hipRegisterFatBinary(const void* data) {
+    if (shared_lib == NULL)  { init(); }
+    void** (real___hipRegisterFatBinary*)(const void*) = dlsym(shared_lib, "__hipRegisterFatBinary");
+
+    void** fatbin = real___hipRegisterFatBinary(data);
+    if (VERBOSE)  { fprintf(event_log, "[info] __hipRegisterFatBinary %p %p\n", *fatbin, data); }
+    return fatbin;
+}
+
+// @see https://github.com/ROCm/hip/blob/rocm-4.2.x/rocclr/hip_platform.cpp#L87
+void __hipRegisterFunction(
+    void** modules,
+    const void*  hostFunction,
+    char*        deviceFunction,
+    const char*  deviceName,
+    unsigned int threadLimit,
+    uint3*       tid, // There's no official C style Impl of uint3 and dim3, I refer to
+    uint3*       bid, // https://rocm.docs.amd.com/projects/HIP/en/docs-5.7.0/reference/kernel_language.html#short-vector-types
+    dim3*        blockDim, // so as the dim3
+    dim3*        gridDim,  // https://rocm.docs.amd.com/projects/HIP/en/docs-5.7.0/reference/kernel_language.html#dim3
+    int*         wSize) {
+    if (shared_lib == NULL)  { init(); }
+    void (real___hipRegisterFunction*)(void**, const void*, char*, const char*, unsigned int, uint3*, uint3*, dim3*, dim3*, int*) = dlsym(shared_lib, "__hipRegisterFunction");
+
+    real___hipRegisterFunction(modules, hostFunction, deviceFunction, deviceName, threadLimit, tid, bid, blockDim, gridDim, wSize);
+    if (VERBOSE)  { fprintf(event_log, "[info] __hipRegisterFunction %p %p\n", *modules, hostFunction); }
+}
+
+void __hipUnregisterFatBinary(void** modules) {
+    if (shared_lib == NULL)  { init(); }
+    void (real___hipUnregisterFatBinary*)(void**) = dlsym(shared_lib, "__hipUnregisterFatBinary");
+
+    real___hipUnregisterFatBinary(modules);
+    if (VERBOSE)  { fprintf(event_log, "[info] __hipUnregisterFatBinary %p\n", *modules); }
+}
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/parse.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/parse.py
new file mode 100644
index 0000000..53cfbd0
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/parse.py
@@ -0,0 +1,311 @@
+"""Code Generator for Unmodified Driver Functions and Symbols 
+
+NOTE Support CUDA / ROCm
+
+How it works?
+All Driver symbols are exposed via <cuda.h> with signatures like:
+CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);
+
+By parsing this symbol allow us to get and generate a mask driver like:
+```c
+CUresult (*real_cuDeviceGetName)(char*, int, CUdevice) = NULL;
+
+CUresult cuDeviceGetName(char* name, int len, CUdevice dev) {
+    if (shared_lib == NULL) { init(); }
+    CUresult ret = real_cuDeviceGetName(name, len, dev);
+    return ret;
+}
+```
+
+But CUDA Symbols might be versioned slightly, like cuMemAlloc now has two
+symbol cuMemAlloc (actually a macro) and cuMemAlloc_v2(real but not in cuda.h)
+so we need to check libcuda.so to compromise the missed symbol (assume the
+signature identical to unversioned.
+
+NOTE You may see warnings functions such as lacking symbols for 
+cuGL, cudbg, cuEGL, cuMem, cuProfiler, cuVDP, but they can be ignored mostly
+"""
+from typing import NamedTuple, List, Tuple, Dict
+import sys
+import os
+import subprocess
+
+LIB_PATH: str = None
+HEADER_PATH: str = None
+UNMODIFIED_C_NAME = "unmodified.c"
+SIGNATURE_C_NAME  = "signature.c"
+
+# NOTE list of modified symbols -> handled by modified.c unless code generation
+MODIFIED_FUNCTIONS: Dict[str, List[str]] = {
+    "cu": ["cuMemAlloc_v2", "cuMemFree_v2", "cuModuleLoadData", "cuModuleGetFunction", 
+        "cuKernelGetFunction", "cuLibraryGetKernel", "cuLibraryGetModule",
+        "cuLibraryLoadData", "cuLaunchKernel", "cuGetProcAddress_v2",  "cuGetProcAddress", 
+        "cuModuleLoadDataEx", "cuModuleLoad", "cuModuleLoadFatBinary", "cuLaunchKernelEx"],
+    "hip": ["hipModuleLoadData", "hipModuleLoadDataEx", "hipModuleGetFunction", 
+        "hipModuleLaunchKernel", "hipMalloc", "hipFree", "hipModuleLoad", "hipKernelNameRef",
+        "hipEventRecord", "hipGetErrorName", "hipGetErrorString", "hipApiName", 
+        "hipKernelNameRefByPtr", "hip_init"] # "hip_init" is weird in hip_runtime_api.h
+}
+
+CODEGEN_TEMPLATE: Dict[str, str] = {
+    "cu": """
+CUresult {func_name}({param_list}) {{
+    if (shared_lib == NULL)  {{ init(); }}
+    CUresult err = real_{func_name}({param_val_list}); // call the real
+    if (VERBOSE)  {{ 
+        fprintf(event_log, "[info] {func_name} %d\\n", err); 
+        fflush(event_log); // block until output written for debugging
+    }}
+    return err;
+}}""",
+    "hip": """
+hipError_t {func_name}({param_list}) {{
+    if (shared_lib == NULL)  {{ init(); }} 
+    hipError_t err = real_{func_name}({param_val_list}); // call the real
+    if (VERBOSE) {{ 
+        fprintf(event_log, "[info] {func_name} %d\\n", err); 
+        fflush(event_log); // block until output written for debugging
+    }}
+    return err;
+}}"""
+}
+
+SIGNATURE_TEMPLATE = {
+    "cu": 'CUresult (*real_{func_name})({param_list}) = NULL;',
+    "hip":  'hipError_t (*real_{func_name})({param_list}) = NULL;'
+}
+
+INIT_TEMPLATE = '    real_{func_name} = dlsym(shared_lib, "{func_name}");'
+
+IDENTIFIERS = {
+    "cu": "CUresult CUDAAPI",
+    "hip":  "hipError_t"
+}
+
+class Parameter(NamedTuple):
+    type_name: str
+    var_name: str
+
+class Signature(NamedTuple):
+    func_name: str
+    params: List[Parameter]
+
+class VersionedSymbol(NamedTuple):
+    name: str
+    version: str
+
+def parse_parameter(param: str) -> Parameter:
+    # Split the parameter into type and name
+    param_parts = param.rsplit(' ', 1)
+    if len(param_parts) == 2:
+        type_name = param_parts[0].strip()
+        var_name: str = param_parts[1].strip()
+        if var_name.startswith("*"): # avoid ptr * at variable side
+            num_star = var_name.rfind("*") + 1
+            type_name = type_name + "*" * num_star
+            var_name = var_name[num_star:]
+    else:
+        type_name = param_parts[0].strip()
+        var_name = ''
+    
+    return Parameter(type_name, var_name)
+
+def parse_function_signature(signature: str) -> Signature:
+    # Remove the trailing semicolon
+    signature = signature.strip().rstrip(';\n')
+    
+    # Find the opening parenthesis for parameters
+    paren_index = signature.find('(')
+    func_name = signature[:paren_index].strip()
+    space_index = func_name.rfind(' ')
+    func_name = func_name[space_index + 1:]
+    if "\n" in func_name:
+        space_index = func_name.rfind('\n')
+        func_name = func_name[space_index + 1:]
+
+    # Extract parameters
+    params_str = signature[paren_index + 1:].strip()
+    params_str = params_str[:-1].strip()  # Remove closing parenthesis
+
+    # Parse parameters
+    param_list: List[Parameter] = []
+    if params_str:
+        # Split by commas, considering pointer types
+        param_parts = []
+        current_param = ''
+        depth = 0
+        
+        for char in params_str:
+            if char == ',' and depth == 0:
+                param_parts.append(current_param.strip())
+                current_param = ''
+            else:
+                current_param += char
+                if char == '<':
+                    depth += 1
+                elif char == '>':
+                    depth -= 1
+            
+        # Add the last parameter
+        if current_param:
+            param_parts.append(current_param.strip())
+
+        for param in param_parts:
+            param = param.strip()
+            if param:
+                param_list.append(parse_parameter(param))
+
+    return Signature(func_name, param_list)
+
+def parse_symbol(nm_line: str) -> str:
+    if len(nm_line.strip()) != 0:
+        symbol = nm_line.rsplit(" ", 1)[1]
+        if "@" in symbol: # NOTE remove version tag @
+            symbol = symbol.split("@")[0]
+        return symbol
+    else:
+        return ""
+    
+def parse_version_symbol(symbol: str) -> Tuple[str, str]:
+    if symbol[0] != "_" and "_" in symbol: # FIX __hip
+        name, version = symbol.split("_", 1)
+        return name, "_"+version
+    else:
+        return symbol, ""
+
+def gencode(signature: Signature, template: str) -> str:
+    param_list = []
+    param_type_list = []
+    param_val_list = []
+    for param in signature.params:
+        param_type_list.append(param.type_name)
+        param_val_list.append(param.var_name)
+        param_list.append(param.type_name + " " + param.var_name)
+    return template.format(
+        func_name = signature.func_name,
+        param_list = ", ".join(param_list),
+        param_type_list = ", ".join(param_type_list),
+        param_val_list = ", ".join(param_val_list)
+    )
+
+def gensignature(signature: Signature, template: str) -> str:
+    param_list = []
+    param_type_list = []
+    param_val_list = []
+    for param in signature.params:
+        param_type_list.append(param.type_name)
+        param_val_list.append(param.var_name)
+        param_list.append(param.type_name + " " + param.var_name)
+    return template.format(
+        func_name = signature.func_name,
+        param_list = ", ".join(param_list)
+    )
+
+def geninit(signature: Signature) -> str:
+    return INIT_TEMPLATE.format(func_name = signature.func_name)
+
+if __name__ == "__main__":
+    # parse cli param if given, usage is python parse.py CUDA_HEADER_PATH, CUDA_LIB_PATH
+    if len(sys.argv) >= 3:
+        HEADER_PATH = sys.argv[1]
+        LIB_PATH = sys.argv[2]
+    else:
+        print("Usage: python parse.py <HEADER_PATH> <LIB_PATH>")
+        exit(1)
+
+    unmodified_c = open(os.path.join(os.path.dirname(os.path.realpath(__file__)), UNMODIFIED_C_NAME), "w")
+    signature_c  = open(os.path.join(os.path.dirname(os.path.realpath(__file__)), SIGNATURE_C_NAME),  "w")
+
+    print(f"[INFO] use {HEADER_PATH} and {LIB_PATH}", file=sys.stderr)
+
+    if HEADER_PATH.endswith("cuda.h") and "libcuda.so" in LIB_PATH: 
+        target = "cu"
+    elif HEADER_PATH.endswith("hip_runtime_api.h") and "libamdhip64.so" in LIB_PATH: 
+        target= "hip"
+    else: # can add more target for the future
+        raise ValueError(f"[error] {LIB_PATH} is not supported")
+
+    signatures: List[Signature] = []
+
+    # parse cuda.h to extract cuda headers
+    with open(HEADER_PATH, "r") as header_file:
+        headers = header_file.readlines()
+        idx = 0
+        start_idx, ending_idx = 0, 0
+        
+        if target == "cu":
+            for idx in range(len(headers)):
+                # BUG FIX weird definition in cuda.h
+                if "#define CUDAAPI" in headers[idx]:
+                    start_idx = idx
+                elif "CUDA API versioning support" in headers[idx]:
+                    ending_idx = idx
+                    break
+        elif target == "hip":
+            for idx in range(len(headers)):
+                if " *  @defgroup API HIP API" in headers[idx]:
+                    start_idx = idx
+                elif '} /* extern "c" */' in headers[idx]:
+                    ending_idx = idx
+                    break
+        idx = start_idx
+        # print(f"start: {start_idx}, end: {ending_idx}", file=sys.stderr)
+        identifier = IDENTIFIERS[target]
+        while idx < ending_idx:
+            if identifier in headers[idx] and "typedef" not in headers[idx]:
+                end_idx = idx + 1
+                if ";" in headers[idx]: # a full signature
+                    parsed_signature = parse_function_signature(headers[idx])
+                    signatures.append(parsed_signature)
+                else:
+                    while ";" not in headers[end_idx]:
+                        end_idx += 1
+                    parsed_signature = parse_function_signature("".join(headers[idx:end_idx+1]))
+                    signatures.append(parsed_signature)
+                idx = end_idx
+            else:
+                idx += 1
+
+
+    # extract missing symbols from libcuda.so
+    symbols_so: List[str] = []
+    result = subprocess.run(["nm", "-D", LIB_PATH], stdout=subprocess.PIPE, text=True)
+    so_log = result.stdout.split("\n")
+    for line in so_log:
+        symbol = parse_symbol(line)
+        if symbol.startswith(target): # target is also prefix of API name :)
+            symbols_so.append(symbol)
+    
+    # get the symbols missed in our cuda lib
+    parsed_symbols = {signature.func_name: signature for signature in signatures}
+    missed_symbols = [symbol for symbol in symbols_so if symbol not in parsed_symbols]
+    print(f"[INFO] Extract {len(signatures)} Symbols from {HEADER_PATH}", file=sys.stderr)
+
+    for symbol in missed_symbols:
+        # try to extract symbol and version
+        raw_symbol_name, version = parse_version_symbol(symbol)
+        # check if raw_symbol in parsed_symbols
+        if raw_symbol_name in parsed_symbols:
+            # versioned symbol share the same parameter list
+            raw_symbol = parsed_symbols[raw_symbol_name]
+            signatures.append(Signature(func_name=symbol, params=raw_symbol.params))
+        else:
+            print(f"[WARNING] can't resolve {symbol}", file=sys.stderr)
+    
+    print(f"[INFO] Resolved {len(signatures)} Symbols from {len(symbols_so)} Symbols in {LIB_PATH}", file=sys.stderr)
+
+    print("// auto-generated by parse.py, used with modified.c", file=unmodified_c)
+    print("// auto-generated by parse.py, used with modified.c", file=signature_c)
+    inits = []
+    for signature in signatures:
+        if signature.func_name not in MODIFIED_FUNCTIONS[target]: # REMOVE MODIFIED
+            code = gencode(signature, CODEGEN_TEMPLATE[target])
+            print(code, file=unmodified_c)
+            signautre_ = gensignature(signature, SIGNATURE_TEMPLATE[target])
+            print(signautre_, file=signature_c)
+            init_ = geninit(signature)
+            inits.append(init_)
+    
+    print("\nstatic void init_unmodified(void) {", file=signature_c)
+    print("\n".join(inits), file=signature_c)
+    print("}", file=signature_c)
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/preload.c b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/preload.c
new file mode 100644
index 0000000..608c2b2
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/preload.c
@@ -0,0 +1,122 @@
+/**
+ * A customized preload shared library to redirect driver (libcuda.so)
+ * 1. redirect driver interaction so don't need to modify host env (/usr/lib/...)
+ * 2. filter out proprietary product such as cuBLAS to conform updated NVIDIA EULA
+ * 
+ * @note Proprietary NVIDIA Softwares includes:
+ * cublas/curand/cufft/cusparse/cusolver/optix/...
+ * but at most case, only cuBLAS if you used PyTorch or other AIML workload
+ */
+#define _GNU_SOURCE
+#include <dlfcn.h>     // for dynamic library
+#include <stdio.h>     // for I/O
+#include <string.h>    // for strcmp
+#include <execinfo.h>  // for backtrace and backtrace_symbols
+#include <stdlib.h>    // for malloc and free
+#include <time.h>      // for timing terms
+
+#ifndef STACK_TRACE_SIZE
+#define STACK_TRACE_SIZE 5
+#endif
+
+#ifndef DL_VERBOSE
+#define DL_VERBOSE 0
+#endif
+
+// Pointer to GLIBC dlopen function, by dlsym(RTLD_NEXT, "dlopen")
+static void* (*real_dlopen)(const char *filename, int flags) = NULL;
+
+static char* NEUTRINO_REAL_DRIVER = NULL;
+static char* NEUTRINO_HOOK_DRIVER = NULL;
+static char* NEUTRINO_DRIVER_NAME = NULL;
+
+/**
+ * Provides a hook on both statically or dynamically loading shared library
+ * by overwriting dlopen with the same signature as GLIBC dlopen
+ * @cite https://man7.org/linux/man-pages/man3/dlopen.3.html
+ * 
+ * This will leads to 2 dlopen function in the search space of executable:
+ * 1. our dlopen as follows, will be chosen automatically as LD_PRELOAD
+ * 2. standard c library's dlopen, will be masked but still can be referred
+ *    if RTLD_NEXT flag is specified
+ * 
+ * This ensure FULL COVERAGE because dlopen must be linked statically to 
+ * enable dynamic linking (via dlopen) -> a puzzle in UNIX-like OS
+ */
+void* dlopen(const char *filename, int flags) {
+    // original (GLIBC) dlopen still exists in search space 
+    // but is less prefered as LD_PRELOAD mask it
+    // using dlsym with RTLD_NEXT we can extract GLIBC dlopen.
+    if (!real_dlopen) 
+        real_dlopen = dlsym(RTLD_NEXT, "dlopen");
+    
+    if (!NEUTRINO_DRIVER_NAME) {
+        NEUTRINO_DRIVER_NAME = getenv("NEUTRINO_DRIVER_NAME");
+        // fprintf(stderr, "[info] NEUTRINO_DRIVER_NAME: %s\n", NEUTRINO_DRIVER_NAME);
+    }   
+
+    if (filename != NULL && (strstr(filename, NEUTRINO_DRIVER_NAME) != NULL)) {
+        
+        // Check if it's libcublas.so backtrace
+        // @see https://man7.org/linux/man-pages/man3/backtrace.3.html
+        void* array[STACK_TRACE_SIZE];
+        int size       = backtrace(array, STACK_TRACE_SIZE);
+        char** strings = backtrace_symbols(array, size);
+        int call_from_cublas = 0;
+        if (strings != NULL){
+            for (int i = 0; i < size; i++) {
+                // we will add ALL Nvidia Propietray Product here
+                if (strstr(strings[i], "libcublas") != NULL) {
+                    call_from_cublas = 1;
+                    break;
+                }
+            }
+        }
+        free(strings);
+        void* ptr;
+        if (call_from_cublas) {
+            if (NEUTRINO_REAL_DRIVER == NULL) {
+                NEUTRINO_REAL_DRIVER = getenv("NEUTRINO_REAL_DRIVER");
+                if (NEUTRINO_REAL_DRIVER == NULL) { // fault
+                    fprintf(stderr, "[error] NEUTRINO_REAL_DRIVER not set\n");
+                    exit(1);
+                }
+            }
+            ptr = real_dlopen(NEUTRINO_REAL_DRIVER, flags);
+            struct timespec ts;
+            clock_gettime(CLOCK_REALTIME, &ts);
+            long long time = ts.tv_nsec + ts.tv_sec * 1e9;
+            // printf("[info] %lld cublas use real: %s %p %d\n", time, NEUTRINO_REAL_DRIVER, ptr, flags);
+            fflush(stdout);
+        } else {
+            char* NEUTRINO_HOOK_DRIVER = getenv("NEUTRINO_HOOK_DRIVER");
+            if (NEUTRINO_HOOK_DRIVER == NULL) {
+                fprintf(stderr, "[error] NEUTRINO_HOOK_DRIVER not set\n");
+                ptr = real_dlopen(filename, flags); // try to backup
+            }
+            // @note fix the multiple initialization bug
+            ptr = real_dlopen(NEUTRINO_HOOK_DRIVER, flags | RTLD_GLOBAL);
+            // fprintf(stderr, "[dlopen] %s : %d, %p", NEUTRINO_HOOK_DRIVER, flags | RTLD_GLOBAL, ptr);
+            if (DL_VERBOSE) {
+                struct timespec ts;
+                clock_gettime(CLOCK_REALTIME, &ts);
+                long long time = ts.tv_nsec + ts.tv_sec * 1e9;
+                printf("[info] %lld use hooked: %s %p %d\n", time, NEUTRINO_HOOK_DRIVER, ptr, flags);
+                fflush(stdout);
+            }
+        }
+        return ptr;
+    } else { // not interested, just let them go via loading the correct
+        // Call the original dlopen
+        void* ptr = real_dlopen(filename, flags);
+        // Print the name of the module being loaded
+        if (DL_VERBOSE) {
+            struct timespec ts;
+            clock_gettime(CLOCK_REALTIME, &ts);
+            long long time = ts.tv_nsec + ts.tv_sec * 1e9;
+            printf("[info] %lld Loading: %s %p %d\n", time, filename, ptr, flags);
+            fflush(stdout);
+        }
+        return ptr;
+    }
+}
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/sha1.h b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/sha1.h
new file mode 100644
index 0000000..a9e174b
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/sha1.h
@@ -0,0 +1,165 @@
+/**
+ * SHA-1 Algorithm used to place kernel with long long long name
+ * exceeds file system limitation on 255 character
+ * 
+ * @cite https://github.com/B-Con/crypto-algorithms/blob/master/sha1.h
+ */
+
+ #ifndef SHA1_H
+ #define SHA1_H
+ 
+ /*************************** HEADER FILES ***************************/
+ #include <stddef.h>
+ #include <stdlib.h>
+ #include <memory.h>
+ 
+ /****************************** MACROS ******************************/
+ #define SHA1_BLOCK_SIZE 20              // SHA1 outputs a 20 byte digest
+ #define ROTLEFT(a, b) ((a << b) | (a >> (32 - b)))
+ 
+ /**************************** DATA TYPES ****************************/
+ typedef unsigned char BYTE;             // 8-bit byte
+ typedef unsigned int  WORD;             // 32-bit word, change to "long" for 16-bit machines
+ 
+ typedef struct {
+     BYTE data[64];
+     WORD datalen;
+     unsigned long long bitlen;
+     WORD state[5];
+     WORD k[4];
+ } SHA1_CTX;
+ 
+ /*********************** FUNCTION DECLARATIONS **********************/
+ 
+ void sha1_transform(SHA1_CTX *ctx, const BYTE data[])
+ {
+     WORD a, b, c, d, e, i, j, t, m[80];
+ 
+     for (i = 0, j = 0; i < 16; ++i, j += 4)
+         m[i] = (data[j] << 24) + (data[j + 1] << 16) + (data[j + 2] << 8) + (data[j + 3]);
+     for ( ; i < 80; ++i) {
+         m[i] = (m[i - 3] ^ m[i - 8] ^ m[i - 14] ^ m[i - 16]);
+         m[i] = (m[i] << 1) | (m[i] >> 31);
+     }
+ 
+     a = ctx->state[0];
+     b = ctx->state[1];
+     c = ctx->state[2];
+     d = ctx->state[3];
+     e = ctx->state[4];
+ 
+     for (i = 0; i < 20; ++i) {
+         t = ROTLEFT(a, 5) + ((b & c) ^ (~b & d)) + e + ctx->k[0] + m[i];
+         e = d;
+         d = c;
+         c = ROTLEFT(b, 30);
+         b = a;
+         a = t;
+     }
+     for ( ; i < 40; ++i) {
+         t = ROTLEFT(a, 5) + (b ^ c ^ d) + e + ctx->k[1] + m[i];
+         e = d;
+         d = c;
+         c = ROTLEFT(b, 30);
+         b = a;
+         a = t;
+     }
+     for ( ; i < 60; ++i) {
+         t = ROTLEFT(a, 5) + ((b & c) ^ (b & d) ^ (c & d))  + e + ctx->k[2] + m[i];
+         e = d;
+         d = c;
+         c = ROTLEFT(b, 30);
+         b = a;
+         a = t;
+     }
+     for ( ; i < 80; ++i) {
+         t = ROTLEFT(a, 5) + (b ^ c ^ d) + e + ctx->k[3] + m[i];
+         e = d;
+         d = c;
+         c = ROTLEFT(b, 30);
+         b = a;
+         a = t;
+     }
+ 
+     ctx->state[0] += a;
+     ctx->state[1] += b;
+     ctx->state[2] += c;
+     ctx->state[3] += d;
+     ctx->state[4] += e;
+ }
+ 
+ void sha1_init(SHA1_CTX *ctx)
+ {
+     ctx->datalen = 0;
+     ctx->bitlen = 0;
+     ctx->state[0] = 0x67452301;
+     ctx->state[1] = 0xEFCDAB89;
+     ctx->state[2] = 0x98BADCFE;
+     ctx->state[3] = 0x10325476;
+     ctx->state[4] = 0xc3d2e1f0;
+     ctx->k[0] = 0x5a827999;
+     ctx->k[1] = 0x6ed9eba1;
+     ctx->k[2] = 0x8f1bbcdc;
+     ctx->k[3] = 0xca62c1d6;
+ }
+ 
+ void sha1_update(SHA1_CTX *ctx, const BYTE data[], size_t len)
+ {
+     size_t i;
+ 
+     for (i = 0; i < len; ++i) {
+         ctx->data[ctx->datalen] = data[i];
+         ctx->datalen++;
+         if (ctx->datalen == 64) {
+             sha1_transform(ctx, ctx->data);
+             ctx->bitlen += 512;
+             ctx->datalen = 0;
+         }
+     }
+ }
+ 
+ void sha1_final(SHA1_CTX *ctx, BYTE hash[])
+ {
+     WORD i;
+ 
+     i = ctx->datalen;
+ 
+     // Pad whatever data is left in the buffer.
+     if (ctx->datalen < 56) {
+         ctx->data[i++] = 0x80;
+         while (i < 56)
+             ctx->data[i++] = 0x00;
+     }
+     else {
+         ctx->data[i++] = 0x80;
+         while (i < 64)
+             ctx->data[i++] = 0x00;
+         sha1_transform(ctx, ctx->data);
+         memset(ctx->data, 0, 56);
+     }
+ 
+     // Append to the padding the total message's length in bits and transform.
+     ctx->bitlen += ctx->datalen * 8;
+     ctx->data[63] = ctx->bitlen;
+     ctx->data[62] = ctx->bitlen >> 8;
+     ctx->data[61] = ctx->bitlen >> 16;
+     ctx->data[60] = ctx->bitlen >> 24;
+     ctx->data[59] = ctx->bitlen >> 32;
+     ctx->data[58] = ctx->bitlen >> 40;
+     ctx->data[57] = ctx->bitlen >> 48;
+     ctx->data[56] = ctx->bitlen >> 56;
+     sha1_transform(ctx, ctx->data);
+ 
+     // Since this implementation uses little endian byte ordering and MD uses big endian,
+     // reverse all the bytes when copying the final state to the output hash.
+     for (i = 0; i < 4; ++i) {
+         hash[i]      = (ctx->state[0] >> (24 - i * 8)) & 0x000000ff;
+         hash[i + 4]  = (ctx->state[1] >> (24 - i * 8)) & 0x000000ff;
+         hash[i + 8]  = (ctx->state[2] >> (24 - i * 8)) & 0x000000ff;
+         hash[i + 12] = (ctx->state[3] >> (24 - i * 8)) & 0x000000ff;
+         hash[i + 16] = (ctx->state[4] >> (24 - i * 8)) & 0x000000ff;
+     }
+ }
+ 
+ 
+ #endif
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/uthash.h b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/uthash.h
new file mode 100644
index 0000000..cf889f9
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/uthash.h
@@ -0,0 +1,1144 @@
+/**
+ * UTHash used for JIT Code Cache in Neutrino
+ */
+
+/*
+Copyright (c) 2003-2022, Troy D. Hanson  https://troydhanson.github.io/uthash/
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef UTHASH_H
+#define UTHASH_H
+
+#define UTHASH_VERSION 2.3.0
+
+#include <string.h>   /* memcmp, memset, strlen */
+#include <stddef.h>   /* ptrdiff_t */
+#include <stdlib.h>   /* exit */
+
+#if defined(HASH_DEFINE_OWN_STDINT) && HASH_DEFINE_OWN_STDINT
+/* This codepath is provided for backward compatibility, but I plan to remove it. */
+#warning "HASH_DEFINE_OWN_STDINT is deprecated; please use HASH_NO_STDINT instead"
+typedef unsigned int uint32_t;
+typedef unsigned char uint8_t;
+#elif defined(HASH_NO_STDINT) && HASH_NO_STDINT
+#else
+#include <stdint.h>   /* uint8_t, uint32_t */
+#endif
+
+/* These macros use decltype or the earlier __typeof GNU extension.
+   As decltype is only available in newer compilers (VS2010 or gcc 4.3+
+   when compiling c++ source) this code uses whatever method is needed
+   or, for VS2008 where neither is available, uses casting workarounds. */
+#if !defined(DECLTYPE) && !defined(NO_DECLTYPE)
+#if defined(_MSC_VER)   /* MS compiler */
+#if _MSC_VER >= 1600 && defined(__cplusplus)  /* VS2010 or newer in C++ mode */
+#define DECLTYPE(x) (decltype(x))
+#else                   /* VS2008 or older (or VS2010 in C mode) */
+#define NO_DECLTYPE
+#endif
+#elif defined(__MCST__)  /* Elbrus C Compiler */
+#define DECLTYPE(x) (__typeof(x))
+#elif defined(__BORLANDC__) || defined(__ICCARM__) || defined(__LCC__) || defined(__WATCOMC__)
+#define NO_DECLTYPE
+#else                   /* GNU, Sun and other compilers */
+#define DECLTYPE(x) (__typeof(x))
+#endif
+#endif
+
+#ifdef NO_DECLTYPE
+#define DECLTYPE(x)
+#define DECLTYPE_ASSIGN(dst,src)                                                 \
+do {                                                                             \
+  char **_da_dst = (char**)(&(dst));                                             \
+  *_da_dst = (char*)(src);                                                       \
+} while (0)
+#else
+#define DECLTYPE_ASSIGN(dst,src)                                                 \
+do {                                                                             \
+  (dst) = DECLTYPE(dst)(src);                                                    \
+} while (0)
+#endif
+
+#ifndef uthash_malloc
+#define uthash_malloc(sz) malloc(sz)      /* malloc fcn                      */
+#endif
+#ifndef uthash_free
+#define uthash_free(ptr,sz) free(ptr)     /* free fcn                        */
+#endif
+#ifndef uthash_bzero
+#define uthash_bzero(a,n) memset(a,'\0',n)
+#endif
+#ifndef uthash_strlen
+#define uthash_strlen(s) strlen(s)
+#endif
+
+#ifndef HASH_FUNCTION
+#define HASH_FUNCTION(keyptr,keylen,hashv) HASH_JEN(keyptr, keylen, hashv)
+#endif
+
+#ifndef HASH_KEYCMP
+#define HASH_KEYCMP(a,b,n) memcmp(a,b,n)
+#endif
+
+#ifndef uthash_noexpand_fyi
+#define uthash_noexpand_fyi(tbl)          /* can be defined to log noexpand  */
+#endif
+#ifndef uthash_expand_fyi
+#define uthash_expand_fyi(tbl)            /* can be defined to log expands   */
+#endif
+
+#ifndef HASH_NONFATAL_OOM
+#define HASH_NONFATAL_OOM 0
+#endif
+
+#if HASH_NONFATAL_OOM
+/* malloc failures can be recovered from */
+
+#ifndef uthash_nonfatal_oom
+#define uthash_nonfatal_oom(obj) do {} while (0)    /* non-fatal OOM error */
+#endif
+
+#define HASH_RECORD_OOM(oomed) do { (oomed) = 1; } while (0)
+#define IF_HASH_NONFATAL_OOM(x) x
+
+#else
+/* malloc failures result in lost memory, hash tables are unusable */
+
+#ifndef uthash_fatal
+#define uthash_fatal(msg) exit(-1)        /* fatal OOM error */
+#endif
+
+#define HASH_RECORD_OOM(oomed) uthash_fatal("out of memory")
+#define IF_HASH_NONFATAL_OOM(x)
+
+#endif
+
+/* initial number of buckets */
+#define HASH_INITIAL_NUM_BUCKETS 32U     /* initial number of buckets        */
+#define HASH_INITIAL_NUM_BUCKETS_LOG2 5U /* lg2 of initial number of buckets */
+#define HASH_BKT_CAPACITY_THRESH 10U     /* expand when bucket count reaches */
+
+/* calculate the element whose hash handle address is hhp */
+#define ELMT_FROM_HH(tbl,hhp) ((void*)(((char*)(hhp)) - ((tbl)->hho)))
+/* calculate the hash handle from element address elp */
+#define HH_FROM_ELMT(tbl,elp) ((UT_hash_handle*)(void*)(((char*)(elp)) + ((tbl)->hho)))
+
+#define HASH_ROLLBACK_BKT(hh, head, itemptrhh)                                   \
+do {                                                                             \
+  struct UT_hash_handle *_hd_hh_item = (itemptrhh);                              \
+  unsigned _hd_bkt;                                                              \
+  HASH_TO_BKT(_hd_hh_item->hashv, (head)->hh.tbl->num_buckets, _hd_bkt);         \
+  (head)->hh.tbl->buckets[_hd_bkt].count++;                                      \
+  _hd_hh_item->hh_next = NULL;                                                   \
+  _hd_hh_item->hh_prev = NULL;                                                   \
+} while (0)
+
+#define HASH_VALUE(keyptr,keylen,hashv)                                          \
+do {                                                                             \
+  HASH_FUNCTION(keyptr, keylen, hashv);                                          \
+} while (0)
+
+#define HASH_FIND_BYHASHVALUE(hh,head,keyptr,keylen,hashval,out)                 \
+do {                                                                             \
+  (out) = NULL;                                                                  \
+  if (head) {                                                                    \
+    unsigned _hf_bkt;                                                            \
+    HASH_TO_BKT(hashval, (head)->hh.tbl->num_buckets, _hf_bkt);                  \
+    if (HASH_BLOOM_TEST((head)->hh.tbl, hashval)) {                              \
+      HASH_FIND_IN_BKT((head)->hh.tbl, hh, (head)->hh.tbl->buckets[ _hf_bkt ], keyptr, keylen, hashval, out); \
+    }                                                                            \
+  }                                                                              \
+} while (0)
+
+#define HASH_FIND(hh,head,keyptr,keylen,out)                                     \
+do {                                                                             \
+  (out) = NULL;                                                                  \
+  if (head) {                                                                    \
+    unsigned _hf_hashv;                                                          \
+    HASH_VALUE(keyptr, keylen, _hf_hashv);                                       \
+    HASH_FIND_BYHASHVALUE(hh, head, keyptr, keylen, _hf_hashv, out);             \
+  }                                                                              \
+} while (0)
+
+#ifdef HASH_BLOOM
+#define HASH_BLOOM_BITLEN (1UL << HASH_BLOOM)
+#define HASH_BLOOM_BYTELEN (HASH_BLOOM_BITLEN/8UL) + (((HASH_BLOOM_BITLEN%8UL)!=0UL) ? 1UL : 0UL)
+#define HASH_BLOOM_MAKE(tbl,oomed)                                               \
+do {                                                                             \
+  (tbl)->bloom_nbits = HASH_BLOOM;                                               \
+  (tbl)->bloom_bv = (uint8_t*)uthash_malloc(HASH_BLOOM_BYTELEN);                 \
+  if (!(tbl)->bloom_bv) {                                                        \
+    HASH_RECORD_OOM(oomed);                                                      \
+  } else {                                                                       \
+    uthash_bzero((tbl)->bloom_bv, HASH_BLOOM_BYTELEN);                           \
+    (tbl)->bloom_sig = HASH_BLOOM_SIGNATURE;                                     \
+  }                                                                              \
+} while (0)
+
+#define HASH_BLOOM_FREE(tbl)                                                     \
+do {                                                                             \
+  uthash_free((tbl)->bloom_bv, HASH_BLOOM_BYTELEN);                              \
+} while (0)
+
+#define HASH_BLOOM_BITSET(bv,idx) (bv[(idx)/8U] |= (1U << ((idx)%8U)))
+#define HASH_BLOOM_BITTEST(bv,idx) ((bv[(idx)/8U] & (1U << ((idx)%8U))) != 0)
+
+#define HASH_BLOOM_ADD(tbl,hashv)                                                \
+  HASH_BLOOM_BITSET((tbl)->bloom_bv, ((hashv) & (uint32_t)((1UL << (tbl)->bloom_nbits) - 1U)))
+
+#define HASH_BLOOM_TEST(tbl,hashv)                                               \
+  HASH_BLOOM_BITTEST((tbl)->bloom_bv, ((hashv) & (uint32_t)((1UL << (tbl)->bloom_nbits) - 1U)))
+
+#else
+#define HASH_BLOOM_MAKE(tbl,oomed)
+#define HASH_BLOOM_FREE(tbl)
+#define HASH_BLOOM_ADD(tbl,hashv)
+#define HASH_BLOOM_TEST(tbl,hashv) 1
+#define HASH_BLOOM_BYTELEN 0U
+#endif
+
+#define HASH_MAKE_TABLE(hh,head,oomed)                                           \
+do {                                                                             \
+  (head)->hh.tbl = (UT_hash_table*)uthash_malloc(sizeof(UT_hash_table));         \
+  if (!(head)->hh.tbl) {                                                         \
+    HASH_RECORD_OOM(oomed);                                                      \
+  } else {                                                                       \
+    uthash_bzero((head)->hh.tbl, sizeof(UT_hash_table));                         \
+    (head)->hh.tbl->tail = &((head)->hh);                                        \
+    (head)->hh.tbl->num_buckets = HASH_INITIAL_NUM_BUCKETS;                      \
+    (head)->hh.tbl->log2_num_buckets = HASH_INITIAL_NUM_BUCKETS_LOG2;            \
+    (head)->hh.tbl->hho = (char*)(&(head)->hh) - (char*)(head);                  \
+    (head)->hh.tbl->buckets = (UT_hash_bucket*)uthash_malloc(                    \
+        HASH_INITIAL_NUM_BUCKETS * sizeof(struct UT_hash_bucket));               \
+    (head)->hh.tbl->signature = HASH_SIGNATURE;                                  \
+    if (!(head)->hh.tbl->buckets) {                                              \
+      HASH_RECORD_OOM(oomed);                                                    \
+      uthash_free((head)->hh.tbl, sizeof(UT_hash_table));                        \
+    } else {                                                                     \
+      uthash_bzero((head)->hh.tbl->buckets,                                      \
+          HASH_INITIAL_NUM_BUCKETS * sizeof(struct UT_hash_bucket));             \
+      HASH_BLOOM_MAKE((head)->hh.tbl, oomed);                                    \
+      IF_HASH_NONFATAL_OOM(                                                      \
+        if (oomed) {                                                             \
+          uthash_free((head)->hh.tbl->buckets,                                   \
+              HASH_INITIAL_NUM_BUCKETS*sizeof(struct UT_hash_bucket));           \
+          uthash_free((head)->hh.tbl, sizeof(UT_hash_table));                    \
+        }                                                                        \
+      )                                                                          \
+    }                                                                            \
+  }                                                                              \
+} while (0)
+
+#define HASH_REPLACE_BYHASHVALUE_INORDER(hh,head,fieldname,keylen_in,hashval,add,replaced,cmpfcn) \
+do {                                                                             \
+  (replaced) = NULL;                                                             \
+  HASH_FIND_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, hashval, replaced); \
+  if (replaced) {                                                                \
+    HASH_DELETE(hh, head, replaced);                                             \
+  }                                                                              \
+  HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh, head, &((add)->fieldname), keylen_in, hashval, add, cmpfcn); \
+} while (0)
+
+#define HASH_REPLACE_BYHASHVALUE(hh,head,fieldname,keylen_in,hashval,add,replaced) \
+do {                                                                             \
+  (replaced) = NULL;                                                             \
+  HASH_FIND_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, hashval, replaced); \
+  if (replaced) {                                                                \
+    HASH_DELETE(hh, head, replaced);                                             \
+  }                                                                              \
+  HASH_ADD_KEYPTR_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, hashval, add); \
+} while (0)
+
+#define HASH_REPLACE(hh,head,fieldname,keylen_in,add,replaced)                   \
+do {                                                                             \
+  unsigned _hr_hashv;                                                            \
+  HASH_VALUE(&((add)->fieldname), keylen_in, _hr_hashv);                         \
+  HASH_REPLACE_BYHASHVALUE(hh, head, fieldname, keylen_in, _hr_hashv, add, replaced); \
+} while (0)
+
+#define HASH_REPLACE_INORDER(hh,head,fieldname,keylen_in,add,replaced,cmpfcn)    \
+do {                                                                             \
+  unsigned _hr_hashv;                                                            \
+  HASH_VALUE(&((add)->fieldname), keylen_in, _hr_hashv);                         \
+  HASH_REPLACE_BYHASHVALUE_INORDER(hh, head, fieldname, keylen_in, _hr_hashv, add, replaced, cmpfcn); \
+} while (0)
+
+#define HASH_APPEND_LIST(hh, head, add)                                          \
+do {                                                                             \
+  (add)->hh.next = NULL;                                                         \
+  (add)->hh.prev = ELMT_FROM_HH((head)->hh.tbl, (head)->hh.tbl->tail);           \
+  (head)->hh.tbl->tail->next = (add);                                            \
+  (head)->hh.tbl->tail = &((add)->hh);                                           \
+} while (0)
+
+#define HASH_AKBI_INNER_LOOP(hh,head,add,cmpfcn)                                 \
+do {                                                                             \
+  do {                                                                           \
+    if (cmpfcn(DECLTYPE(head)(_hs_iter), add) > 0) {                             \
+      break;                                                                     \
+    }                                                                            \
+  } while ((_hs_iter = HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->next));           \
+} while (0)
+
+#ifdef NO_DECLTYPE
+#undef HASH_AKBI_INNER_LOOP
+#define HASH_AKBI_INNER_LOOP(hh,head,add,cmpfcn)                                 \
+do {                                                                             \
+  char *_hs_saved_head = (char*)(head);                                          \
+  do {                                                                           \
+    DECLTYPE_ASSIGN(head, _hs_iter);                                             \
+    if (cmpfcn(head, add) > 0) {                                                 \
+      DECLTYPE_ASSIGN(head, _hs_saved_head);                                     \
+      break;                                                                     \
+    }                                                                            \
+    DECLTYPE_ASSIGN(head, _hs_saved_head);                                       \
+  } while ((_hs_iter = HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->next));           \
+} while (0)
+#endif
+
+#if HASH_NONFATAL_OOM
+
+#define HASH_ADD_TO_TABLE(hh,head,keyptr,keylen_in,hashval,add,oomed)            \
+do {                                                                             \
+  if (!(oomed)) {                                                                \
+    unsigned _ha_bkt;                                                            \
+    (head)->hh.tbl->num_items++;                                                 \
+    HASH_TO_BKT(hashval, (head)->hh.tbl->num_buckets, _ha_bkt);                  \
+    HASH_ADD_TO_BKT((head)->hh.tbl->buckets[_ha_bkt], hh, &(add)->hh, oomed);    \
+    if (oomed) {                                                                 \
+      HASH_ROLLBACK_BKT(hh, head, &(add)->hh);                                   \
+      HASH_DELETE_HH(hh, head, &(add)->hh);                                      \
+      (add)->hh.tbl = NULL;                                                      \
+      uthash_nonfatal_oom(add);                                                  \
+    } else {                                                                     \
+      HASH_BLOOM_ADD((head)->hh.tbl, hashval);                                   \
+      HASH_EMIT_KEY(hh, head, keyptr, keylen_in);                                \
+    }                                                                            \
+  } else {                                                                       \
+    (add)->hh.tbl = NULL;                                                        \
+    uthash_nonfatal_oom(add);                                                    \
+  }                                                                              \
+} while (0)
+
+#else
+
+#define HASH_ADD_TO_TABLE(hh,head,keyptr,keylen_in,hashval,add,oomed)            \
+do {                                                                             \
+  unsigned _ha_bkt;                                                              \
+  (head)->hh.tbl->num_items++;                                                   \
+  HASH_TO_BKT(hashval, (head)->hh.tbl->num_buckets, _ha_bkt);                    \
+  HASH_ADD_TO_BKT((head)->hh.tbl->buckets[_ha_bkt], hh, &(add)->hh, oomed);      \
+  HASH_BLOOM_ADD((head)->hh.tbl, hashval);                                       \
+  HASH_EMIT_KEY(hh, head, keyptr, keylen_in);                                    \
+} while (0)
+
+#endif
+
+
+#define HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh,head,keyptr,keylen_in,hashval,add,cmpfcn) \
+do {                                                                             \
+  IF_HASH_NONFATAL_OOM( int _ha_oomed = 0; )                                     \
+  (add)->hh.hashv = (hashval);                                                   \
+  (add)->hh.key = (char*) (keyptr);                                              \
+  (add)->hh.keylen = (unsigned) (keylen_in);                                     \
+  if (!(head)) {                                                                 \
+    (add)->hh.next = NULL;                                                       \
+    (add)->hh.prev = NULL;                                                       \
+    HASH_MAKE_TABLE(hh, add, _ha_oomed);                                         \
+    IF_HASH_NONFATAL_OOM( if (!_ha_oomed) { )                                    \
+      (head) = (add);                                                            \
+    IF_HASH_NONFATAL_OOM( } )                                                    \
+  } else {                                                                       \
+    void *_hs_iter = (head);                                                     \
+    (add)->hh.tbl = (head)->hh.tbl;                                              \
+    HASH_AKBI_INNER_LOOP(hh, head, add, cmpfcn);                                 \
+    if (_hs_iter) {                                                              \
+      (add)->hh.next = _hs_iter;                                                 \
+      if (((add)->hh.prev = HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->prev)) {     \
+        HH_FROM_ELMT((head)->hh.tbl, (add)->hh.prev)->next = (add);              \
+      } else {                                                                   \
+        (head) = (add);                                                          \
+      }                                                                          \
+      HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->prev = (add);                      \
+    } else {                                                                     \
+      HASH_APPEND_LIST(hh, head, add);                                           \
+    }                                                                            \
+  }                                                                              \
+  HASH_ADD_TO_TABLE(hh, head, keyptr, keylen_in, hashval, add, _ha_oomed);       \
+  HASH_FSCK(hh, head, "HASH_ADD_KEYPTR_BYHASHVALUE_INORDER");                    \
+} while (0)
+
+#define HASH_ADD_KEYPTR_INORDER(hh,head,keyptr,keylen_in,add,cmpfcn)             \
+do {                                                                             \
+  unsigned _hs_hashv;                                                            \
+  HASH_VALUE(keyptr, keylen_in, _hs_hashv);                                      \
+  HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh, head, keyptr, keylen_in, _hs_hashv, add, cmpfcn); \
+} while (0)
+
+#define HASH_ADD_BYHASHVALUE_INORDER(hh,head,fieldname,keylen_in,hashval,add,cmpfcn) \
+  HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh, head, &((add)->fieldname), keylen_in, hashval, add, cmpfcn)
+
+#define HASH_ADD_INORDER(hh,head,fieldname,keylen_in,add,cmpfcn)                 \
+  HASH_ADD_KEYPTR_INORDER(hh, head, &((add)->fieldname), keylen_in, add, cmpfcn)
+
+#define HASH_ADD_KEYPTR_BYHASHVALUE(hh,head,keyptr,keylen_in,hashval,add)        \
+do {                                                                             \
+  IF_HASH_NONFATAL_OOM( int _ha_oomed = 0; )                                     \
+  (add)->hh.hashv = (hashval);                                                   \
+  (add)->hh.key = (const void*) (keyptr);                                        \
+  (add)->hh.keylen = (unsigned) (keylen_in);                                     \
+  if (!(head)) {                                                                 \
+    (add)->hh.next = NULL;                                                       \
+    (add)->hh.prev = NULL;                                                       \
+    HASH_MAKE_TABLE(hh, add, _ha_oomed);                                         \
+    IF_HASH_NONFATAL_OOM( if (!_ha_oomed) { )                                    \
+      (head) = (add);                                                            \
+    IF_HASH_NONFATAL_OOM( } )                                                    \
+  } else {                                                                       \
+    (add)->hh.tbl = (head)->hh.tbl;                                              \
+    HASH_APPEND_LIST(hh, head, add);                                             \
+  }                                                                              \
+  HASH_ADD_TO_TABLE(hh, head, keyptr, keylen_in, hashval, add, _ha_oomed);       \
+  HASH_FSCK(hh, head, "HASH_ADD_KEYPTR_BYHASHVALUE");                            \
+} while (0)
+
+#define HASH_ADD_KEYPTR(hh,head,keyptr,keylen_in,add)                            \
+do {                                                                             \
+  unsigned _ha_hashv;                                                            \
+  HASH_VALUE(keyptr, keylen_in, _ha_hashv);                                      \
+  HASH_ADD_KEYPTR_BYHASHVALUE(hh, head, keyptr, keylen_in, _ha_hashv, add);      \
+} while (0)
+
+#define HASH_ADD_BYHASHVALUE(hh,head,fieldname,keylen_in,hashval,add)            \
+  HASH_ADD_KEYPTR_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, hashval, add)
+
+#define HASH_ADD(hh,head,fieldname,keylen_in,add)                                \
+  HASH_ADD_KEYPTR(hh, head, &((add)->fieldname), keylen_in, add)
+
+#define HASH_TO_BKT(hashv,num_bkts,bkt)                                          \
+do {                                                                             \
+  bkt = ((hashv) & ((num_bkts) - 1U));                                           \
+} while (0)
+
+/* delete "delptr" from the hash table.
+ * "the usual" patch-up process for the app-order doubly-linked-list.
+ * The use of _hd_hh_del below deserves special explanation.
+ * These used to be expressed using (delptr) but that led to a bug
+ * if someone used the same symbol for the head and deletee, like
+ *  HASH_DELETE(hh,users,users);
+ * We want that to work, but by changing the head (users) below
+ * we were forfeiting our ability to further refer to the deletee (users)
+ * in the patch-up process. Solution: use scratch space to
+ * copy the deletee pointer, then the latter references are via that
+ * scratch pointer rather than through the repointed (users) symbol.
+ */
+#define HASH_DELETE(hh,head,delptr)                                              \
+    HASH_DELETE_HH(hh, head, &(delptr)->hh)
+
+#define HASH_DELETE_HH(hh,head,delptrhh)                                         \
+do {                                                                             \
+  const struct UT_hash_handle *_hd_hh_del = (delptrhh);                          \
+  if ((_hd_hh_del->prev == NULL) && (_hd_hh_del->next == NULL)) {                \
+    HASH_BLOOM_FREE((head)->hh.tbl);                                             \
+    uthash_free((head)->hh.tbl->buckets,                                         \
+                (head)->hh.tbl->num_buckets * sizeof(struct UT_hash_bucket));    \
+    uthash_free((head)->hh.tbl, sizeof(UT_hash_table));                          \
+    (head) = NULL;                                                               \
+  } else {                                                                       \
+    unsigned _hd_bkt;                                                            \
+    if (_hd_hh_del == (head)->hh.tbl->tail) {                                    \
+      (head)->hh.tbl->tail = HH_FROM_ELMT((head)->hh.tbl, _hd_hh_del->prev);     \
+    }                                                                            \
+    if (_hd_hh_del->prev != NULL) {                                              \
+      HH_FROM_ELMT((head)->hh.tbl, _hd_hh_del->prev)->next = _hd_hh_del->next;   \
+    } else {                                                                     \
+      DECLTYPE_ASSIGN(head, _hd_hh_del->next);                                   \
+    }                                                                            \
+    if (_hd_hh_del->next != NULL) {                                              \
+      HH_FROM_ELMT((head)->hh.tbl, _hd_hh_del->next)->prev = _hd_hh_del->prev;   \
+    }                                                                            \
+    HASH_TO_BKT(_hd_hh_del->hashv, (head)->hh.tbl->num_buckets, _hd_bkt);        \
+    HASH_DEL_IN_BKT((head)->hh.tbl->buckets[_hd_bkt], _hd_hh_del);               \
+    (head)->hh.tbl->num_items--;                                                 \
+  }                                                                              \
+  HASH_FSCK(hh, head, "HASH_DELETE_HH");                                         \
+} while (0)
+
+/* convenience forms of HASH_FIND/HASH_ADD/HASH_DEL */
+#define HASH_FIND_STR(head,findstr,out)                                          \
+do {                                                                             \
+    unsigned _uthash_hfstr_keylen = (unsigned)uthash_strlen(findstr);            \
+    HASH_FIND(hh, head, findstr, _uthash_hfstr_keylen, out);                     \
+} while (0)
+#define HASH_ADD_STR(head,strfield,add)                                          \
+do {                                                                             \
+    unsigned _uthash_hastr_keylen = (unsigned)uthash_strlen((add)->strfield);    \
+    HASH_ADD(hh, head, strfield[0], _uthash_hastr_keylen, add);                  \
+} while (0)
+#define HASH_REPLACE_STR(head,strfield,add,replaced)                             \
+do {                                                                             \
+    unsigned _uthash_hrstr_keylen = (unsigned)uthash_strlen((add)->strfield);    \
+    HASH_REPLACE(hh, head, strfield[0], _uthash_hrstr_keylen, add, replaced);    \
+} while (0)
+#define HASH_FIND_INT(head,findint,out)                                          \
+    HASH_FIND(hh,head,findint,sizeof(int),out)
+#define HASH_ADD_INT(head,intfield,add)                                          \
+    HASH_ADD(hh,head,intfield,sizeof(int),add)
+#define HASH_REPLACE_INT(head,intfield,add,replaced)                             \
+    HASH_REPLACE(hh,head,intfield,sizeof(int),add,replaced)
+#define HASH_FIND_PTR(head,findptr,out)                                          \
+    HASH_FIND(hh,head,findptr,sizeof(void *),out)
+#define HASH_ADD_PTR(head,ptrfield,add)                                          \
+    HASH_ADD(hh,head,ptrfield,sizeof(void *),add)
+#define HASH_REPLACE_PTR(head,ptrfield,add,replaced)                             \
+    HASH_REPLACE(hh,head,ptrfield,sizeof(void *),add,replaced)
+#define HASH_DEL(head,delptr)                                                    \
+    HASH_DELETE(hh,head,delptr)
+
+/* HASH_FSCK checks hash integrity on every add/delete when HASH_DEBUG is defined.
+ * This is for uthash developer only; it compiles away if HASH_DEBUG isn't defined.
+ */
+#ifdef HASH_DEBUG
+#include <stdio.h>   /* fprintf, stderr */
+#define HASH_OOPS(...) do { fprintf(stderr, __VA_ARGS__); exit(-1); } while (0)
+#define HASH_FSCK(hh,head,where)                                                 \
+do {                                                                             \
+  struct UT_hash_handle *_thh;                                                   \
+  if (head) {                                                                    \
+    unsigned _bkt_i;                                                             \
+    unsigned _count = 0;                                                         \
+    char *_prev;                                                                 \
+    for (_bkt_i = 0; _bkt_i < (head)->hh.tbl->num_buckets; ++_bkt_i) {           \
+      unsigned _bkt_count = 0;                                                   \
+      _thh = (head)->hh.tbl->buckets[_bkt_i].hh_head;                            \
+      _prev = NULL;                                                              \
+      while (_thh) {                                                             \
+        if (_prev != (char*)(_thh->hh_prev)) {                                   \
+          HASH_OOPS("%s: invalid hh_prev %p, actual %p\n",                       \
+              (where), (void*)_thh->hh_prev, (void*)_prev);                      \
+        }                                                                        \
+        _bkt_count++;                                                            \
+        _prev = (char*)(_thh);                                                   \
+        _thh = _thh->hh_next;                                                    \
+      }                                                                          \
+      _count += _bkt_count;                                                      \
+      if ((head)->hh.tbl->buckets[_bkt_i].count !=  _bkt_count) {                \
+        HASH_OOPS("%s: invalid bucket count %u, actual %u\n",                    \
+            (where), (head)->hh.tbl->buckets[_bkt_i].count, _bkt_count);         \
+      }                                                                          \
+    }                                                                            \
+    if (_count != (head)->hh.tbl->num_items) {                                   \
+      HASH_OOPS("%s: invalid hh item count %u, actual %u\n",                     \
+          (where), (head)->hh.tbl->num_items, _count);                           \
+    }                                                                            \
+    _count = 0;                                                                  \
+    _prev = NULL;                                                                \
+    _thh =  &(head)->hh;                                                         \
+    while (_thh) {                                                               \
+      _count++;                                                                  \
+      if (_prev != (char*)_thh->prev) {                                          \
+        HASH_OOPS("%s: invalid prev %p, actual %p\n",                            \
+            (where), (void*)_thh->prev, (void*)_prev);                           \
+      }                                                                          \
+      _prev = (char*)ELMT_FROM_HH((head)->hh.tbl, _thh);                         \
+      _thh = (_thh->next ? HH_FROM_ELMT((head)->hh.tbl, _thh->next) : NULL);     \
+    }                                                                            \
+    if (_count != (head)->hh.tbl->num_items) {                                   \
+      HASH_OOPS("%s: invalid app item count %u, actual %u\n",                    \
+          (where), (head)->hh.tbl->num_items, _count);                           \
+    }                                                                            \
+  }                                                                              \
+} while (0)
+#else
+#define HASH_FSCK(hh,head,where)
+#endif
+
+/* When compiled with -DHASH_EMIT_KEYS, length-prefixed keys are emitted to
+ * the descriptor to which this macro is defined for tuning the hash function.
+ * The app can #include <unistd.h> to get the prototype for write(2). */
+#ifdef HASH_EMIT_KEYS
+#define HASH_EMIT_KEY(hh,head,keyptr,fieldlen)                                   \
+do {                                                                             \
+  unsigned _klen = fieldlen;                                                     \
+  write(HASH_EMIT_KEYS, &_klen, sizeof(_klen));                                  \
+  write(HASH_EMIT_KEYS, keyptr, (unsigned long)fieldlen);                        \
+} while (0)
+#else
+#define HASH_EMIT_KEY(hh,head,keyptr,fieldlen)
+#endif
+
+/* The Bernstein hash function, used in Perl prior to v5.6. Note (x<<5+x)=x*33. */
+#define HASH_BER(key,keylen,hashv)                                               \
+do {                                                                             \
+  unsigned _hb_keylen = (unsigned)keylen;                                        \
+  const unsigned char *_hb_key = (const unsigned char*)(key);                    \
+  (hashv) = 0;                                                                   \
+  while (_hb_keylen-- != 0U) {                                                   \
+    (hashv) = (((hashv) << 5) + (hashv)) + *_hb_key++;                           \
+  }                                                                              \
+} while (0)
+
+
+/* SAX/FNV/OAT/JEN hash functions are macro variants of those listed at
+ * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
+ * (archive link: https://archive.is/Ivcan )
+ */
+#define HASH_SAX(key,keylen,hashv)                                               \
+do {                                                                             \
+  unsigned _sx_i;                                                                \
+  const unsigned char *_hs_key = (const unsigned char*)(key);                    \
+  hashv = 0;                                                                     \
+  for (_sx_i=0; _sx_i < keylen; _sx_i++) {                                       \
+    hashv ^= (hashv << 5) + (hashv >> 2) + _hs_key[_sx_i];                       \
+  }                                                                              \
+} while (0)
+/* FNV-1a variation */
+#define HASH_FNV(key,keylen,hashv)                                               \
+do {                                                                             \
+  unsigned _fn_i;                                                                \
+  const unsigned char *_hf_key = (const unsigned char*)(key);                    \
+  (hashv) = 2166136261U;                                                         \
+  for (_fn_i=0; _fn_i < keylen; _fn_i++) {                                       \
+    hashv = hashv ^ _hf_key[_fn_i];                                              \
+    hashv = hashv * 16777619U;                                                   \
+  }                                                                              \
+} while (0)
+
+#define HASH_OAT(key,keylen,hashv)                                               \
+do {                                                                             \
+  unsigned _ho_i;                                                                \
+  const unsigned char *_ho_key=(const unsigned char*)(key);                      \
+  hashv = 0;                                                                     \
+  for(_ho_i=0; _ho_i < keylen; _ho_i++) {                                        \
+      hashv += _ho_key[_ho_i];                                                   \
+      hashv += (hashv << 10);                                                    \
+      hashv ^= (hashv >> 6);                                                     \
+  }                                                                              \
+  hashv += (hashv << 3);                                                         \
+  hashv ^= (hashv >> 11);                                                        \
+  hashv += (hashv << 15);                                                        \
+} while (0)
+
+#define HASH_JEN_MIX(a,b,c)                                                      \
+do {                                                                             \
+  a -= b; a -= c; a ^= ( c >> 13 );                                              \
+  b -= c; b -= a; b ^= ( a << 8 );                                               \
+  c -= a; c -= b; c ^= ( b >> 13 );                                              \
+  a -= b; a -= c; a ^= ( c >> 12 );                                              \
+  b -= c; b -= a; b ^= ( a << 16 );                                              \
+  c -= a; c -= b; c ^= ( b >> 5 );                                               \
+  a -= b; a -= c; a ^= ( c >> 3 );                                               \
+  b -= c; b -= a; b ^= ( a << 10 );                                              \
+  c -= a; c -= b; c ^= ( b >> 15 );                                              \
+} while (0)
+
+#define HASH_JEN(key,keylen,hashv)                                               \
+do {                                                                             \
+  unsigned _hj_i,_hj_j,_hj_k;                                                    \
+  unsigned const char *_hj_key=(unsigned const char*)(key);                      \
+  hashv = 0xfeedbeefu;                                                           \
+  _hj_i = _hj_j = 0x9e3779b9u;                                                   \
+  _hj_k = (unsigned)(keylen);                                                    \
+  while (_hj_k >= 12U) {                                                         \
+    _hj_i +=    (_hj_key[0] + ( (unsigned)_hj_key[1] << 8 )                      \
+        + ( (unsigned)_hj_key[2] << 16 )                                         \
+        + ( (unsigned)_hj_key[3] << 24 ) );                                      \
+    _hj_j +=    (_hj_key[4] + ( (unsigned)_hj_key[5] << 8 )                      \
+        + ( (unsigned)_hj_key[6] << 16 )                                         \
+        + ( (unsigned)_hj_key[7] << 24 ) );                                      \
+    hashv += (_hj_key[8] + ( (unsigned)_hj_key[9] << 8 )                         \
+        + ( (unsigned)_hj_key[10] << 16 )                                        \
+        + ( (unsigned)_hj_key[11] << 24 ) );                                     \
+                                                                                 \
+     HASH_JEN_MIX(_hj_i, _hj_j, hashv);                                          \
+                                                                                 \
+     _hj_key += 12;                                                              \
+     _hj_k -= 12U;                                                               \
+  }                                                                              \
+  hashv += (unsigned)(keylen);                                                   \
+  switch ( _hj_k ) {                                                             \
+    case 11: hashv += ( (unsigned)_hj_key[10] << 24 ); /* FALLTHROUGH */         \
+    case 10: hashv += ( (unsigned)_hj_key[9] << 16 );  /* FALLTHROUGH */         \
+    case 9:  hashv += ( (unsigned)_hj_key[8] << 8 );   /* FALLTHROUGH */         \
+    case 8:  _hj_j += ( (unsigned)_hj_key[7] << 24 );  /* FALLTHROUGH */         \
+    case 7:  _hj_j += ( (unsigned)_hj_key[6] << 16 );  /* FALLTHROUGH */         \
+    case 6:  _hj_j += ( (unsigned)_hj_key[5] << 8 );   /* FALLTHROUGH */         \
+    case 5:  _hj_j += _hj_key[4];                      /* FALLTHROUGH */         \
+    case 4:  _hj_i += ( (unsigned)_hj_key[3] << 24 );  /* FALLTHROUGH */         \
+    case 3:  _hj_i += ( (unsigned)_hj_key[2] << 16 );  /* FALLTHROUGH */         \
+    case 2:  _hj_i += ( (unsigned)_hj_key[1] << 8 );   /* FALLTHROUGH */         \
+    case 1:  _hj_i += _hj_key[0];                      /* FALLTHROUGH */         \
+    default: ;                                                                   \
+  }                                                                              \
+  HASH_JEN_MIX(_hj_i, _hj_j, hashv);                                             \
+} while (0)
+
+/* The Paul Hsieh hash function */
+#undef get16bits
+#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__)             \
+  || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__)
+#define get16bits(d) (*((const uint16_t *) (d)))
+#endif
+
+#if !defined (get16bits)
+#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)             \
+                       +(uint32_t)(((const uint8_t *)(d))[0]) )
+#endif
+#define HASH_SFH(key,keylen,hashv)                                               \
+do {                                                                             \
+  unsigned const char *_sfh_key=(unsigned const char*)(key);                     \
+  uint32_t _sfh_tmp, _sfh_len = (uint32_t)keylen;                                \
+                                                                                 \
+  unsigned _sfh_rem = _sfh_len & 3U;                                             \
+  _sfh_len >>= 2;                                                                \
+  hashv = 0xcafebabeu;                                                           \
+                                                                                 \
+  /* Main loop */                                                                \
+  for (;_sfh_len > 0U; _sfh_len--) {                                             \
+    hashv    += get16bits (_sfh_key);                                            \
+    _sfh_tmp  = ((uint32_t)(get16bits (_sfh_key+2)) << 11) ^ hashv;              \
+    hashv     = (hashv << 16) ^ _sfh_tmp;                                        \
+    _sfh_key += 2U*sizeof (uint16_t);                                            \
+    hashv    += hashv >> 11;                                                     \
+  }                                                                              \
+                                                                                 \
+  /* Handle end cases */                                                         \
+  switch (_sfh_rem) {                                                            \
+    case 3: hashv += get16bits (_sfh_key);                                       \
+            hashv ^= hashv << 16;                                                \
+            hashv ^= (uint32_t)(_sfh_key[sizeof (uint16_t)]) << 18;              \
+            hashv += hashv >> 11;                                                \
+            break;                                                               \
+    case 2: hashv += get16bits (_sfh_key);                                       \
+            hashv ^= hashv << 11;                                                \
+            hashv += hashv >> 17;                                                \
+            break;                                                               \
+    case 1: hashv += *_sfh_key;                                                  \
+            hashv ^= hashv << 10;                                                \
+            hashv += hashv >> 1;                                                 \
+            break;                                                               \
+    default: ;                                                                   \
+  }                                                                              \
+                                                                                 \
+  /* Force "avalanching" of final 127 bits */                                    \
+  hashv ^= hashv << 3;                                                           \
+  hashv += hashv >> 5;                                                           \
+  hashv ^= hashv << 4;                                                           \
+  hashv += hashv >> 17;                                                          \
+  hashv ^= hashv << 25;                                                          \
+  hashv += hashv >> 6;                                                           \
+} while (0)
+
+/* iterate over items in a known bucket to find desired item */
+#define HASH_FIND_IN_BKT(tbl,hh,head,keyptr,keylen_in,hashval,out)               \
+do {                                                                             \
+  if ((head).hh_head != NULL) {                                                  \
+    DECLTYPE_ASSIGN(out, ELMT_FROM_HH(tbl, (head).hh_head));                     \
+  } else {                                                                       \
+    (out) = NULL;                                                                \
+  }                                                                              \
+  while ((out) != NULL) {                                                        \
+    if ((out)->hh.hashv == (hashval) && (out)->hh.keylen == (keylen_in)) {       \
+      if (HASH_KEYCMP((out)->hh.key, keyptr, keylen_in) == 0) {                  \
+        break;                                                                   \
+      }                                                                          \
+    }                                                                            \
+    if ((out)->hh.hh_next != NULL) {                                             \
+      DECLTYPE_ASSIGN(out, ELMT_FROM_HH(tbl, (out)->hh.hh_next));                \
+    } else {                                                                     \
+      (out) = NULL;                                                              \
+    }                                                                            \
+  }                                                                              \
+} while (0)
+
+/* add an item to a bucket  */
+#define HASH_ADD_TO_BKT(head,hh,addhh,oomed)                                     \
+do {                                                                             \
+  UT_hash_bucket *_ha_head = &(head);                                            \
+  _ha_head->count++;                                                             \
+  (addhh)->hh_next = _ha_head->hh_head;                                          \
+  (addhh)->hh_prev = NULL;                                                       \
+  if (_ha_head->hh_head != NULL) {                                               \
+    _ha_head->hh_head->hh_prev = (addhh);                                        \
+  }                                                                              \
+  _ha_head->hh_head = (addhh);                                                   \
+  if ((_ha_head->count >= ((_ha_head->expand_mult + 1U) * HASH_BKT_CAPACITY_THRESH)) \
+      && !(addhh)->tbl->noexpand) {                                              \
+    HASH_EXPAND_BUCKETS(addhh,(addhh)->tbl, oomed);                              \
+    IF_HASH_NONFATAL_OOM(                                                        \
+      if (oomed) {                                                               \
+        HASH_DEL_IN_BKT(head,addhh);                                             \
+      }                                                                          \
+    )                                                                            \
+  }                                                                              \
+} while (0)
+
+/* remove an item from a given bucket */
+#define HASH_DEL_IN_BKT(head,delhh)                                              \
+do {                                                                             \
+  UT_hash_bucket *_hd_head = &(head);                                            \
+  _hd_head->count--;                                                             \
+  if (_hd_head->hh_head == (delhh)) {                                            \
+    _hd_head->hh_head = (delhh)->hh_next;                                        \
+  }                                                                              \
+  if ((delhh)->hh_prev) {                                                        \
+    (delhh)->hh_prev->hh_next = (delhh)->hh_next;                                \
+  }                                                                              \
+  if ((delhh)->hh_next) {                                                        \
+    (delhh)->hh_next->hh_prev = (delhh)->hh_prev;                                \
+  }                                                                              \
+} while (0)
+
+/* Bucket expansion has the effect of doubling the number of buckets
+ * and redistributing the items into the new buckets. Ideally the
+ * items will distribute more or less evenly into the new buckets
+ * (the extent to which this is true is a measure of the quality of
+ * the hash function as it applies to the key domain).
+ *
+ * With the items distributed into more buckets, the chain length
+ * (item count) in each bucket is reduced. Thus by expanding buckets
+ * the hash keeps a bound on the chain length. This bounded chain
+ * length is the essence of how a hash provides constant time lookup.
+ *
+ * The calculation of tbl->ideal_chain_maxlen below deserves some
+ * explanation. First, keep in mind that we're calculating the ideal
+ * maximum chain length based on the *new* (doubled) bucket count.
+ * In fractions this is just n/b (n=number of items,b=new num buckets).
+ * Since the ideal chain length is an integer, we want to calculate
+ * ceil(n/b). We don't depend on floating point arithmetic in this
+ * hash, so to calculate ceil(n/b) with integers we could write
+ *
+ *      ceil(n/b) = (n/b) + ((n%b)?1:0)
+ *
+ * and in fact a previous version of this hash did just that.
+ * But now we have improved things a bit by recognizing that b is
+ * always a power of two. We keep its base 2 log handy (call it lb),
+ * so now we can write this with a bit shift and logical AND:
+ *
+ *      ceil(n/b) = (n>>lb) + ( (n & (b-1)) ? 1:0)
+ *
+ */
+#define HASH_EXPAND_BUCKETS(hh,tbl,oomed)                                        \
+do {                                                                             \
+  unsigned _he_bkt;                                                              \
+  unsigned _he_bkt_i;                                                            \
+  struct UT_hash_handle *_he_thh, *_he_hh_nxt;                                   \
+  UT_hash_bucket *_he_new_buckets, *_he_newbkt;                                  \
+  _he_new_buckets = (UT_hash_bucket*)uthash_malloc(                              \
+           sizeof(struct UT_hash_bucket) * (tbl)->num_buckets * 2U);             \
+  if (!_he_new_buckets) {                                                        \
+    HASH_RECORD_OOM(oomed);                                                      \
+  } else {                                                                       \
+    uthash_bzero(_he_new_buckets,                                                \
+        sizeof(struct UT_hash_bucket) * (tbl)->num_buckets * 2U);                \
+    (tbl)->ideal_chain_maxlen =                                                  \
+       ((tbl)->num_items >> ((tbl)->log2_num_buckets+1U)) +                      \
+       ((((tbl)->num_items & (((tbl)->num_buckets*2U)-1U)) != 0U) ? 1U : 0U);    \
+    (tbl)->nonideal_items = 0;                                                   \
+    for (_he_bkt_i = 0; _he_bkt_i < (tbl)->num_buckets; _he_bkt_i++) {           \
+      _he_thh = (tbl)->buckets[ _he_bkt_i ].hh_head;                             \
+      while (_he_thh != NULL) {                                                  \
+        _he_hh_nxt = _he_thh->hh_next;                                           \
+        HASH_TO_BKT(_he_thh->hashv, (tbl)->num_buckets * 2U, _he_bkt);           \
+        _he_newbkt = &(_he_new_buckets[_he_bkt]);                                \
+        if (++(_he_newbkt->count) > (tbl)->ideal_chain_maxlen) {                 \
+          (tbl)->nonideal_items++;                                               \
+          if (_he_newbkt->count > _he_newbkt->expand_mult * (tbl)->ideal_chain_maxlen) { \
+            _he_newbkt->expand_mult++;                                           \
+          }                                                                      \
+        }                                                                        \
+        _he_thh->hh_prev = NULL;                                                 \
+        _he_thh->hh_next = _he_newbkt->hh_head;                                  \
+        if (_he_newbkt->hh_head != NULL) {                                       \
+          _he_newbkt->hh_head->hh_prev = _he_thh;                                \
+        }                                                                        \
+        _he_newbkt->hh_head = _he_thh;                                           \
+        _he_thh = _he_hh_nxt;                                                    \
+      }                                                                          \
+    }                                                                            \
+    uthash_free((tbl)->buckets, (tbl)->num_buckets * sizeof(struct UT_hash_bucket)); \
+    (tbl)->num_buckets *= 2U;                                                    \
+    (tbl)->log2_num_buckets++;                                                   \
+    (tbl)->buckets = _he_new_buckets;                                            \
+    (tbl)->ineff_expands = ((tbl)->nonideal_items > ((tbl)->num_items >> 1)) ?   \
+        ((tbl)->ineff_expands+1U) : 0U;                                          \
+    if ((tbl)->ineff_expands > 1U) {                                             \
+      (tbl)->noexpand = 1;                                                       \
+      uthash_noexpand_fyi(tbl);                                                  \
+    }                                                                            \
+    uthash_expand_fyi(tbl);                                                      \
+  }                                                                              \
+} while (0)
+
+
+/* This is an adaptation of Simon Tatham's O(n log(n)) mergesort */
+/* Note that HASH_SORT assumes the hash handle name to be hh.
+ * HASH_SRT was added to allow the hash handle name to be passed in. */
+#define HASH_SORT(head,cmpfcn) HASH_SRT(hh,head,cmpfcn)
+#define HASH_SRT(hh,head,cmpfcn)                                                 \
+do {                                                                             \
+  unsigned _hs_i;                                                                \
+  unsigned _hs_looping,_hs_nmerges,_hs_insize,_hs_psize,_hs_qsize;               \
+  struct UT_hash_handle *_hs_p, *_hs_q, *_hs_e, *_hs_list, *_hs_tail;            \
+  if (head != NULL) {                                                            \
+    _hs_insize = 1;                                                              \
+    _hs_looping = 1;                                                             \
+    _hs_list = &((head)->hh);                                                    \
+    while (_hs_looping != 0U) {                                                  \
+      _hs_p = _hs_list;                                                          \
+      _hs_list = NULL;                                                           \
+      _hs_tail = NULL;                                                           \
+      _hs_nmerges = 0;                                                           \
+      while (_hs_p != NULL) {                                                    \
+        _hs_nmerges++;                                                           \
+        _hs_q = _hs_p;                                                           \
+        _hs_psize = 0;                                                           \
+        for (_hs_i = 0; _hs_i < _hs_insize; ++_hs_i) {                           \
+          _hs_psize++;                                                           \
+          _hs_q = ((_hs_q->next != NULL) ?                                       \
+            HH_FROM_ELMT((head)->hh.tbl, _hs_q->next) : NULL);                   \
+          if (_hs_q == NULL) {                                                   \
+            break;                                                               \
+          }                                                                      \
+        }                                                                        \
+        _hs_qsize = _hs_insize;                                                  \
+        while ((_hs_psize != 0U) || ((_hs_qsize != 0U) && (_hs_q != NULL))) {    \
+          if (_hs_psize == 0U) {                                                 \
+            _hs_e = _hs_q;                                                       \
+            _hs_q = ((_hs_q->next != NULL) ?                                     \
+              HH_FROM_ELMT((head)->hh.tbl, _hs_q->next) : NULL);                 \
+            _hs_qsize--;                                                         \
+          } else if ((_hs_qsize == 0U) || (_hs_q == NULL)) {                     \
+            _hs_e = _hs_p;                                                       \
+            if (_hs_p != NULL) {                                                 \
+              _hs_p = ((_hs_p->next != NULL) ?                                   \
+                HH_FROM_ELMT((head)->hh.tbl, _hs_p->next) : NULL);               \
+            }                                                                    \
+            _hs_psize--;                                                         \
+          } else if ((cmpfcn(                                                    \
+                DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl, _hs_p)),             \
+                DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl, _hs_q))              \
+                )) <= 0) {                                                       \
+            _hs_e = _hs_p;                                                       \
+            if (_hs_p != NULL) {                                                 \
+              _hs_p = ((_hs_p->next != NULL) ?                                   \
+                HH_FROM_ELMT((head)->hh.tbl, _hs_p->next) : NULL);               \
+            }                                                                    \
+            _hs_psize--;                                                         \
+          } else {                                                               \
+            _hs_e = _hs_q;                                                       \
+            _hs_q = ((_hs_q->next != NULL) ?                                     \
+              HH_FROM_ELMT((head)->hh.tbl, _hs_q->next) : NULL);                 \
+            _hs_qsize--;                                                         \
+          }                                                                      \
+          if ( _hs_tail != NULL ) {                                              \
+            _hs_tail->next = ((_hs_e != NULL) ?                                  \
+              ELMT_FROM_HH((head)->hh.tbl, _hs_e) : NULL);                       \
+          } else {                                                               \
+            _hs_list = _hs_e;                                                    \
+          }                                                                      \
+          if (_hs_e != NULL) {                                                   \
+            _hs_e->prev = ((_hs_tail != NULL) ?                                  \
+              ELMT_FROM_HH((head)->hh.tbl, _hs_tail) : NULL);                    \
+          }                                                                      \
+          _hs_tail = _hs_e;                                                      \
+        }                                                                        \
+        _hs_p = _hs_q;                                                           \
+      }                                                                          \
+      if (_hs_tail != NULL) {                                                    \
+        _hs_tail->next = NULL;                                                   \
+      }                                                                          \
+      if (_hs_nmerges <= 1U) {                                                   \
+        _hs_looping = 0;                                                         \
+        (head)->hh.tbl->tail = _hs_tail;                                         \
+        DECLTYPE_ASSIGN(head, ELMT_FROM_HH((head)->hh.tbl, _hs_list));           \
+      }                                                                          \
+      _hs_insize *= 2U;                                                          \
+    }                                                                            \
+    HASH_FSCK(hh, head, "HASH_SRT");                                             \
+  }                                                                              \
+} while (0)
+
+/* This function selects items from one hash into another hash.
+ * The end result is that the selected items have dual presence
+ * in both hashes. There is no copy of the items made; rather
+ * they are added into the new hash through a secondary hash
+ * hash handle that must be present in the structure. */
+#define HASH_SELECT(hh_dst, dst, hh_src, src, cond)                              \
+do {                                                                             \
+  unsigned _src_bkt, _dst_bkt;                                                   \
+  void *_last_elt = NULL, *_elt;                                                 \
+  UT_hash_handle *_src_hh, *_dst_hh, *_last_elt_hh=NULL;                         \
+  ptrdiff_t _dst_hho = ((char*)(&(dst)->hh_dst) - (char*)(dst));                 \
+  if ((src) != NULL) {                                                           \
+    for (_src_bkt=0; _src_bkt < (src)->hh_src.tbl->num_buckets; _src_bkt++) {    \
+      for (_src_hh = (src)->hh_src.tbl->buckets[_src_bkt].hh_head;               \
+        _src_hh != NULL;                                                         \
+        _src_hh = _src_hh->hh_next) {                                            \
+        _elt = ELMT_FROM_HH((src)->hh_src.tbl, _src_hh);                         \
+        if (cond(_elt)) {                                                        \
+          IF_HASH_NONFATAL_OOM( int _hs_oomed = 0; )                             \
+          _dst_hh = (UT_hash_handle*)(void*)(((char*)_elt) + _dst_hho);          \
+          _dst_hh->key = _src_hh->key;                                           \
+          _dst_hh->keylen = _src_hh->keylen;                                     \
+          _dst_hh->hashv = _src_hh->hashv;                                       \
+          _dst_hh->prev = _last_elt;                                             \
+          _dst_hh->next = NULL;                                                  \
+          if (_last_elt_hh != NULL) {                                            \
+            _last_elt_hh->next = _elt;                                           \
+          }                                                                      \
+          if ((dst) == NULL) {                                                   \
+            DECLTYPE_ASSIGN(dst, _elt);                                          \
+            HASH_MAKE_TABLE(hh_dst, dst, _hs_oomed);                             \
+            IF_HASH_NONFATAL_OOM(                                                \
+              if (_hs_oomed) {                                                   \
+                uthash_nonfatal_oom(_elt);                                       \
+                (dst) = NULL;                                                    \
+                continue;                                                        \
+              }                                                                  \
+            )                                                                    \
+          } else {                                                               \
+            _dst_hh->tbl = (dst)->hh_dst.tbl;                                    \
+          }                                                                      \
+          HASH_TO_BKT(_dst_hh->hashv, _dst_hh->tbl->num_buckets, _dst_bkt);      \
+          HASH_ADD_TO_BKT(_dst_hh->tbl->buckets[_dst_bkt], hh_dst, _dst_hh, _hs_oomed); \
+          (dst)->hh_dst.tbl->num_items++;                                        \
+          IF_HASH_NONFATAL_OOM(                                                  \
+            if (_hs_oomed) {                                                     \
+              HASH_ROLLBACK_BKT(hh_dst, dst, _dst_hh);                           \
+              HASH_DELETE_HH(hh_dst, dst, _dst_hh);                              \
+              _dst_hh->tbl = NULL;                                               \
+              uthash_nonfatal_oom(_elt);                                         \
+              continue;                                                          \
+            }                                                                    \
+          )                                                                      \
+          HASH_BLOOM_ADD(_dst_hh->tbl, _dst_hh->hashv);                          \
+          _last_elt = _elt;                                                      \
+          _last_elt_hh = _dst_hh;                                                \
+        }                                                                        \
+      }                                                                          \
+    }                                                                            \
+  }                                                                              \
+  HASH_FSCK(hh_dst, dst, "HASH_SELECT");                                         \
+} while (0)
+
+#define HASH_CLEAR(hh,head)                                                      \
+do {                                                                             \
+  if ((head) != NULL) {                                                          \
+    HASH_BLOOM_FREE((head)->hh.tbl);                                             \
+    uthash_free((head)->hh.tbl->buckets,                                         \
+                (head)->hh.tbl->num_buckets*sizeof(struct UT_hash_bucket));      \
+    uthash_free((head)->hh.tbl, sizeof(UT_hash_table));                          \
+    (head) = NULL;                                                               \
+  }                                                                              \
+} while (0)
+
+#define HASH_OVERHEAD(hh,head)                                                   \
+ (((head) != NULL) ? (                                                           \
+ (size_t)(((head)->hh.tbl->num_items   * sizeof(UT_hash_handle))   +             \
+          ((head)->hh.tbl->num_buckets * sizeof(UT_hash_bucket))   +             \
+           sizeof(UT_hash_table)                                   +             \
+           (HASH_BLOOM_BYTELEN))) : 0U)
+
+#ifdef NO_DECLTYPE
+#define HASH_ITER(hh,head,el,tmp)                                                \
+for(((el)=(head)), ((*(char**)(&(tmp)))=(char*)((head!=NULL)?(head)->hh.next:NULL)); \
+  (el) != NULL; ((el)=(tmp)), ((*(char**)(&(tmp)))=(char*)((tmp!=NULL)?(tmp)->hh.next:NULL)))
+#else
+#define HASH_ITER(hh,head,el,tmp)                                                \
+for(((el)=(head)), ((tmp)=DECLTYPE(el)((head!=NULL)?(head)->hh.next:NULL));      \
+  (el) != NULL; ((el)=(tmp)), ((tmp)=DECLTYPE(el)((tmp!=NULL)?(tmp)->hh.next:NULL)))
+#endif
+
+/* obtain a count of items in the hash */
+#define HASH_COUNT(head) HASH_CNT(hh,head)
+#define HASH_CNT(hh,head) ((head != NULL)?((head)->hh.tbl->num_items):0U)
+
+typedef struct UT_hash_bucket {
+   struct UT_hash_handle *hh_head;
+   unsigned count;
+
+   /* expand_mult is normally set to 0. In this situation, the max chain length
+    * threshold is enforced at its default value, HASH_BKT_CAPACITY_THRESH. (If
+    * the bucket's chain exceeds this length, bucket expansion is triggered).
+    * However, setting expand_mult to a non-zero value delays bucket expansion
+    * (that would be triggered by additions to this particular bucket)
+    * until its chain length reaches a *multiple* of HASH_BKT_CAPACITY_THRESH.
+    * (The multiplier is simply expand_mult+1). The whole idea of this
+    * multiplier is to reduce bucket expansions, since they are expensive, in
+    * situations where we know that a particular bucket tends to be overused.
+    * It is better to let its chain length grow to a longer yet-still-bounded
+    * value, than to do an O(n) bucket expansion too often.
+    */
+   unsigned expand_mult;
+
+} UT_hash_bucket;
+
+/* random signature used only to find hash tables in external analysis */
+#define HASH_SIGNATURE 0xa0111fe1u
+#define HASH_BLOOM_SIGNATURE 0xb12220f2u
+
+typedef struct UT_hash_table {
+   UT_hash_bucket *buckets;
+   unsigned num_buckets, log2_num_buckets;
+   unsigned num_items;
+   struct UT_hash_handle *tail; /* tail hh in app order, for fast append    */
+   ptrdiff_t hho; /* hash handle offset (byte pos of hash handle in element */
+
+   /* in an ideal situation (all buckets used equally), no bucket would have
+    * more than ceil(#items/#buckets) items. that's the ideal chain length. */
+   unsigned ideal_chain_maxlen;
+
+   /* nonideal_items is the number of items in the hash whose chain position
+    * exceeds the ideal chain maxlen. these items pay the penalty for an uneven
+    * hash distribution; reaching them in a chain traversal takes >ideal steps */
+   unsigned nonideal_items;
+
+   /* ineffective expands occur when a bucket doubling was performed, but
+    * afterward, more than half the items in the hash had nonideal chain
+    * positions. If this happens on two consecutive expansions we inhibit any
+    * further expansion, as it's not helping; this happens when the hash
+    * function isn't a good fit for the key domain. When expansion is inhibited
+    * the hash will still work, albeit no longer in constant time. */
+   unsigned ineff_expands, noexpand;
+
+   uint32_t signature; /* used only to find hash tables in external analysis */
+#ifdef HASH_BLOOM
+   uint32_t bloom_sig; /* used only to test bloom exists in external analysis */
+   uint8_t *bloom_bv;
+   uint8_t bloom_nbits;
+#endif
+
+} UT_hash_table;
+
+typedef struct UT_hash_handle {
+   struct UT_hash_table *tbl;
+   void *prev;                       /* prev element in app order      */
+   void *next;                       /* next element in app order      */
+   struct UT_hash_handle *hh_prev;   /* previous hh in bucket order    */
+   struct UT_hash_handle *hh_next;   /* next hh in bucket order        */
+   const void *key;                  /* ptr to enclosing struct's key  */
+   unsigned keylen;                  /* enclosing struct's key len     */
+   unsigned hashv;                   /* result of hash-fcn(key)        */
+} UT_hash_handle;
+
+#endif /* UTHASH_H */
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/__init__.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/block_sched.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/block_sched.py
new file mode 100644
index 0000000..4a328a9
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/block_sched.py
@@ -0,0 +1,25 @@
+from neutrino import probe, Map
+import neutrino.language as nl
+
+CALLBACK = "block_sched_callback.py" # for trace analysis
+
+# declare maps for persistence
+@Map(level="warp", type="array", size=16, cap=1)
+class block_sched:
+    start: nl.u64
+    elapsed: nl.u32
+    cuid: nl.u32
+
+# declare probe registers shared across probes
+start: nl.u64 = 0 # starting clock
+elapsed: nl.u64 = 0 # elapsed time, initialized to 0
+
+# define probes with decorator
+@probe(pos="kernel", level="warp", before=True)
+def thread_start():
+    start = nl.clock()
+
+@probe(pos="kernel", level="warp")
+def thread_end():
+    elapsed = nl.clock() - start
+    block_sched.save(start, elapsed, nl.cuid())
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/block_sched_callback.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/block_sched_callback.py
new file mode 100644
index 0000000..d66e16a
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/block_sched_callback.py
@@ -0,0 +1,74 @@
+# Neutrino Auto-Generated Code for Trace Reading
+import struct
+from typing import NamedTuple, List, Tuple, Dict
+from neutrino import TraceHeader, TraceSection
+
+
+class block_sched(NamedTuple):
+    start: int
+    elapsed: int
+    cuid: int
+
+
+def parse(path: str) -> Tuple[TraceHeader, List[TraceSection], Dict[str, List[List[NamedTuple]]]]:
+    with open(path, "rb") as f:
+        header: TraceHeader = TraceHeader(*struct.unpack("iiiiiiii", f.read(32)))
+        sections: List[TraceSection] = []
+        for _ in range(header.numProbes):
+            sections.append(TraceSection(*struct.unpack("IIQ", f.read(16))))
+        gridSize = header.gridDimX * header.gridDimY * header.gridDimZ
+        blockSize = header.blockDimX * header.blockDimY * header.blockDimZ
+        records: Dict[str, List[List[NamedTuple]]] = dict()
+
+        # Read block_sched
+        records["block_sched"] = []
+        f.seek(sections[0].offset)
+        for i in range(gridSize):
+            records["block_sched"].append([])
+            for j in range(blockSize // sections[0].warpDiv):
+                records["block_sched"][-1].append([])
+                for k in range(sections[0].size // 16):
+                    records["block_sched"][i][j].append(block_sched(*struct.unpack("qII", f.read(16))))
+
+    return header, sections, records
+# END of Neutrino Auto-Generated Code for Trace Reading
+import sys
+import numpy as np
+header, sections, records_map = parse(sys.argv[1]) # filled by path to trace
+
+records = records_map["block_sched"]
+
+unique_sms = set()    
+for block in records:
+    unique_sms.add(block[0][0].cuid)
+
+sm_timelines = []
+for _ in range(len(unique_sms)):
+    sm_timelines.append([])
+sched_times = [0.0] * len(unique_sms)
+work_times = [0.0] * len(unique_sms)
+
+for cur in records:
+    # print(sm_timelines[cur[0].cuid])
+    sched_out = False
+    cuid = cur[0][0].cuid
+    if len(sm_timelines[cuid]) > 0:
+        for block in sm_timelines[cuid]:
+            if block.start + block.elapsed <= cur[0][0].start:
+                # if cur[0].lstart - (block.lstart + block.elapse) < 100000:
+                #     print(cur[0], block)
+                sched_times[cuid] += cur[0][0].start - (block.start + block.elapsed)
+                sm_timelines[cuid].remove(block)
+                sm_timelines[cuid].append(cur[0][0])
+                work_times[cuid] += cur[0][0].elapsed
+                sched_out = True
+                break
+            if not sched_out:
+                sm_timelines[cuid].append(cur[0][0]) 
+                work_times[cuid] += cur[0][0].elapsed
+                break
+    else:
+        sm_timelines[cuid].append(cur[0][0]) 
+        work_times[cuid] += cur[0][0].elapsed
+
+print(f"No.block:{header.gridDimX * header.gridDimY  * header.gridDimZ} Running:{int(np.array(work_times).mean())} Scheduling:{int(np.array(sched_times).mean())}(cycle)")
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/dmat b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/dmat
new file mode 100644
index 0000000..e34a2a8
Binary files /dev/null and b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/dmat differ
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/dmat.cc b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/dmat.cc
new file mode 100644
index 0000000..86b6dac
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/dmat.cc
@@ -0,0 +1,128 @@
+/**
+ * A Faster C++ STL Based Sparsifying
+ */
+
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <unordered_map>
+#include <map>
+#include <unordered_set>
+#include <vector>
+ 
+// Mask out last 16bit of 64bit address -> 16MB Page Size
+#define PAGE_MASK 0xFFFFFFFFFFFF0000
+
+typedef struct {
+    // basic launch configuration
+    uint32_t gridDimX;
+    uint32_t gridDimY;
+    uint32_t gridDimZ;
+    uint32_t blockDimX;
+    uint32_t blockDimY;
+    uint32_t blockDimZ;
+    uint32_t sharedMemBytes; 
+    // all above from CUDA/ROCm launch configuration
+    uint32_t numProbes; // number of traces exposed
+    // followed by an array of trace_section_t
+} trace_header_t;
+
+// @todo add a placeholder for probe level, aka warp/thread
+typedef struct {
+    uint32_t size;    // size of record per thread/warp in byte
+    uint32_t warpDiv; // warpSize for warp-level, 1 for thread-level
+    uint64_t offset;  // offset for fseek
+} trace_section_t;
+
+typedef struct {
+    uint64_t clock;
+    uint64_t addrs;
+} dmat_t;
+
+int main(int argc, char* argv[]) {
+    if (argc < 3) {
+    fprintf(stderr, "Usage: %s <input> <output>\n", argv[0]);
+        return EXIT_FAILURE;
+    }
+    
+    FILE* inputf = fopen(argv[1], "rb");
+    if (inputf == NULL) {
+        fprintf(stderr, "can't open input %s \n", argv[1]);
+        return EXIT_FAILURE;
+    }
+
+    FILE* outf = fopen(argv[2], "wb");
+    if (outf == NULL) {
+        fprintf(stderr, "can't open output %s \n", argv[2]);
+        return EXIT_FAILURE;
+    }
+
+    trace_header_t header;
+    size_t elements_read;
+    elements_read = fread(&header, sizeof(header), 1, inputf);
+    
+    uint32_t gridSize = header.gridDimX * header.gridDimY * header.gridDimZ;
+    uint32_t blockSize = header.blockDimX * header.blockDimY * header.blockDimZ;
+
+    trace_section_t section;
+    elements_read = fread(&section, sizeof(section), 1, inputf);
+
+    uint64_t size = section.size;
+    uint64_t offset = section.offset;
+
+    fprintf(stderr, "[info] size: %lu, gridSize: %u, blockSize: %u, offset %lu, numProbes: %u\n", size, gridSize, blockSize, offset, header.numProbes);
+
+    // use fseek to locate the section starting point 
+    fseek(inputf, offset, SEEK_SET);
+
+    // allocate buffer size to contain the record, here we know it's uint64_t
+    void* content = (void*) malloc(size * gridSize * blockSize);
+    elements_read = fread(content, size * gridSize * blockSize, 1, inputf);
+    
+    // I am SORRY I have to use C++ Standard Template Library Containers
+    // page_reference_map := time -> page -> count
+    std::unordered_map<uint64_t, std::map<uint64_t, uint32_t>> page_reference_map;
+    std::unordered_set<uint64_t> pages; 
+
+    uint64_t max_clock = 0;
+    for (int blockIdx = 0; blockIdx < gridSize; blockIdx++) {
+        for (int threadIdx = 0; threadIdx < blockSize; threadIdx++) {
+            // Here we know every record takes 16 bytes
+            for (int recordIdx = 0; recordIdx < (size / 16); recordIdx++) {
+                dmat_t record = *(dmat_t*)(content);
+                if (record.clock != ~0) { // valid record
+                    max_clock = (record.clock > max_clock) ? record.clock : max_clock;
+                    uint64_t page = record.addrs & PAGE_MASK;
+                    page_reference_map[record.clock][page]++; // accumulate the offset
+                    pages.insert(page);
+                }
+                content += sizeof(dmat_t); // anyway offset by 16 bytes
+            }
+        }
+    }
+    
+    // now let's dump it to disk
+    size_t num_clocks = page_reference_map.size(), num_pages = pages.size();
+
+    fprintf(stderr, "\n[info] num_pages: %lu, num_clocks: %lu, max_clock: %lu\n", num_pages, num_clocks, max_clock);
+    
+    fwrite(&num_pages, sizeof(num_pages), 1, outf);
+    fwrite(&num_clocks, sizeof(num_clocks), 1, outf);
+
+    std::vector<uint64_t> page_vec(pages.begin(), pages.end()); // set -> vector
+    fwrite(page_vec.data(), sizeof(uint64_t), num_pages, outf);
+
+    for (const auto& [clock, pages_clock] : page_reference_map) {
+        fwrite(&clock, sizeof(clock), 1, outf);
+        uint64_t size = pages_clock.size();
+        fwrite(&size,  sizeof(size),  1, outf);
+        for (const auto& [page, count] : pages_clock) {
+            fwrite(&page,  sizeof(page),  1, outf);
+            fwrite(&count, sizeof(count), 1, outf);
+        }
+    }
+    fclose(outf);
+    return EXIT_SUCCESS;
+}
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/dmat.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/dmat.py
new file mode 100644
index 0000000..ea023ef
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/dmat.py
@@ -0,0 +1,22 @@
+from neutrino import probe, Map
+import neutrino.language as nl
+
+CALLBACK = "dmat_callback.py"
+
+@Map(level="thread", type="array", size=16, cap="dynamic")
+class DMAT:
+    clock: nl.u64
+    addr:  nl.u64
+
+start: nl.u64 = 0
+mem_clock: nl.u64 = 0
+
+# define probes with decorator
+@probe(pos="kernel", level="thread", before=True)
+def thread_start():
+    start = nl.clock()
+
+@probe(pos="ld.global:st.global:cp.async.cg:cp.async.ca", level="thread")
+def memory_access():
+    mem_clock = nl.clock() - start
+    DMAT.save(mem_clock, nl.addr)
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/dmat_callback.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/dmat_callback.py
new file mode 100644
index 0000000..2efd80a
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/dmat_callback.py
@@ -0,0 +1,244 @@
+"""Analyze the DMAT Output"""
+import os
+import sys
+import subprocess
+import struct
+from typing import List, Dict, Tuple, Optional
+from dataclasses import dataclass
+
+try:
+    import numpy as np
+    import matplotlib.pyplot as plt
+    from matplotlib.colors import ListedColormap, hex2color
+    import matplotlib.patches as mpatches
+except:
+    import pip
+    pip.main(["install", "numpy"])
+    import numpy as np
+    pip.main(["install", "matplotlib"])
+    import matplotlib.pyplot as plt
+    from matplotlib.colors import ListedColormap, hex2color
+    import matplotlib.patches as mpatches
+
+# configure the x:y ratio and their scaling
+Y = 10
+X = 16
+GRIDX, GRIDY = X * 10, Y * 10
+DPI = 200
+
+def sparsify(path: str) -> str:
+    """Sparsify the raw trace to page reference map format
+    NOTE This involves calling external C++ program named `dmat.cc`"""
+    # First try to locate the .cc, it shall be in the same folder as this script
+    out_path = path[:path.index(".bin")] + ".dmat"
+    if os.path.exists(out_path):
+        print(f"[note] {out_path} exists")
+        return out_path
+    CURDIR = os.path.dirname(os.path.realpath(__file__))
+    contents = os.listdir(CURDIR)
+    # Built it with g++ if it's not built
+    cmd = ["g++", os.path.join(CURDIR, "dmat.cc"), "-o", os.path.join(CURDIR, "dmat"),
+           "-O3", "-std=c++17"]
+    # print(" ".join(cmd))
+    if not "dmat" in contents:
+        subprocess.check_call(cmd)
+    # Now call it to apply the C++ program
+    subprocess.check_call([os.path.join(CURDIR, "dmat"), path, out_path])
+    return out_path
+
+@dataclass
+class Param:
+    """We use shape as str to facilitate hash"""
+    ptr: int
+    size:  Optional[int] = 0
+    shape: Optional[str] = ""
+    name:  Optional[str] = ""
+
+def read(path: str) -> List[Param]:
+    """Read the metadata """
+    with open(os.path.join(os.path.dirname(os.path.dirname(path)), "event.log"), "r", encoding='utf-8', errors='ignore') as f:
+        event_logs = f.read().split("\n")
+    # try to find the last 
+    raw_params = set()
+    for line in event_logs[::-1]:
+        # NOTE remove jit.py because Triton will implicitly call many funcs
+        if "[exec]" in line and "param" in line:
+            tmp = line.split(" ")[:-1]
+            for param in tmp[3:]:
+                raw_params.add(int(param, base=16))
+            break # JUST THE LAST RECORD
+    with open(os.path.join(os.path.dirname(os.path.dirname(path)), "tensor.trace"), "r") as f:
+        tensor_traces = f.read().split("\n")
+    found = set()
+    params: List[Param] = []
+    for line in tensor_traces[::-1]:
+        if line.startswith("[call]") or line.startswith("[ret]"):
+            record = line.split("  ") # split by 2 spaces
+            ptr=int(record[4])
+            if ptr not in found:
+                params.append(Param(
+                    ptr = int(record[4]),
+                    size = int(record[3]),
+                    shape = record[2],
+                    name = record[5]
+                ))
+                found.add(ptr)
+    return params
+
+# use our own colormap -> support up to 6 level and 6 colors
+colors = [
+    ListedColormap([(0, 0, 0, 0), hex2color("#ccddf7"), hex2color("#99baef"),  hex2color("#6698e6"), hex2color("#3375de"), hex2color("#0053d6")]), # blue
+    ListedColormap([(0, 0, 0, 0), hex2color("#e6d5f9"), hex2color("#cdaaf3"),  hex2color("#b380ed"), hex2color("#9a55e7"), hex2color("#812be1")]), # purple
+    ListedColormap([(0, 0, 0, 0), hex2color("#ccf1cc"), hex2color("#99e499"),  hex2color("#66d666"), hex2color("#33c933"), hex2color("#00bb00")]), # green
+    ListedColormap([(0, 0, 0, 0), hex2color("#f0d1cd"), hex2color("#e1a39b"),  hex2color("#d17469"), hex2color("#c24637"), hex2color("#b31805")]), # red
+    ListedColormap([(0, 0, 0, 0), hex2color("#f0e0d6"), hex2color("#e0c1ad"),  hex2color("#d1a383"), hex2color("#c1845a"), hex2color("#b26531")]), # yellow
+    # ListedColormap(), # 
+]
+
+def plot(path: str, params: List[Param]):
+    """Draw the DMAT Plot"""
+    unique_pages: List[int] = []
+    page_reference_map: Dict[int, Dict[int, int]] = dict()
+    max_clock: int = 0
+    with open(path, "rb") as f:
+        num_pages, num_clocks = struct.unpack("QQ", f.read(16))
+        # print(num_pages, num_clocks)
+        for _ in range(num_pages):
+            unique_pages.append(struct.unpack("Q", f.read(8))[0])
+        for _ in range(num_clocks):
+            clock = struct.unpack("Q", f.read(8))[0]
+            max_clock = max(clock, max_clock)
+            size  = struct.unpack("Q", f.read(8))[0]
+            page_reference_map[clock] = dict()
+            for _ in range(size):
+                data = f.read(12)
+                if len(data) == 12:
+                    page, count = struct.unpack("QI", data)
+                    page_reference_map[clock][page] = count
+    
+    unique_pages = sorted(unique_pages)
+    # print(unique_pages)
+
+    # print(unique_pages)
+    # now pages are sorted ascendingly -> distinguish into groups
+    page_group_start = [unique_pages[0]]
+    page_group_sizes = []
+    current_size = 1
+    for i in range(1, len(unique_pages)):
+        if unique_pages[i] - unique_pages[i - 1] > 2 ** 16: # new group
+            page_group_sizes.append(current_size)
+            page_group_start.append(unique_pages[i])
+            current_size = 1
+        else: # prev group
+            current_size += 1
+    page_group_sizes.append(current_size)
+    # print(page_group_start)
+    # print(page_group_sizes)
+    # group name is the starting address and 
+    page_to_id = {page: i for i, page in enumerate(unique_pages)}
+
+    # need to have a grid
+    page_to_gridy  = len(unique_pages) // (GRIDY - 1)
+    clock_to_gridx = max_clock // (GRIDX - 1)
+
+    # Flatten the record
+    #clocks: List[int] = []
+    #page_ids: List[int] = []
+    counts: List[int] = []
+    param_matches: Dict[int, Tuple[List[int], List[int], List[int], str, str]] = {i: ([], [], [], p.shape, p.name) for i, p in enumerate(params)}  # page_id: [(clock, param_index, shape)]
+    
+    # NOTE Fix: Add a unmatched group
+    param_matches[len(param_matches)] = ([], [], [], "Unknown", "Unknown")
+
+    # group_matches: Dict[int, Tuple[List[int], List[int], List[int]]] = {i: ([], [], []) for i in range(len(page_group_start))} # page_id -> group_start, group_size
+    for clock, items in page_reference_map.items():
+        if clock < 5000000: # a useless filter for safety
+            for page, count in items.items():
+                page_id = page_to_id[page]
+                #clocks.append(clock)
+                #page_ids.append(page_id)
+                #counts.append(count)
+                matched = False
+                for i, param in enumerate(params):
+                    if param.size > 0 and param.ptr <= page <= param.ptr + param.size: # size is raw bytes
+                        param_matches[i][0].append(clock)
+                        param_matches[i][1].append(page_id)
+                        param_matches[i][2].append(count)
+                        matched = True
+                if not matched:
+                    param_matches[len(param_matches) - 1][0].append(clock)
+                    param_matches[len(param_matches) - 1][1].append(page_id)
+                    param_matches[len(param_matches) - 1][2].append(count)
+        else:
+            print(f"Find Weird Data {clock}", file=sys.stderr) # might be bugs
+
+    # print(param_matches)
+
+    # filter out unused group
+    plotted_matches: List[Tuple[List[int], List[int], List[int], str, str]] = []
+    for match in param_matches.values():
+        if len(match[0]) > 0:
+            plotted_matches.append(match)
+
+    # print(len(plotted_matches))
+    dist: List[np.ndarray] = []
+
+    # Create the figure and axis
+    fig, ax = plt.subplots(figsize=(X, Y), dpi=DPI)
+
+    n = min(len(plotted_matches), 5)
+
+    sys.stdout.write('\r')
+    sys.stdout.write(f"Ploting Tensors: [{' '*5*n}] 0/{n}")
+    sys.stdout.flush()
+
+    for i in range(n): # at most 5 now
+        tmp = np.zeros((Y * 10 + 1, X * 10 + 1), dtype=np.int32)
+        for clock, page, count in zip(plotted_matches[i][0], plotted_matches[i][1], plotted_matches[i][2]):
+            tmp[Y * 10 - page // page_to_gridy, clock // clock_to_gridx] += count
+        # max_ = tmp.max()
+        dist.append(tmp.flatten())
+        boundaries = np.percentile(tmp[tmp != 0], [20, 40, 60, 80])
+        boundaries = np.concatenate(([0], boundaries))
+        # cut into five region based on percentile
+        temp = np.zeros_like(tmp)
+        for j in range(len(boundaries)):
+            temp[boundaries[j] < tmp] = j + 1
+        ax.imshow(temp, cmap=colors[i])
+
+        sys.stdout.write('\r')
+        sys.stdout.write(f"Ploting Tensors: [{'='*5*(i+1)}{' '*5*(n-(i+1))}] {i+1}/{n}")
+        sys.stdout.flush()
+
+    # Set the ticks and labels
+    ax.set_xticks(np.arange(0, X * 10, 10))
+    ax.set_xticklabels([f'{int(max_clock / X * i)}' for i in range(X)], rotation=45)
+    ax.set_yticks(np.arange(0, Y * 10, 10))
+    ax.set_yticklabels([f'{int(len(unique_pages) / Y * i)}' for i in range(Y, 0, -1)])
+    
+    # Manually draw grid lines
+    for x in range(X * 10 + 1):  # Vertical lines
+        ax.axvline(x - 0.5, color='lightgrey', linewidth=0.4)
+
+    for y in range(Y * 10 + 1):  # Horizontal lines
+        ax.axhline(y - 0.5, color='lightgrey', linewidth=0.4)
+
+    # Create handles for the legend
+    handles = [mpatches.Patch(color=colors[i].colors[-1], label=f'Ptr {i}: {plotted_matches[i][3]}, {plotted_matches[i][4]}') for i in range(min(len(plotted_matches), 5))]
+    plt.legend(handles=handles, title=f"Tensor", loc='lower left')
+
+    plt.title('Page Reference Map')
+    plt.xlabel('Clock')
+    plt.ylabel('Pages')
+
+    # Save the figure
+    plt.tight_layout()
+    plt.savefig(path[:path.index(".dmat")] + ".png")
+    plt.close(fig)
+    print(f'\n[info] save to {path[:path.index(".dmat")] + ".png"}')
+
+if __name__ == "__main__":
+    path = sys.argv[1]
+    sparsified = sparsify(path)
+    params = read(path)
+    plot(sparsified, params)
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/gmem_bytes.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/gmem_bytes.py
new file mode 100644
index 0000000..89f069e
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/gmem_bytes.py
@@ -0,0 +1,29 @@
+from neutrino import probe, Map
+import neutrino.language as nl
+
+CALLBACK = "gmem_bytes_analysis.py"
+
+@Map(level="thread", type="array", size=8, cap=1)
+class GMEMBytes:
+    sync_bytes: nl.u32
+    async_bytes: nl.u32
+
+sync_bytes:  nl.u64 = 0
+async_bytes: nl.u64 = 0
+
+@probe(level="thread", pos="kernel", before=True)
+def init():
+    sync_bytes = 0
+    async_bytes = 0
+
+@probe(level="thread", pos="ld.global:st.global")
+def record_sync():
+    sync_bytes += nl.bytes
+
+@probe(level="thread", pos="cp.async.ca:cp.async.cg")
+def record_async():
+    async_bytes += nl.bytes
+
+@probe(level="thread", pos="kernel")
+def save():
+    GMEMBytes.save(sync_bytes, async_bytes)
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/gmem_bytes_analysis.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/gmem_bytes_analysis.py
new file mode 100644
index 0000000..a82e588
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/gmem_bytes_analysis.py
@@ -0,0 +1,40 @@
+# Neutrino Auto-Generated Code for Trace Reading
+import struct
+from typing import NamedTuple, List, Tuple
+from neutrino import TraceHeader, TraceSection
+
+class saving(NamedTuple):
+	sync_bytes: int
+	async_bytes: int
+
+
+def parse(path: str) -> Tuple[TraceHeader, List[TraceSection], List[List[saving]]]:
+    with open(path, "rb") as f:
+        gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, numProbes = struct.unpack("iiiiiiii", f.read(32))
+        header: TraceHeader = TraceHeader(gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, numProbes)
+        assert header.numProbes == 1 # currently only one saving probe is supported
+        sections: List[TraceSection] = []
+        for _ in range(header.numProbes):
+            size, offset = struct.unpack("QQ", f.read(16))
+            sections.append(TraceSection(size, offset))
+        gridSize = header.gridDimX * header.gridDimY * header.gridDimZ
+        blockSize = header.blockDimX * header.blockDimY * header.blockDimZ
+        records: List[List[saving]] = []
+        for i in range(gridSize):
+            records.append([])
+            for j in range(blockSize):
+                sync_bytes, async_bytes = struct.unpack("II", f.read(8))
+                records[i].append(saving(sync_bytes, async_bytes))
+        return header, sections, records
+# END OF GENERATED CODE
+import sys
+header, sections, records = parse(sys.argv[1]) # filled by path to trace
+
+gridSize = header.gridDimX * header.gridDimY * header.gridDimZ
+blockSize = header.blockDimX * header.blockDimY * header.blockDimZ
+gmem_bytes = 0
+for i in range(gridSize):
+    for j in range(blockSize):
+         gmem_bytes += records[i][j].sync_bytes + records[i][j].async_bytes
+
+print(f"gmem_bytes:{gmem_bytes}")
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/program_progress.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/program_progress.py
new file mode 100644
index 0000000..8919c47
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/program_progress.py
@@ -0,0 +1,10 @@
+from neutrino import probe, Map
+import neutrino.language as nl
+
+@Map(level="thread", type="array", size=8, cap=128)
+class Sample:
+    clock: nl.u64
+
+@probe(level="thread", pos="bra")
+def bra_sample():
+    Sample.save(nl.clock())
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/tensorop.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/tensorop.py
new file mode 100644
index 0000000..c708d5c
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/tensorop.py
@@ -0,0 +1,20 @@
+from neutrino import probe, Map
+import neutrino.language as nl
+
+@Map(level="thread", type="array", size=8, cap=1)
+class TensorOpCount:
+    count: nl.u64
+
+counter: nl.u64 = 0
+
+@probe(level="thread", pos="kernel", before=True)
+def init():
+    counter = 0
+
+@probe(level="thread", pos="mma.sync.aligned")
+def count():
+    counter += 1
+
+@probe(level="thread", pos="kernel")
+def save():
+    TensorOpCount.save(counter)
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/utils/__init__.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/utils/__init__.py
new file mode 100644
index 0000000..c676b7c
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/utils/__init__.py
@@ -0,0 +1,41 @@
+"""Neutrino 3rd Party Utilities
+
+Please place each util as a module under neutrino.utils
+Following are some common utilities"""
+
+def get_tracedir() -> str:
+    """Get or Create (if not yet here) the tracedir
+    
+    NOTE Impl of this shall match src/common.h !"""
+    import os
+    import time
+    from datetime import datetime
+
+    neutrino_dir = os.getenv("NEUTRINO_TRACEDIR")
+    assert neutrino_dir is not None, "NEUTRINO_TRACEDIR must be set"
+    if not os.path.isdir(neutrino_dir):
+        os.mkdir(neutrino_dir)
+
+    # 1. read the 22nd value of /proc/[pid]/stat (jiffies of proc start time)
+    with open("/proc/self/stat", "r") as f:
+        jiffies = int(f.read().split()[21])
+    
+    # 2. get system clock frequency (Hz, usually 100MHz)
+    clk_tck = os.sysconf(os.sysconf_names["SC_CLK_TCK"])
+    
+    # 3. read the systme boot time (second, since 1970)
+    with open("/proc/uptime", "r") as f:
+        uptime_seconds = int(float(f.read().split()[0]))
+    
+    # 4. compute absolute timestamp of proc boot time and format
+    # NOTE we convert time() and uptime to int to match C algorithm, 
+    #      or it's likely to have two folder with 1 second difference
+    procstart = int(time.time()) - uptime_seconds + (jiffies / clk_tck)
+    procstart = datetime.fromtimestamp(procstart)
+    formatted = procstart.strftime("%b%d_%H%M%S") + "_" + str(os.getpid())
+    trace_dir = os.path.join(neutrino_dir, formatted)
+
+    if not os.path.isdir(trace_dir):
+        os.mkdir(trace_dir)
+
+    return trace_dir
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/utils/tensortrace.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/utils/tensortrace.py
new file mode 100644
index 0000000..257291d
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/utils/tensortrace.py
@@ -0,0 +1,80 @@
+"""Obtain High-Level Tensor Information from PyTorch or other framework
+
+USAGE `TensorTrace` for with-statement and `tensortrace` for function wrapper
+TODO Support JAX via arr.__cuda_array_interface__['data'] for .data_ptr()
+INTERNAL Use Python's built-in sys.settrace to track call frames"""
+
+import sys
+import os
+import time
+from typing import Callable, TextIO
+from functools import wraps
+import torch
+from neutrino.utils import get_tracedir
+
+__all__ = ["TensorTrace", "tensortrace"]
+
+def get_time() -> int:
+    """Python Equivalent of C Style Get Time:
+    struct timespec ts;
+    clock_gettime(CLOCK_REALTIME, &ts);
+    long long time = ts.tv_nsec + ts.tv_sec * 1e9;
+    """
+    ts = time.clock_gettime(time.CLOCK_REALTIME)
+    seconds = int(ts)
+    nanoseconds = int((ts - seconds) * 1e9)
+    return nanoseconds + seconds * int(1e9)
+
+# We use a closure to specify the holder
+def callframe_tracer_wrapper(f: TextIO) -> Callable:
+
+    # a trace function conforms sys.settrace() interface
+    def trace_calls(frame, event, arg):
+        if event == 'call' and "jit" not in frame.f_code.co_filename:
+            code = frame.f_code
+            func_name = code.co_name
+            func_filename = code.co_filename
+            func_line_no = frame.f_lineno
+            for i, varname in enumerate(code.co_varnames):
+                if i < frame.f_code.co_argcount:
+                    if isinstance(frame.f_locals[varname], torch.Tensor):
+                        print(f"[call]  {get_time()}  {frame.f_locals[varname].shape}  {frame.f_locals[varname].untyped_storage().nbytes()}  {frame.f_locals[varname].data_ptr()}  {varname}  {func_name}  {func_filename}:{func_line_no}", flush=True, file=f)
+        # NOTE might not need return because we pause the exec so it shall stall in call
+        # elif event == 'return' and "jit" not in code.co_filename:
+        #     code = frame.f_code
+        #     func_name = code.co_name
+        #     func_filename = code.co_filename
+        #     func_line_no = frame.f_lineno
+        #     if isinstance(arg, torch.Tensor):
+        #         print(f"[ret]  {get_time()}  {arg.shape}  {arg.untyped_storage().nbytes()}  {arg.data_ptr()}  {func_name}  {func_filename}:{func_line_no}", flush=True, file=f)
+        #     elif isinstance(arg, tuple):
+        #         for arg_i in arg:
+        #             if isinstance(arg_i, torch.Tensor):
+        #                 print(f"[ret]  {get_time()}  {arg_i.shape}  {arg_i.untyped_storage().nbytes()}  {arg_i.data_ptr()}  {func_name}  {func_filename}:{func_line_no}", flush=True, file=f)
+
+        return trace_calls
+    
+    return trace_calls
+
+class TensorTrace:
+    """A context manager to trace call call stacks"""
+    def __enter__(self): 
+        trace_file: TextIO
+        if os.getenv("NEUTRINO_TRACEDIR") is not None:
+            trace_dir = get_tracedir()
+            # print(f"[info] tensor trace in {os.path.join(trace_dir, 'tensor.trace')}", file=sys.stderr)
+            trace_file = open(os.path.join(trace_dir, "tensor.trace"), "w+")
+        else: 
+            trace_file = sys.stderr
+        sys.settrace(callframe_tracer_wrapper(f=trace_file))
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        sys.settrace(None) # clear the trace function 
+
+def tensortrace(func: Callable) -> Callable:
+    """A decorator to apply TensorTrace to a function."""
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        with TensorTrace():
+            return func(*args, **kwargs)
+    return wrapper
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/utils/trace_reading.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/utils/trace_reading.py
new file mode 100644
index 0000000..cedc3a5
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/utils/trace_reading.py
@@ -0,0 +1,81 @@
+"""Automatic Code Generation based on the Map definitions for reading trace files.
+TODO update to numpy.struct"""
+
+from neutrino.common import load
+
+__all__ = ["gen_reading_code"]
+
+# NOTE Template for Generating Trace Reading Code
+TRACE_READING_PY = """# Neutrino Auto-Generated Code for Trace Reading
+import struct
+from typing import NamedTuple
+from neutrino import TraceHeader, TraceSection
+
+{MAP_DEFNS}
+
+def parse(path: str) -> tuple[TraceHeader, list[TraceSection], dict[str, list[list[NamedTuple]]]]:
+    with open(path, "rb") as f:
+        header: TraceHeader = TraceHeader(*struct.unpack("iiiiiiii", f.read(32)))
+        sections: list[TraceSection] = []
+        for _ in range(header.numProbes):
+            sections.append(TraceSection(*struct.unpack("IIQ", f.read(16))))
+        gridSize = header.gridDimX * header.gridDimY * header.gridDimZ
+        blockSize = header.blockDimX * header.blockDimY * header.blockDimZ
+        records: dict[str, list[list[NamedTuple]]] = dict()
+{TRACE_READINGS}
+    return header, sections, records
+# END of Neutrino Auto-Generated Code for Trace Reading"""
+
+TRACE_STRUCT_CODE_PY = """
+class {MAP_NAME}(NamedTuple):
+{CONTENT}
+"""
+
+TRACE_PARSING_PY = """
+        # Read {MAP_NAME}
+        records["{MAP_NAME}"] = []
+        f.seek(sections[{INDEX}].offset)
+        for i in range(gridSize):
+            records["{MAP_NAME}"].append([])
+            for j in range(blockSize // sections[{INDEX}].warpDiv):
+                records["{MAP_NAME}"][-1].append([])
+                for k in range(sections[{INDEX}].size // {BYTES}):
+                    records["{MAP_NAME}"][i][j].append({MAP_NAME}(*struct.unpack("{FORMAT_STRING}", f.read({BYTES}))))
+"""
+
+def gen_reading_code(probe: dict) -> str:
+    """Generate the code for reading the trace file"""
+    _, maps, _ = load(probe)
+    trace_structs = []
+    trace_readings = []
+
+    for index, map in enumerate(maps):
+        content = []
+        format_string = ""
+        reading_bytes = 0
+        for reg in map.regs:
+            content.append(f"    {reg.name}: int")
+            format_string += "q" if reg.dtype == 'u64' else "I"
+            reading_bytes += 8 if reg.dtype == 'u64' else 4
+        trace_structs.append(TRACE_STRUCT_CODE_PY.format(
+            MAP_NAME=map.name,
+            CONTENT="\n".join(content)
+        ))
+        trace_readings.append(TRACE_PARSING_PY.format(
+            MAP_NAME=map.name,
+            FORMAT_STRING=format_string,
+            BYTES=reading_bytes,
+            INDEX=index,
+        ))
+
+    return TRACE_READING_PY.format(
+        MAP_DEFNS="\n".join(trace_structs),
+        TRACE_READINGS="\n".join(trace_readings)
+    )
+
+if __name__ == "__main__":
+    # Example usage
+    import sys
+    import toml
+    probe = toml.load(sys.argv[1])
+    print(gen_reading_code(probe))
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/utils/verifier.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/utils/verifier.py
new file mode 100644
index 0000000..e69de29
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/pyproject.toml b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/pyproject.toml
new file mode 100644
index 0000000..e69de29
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/setup.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/setup.py
new file mode 100644
index 0000000..6ae50b2
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/setup.py
@@ -0,0 +1,47 @@
+import sys
+import os
+import subprocess
+from setuptools import setup, find_packages
+from setuptools.command.install import install
+
+class BuildNeutrino(install):
+    def run(self):
+        CURDIR = os.path.dirname(os.path.realpath(__file__))
+        # run the building script 
+        subprocess.check_output([sys.executable, os.path.join(CURDIR, "neutrino", "build.py")])
+        install.run(self)
+
+
+setup(
+    name='neutrino',  
+    version='0.1.0',  
+    packages=find_packages(),  
+    # package_dir={'': 'neutrino'},  # Set src as the root for packages
+    package_data={'build': ['*'], 'tools': ['*']},
+    include_package_data=True,  # Include files specified in MANIFEST.in
+    install_requires=[
+        'toml',
+    ],
+    py_modules=["neutrino"],
+    entry_points={
+        'console_scripts': [
+            'neutrino = neutrino.cli:main',  # Links 'myentry' command to `main` function in `myentry.py`
+        ],
+    },
+    author='Neutrino Team',  # Anonymous Name
+    author_email='anonymous@example.com',  # Anonymous Email
+    description='Something',
+    long_description=open('README.md').read(),
+    long_description_content_type='text/markdown',
+    url='https://github.com/neutrino-gpu/neutrino',
+    classifiers=[
+        'Programming Language :: Python :: 3',
+        'License :: OSI Approved :: MIT License', 
+        'Operating System :: POSIX :: Linux',
+    ],
+    cmdclass={'install': BuildNeutrino},
+    python_requires='>=3.10',  # Specify the Python version requirement
+    setup_requires=[
+        'toml',
+    ],
+)
\ No newline at end of file