diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/_agent_eval/main.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/_agent_eval/main.py new file mode 100644 index 0000000..bc13223 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/_agent_eval/main.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +"""Runs environment setup, build, benchmark prep, and experiment runs checks for Neutrino (OSDI'25).""" + +from __future__ import annotations + +from pathlib import Path +from typing import Dict, Tuple +import os +import sys + + +from evaluator.utils import ( # noqa: E402 + EntryConfig, + LoggerConfig, + get_logger, + record_result, +) + +from oracle_artifact_build import OracleArtifactBuild # noqa: E402 +from oracle_benchmark_prep import OracleBenchmarkPrep # noqa: E402 +from oracle_env_setup import OracleEnvSetup # noqa: E402 +from oracle_experiment_runs import OracleExperimentRuns # noqa: E402 + + +def _resolve_workspace_paths() -> Tuple[Path, Path, Path]: + """Resolve and validate _agent_eval/ and neutrino/ locations. + + Expects either: + (1) _agent_eval/ and the Neutrino repo are located in the same workspace root; or + (2) _AGENT_EVAL_DIR and _NEUTRINO_HOME are set by the user. + """ + try: + env_agent_eval = os.environ.get("_AGENT_EVAL_DIR") + env_neutrino_home = os.environ.get("_NEUTRINO_HOME") + + agent_eval_dir = ( + Path(env_agent_eval).expanduser().resolve() + if env_agent_eval + else Path(__file__).resolve().parent + ) + + workspace_root = ( + Path(env_neutrino_home).expanduser().resolve() + if env_neutrino_home + else agent_eval_dir.parent.resolve() + ) + + if not agent_eval_dir.is_dir(): + raise RuntimeError( + f"Invalid _agent_eval dir: {agent_eval_dir}\n" + "Set _AGENT_EVAL_DIR to the directory containing main.py if needed." + ) + + neutrino_repo_root = workspace_root / "neutrino" + if not neutrino_repo_root.is_dir(): + raise RuntimeError( + f"Invalid Neutrino workspace: {workspace_root}\n" + f"Expected to find a Neutrino repository directory at: {neutrino_repo_root}\n" + "This runner expects _agent_eval/ and the Neutrino repo to be located in the same workspace root.\n" + "Set _NEUTRINO_HOME to the workspace root if needed." + ) + + return agent_eval_dir, workspace_root, neutrino_repo_root + + except OSError as exc: + raise RuntimeError(f"Failed to resolve workspace paths: {exc}") from exc + + +def _build_neutrino_config( + *, agent_eval_dir: Path, workspace_root: Path, neutrino_repo_root: Path +) -> EntryConfig: + """Constructs EntryConfig for the Neutrino evaluation bundle from resolved paths.""" + + return EntryConfig( + name="osdi25-neutrino", + home_dir=workspace_root, + repository_paths={ + "osdi25-neutrino": neutrino_repo_root, + }, + results_paths={ + # Need to add results dir + }, + ground_truth_paths={ + # Need _agent_eval/refs. + }, + similarity_ratio=0.75, + ) + + +def main(argv: list[str]) -> int: + verbose = "--verbose" in argv + + results: Dict[str, int] = {} + score = 0 + + logger_name = os.environ.get("EVAL_LOGGER_NAME", "NEUTRINO-AGENT-EVALUATOR") + logger = get_logger(LoggerConfig(root_name=logger_name)) + + try: + agent_eval_dir, workspace_root, neutrino_repo_root = _resolve_workspace_paths() + NEUTRINO_CONFIG = _build_neutrino_config( + agent_eval_dir=agent_eval_dir, + workspace_root=workspace_root, + neutrino_repo_root=neutrino_repo_root, + ) + except RuntimeError as exc: + raise SystemExit(str(exc)) from exc + + env_checker = OracleEnvSetup(config=NEUTRINO_CONFIG, logger=logger) + score += record_result(results, type(env_checker).__name__, env_checker.run(verbose=verbose)) + + build_checker = OracleArtifactBuild(config=NEUTRINO_CONFIG, logger=logger) + score += record_result(results, type(build_checker).__name__, build_checker.run(verbose=verbose)) + + prep_checker = OracleBenchmarkPrep(config=NEUTRINO_CONFIG, logger=logger) + score += record_result(results, type(prep_checker).__name__, prep_checker.run(verbose=verbose)) + + runs_checker = OracleExperimentRuns(config=NEUTRINO_CONFIG, logger=logger) + score += record_result(results, type(runs_checker).__name__, runs_checker.run(verbose=verbose)) + + logger.info("Agent scores: %s", results) + return score + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/_agent_eval/oracle_artifact_build.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/_agent_eval/oracle_artifact_build.py new file mode 100644 index 0000000..d64f0be --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/_agent_eval/oracle_artifact_build.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +"""Artifact build oracle for Neutrino (OSDI'25). + +Validates: + - Repository working directory exists. + - The Neutrino CLI is on PATH and can invoke `--help`. + - The Neutrino module is importable after installation. +""" + +from collections.abc import Mapping, Sequence +from dataclasses import dataclass, field +import logging +from pathlib import Path +import sys + +from evaluator.oracle_artifact_build_primitives import ( + BuildCommandRequirement, + OracleArtifactBuildBase, +) +from evaluator.utils import BaseRequirement, EntryConfig + + +@dataclass(frozen=True, slots=True, kw_only=True) +class BuildTarget: + """Declarative description of one build command to run.""" + + name: str + cmd: Sequence[str] + relative_workdir: Path | None = None + optional: bool = False + timeout_seconds: float = 60.0 + env_overrides: Mapping[str, str] = field(default_factory=dict) + + def __post_init__(self) -> None: + if not self.name: + raise ValueError("BuildTarget.name must be non-empty") + + if isinstance(self.cmd, (str, bytes)) or not self.cmd: + raise ValueError("BuildTarget.cmd must be a non-empty argv sequence") + + object.__setattr__(self, "cmd", tuple(self.cmd)) + + if self.relative_workdir is not None and not isinstance(self.relative_workdir, Path): + object.__setattr__(self, "relative_workdir", Path(self.relative_workdir)) + + +class OracleArtifactBuild(OracleArtifactBuildBase): + """The artifact build oracle for Neutrino.""" + + def __init__( + self, + *, + config: EntryConfig, + logger: logging.Logger, + targets: Sequence[BuildTarget] | None = None, + ) -> None: + super().__init__(logger=logger) + self._config = config + + if targets is None: + targets = self._make_default_targets() + self._targets = tuple(targets) + + names = [t.name for t in self._targets] + if len(names) != len(set(names)): + raise ValueError(f"Duplicate build target names: {names!r}") + + def _make_default_targets(self) -> tuple[BuildTarget, ...]: + py = sys.executable or "python" + + return ( + BuildTarget( + name="neutrino: import test", + cmd=(py, "-c", "import neutrino; print(neutrino.__file__)"), + timeout_seconds=30.0, + ), + BuildTarget( + name="neutrino: CLI help (optional)", + cmd=("neutrino", "--help"), + optional=True, + timeout_seconds=30.0, + ), + ) + + def requirements(self) -> Sequence[BaseRequirement]: + """Returns an ordered list of build requirements to validate.""" + repo_root = self._config.repository_paths.get(self._config.name) + + if repo_root is None: + return ( + BuildCommandRequirement( + name=f"config: missing repository_paths entry for {self._config.name!r}", + optional=False, + cwd=Path(self._config.home_dir) / "__MISSING_REPOSITORY_PATH__", + cmd=("true",), + timeout_seconds=30.0, + ), + ) + + return tuple( + BuildCommandRequirement( + name=target.name, + optional=target.optional, + cwd=repo_root, + cmd=target.cmd, + relative_workdir=target.relative_workdir, + timeout_seconds=target.timeout_seconds, + env_overrides=target.env_overrides, + ) + for target in self._targets + ) \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/_agent_eval/oracle_env_setup.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/_agent_eval/oracle_env_setup.py new file mode 100644 index 0000000..f585863 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/_agent_eval/oracle_env_setup.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +"""Environment setup oracle for Neutrino (OSDI'25). + +Validates: + - Baseline tools for running the (static) evaluation workflow. + - Repository directory layout and required artifact files. + - Static and dynamic evaluation prerequisites. +""" + +import logging +from pathlib import Path +from typing import Mapping, Sequence + +from evaluator import utils +from evaluator.utils import EntryConfig +from evaluator.oracle_env_setup_primitives import ( + DependencyVersionRequirement, + FilesystemPathRequirement, + OracleEnvSetupBase, + PathType, + VersionCompare, +) + + +def _required_path(paths: Mapping[str, Path], key: str, *, label: str) -> Path: + """Returns a required path from a mapping with a clear error.""" + try: + return paths[key] + except KeyError as e: + raise ValueError(f"Missing {label}[{key!r}] in EntryConfig") from e + + +class OracleEnvSetup(OracleEnvSetupBase): + """Validates environment prerequisites for Neutrino (OSDI'25).""" + + def __init__(self, *, config: EntryConfig, logger: logging.Logger) -> None: + super().__init__(logger) + self._config = config + + def requirements(self) -> Sequence[utils.BaseRequirement]: + repo_root = _required_path( + self._config.repository_paths, self._config.name, label="repository_paths" + ) + + artifact_dir = repo_root / "artifact" + pkg_dir = repo_root / "neutrino" + + # Static evaluation requirements + reqs: list[utils.BaseRequirement] = [ + DependencyVersionRequirement( + name="python", + cmd=("python", "--version"), + required_version=(3, 11, 0), + compare=VersionCompare.GEQ, + ), + DependencyVersionRequirement( + name="pip", + cmd=("python", "-m", "pip", "--version"), + required_version=(0, 0, 0), + compare=VersionCompare.GEQ, + ), + DependencyVersionRequirement( + name="wget", + cmd=("wget", "--version"), + required_version=(0, 0, 0), + compare=VersionCompare.GEQ, + optional=True, + ), + DependencyVersionRequirement( + name="unzip", + cmd=("unzip", "-v"), + required_version=(0, 0, 0), + compare=VersionCompare.GEQ, + optional=True, + ), + FilesystemPathRequirement( + name="repo_root_exists", + path=repo_root, + path_type=PathType.DIRECTORY, + ), + FilesystemPathRequirement( + name="artifact_dir_exists", + path=artifact_dir, + path_type=PathType.DIRECTORY, + ), + FilesystemPathRequirement( + name="static_notebook_exists", + path=artifact_dir / "static.ipynb", + path_type=PathType.FILE, + ), + ] + + # Dynamic evaluation requirements + reqs.extend( + [ + DependencyVersionRequirement( + name="gcc", + cmd=("gcc", "--version"), + required_version=(0, 0, 0), + compare=VersionCompare.GEQ, + optional=False, + ), + DependencyVersionRequirement( + name="nm", + cmd=("nm", "--version"), + required_version=(0, 0, 0), + compare=VersionCompare.GEQ, + optional=False, + ), + DependencyVersionRequirement( + name="cmake", + cmd=("cmake", "--version"), + required_version=(0, 0, 0), + compare=VersionCompare.GEQ, + optional=False, + ), + DependencyVersionRequirement( + name="make", + cmd=("make", "--version"), + required_version=(0, 0, 0), + compare=VersionCompare.GEQ, + optional=False, + ), + DependencyVersionRequirement( + name="nvidia-smi", + cmd=("nvidia-smi",), + required_version=(0, 0, 0), + compare=VersionCompare.GEQ, + optional=True, + ), + DependencyVersionRequirement( + name="ptxas", + cmd=("ptxas", "--version"), + required_version=(0, 0, 0), + compare=VersionCompare.GEQ, + optional=True, + ), + DependencyVersionRequirement( + name="cuobjdump", + cmd=("cuobjdump", "--version"), + required_version=(0, 0, 0), + compare=VersionCompare.GEQ, + optional=True, + ), + FilesystemPathRequirement( + name="dynamic_notebook_exists", + path=artifact_dir / "dynamic.ipynb", + path_type=PathType.FILE, + ), + ] + ) + + return reqs diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/MANIFEST.in b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/MANIFEST.in new file mode 100644 index 0000000..14eaab6 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/MANIFEST.in @@ -0,0 +1 @@ +recursive-include neutrino * \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/README.md b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/README.md new file mode 100644 index 0000000..602edc1 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/README.md @@ -0,0 +1,210 @@ +# Neutrino + +by [Huang Songlin](https://huangs0.github.io) and [Wu Chenshu](https://cswu.me) from the University of Hong Kong. + +Neutrino is a Probing-based GPU Kernel Profiler providing eBPF-like user experience for GPU Kernel Profiling, targeting: + +1. **Fine-granularity**: Directly works on instructions to offer the finest granularity that can be mapped to particular hardware units. +2. **Programmability**: Extends the programmability of previous tools to probe cooperation with probe +3. **Versatility**: Supports both value profiling (register value like memory address) and value profiling (timestamp from device-side clock). +4. **Hardware-Independence**: Support both NVIDIA/CUDA and AMD/ROCm, more platforms to come! +5. **Ecosystem-Compatibility**: Built-in compatible with PyTorch (and everything on top like Huggingface), JAX, Triton, CUTLASS... + +The foundations of this project are described in our OSDI '25 publication: [Neutrino: Fine-grained GPU Kernel Profiling via Programmable Probing](https://www.usenix.org/conference/osdi25/presentation/huang-songlin). Please consider citing this work if you use Neutrino! +The [official documentation](https://open-neutrino.github.io) contains more installation instructions, tutorials, internals and the DMAT galley! + +## Latest News +* May 31, 2025: [Neutrino's artifact](https://github.com/open-neutrino/neutrino/tree/artifact) received all [badges](https://sysartifacts.github.io/osdi2024/badges) (Available, Functional, Reproduced) from OSDI 25 Artifact Evaluation! + +## Quick Start + +### Demos + +Following demos are hosted on Colab with simple click `Runtime -> Run All`: + +| Demo | Colab Link| +|---|---| +| Unrevealing block scheduling cost of `torch.zeros` | ![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg) | +| Visualizing FlashAttn-v2 Memory Access | ![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg) | +| Warp Scheduling and Tailing Effect | ![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg) | + +### Installation + +Neutrino can be installed as a Python package from source. Building is fast (<30 seconds)! + +```bash +# Virtual Environmnt is highly recommended! +conda create conda create -y -n python=3.11 && conda activate +git clone https://github.com/open-neutrino/neutrino +cd neutrino && python setup.py install && cd .. +neutrino --help # test installation +``` + +Neutrino does not have pre-build wheels, please **DO NOT** `pip instsall neutrino`! + +## Using Neutrino + +Inspired by [eBPF](https://ebpf.io/what-is-ebpf/), `probe` in Neutrino refers to a tiny sandboxed code snippet that could be attached to the GPU kernel at the assembly level (PTX, GCNAsm, SPIR-V) in the runtime. +`probe` extends a new programmable interface than traditional programming and provides a convenient way for observability to black-boxed GPU runtime. +Currently Neutrino probes support two programming ways: +1. Pythonic Tracing DSL, suitable for beginners. +2. Direct Assembly probes wrapped in [TOML](https://toml.io/en/), suitable for advanced usage but it is platform-dependent. + + + + + + +
+ +### Pythonic Tracing DSL +```python +from neutrino import probe, Map +import neutrino.language as nl +CALLBACK = "block_sched.py" # for trace analysis +# declare maps for persistence +@Map(level="warp", type="array", size=16, cap=1) +class block_sched: + start: nl.u64 + elapsed: nl.u32 + cuid: nl.u32 +# declare probe registers shared across probes +start: nl.u64 = 0 # starting clock +elapsed: nl.u64 = 0 # elapsed time, initialized to 0 +# define probes with decorator +@probe(pos="kernel", level="warp", before=True) +def thread_start(): + start = nl.clock() +@probe(pos="kernel", level="warp") +def thread_end(): + elapsed = nl.clock() - start + block_sched.save(start, elapsed, nl.cuid()) +``` + + + +### Direct Assembly wrapped in TOML +```toml +# CUDA PTX Assembly Example +callback="block_sched.py" +[ map.block_sched ] +type = "array" +level = "warp" +size = "16" +cap = "1" +[ probe.thread_start_thread_end ] +position = "kernel" +level = "warp" +register = {"u32": 2, "u64": 3} +before = """.reg .b64 %PD<3>; +.reg .b32 %P<2>; +mov.u64 %PD0, %clock64;""" +after = """mov.u64 %PD1, %clock64; +sub.u64 %PD1, %PD1, %PD0; +cvt.u32.u64 %P1, %PD1; +mov.u32 %P2, %smid; +SAVE [ block_sched ] {%PD0, %P1, %P2};""" +``` + +
+ +The interface of `@neutrino.Probe` is inspired by [Triton](https://triton-lang.org/main/index.html) whose contents (left) will be compiled, rather than executed, into platform-specific assemblies (right). +Probes of same `level` and `pos` will be merged. + +The formulation (and the name) of `@neutrino.Map` is prompted by [eBPF Map](https://docs.ebpf.io/linux/concepts/maps/). With structured definition, Neutrino can have save (no illegal memory access) and efficient (race-free, no atomics) persistence. + +To simplify the development, Neutrino also provides some helper functions / operands: +* `nl.clock() / nl.time()`: for reading device-side clock and timer. +* `nl.addr/out/in1/in2/in3`: for reading register values +* `Map.save()`: for persisting values for posterior analysis. + +## Compatibility + +More information can be found in our documentation. If you have more platforms or workloads need the support, please raise an issue to let us know! + + + + + + +
+ +### Hardware + + +| Hardware Platform | Support Status | +| --- | --- | +| NVIDIA/CUDA/PTX | ✅ Supported | +| AMD/ROCm/GCNAsm | 🛠️ Testing | +| General/OpenCL/SPIR-V | 🚀 Planning | + + + +### Software + +| Software Framework | Status | +| --- | --- | +| cuBLAS/cuFFT/cuSparse... | ❌ (no plan for supporting) | +| CUTLASS | ✅ (with macro in building) | +| PyTorch family (torchvision...) | ✅ (with custom build) | +| JAX | ✅ (with envariable in runtime) | +| Triton | ✅ | + +
+ +## Internals + +`neutrino` is designed to operate in the following workflow: + +workflow + +The source code are placed in the following structure: + +``` +neutrino +├── language # DSL and Compiler, Still in Testing +│ ├── __init__.py # DSL Primitive +│ ├── compiler.py # Exported Compiler API +│ ├── frontend.py # Parser and AST Transformer +│ ├── gcn.py # CUDA PTX Codegen Backend +│ └── ptx.py # AMD ROCm Codegen Backend +├── probe # Probe Engine +│ ├── __init__.py # Common Definition and Utilities +│ ├── cuda.py # CUDA PTX Impl +│ └── hip.py # AMD ROCm Impl +├── src # Hook Driver +│ ├── common.h # Platform-agnostic Definition (GNU-only) +│ ├── cuda.c # CUDA Impl (NVIDIA-related) +│ ├── hip.c # ROCm Impl (AMD-related) +│ ├── preload.c # Injector via LD_PRELOAD +│ ├── parse.py # Generate Unhook API (NVIDIA/AMD) +│ ├── sha1.h # third-parties header-only library +│ └── uthash.h # third-parties header-only library +├── build.py # Builder for driver in src/ +├── cli.py # Command Line Interface Entry +├── common.py # Common Internal API not for User import +└── __init__.py # Common Defn for user import like probe, Map +``` + +The overall structure is clean and approachable, we welcome developers to hack the system for their need. Raise issues if you need help. + +## More + +* How to write my probe? Check the [Probe Writing Guide](https://open-neutrino.github.io/docs/write-probes). +* How are probes executed? Check the [Probe Execution Model](https://open-neutrino.github.io/docs/execute-model). +* How to read the neutrino trace? Check the [Trace File Structure](https://open-neutrino.github.io/docs/read-trace). +* How to Neutrino works and how to extend? [Check the Reference and Internals](https://open-neutrino.github.io/docs/system-workflow). +* How good is Neutrino? Check the [Utilities and Extensions](https://open-neutrino.github.io/docs/analysis-code) + +## Citation +If you used Neutrino in your research, please cite the paper below. And we welcome you to send us a link to your paper. +``` +@inproceedings{huang2025neutrino, + author = {Songlin Huang and Chenshu Wu}, + title = {Neutrino: Fine-grained GPU Kernel Profiling via Programmable Probing}, + booktitle = {19th USENIX Symposium on Operating Systems Design and Implementation (OSDI 25)}, + year = {2025}, + url = {https://www.usenix.org/conference/osdi25/presentation/huang-songlin}, + publisher = {USENIX Association}, +} +``` diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/analysis.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/analysis.py new file mode 100644 index 0000000..2ff4f9b --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/analysis.py @@ -0,0 +1,54 @@ +# Neutrino Generated Code for Reading Trace +import struct +from typing import NamedTuple, List, Tuple +from neutrino import TraceHeader, TraceSection +class block_sched(NamedTuple): + start: int + elapsed: int + cuid: int +def parse(path: str): + with open(path, "rb") as f: + header: TraceHeader = TraceHeader( + struct.unpack("iiiiiiii", f.read(32))) + sections: List[TraceSection] = [] + for _ in range(header.numProbes): + size, offset = struct.unpack("QQ", f.read(16)) + sections.append(TraceSection(size, offset)) + gridSize = header.gridDimX * header.gridDimY + * header.gridDimZ + blockSize = header.blockDimX * header.blockDimY + * header.blockDimZ + records: List[List[block_sched]] = [] + for i in range(gridSize): + records.append([]) + for j in range(blockSize): + start, elapsed, cuid = struct.unpack( + "QII", f.read(16)) + records[i].append( + block_sched(start, elapsed, cuid)) + return header, sections, records +# END OF GENERATED CODE +import numpy as np +header, sections, records = parse(sys.argv[1]) +unique_cus = set() +for block in records: + unique_cus.add(block[0].cuid) +cu_timelines = [[]] * len(unique_cus) +sched_times = [0.0] * len(unique_cus) +work_times = [0.0] * len(unique_cus) +for cur in records: + sched_out = False + for block in cu_timelines[cur[0].cuid]: + if block.start+block.elapsed<=cur[0].start: + sched_times[cur[0].cuid]+=cur[0].start + - (block.start + block.elapsed) + cu_timelines[cur[0].cuid].remove(block) + cu_timelines[cur[0].cuid].append(cur[0]) + work_times[cur[0].cuid] += cur[0].elapsed + sched_out = True + break + if not sched_out: + cu_timelines[cur[0].cuid].append(cur[0]) + work_times[cur[0].cuid] += cur[0].elapsed +print(np.array(sched_times).mean(), + np.array(work_times).mean()) diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/gcn.asm b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/gcn.asm new file mode 100644 index 0000000..256a044 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/gcn.asm @@ -0,0 +1 @@ +CALLBACK = "block_sched.py" diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/zero_persistent.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/zero_persistent.py new file mode 100644 index 0000000..a829336 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/zero_persistent.py @@ -0,0 +1,29 @@ +import torch +import triton +import triton.language as tl +@triton.jit +def zero_persistent_kernel(output_ptr, numel, + BLOCK_SIZE: tl.constexpr, NUM_SMS: tl.constexpr): + start_pid = tl.program_id(axis=0) + num_blocks = tl.cdiv(numel, BLOCK_SIZE) + blocks_per_sm = num_blocks // NUM_SMS + if start_pid < num_blocks % NUM_SMS: + blocks_per_sm += 1 + block_id = start_pid - NUM_SMS + for _ in range(blocks_per_sm): + block_id += NUM_SMS + offsets=block_id*BLOCK_SIZE+tl.arange(0,BLOCK_SIZE) + mask = offsets < numel + tl.store(output_ptr + offsets, + tl.zeros([BLOCK_SIZE], dtype=tl.float16), mask) +def zero_persistent(x: torch.Tensor): + numel = x.numel() + NUM_SMS = torch.cuda.get_device_properties("cuda")\ + .multi_processor_count + BLOCK_SIZE = 128 + grid = lambda META: (min(NUM_SMS, + triton.cdiv(numel, META['BLOCK_SIZE'])),) + zero_persistent_kernel[grid]( + x, numel, BLOCK_SIZE, NUM_SMS) +t=torch.empty((4096,4096),torch.float16,device="cuda") +zero_persistent(t) diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/__init__.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/__init__.py new file mode 100644 index 0000000..a6f2479 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/__init__.py @@ -0,0 +1,39 @@ +from typing import NamedTuple, Union, Literal + +class TraceHeader(NamedTuple): + gridDimX: int + gridDimY: int + gridDimZ: int + blockDimX: int + blockDimY: int + blockDimZ: int + sharedMemBytes: int + numProbes: int + +class TraceSection(NamedTuple): + size: int + warpDiv: int + offset: int + +def probe(pos: str, after: bool = False, level: str = "thread", size: int = 0): + """Neutrino Probe Entry""" + from functools import wraps + # Just preventing the execution as we take it as part of AST only + def inner(func: callable): + @wraps(func) + def wrapper(*args, **kwargs): + raise RuntimeError(f"{func.__name__} shall be jit other than run") + return wrapper + return inner + +def Map(level: Literal["warp", "thread"], type: str, size: int, cap: Union[int, Literal["dynamic"]]): + """Neutrino Map Definition""" + from functools import wraps + def inner(cls): + @wraps(cls) + def wrapper(*args, **kwargs): + raise RuntimeError(f"{cls.__name__} shall be jit other than run") + return wrapper + return inner + +# Following are internal definition \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/build.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/build.py new file mode 100644 index 0000000..8685e9f --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/build.py @@ -0,0 +1,129 @@ +import os +import sys +import subprocess +from pprint import pprint + +try: + import toml +except: + import pip + pip.main(["install", "toml"]) + import toml + +CURDIR = os.path.dirname(os.path.realpath(__file__)) + +def check_command(cmd: str): + try: + _ = subprocess.run([cmd], stdout=subprocess.PIPE, + stderr=subprocess.PIPE, text=True, check=True) + return True + except FileNotFoundError: + return False + +# Use rocm-smi or nvidia-smi to detect if installed +if check_command("rocm-smi"): + NEUTRINO_MODE = "HIP" + NEUTRINO_DRIVER_HEADER_NAME = "hip/hip_runtime_api.h" + NEUTRINO_IMPL_SRC = "hip.c" + NEUTRINO_HOOK_DRIVER_LIB_NAME = "libamdhip64.so.6" + NEUTRINO_DRIVER_HEADER_SEARCH_PATH = [ + "/opt/rocm/include/", # AFAIK, add if new path is met + ] + extra_flags = ["-D__HIP_PLATFORM_AMD__"] +elif check_command("nvidia-smi"): + NEUTRINO_MODE = "CUDA" + NEUTRINO_DRIVER_HEADER_NAME = "cuda.h" + NEUTRINO_IMPL_SRC = "cuda.c" + NEUTRINO_HOOK_DRIVER_LIB_NAME = "libcuda.so.1" + NEUTRINO_DRIVER_HEADER_SEARCH_PATH = [ + "/usr/local/cuda/targets/x86_64-linux/include/", # for x86 + "/usr/local/cuda/targets/aarch64-linux/include/", # for ARM + # add if missed + ] + extra_flags = [] +else: + raise RuntimeError("ONLY SUPPORT CUDA and HIP(AMD-ONLY)") + +# Internal Configurations +SRC_DIR = os.path.join(CURDIR, "src") +BUILD_DIR = os.path.join(CURDIR, "build") +CC = "cc" # NOTE don't use nvcc or hipcc, need gcc or clang +PY = sys.executable + +for dir_ in NEUTRINO_DRIVER_HEADER_SEARCH_PATH: + try: + if NEUTRINO_DRIVER_HEADER_NAME in os.listdir(dir_): + break + except: + pass +# NOTE this will be written in config.toml +NEUTRINO_DRIVER_HEADER_DIR = dir_ + +# NOTE Locate Driver Shared Library +# inspired by: https://github.com/triton-lang/triton/commit/58c54455ffa691be64f90f4e856501162373572c#diff-3d1f29795218f61553ab953426c15fa1e4162b224405b85529022293054da57aR25 +# but we need to further locate the real driver library +libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode() +locs = [line.split()[-1] for line in libs.splitlines() if NEUTRINO_HOOK_DRIVER_LIB_NAME in line] +env_ld_library_path = os.getenv("LD_LIBRARY_PATH") +if env_ld_library_path and not locs: + locs = [os.path.join(dir_, NEUTRINO_HOOK_DRIVER_LIB_NAME) for dir_ in env_ld_library_path.split(":") + if os.path.exists(os.path.join(dir_, NEUTRINO_HOOK_DRIVER_LIB_NAME))] + +# try to locate the pointed path +NEUTRINO_REAL_DRIVER_LIB_NAME = "" +NEUTRINO_REAL_DRIVER_LIB_DIR = "" +real_libs = [] +for loc in locs: + real_lib = os.readlink(loc) + # NOTE fix lib32 and i386 bug + if "lib32" not in loc and "lib32" not in real_lib and "i386" not in loc and "i386" not in real_lib: + if not real_lib.startswith("/"): + NEUTRINO_REAL_DRIVER_LIB_DIR = os.path.dirname(loc) + NEUTRINO_REAL_DRIVER_LIB_NAME = real_lib + else: + NEUTRINO_REAL_DRIVER_LIB_DIR = os.path.dirname(loc) + NEUTRINO_REAL_DRIVER_LIB_NAME = os.path.basename(loc) + +print(NEUTRINO_REAL_DRIVER_LIB_DIR, NEUTRINO_REAL_DRIVER_LIB_NAME, file=sys.stderr) + +# NOTE call parse.py +cmd = [PY, os.path.join(SRC_DIR, "parse.py"), + os.path.join(NEUTRINO_DRIVER_HEADER_DIR, NEUTRINO_DRIVER_HEADER_NAME), + os.path.join(NEUTRINO_REAL_DRIVER_LIB_DIR, NEUTRINO_REAL_DRIVER_LIB_NAME)] +print(" ".join(cmd), file=sys.stderr) +subprocess.check_output(cmd) + +# NOTE compile cuda.c/hip.c with common.h +cmd = [CC, os.path.join(SRC_DIR, NEUTRINO_IMPL_SRC), "-fPIC", "-shared", "-ldl", "-lpthread", "-O3", *extra_flags, + "-I", NEUTRINO_DRIVER_HEADER_DIR, "-o", os.path.join(BUILD_DIR, NEUTRINO_HOOK_DRIVER_LIB_NAME)] +print(" ".join(cmd), file=sys.stderr) +subprocess.check_output(cmd) + +# NOTE compile preload.c +cmd = [CC, os.path.join(SRC_DIR, "preload.c"), "-fPIC", "-shared", "-O3", + "-o", os.path.join(BUILD_DIR, "preload.so")] +print(" ".join(cmd), file=sys.stderr) +subprocess.check_output(cmd) + +# NOTE create a symbolic link like libcuda.so -> libcuda.so.1 +# TODO verify if this is need +cmd = ["ln", "-sf", NEUTRINO_HOOK_DRIVER_LIB_NAME, + os.path.join(BUILD_DIR, NEUTRINO_HOOK_DRIVER_LIB_NAME[:NEUTRINO_HOOK_DRIVER_LIB_NAME.index("so") + 2])] +print(" ".join(cmd), file=sys.stderr) +subprocess.check_output(cmd) + +# NOTE dump system configuration for CLI usage +config = {} +config["system"] = { + "NEUTRINO_MODE" : NEUTRINO_MODE, + "NEUTRINO_DRIVER_HEADER_NAME" : NEUTRINO_DRIVER_HEADER_NAME, + "NEUTRINO_DRIVER_HEADER_DIR" : NEUTRINO_DRIVER_HEADER_DIR, + "NEUTRINO_HOOK_DRIVER_LIB_NAME" : NEUTRINO_HOOK_DRIVER_LIB_NAME, + "NEUTRINO_REAL_DRIVER_LIB_NAME" : NEUTRINO_REAL_DRIVER_LIB_NAME, + "NEUTRINO_REAL_DRIVER_LIB_DIR" : NEUTRINO_REAL_DRIVER_LIB_DIR, +} +toml.dump(config, open(os.path.join(BUILD_DIR, "config.toml"), "w")) + +print("Build Success, Configuration") +print("============================") +pprint(config) \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/cli.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/cli.py new file mode 100644 index 0000000..6049a1d --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/cli.py @@ -0,0 +1,165 @@ +"""CLI Entry for NEUTRINO: Fine-grained GPU Kernel Profiling via Programmable Probing""" + +import subprocess +import os +import sys +import toml +import argparse + + +# Main Function, need a func to be referred by setup.py build system +def main(): + # NOTE READ CONFIG.TOML FOR DEFAULT SYSTEM CONFIGURATION + # directory of this python file and other toolkits + CURDIR = os.path.dirname(os.path.realpath(__file__)) + # directory of the neutrino internals + NEUTRINO_BUILD_DIR: str = os.path.join(CURDIR, "build") + NEUTRINO_PROBE_DIR: str = os.path.join(CURDIR, "probe") + NEUTRINO_TOOLS_DIR: str = os.path.join(CURDIR, "tools") + # load system configuration, generated in building + config = toml.load(os.path.join(NEUTRINO_BUILD_DIR, "config.toml"))["system"] + # default configurations, can be overwritten by CLI parameters + NEUTRINO_HOOK_DRIVER_NAME: str = config["NEUTRINO_HOOK_DRIVER_LIB_NAME"] + NEUTRINO_REAL_DRIVER_DIR : str = config["NEUTRINO_REAL_DRIVER_LIB_DIR"] + NEUTRINO_REAL_DRIVER_NAME: str = config["NEUTRINO_REAL_DRIVER_LIB_NAME"] + NEUTRINO_MODE : str = config["NEUTRINO_MODE"] + # available built-in tools + NEUTRINO_TOOLS = {tool[:-3] : tool for tool in os.listdir(NEUTRINO_TOOLS_DIR) if tool.endswith(".py")} + + parser = argparse.ArgumentParser( + prog='neutrino', usage='%(prog)s [options] command', + description=f"""NOTE: Probes must be given via -p (--probe) option. Buit-in tools: {tuple(NEUTRINO_TOOLS.keys())}""", + epilog="Examples: `neutrino -t gmem_bytes python test/zero_.py`. Open issue(s) in https://github.com/neutrino-gpu/neutrino if encountered problems", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + parser.add_argument('-p', '--probe', required=True, + help="probe in form of path to the file") + parser.add_argument('--tracedir', default="./trace", + help="parent folder of traces") + parser.add_argument('--driver', default=os.path.join(NEUTRINO_REAL_DRIVER_DIR, NEUTRINO_REAL_DRIVER_NAME), + help='path to the real cuda/hip driver shared library') + parser.add_argument("--python", default=sys.executable, + help='path to python executable used') + parser.add_argument('--filter', + help='filter OUT buggy kernels by (part of) name, split by :') + parser.add_argument('-k', '--kernel', + help='filter the kernel by (part of) name, split by :') + parser.add_argument('--callback', + help='attach callback for trace analysis') + parser.add_argument('--benchmark', action='store_true', + help="enable benchmark mode to evaluate overhead w.r.t. the original kernel") + parser.add_argument('--memusage', action='store_true', + help="prevent the profiling and only measure the memory usage") + # put command at the end of command + parser.add_argument("command", nargs=argparse.REMAINDER) + # parse the arguments + args = parser.parse_args() + + # same as this executable + NEUTRINO_PYTHON: str = args.python # default to be this executable + # directory to put the trace + NEUTRINO_TRACEDIR: str = args.tracedir + # filter of kernel + NEUTRINO_FILTER: str = args.filter if args.filter is not None else "" + NEUTRINO_KERNEL: str = args.kernel if args.kernel is not None else "" + # Benchmark mode, will include an additional launch after the trace kernel + # Used to measure the kernel-level slowdown of Neutrino, disabled by default + NEUTRINO_BENCHMARK: str = str(int(args.benchmark)) + NEUTRINO_MEMUSAGE: str = str(int(args.memusage)) + # Path to the real driver + NEUTRINO_REAL_DRIVER: str = args.driver + # command to be executed + command: str = args.command + assert len(command) > 0, "Command must be specified" + + # Parse the PROBE + NEUTRINO_PROBE_PATH: str = args.probe + NEUTRINO_READING: str = None + # NOTE endswith .py triggers the Tracing DSL + if NEUTRINO_PROBE_PATH.endswith(".py"): + from neutrino.language.compiler import compile + source = open(NEUTRINO_PROBE_PATH, "r").read() + NEUTRINO_PROBE = compile(NEUTRINO_MODE, source) + elif NEUTRINO_PROBE_PATH.endswith(".toml"): + NEUTRINO_PROBE = toml.load(NEUTRINO_PROBE_PATH) + else: + # No suffix := use built-in tools + if NEUTRINO_PROBE_PATH not in NEUTRINO_TOOLS: + print(f"[error] {NEUTRINO_PROBE_PATH} not in tools: {NEUTRINO_TOOLS}", file=sys.stderr) + exit(-1) + else: + from neutrino.language.compiler import compile + source = open(os.path.join(NEUTRINO_TOOLS_DIR, NEUTRINO_TOOLS[NEUTRINO_PROBE_PATH]), "r").read() + NEUTRINO_PROBE = compile(NEUTRINO_MODE, source) + + # NOTE generate the trace reading code + from neutrino.utils.trace_reading import gen_reading_code + NEUTRINO_READING = gen_reading_code(NEUTRINO_PROBE) + + # NOTE check if dynamic is True, shall have a specific keyword in top-level of probe + NEUTRINO_DYNAMIC = "dynamic" in NEUTRINO_PROBE and NEUTRINO_PROBE["dynamic"] is True + + # TODO change the callback to other places + NEUTRINO_CALLBACK = NEUTRINO_PROBE["CALLBACK"] if "CALLBACK" in NEUTRINO_PROBE else None + NEUTRINO_CALLBACK = args.callback if args.callback is not None else NEUTRINO_CALLBACK + if NEUTRINO_CALLBACK: + # search the path + if not os.path.exists(NEUTRINO_CALLBACK): + searched = os.path.join(os.path.dirname(NEUTRINO_PROBE_PATH), NEUTRINO_CALLBACK) + if os.path.exists(searched): + NEUTRINO_CALLBACK = searched + else: + print(f"[warn] callback {NEUTRINO_CALLBACK} not found") + NEUTRINO_CALLBACK = None + + # a copied environment variables + env = os.environ.copy() + # configure Neutrino related environment variables + env["NEUTRINO_REAL_DRIVER"] = NEUTRINO_REAL_DRIVER + env["NEUTRINO_DRIVER_NAME"] = NEUTRINO_HOOK_DRIVER_NAME + env["NEUTRINO_HOOK_DRIVER"] = os.path.join(NEUTRINO_BUILD_DIR, NEUTRINO_HOOK_DRIVER_NAME) + env["NEUTRINO_PYTHON"] = NEUTRINO_PYTHON + env["NEUTRINO_PROBING_PY"] = os.path.join(NEUTRINO_BUILD_DIR, "process.py") + env["NEUTRINO_FILTER"] = NEUTRINO_FILTER + env["NEUTRINO_KERNEL"] = NEUTRINO_KERNEL + env["NEUTRINO_TRACEDIR"] = NEUTRINO_TRACEDIR + env["NEUTRINO_PROBES"] = toml.dumps(NEUTRINO_PROBE) # dump it to string + # GNU LD_PRELOAD to overwrite dlopen, https://man7.org/linux/man-pages/man8/ld.so.8.html + env["LD_PRELOAD"] = os.path.join(NEUTRINO_BUILD_DIR, "preload.so") + # Add to the LD_LIBRARY_PATH, this would overwrite ldconfig + if "LD_LIBRARY_PATH" in env: + env["LD_LIBRARY_PATH"] = NEUTRINO_BUILD_DIR + ":" + env["LD_LIBRARY_PATH"] + else: + env["LD_LIBRARY_PATH"] = NEUTRINO_BUILD_DIR + # An Environmental Variable to enable the trace + # NOTE some bugs here -> still working on + env["NEUTRINO_ENABLE"] = "1" + # An Environmental Variable to enable the benchmark mode + env["NEUTRINO_BENCHMARK"] = NEUTRINO_BENCHMARK + env["NEUTRINO_MEMUSAGE"] = NEUTRINO_MEMUSAGE + # An Environmental Variables to enable the debug mode -> more messages + # env["NEUTRINO_VERBOSE"] = "1" + if NEUTRINO_DYNAMIC: + env["NEUTRINO_DYNAMIC"] = "1" + if NEUTRINO_READING: + env["NEUTRINO_READING"] = NEUTRINO_READING + if NEUTRINO_CALLBACK: + env["NEUTRINO_CALLBACK"] = NEUTRINO_CALLBACK + + # FIX for Triton + if NEUTRINO_MODE == "CUDA": + env["TRITON_LIBCUDA_PATH"] = NEUTRINO_BUILD_DIR + env["NEUTRINO_PROBING_PY"] = os.path.join(NEUTRINO_PROBE_DIR, "cuda.py") + elif NEUTRINO_MODE == "HIP": + # NOTE There's a bug in Triton's impl here, for path we refer to the + # directory for ld.so to search, instead of spcific file name ... + env["TRITON_LIBHIP_PATH"] = os.path.join(NEUTRINO_BUILD_DIR, "libamdhip64.so") + env["NEUTRINO_PROBING_PY"] = os.path.join(NEUTRINO_PROBE_DIR, "hip.py") + + # start the program with new environment + if len(command) > 0: + proc = subprocess.Popen(command, env=env) + proc.wait() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/common.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/common.py new file mode 100644 index 0000000..873c5fc --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/common.py @@ -0,0 +1,68 @@ +"""Neutrino Internal APIs, not for user import""" + +from typing import Optional, Literal, Union +from dataclasses import dataclass + +@dataclass +class Register: + name: str + dtype: Literal['u32', 'u64'] + init: Optional[int] = None + + +@dataclass +class Probe: + name: str # name is the key in TOML + level: Literal["thread", "warp"] # level of the probe + pos: list[str] # := tracepoint in the paper + before: Union[list, str] = None # snippet inserted before, one of before and after shall be given + after: Union[list, str] = None # snippet inserted after, one of before and after shall be given + + +@dataclass +class Map: + name: str + level: Literal["thread", "warp"] + type: Literal["array"] + size: int + cap: Union[int, Literal["dynamic"]] + regs: list[Register] + + +def load(raw: dict) -> tuple[list[Probe], list[Map], int]: + """Unserialize Neutrino probes in Python dict to probes, maps, regs""" + assert "probe" in raw.keys() and "map" in raw.keys(), "At least a probe and a map" + probes: list[Probe] = [] + maps: list[Map] = [] + for name, probe in raw["probe"].items(): + # first validate the + keys = probe.keys() + assert "position" in keys or "pos" in keys, f"[error] {name} has no position (required)" + # assert "datamodel" in keys, f"[error] " + assert "before" in keys or "after" in keys, f"[error] {name} is empty, one of before or after shall be given" + assert "level" in keys and probe["level"] in ("warp", "thread"), f"[error] level must be given and one of 'warp', 'thread'" + probes.append(Probe(name=name, + level=probe["level"], + pos=probe["pos"].split(":"), + before=probe["before"] if "before" in keys else None, + after=probe["after"] if "after" in keys else None)) + for name, map_ in raw["map"].items(): + maps.append(Map(name=name, + level=map_["level"], + type=map_["type"], + size=map_["size"], + cap=map_["cap"], + regs=[Register(name, val[0], init=val[1]) for name, val in map_["regs"].items()])) + return probes, maps, raw["regs"] + + +def dump(probes, maps, regs, callback = "") -> dict: + """Serialize Neutrino probes to Python dict""" + dict_probe = { + "regs": regs, + "probe" : {p.name: {"level": p.level, "pos": p.pos, "before": p.before, "after": p.after} for p in probes}, + "map": {m.name: {"level": m.level, "type": m.type, "size": m.size, "cap": m.cap, "regs": {r.name: [r.dtype, r.init] for r in m.regs}} for m in maps} + } + if len(callback) > 0: + dict_probe["CALLBACK"] = callback + return dict_probe \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/__init__.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/__init__.py new file mode 100644 index 0000000..342b036 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/__init__.py @@ -0,0 +1,35 @@ +"""Neutrino Trace Language Primitive""" +from functools import wraps +from typing import TypeAlias + +# NOTE Neutrino Language's type system is incomplete and tiny because they're +# placeholders for compilers instead of functional code for Python. +u32: TypeAlias = int +u64: TypeAlias = int +reg: TypeAlias = int + +TYPES = ["u32", "u64"] +FUNCS = ["smid", "time", "clock", "save"] + +def smid() -> u32: ... + +def time() -> u64: ... + +def clock() -> u64: ... + +def save(regs: list[reg], dtype) -> None: ... + +# @_disable_execution_ +# def tid() -> None: ... + +# @_disable_execution_ +# def pid() -> None: ... + +# Following are helpers for parsing register operands +out: reg = ... +in1: reg = ... +in2: reg = ... +in3: reg = ... +in4: reg = ... +addr: reg = ... +bytes: reg = ... \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/compiler.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/compiler.py new file mode 100644 index 0000000..516e802 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/compiler.py @@ -0,0 +1,49 @@ +"""CLI Entry of Language Submodule""" +from neutrino.common import Probe, dump +from neutrino.language.frontend import parse + +def compile(mode: str, source: str) -> tuple[str, str]: + """Compile the Tracing DSL into Assembly Probes""" + regs, probes, maps, callback = parse(source) + if mode == "CUDA": + from neutrino.language.ptx import gencode + probes = gencode(probes) + elif mode == "HIP": + from neutrino.language.gcn import gencode + probes = gencode(probes) + + # NOTE Merge probes of the same level and pos + merged_probes: dict[tuple[str, str], Probe] = {} + for probe in probes: + key = (probe.level, probe.pos) + if key not in merged_probes: + merged_probes[key] = probe + else: # merge + merged_probes[key].name += "_" + probe.name + merged_probes[key].before = ( + (merged_probes[key].before or "") + (probe.before or "") + if merged_probes[key].before or probe.before + else None + ) + merged_probes[key].after = ( + (merged_probes[key].after or "") + (probe.after or "") + if merged_probes[key].after or probe.after + else None + ) + probes = list(merged_probes.values()) + + dumped = dump(probes, maps, regs, callback) + + for map_ in maps: + if map_.cap == "dynamic": + dumped["dynamic"] = True + + return dumped + +if __name__ == "__main__": # A small test case + import sys + import toml + mode, source = sys.argv[1], sys.argv[2] + source = open(source, "r").read() + asm_probes = compile(mode, source) + print(toml.dumps(asm_probes)) \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/frontend.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/frontend.py new file mode 100644 index 0000000..2aa3736 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/frontend.py @@ -0,0 +1,251 @@ +"""Parse and flatten Python Tracing DSL""" +from neutrino.language import TYPES # neutrino/language/__init__.py +import ast +from typing import Optional +from dataclasses import dataclass +from neutrino.common import Register, Probe, Map + +allowed_nodes = { + ast.Import, # Imported Stuff + ast.Module, # the greatest start + ast.Name, # Name of Variable + ast.Assign, # Assign Value + ast.AugAssign, # += + ast.UnaryOp, # Unary Op, only negative + ast.BinOp, # Binary Op, +-*/ + ast.Call, # Call function + ast.Attribute, # Access Attribute of Namespace + ast.Constant, # Constant Value + ast.Expr, # Single Expression +} + +binary_ops = { + ast.Add: "add", + ast.Sub: "sub", + ast.Mult: "mul", + ast.Div: "div" +} + +unary_ops = { + ast.USub: "neg" +} + +class NeutrinoVisitor(ast.NodeVisitor): + def __init__(self, nl_name: str, regs: list[str], maps: list[str]): + super().__init__() + self.nl_name = nl_name + self.reg_counter = -1 # make it R0 + self.ir: list[tuple] = [] + self.reg_map: dict[str, str] = {reg: self.fresh_name() for reg in regs} + self.maps = maps + # initialize and visit tree + + def fresh_name(self): + self.reg_counter += 1 + return f"NR{self.reg_counter}" + + def visit_Assign(self, node): # Lowered to mov + # we shall check if the target has a known name + name = self.reg_map[node.targets[0].id] + if isinstance(node.value, ast.Attribute): + self.ir.append(["mov", name, self.visit(node.value)]) + else: + new_name = self.visit(node.value) # this is the temporary name + for inst in self.ir: + for idx in range(len(inst)): + if inst[idx] == new_name: + inst[idx] = name + self.reg_counter -= 1 + + def process_operand(self, operand) -> str: + if isinstance(operand, ast.Name): + return self.reg_map[operand.id] + elif isinstance(operand, ast.Constant): + return operand.value + elif isinstance(operand, (ast.Attribute, ast.Call, ast.BinOp, ast.UnaryOp)): + return self.visit(operand) + else: + raise ValueError + + def visit_BinOp(self, node): # Lowered to add/sub + lhs = self.process_operand(node.left) + rhs = self.process_operand(node.right) + new_name = self.fresh_name() + if isinstance(node.op, ast.Add): + self.ir.append(["add", new_name, lhs, rhs]) + elif isinstance(node.op, ast.Sub): + self.ir.append(["sub", new_name, lhs, rhs]) + elif isinstance(node.op, ast.Mult): + self.ir.append(["mul", new_name, lhs, rhs]) + elif isinstance(node.op, ast.Div): + self.ir.append(["div", new_name, lhs, rhs]) + elif isinstance(node.op, ast.LShift): + self.ir.append(["lsh", new_name, lhs, rhs]) + else: + raise NotImplementedError() + self.reg_map[new_name] = new_name + return new_name + + def visit_AugAssign(self, node): + rhs = self.process_operand(node.value) + name = self.reg_map[node.target.id] + if isinstance(node.op, ast.Add): + self.ir.append(["add", name, name, rhs]) + elif isinstance(node.op, ast.Sub): + self.ir.append(["sub", name, name, rhs]) + elif isinstance(node.op, ast.Mult): + self.ir.append(["mul", name, name, rhs]) + elif isinstance(node.op, ast.Div): + self.ir.append(["div", name, name, rhs]) + elif isinstance(node.op, ast.LShift): + self.ir.append(["lsh", name, name, rhs]) + else: + raise NotImplementedError() + return name + + def visit_UnaryOp(self, node): + value = self.process_operand(node.operand) + new_name = self.fresh_name() + if isinstance(node.op, ast.USub): + self.ir.append(["neg", new_name, value]) + else: + raise NotImplementedError() + return new_name + + def visit_Call(self, node): + func_name = self.visit(node.func) + if func_name == "cuid": + new_name = self.fresh_name() + self.ir.append(["cuid", new_name]) + self.reg_map[new_name] = new_name + return new_name + elif func_name == "time": + new_name = self.fresh_name() + self.reg_map[new_name] = new_name + self.ir.append(["time", new_name]) + return new_name + elif func_name == "clock": + new_name = self.fresh_name() + self.reg_map[new_name] = new_name + self.ir.append(["clock", new_name]) + return new_name + elif func_name == "save": + map_name = node.func.value.id + if map_name not in self.maps: + raise ValueError(f"Map {map_name} not found, known maps: {self.maps}") + regs = [] + for arg in node.args: + if isinstance(arg, ast.Name): + regs.append(self.reg_map[arg.id]) + elif isinstance(arg, ast.Attribute): + regs.append(self.visit_Attribute(arg)) + else: + regs.append(self.reg_map[self.visit(arg)]) + self.ir.append(["SAVE", map_name] + regs) + else: + raise NotImplementedError() + + def visit_Name(self, node): + return node.id + + def visit_Attribute(self, node): + if node.value.id == self.nl_name or node.value.id in self.maps: + if node.attr in ("bytes", "addr", "out", "in1", "in2", "in3"): + return node.attr.upper() + return node.attr + else: + raise ValueError(f"can only refer to neutrino.language semantic but got {node.value.id}") + + def visit_Constant(self, node): + return node + + def generic_visit(self, node): + if type(node) not in allowed_nodes: + raise NotImplementedError(f"{type(node).__name__} (lineno: {node.lineno})") + super().generic_visit(node) + + +def parse(code: str) -> tuple[list[Register], list[Probe], list[Map], str]: + """Parse the code into probes""" + tree = ast.parse(code) + nl_name: str = None # name of neutrino.language in the code + regs: list[Register] = [] + num_regs: int = 0 + probes: list[Probe] = [] + callback: str = "" # not used yet, but we can use it later + maps: list[Map] = [] # not used yet, but we can use it later + + for node in tree.body: + if type(node) is ast.Import and node.names[0].name == "neutrino.language": + nl_name = node.names[0].asname + elif type(node) is ast.Assign and node.targets[0].id == "CALLBACK": + if isinstance(node.value, ast.Constant): + callback = node.value.value + else: + raise ValueError("CALLBACK must be a string constant") + elif type(node) is ast.AnnAssign and node.annotation: + if node.annotation.value.id == nl_name and node.annotation.attr in TYPES: + regs.append(Register(node.target.id, node.annotation.attr, node.value.value)) + elif type(node) is ast.ClassDef and node.decorator_list: + name = node.name # take class name as map name + decorator = node.decorator_list[0] + if decorator.func.id == "Map": + level, type_, size, cap, contents = None, None, 0, 1, [] + for keyword in decorator.keywords: + if keyword.arg == "level": level = keyword.value.value + elif keyword.arg == "type": type_ = keyword.value.value + elif keyword.arg == "size": size = keyword.value.value + elif keyword.arg == "cap": cap = keyword.value.value + if size % 8 != 0: + raise ValueError("size must be multiple of 8 to avoid misaligned address") + if not level or not type_: + raise ValueError("level and type must be specified") + if not isinstance(cap, int) and cap != "dynamic": + raise ValueError("cap must be an integer or 'dynamic'") + # check if map existed or not + for node in node.body: + if type(node) is ast.AnnAssign and node.annotation: + if node.annotation.value.id == nl_name and node.annotation.attr in TYPES: + contents.append(Register(node.target.id, node.annotation.attr, None)) + else: + raise ValueError(f"Map {name} must only contain AnnAssign nodes") + ordered = sorted(contents, key=lambda reg: reg.dtype, reverse=True) + if ordered != contents: + print("[warn] map contents reordered") + # create a map object + maps.append(Map(name=name, level=level, type=type_, size=size, cap=cap, regs=ordered)) + elif type(node) is ast.FunctionDef and node.decorator_list: + name = node.name # take func name as probe name + decorator = node.decorator_list[0] + if decorator.func.id == "probe": + pos, level, before = None, None, False + for keyword in decorator.keywords: + if keyword.arg == "pos": pos = keyword.value.value + elif keyword.arg == "level": level = keyword.value.value + elif keyword.arg == "before": before = keyword.value.value + if not pos or not level: raise ValueError("position must be specified") + # check if probe existed or not + visitor = NeutrinoVisitor(nl_name=nl_name, regs=[reg.name for reg in regs], maps=[map.name for map in maps]) + visitor.visit(ast.Module(body=node.body)) # Take it as independent code + probe = Probe(name=name, pos=pos, level=level) + if before: + probe.before = visitor.ir + else: + probe.after = visitor.ir + probes.append(probe) + num_regs = max(num_regs, visitor.reg_counter) + + return num_regs + len(regs), probes, maps, callback + +# A Simple Test Case, not really used in production +if __name__ == "__main__": + import sys + + code = open(sys.argv[1], "r").read() + + regs, probes, maps, callback = parse(code) + + print(regs) + print(probes) + print(maps) + print(callback) diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/gcn.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/gcn.py new file mode 100644 index 0000000..21ef908 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/gcn.py @@ -0,0 +1,12 @@ +"""Generate the AMD GCN Assembly, a x86-like asm + +NOTE Currently only targets CDNA branch of GCN, covering MI100/200/300/325 +This is because AMD's Assembly diverge into CDNA/RDNA in 2020, before that +there's only one architecture named GCN (so as the name of GCNAsm). + +CDNA and RDNA shares the same syntax inherited from GCNAsm, but has slight +difference in instruction set, for example, CDNA use `S_MEMTIME S[0:1]` to +read the clock in 64bit but RDNA use `S_GETREG S0, SHADER_CYCLES` in 32bit + +We plan to support CDNA arch first and then port to RDNA arch later. +""" \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/ptx.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/ptx.py new file mode 100644 index 0000000..051f194 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/ptx.py @@ -0,0 +1,75 @@ +"""Generate the CUDA PTX Assembly, a C-style asm""" + +from neutrino.common import Register, Probe, Map + +def filter_keyword(reg: str) -> str: + if reg in {"ADDR", "BYTES", "OUT", "IN1", "IN2", "IN3", "IN4"}: + return reg + elif isinstance(reg, int): + return reg + else: + return "%" + reg + +def cvt_inst(inst: list[str]) -> str: + match inst[0]: + # ALU Instructions + case "add": + return f"add.u64 {filter_keyword(inst[1])}, {filter_keyword(inst[2])}, {filter_keyword(inst[3])};" + case "sub": + return f"sub.u64 {filter_keyword(inst[1])}, {filter_keyword(inst[2])}, {filter_keyword(inst[3])};" + case "mul": + return f"mul.u64 {filter_keyword(inst[1])}, {filter_keyword(inst[2])}, {filter_keyword(inst[3])};" + case "div": + return f"div.u64 {filter_keyword(inst[1])}, {filter_keyword(inst[2])}, {filter_keyword(inst[3])};" + case "mod": + return f"rem.u64 {filter_keyword(inst[1])}, {filter_keyword(inst[2])}, {filter_keyword(inst[3])};" + case "lsh": + return f"shl.u64 {filter_keyword(inst[1])}, {filter_keyword(inst[2])}, {filter_keyword(inst[3])};" + case "rsh": + return f"shr.u64 {filter_keyword(inst[1])}, {filter_keyword(inst[2])}, {filter_keyword(inst[3])};" + # Memory Instructions + case "SAVE": + contents = (filter_keyword(reg) for reg in inst[2:]) + contents = ", ".join(contents) + return f"SAVE [ {inst[1]} ] {{ { contents } }}" # just return everything + # Other Instructions + case "mov": + return f"mov.u64 {filter_keyword(inst[1])}, {filter_keyword(inst[2])};" + case "clock": + return f"mov.u64 {filter_keyword(inst[1])}, %clock64;" + case "time": + return f"mov.u64 {filter_keyword(inst[1])}, %globaltimer;" + case "cuid": + return f"""{{ + .reg .b32 %tmp; + mov.u32 %tmp, %smid; + cvt.u64.u32 {filter_keyword(inst[1])}, %tmp; + }}""" + case _: + raise NotImplementedError(f"{inst} not yet supported") + +def gencode(probes: list[Probe]) -> list[Probe]: + # First handle the initialization of regs + + # Then handle the syntax of probes + for probe in probes: + # only change the instructions, i.e., before and after part + if probe.before is not None: + insts: list[str] = [] + for inst in probe.before: + insts.append(cvt_inst(inst)) + probe.before = "\n".join(insts) + elif probe.after is not None: + insts: list[str] = [] + for inst in probe.after: + insts.append(cvt_inst(inst)) + probe.after = "\n".join(insts) + + return probes + +if __name__ == "__main__": + probes = [ + Probe(name='thread_start', level='warp', pos='kernel', size=0, before=None, after=[['clock', 'R0']]), + Probe(name='thread_end', level='warp', pos='kernel', size=0, before=None, after=[['clock', 'R2'], ['sub', 'R1', 'R2', 'R0'], ['cuid', 'R3'], ['SAVE', 'block_sched', 'R0', 'R1', 'R3']]) + ] + print(gencode(probes)) diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/__init__.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/__init__.py new file mode 100644 index 0000000..4c385f2 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/__init__.py @@ -0,0 +1,50 @@ +"""Neutrino Probing Engine Protocol + +NOTE just a protocol for developers, don't import / export""" + +from dataclasses import dataclass +from neutrino.common import Register, Probe, Map + +__all__ = ["Ref", "load_probes", "TRACE_READING_CODE_PY"] + +@dataclass +class Ref: + """Reference for replacement""" + line: str # Original line + probe: str # Probe name for matchine + before_after: bool # True if before and False if after -> to distinguish which snippet is used + + +@dataclass +class KernelParam: + dtype: str + name: str + + +# NOTE following is just protocol, please implement yours, developers can +# extend other functions for their need, just keep following implemented + +def get_arch() -> str: + """get architecture for assembler""" + ... + +def dump(workdir: str, name: str, suffix: str) -> str: + """call objdump to extract assembly from binary""" + ... + +def prune(ptx: str, entry_name: str): + """Prune Assembly to specific entry_name""" + ... + +def probing(asm: str, probes: list[Probe]): + """Probe the probes into asm""" + ... + +def assemble(workdir: str, name: str): + """call assembler to turn assembly to machine code""" + ... + +def write_kernel_info(name: str, params, probe_mem_sizes: list[int], + workdir: str, analyze_hook: str = "", file_name: str = "kernel.info"): + """write kernel info for hook driver to read back""" + ... \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/cuda.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/cuda.py new file mode 100644 index 0000000..9f4c1c7 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/cuda.py @@ -0,0 +1,786 @@ +"""Neutrino Probing Engine, CUDA Implementation""" + +import os +import sys +import shutil +import subprocess +import traceback # usef for print backtrace to log file instead of stdout +import toml # to load probes from envariables +from neutrino.common import Register, Probe, Map, load +from neutrino.probe import Ref, KernelParam + +workdir = sys.argv[1] # directory contains original.bin +log = open(os.path.join(workdir, "process.log"), 'w') + +# a macro like terms +SUPPORTED_DATAMODEL = { "thread": 0, "warp": 1 } + +# TODO move it to global variable or configurable +def get_arch() -> str: + """get compute_arch of the gpu, like 'sm_89' + # Run nvidia-smi command + result = subprocess.run( + ['nvidia-smi', '--query-gpu=compute_cap', '--format=csv,noheader'], + stdout=subprocess.PIPE, + text=True) + # sm_version like `8.9` + sm_version = result.stdout.split("\n")[0].strip() + major, minor = sm_version.split(".") + """ + # NOTE sometimes auto-detection like above will fail, so manually fix at the time + result = subprocess.run( + ['nvidia-smi', '--query-gpu=compute_cap', '--format=csv,noheader'], + stdout=subprocess.PIPE, + text=True) + # sm_version like `8.9` + sm_version = result.stdout.split("\n")[0].strip() + major, minor = sm_version.split(".") + return f"sm_{major}{minor}" + +def dump(workdir: str, name: str = "original", suffix: str = ".bin") -> str: + """Extract PTX from cuda binaries (cubin or fatbin) via cuobjdump + + NOTE accept three kind of binary: + 1. fatbin @see https://docs.nvidia.com/cuda/nvfatbin/index.html + 2. cubin @see https://docs.nvidia.com/cuda/cuda-binary-utilities/index.html + 3. PTX text file - no need to further process, just rename it to .ptx + """ + bin_path = os.path.join(workdir, name) + suffix + # first check if it's already a NULL-Terminated PTX (i.e., ASCII Text) + result = subprocess.run(['file', bin_path], stdout=subprocess.PIPE, text=True) + out = result.stdout + if "ASCII text" in result.stdout: # raw PTX file, just read it all + shutil.copyfile(bin_path, os.path.join(workdir, name) + ".ptx") + print("[objdump] bin is ptx", file=log) + with open(os.path.join(workdir, name) + ".ptx", "r") as outf: + return outf.read() + # then try cuobjdump -ptx flag + result = subprocess.run( + ['cuobjdump', '-ptx', bin_path], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True) + out = result.stdout + if len(result.stderr) > 0: + print(result.stderr, file=log) + if out.find(".version") != -1: + start = out.index(".version") # ptx valid part starts with .version + with open(os.path.join(workdir, name) + ".ptx", "w") as outf: + outf.write(out[start:]) + print("[objdump] via cuobjdump -ptx", file=log) + return out[start:] + else: + # finally try cuobjdump -elf to dump elf content and check .nv_debug_ptx_txt + result = subprocess.run(['cuobjdump', '-elf', bin_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + if len(result.stderr) > 0: + print(result.stderr, file=log) + sections = result.stdout.split(".section ") # don't remove the space + for section in sections: + if section.startswith(".nv_debug_ptx_txt"): # PTX Info + # write to the original.ptx + start = section.index(".version") + with open(os.path.join(workdir, name) + ".ptx", "w") as outf: + outf.write(section[start:]) + print("[objdump] via cuobjdump -elf", file=log) + return section[start:] + # if still not found + raise ValueError("PTX Not Found in CUBIN") + +def prune(ptx: str, entry_name: str) -> tuple[str, str, str, str]: + """ a minimum parser to truncate the ptx for specific entry + + Use this function to locate a specific entry with entry_name. + as Single PTX objdumped usually have > 1 entry, (try cuobjdump -ptx libcublas.so) + + NOTE verified on PTX from NVCC GCC Backend and LLVM PTX Backend + """ + # first try to find entry_name and raise error if entry_name not found + # .visible .entry and .entry corresponds to CUDA __global__ + length = len(ptx) + # NOTE fix PyTorch problem + entry_loc = ptx.find(f".visible .entry {entry_name}") + if entry_loc == -1: + entry_loc = ptx.find(f".entry {entry_name}") # try raw .entry instead + if entry_loc == -1: + found = False + for i in range(1, 30): + # ignore last few character for fuzzy finding + entry_loc = ptx.find(f".entry {entry_name[:-i]}") # try raw .entry instead + # print(entry_loc, entry_name[:-i]) + if entry_loc != -1: # if find! + entry_name = entry_name[:-i] + found = True + break + if not found: + raise ValueError(f"{entry_name} not found") + + # parse the first global section - shall be included for global info + # global info are complicated and not involved in later processing, just keep them + start_visible_entry = ptx.find(".visible .entry") if ptx.find(".visible .entry") != -1 else length + start_entry = ptx.find(".entry") if ptx.find(".entry") != -1 else length + start = min(start_visible_entry, start_entry) + # include global_section functions like __assertfail and definitions such as gmems + global_section = ptx[:start] + + # parse the .func which corresponds to CUDA __device__, might be used by entries + start_func = start + func_sections = [] + # tbh only very little code arrives here, so not much overhead + while start_func != -1: + start_func = ptx.find(".func", start_func) + if start_func == -1: + break + # function entry could be complicated, just keep them + pos = ptx.find("{", start_func) + 1 + ket_count = 1 + next_bra = ptx.find("{", pos) + next_ket = ptx.find("}", pos) + # now parse end + while ket_count > 0: + if next_bra < next_ket: + pos = next_bra + 1 + next_bra = ptx.find("{", pos) # find next { + if next_bra == -1: # not found, set to max := ptx length + next_bra = length + ket_count += 1 + else: # next_ket < next_bra, := close a bra + pos = next_ket + 1 + next_ket = ptx.find("}", pos) # find next } + if next_ket == -1: # not found, set to max := ptx length + next_ket = length + ket_count -= 1 + # now pos is end_body + func_sections.append(ptx[start_func:pos]) + start_func = pos # + func_section = "\n".join(func_sections) + + # given entry_loc try to parse the whole body + pos = ptx.find("{", entry_loc) + 1 + ket_count = 1 # one } miss + next_bra = ptx.find("{", pos) + if next_bra == -1: + next_bra = length + next_ket = ptx.find("}", pos) + while ket_count > 0: + if next_bra < next_ket: + pos = next_bra + 1 + next_bra = ptx.find("{", pos) + if next_bra == -1: # not found, set to max := ptx length + next_bra = length + ket_count += 1 + elif next_bra > next_ket: # next_ket < next_bra, := close a bra + pos = next_ket + 1 # not found, set to max := ptx length + next_ket = ptx.find("}", pos) + if next_ket == -1: + next_ket = length + ket_count -= 1 + + entry_section = ptx[entry_loc:pos] + + return global_section, func_section, entry_section, entry_name + +""" +NOTE: templates for thread-constant datamodel buffer calculation +These part shall be placed ONCE at the beginning of every kernel function definition +if there's any thread-constant probes + +Most registers below is duplicate and will be optimized by PTXAS +TODO Optimize calculation for 1D/2D Indexing (many kernel don't use 3D Indexing +""" +COMMON_BUFFER_CALC = """// begin buffer calculation +.reg .b32 %buf<20>; // b32 reg to record access, will be optimized by ptxas +mov.u32 %buf2, %tid.x; // threadIdx.x +mov.u32 %buf3, %tid.y; // threadIdx.y +mov.u32 %buf4, %tid.z; // threadIdx.z +mov.u32 %buf5, %ntid.x; // blockDim.x +mov.u32 %buf6, %ntid.y; // blockDim.y +mov.u32 %buf7, %ntid.z; // blockDim.z +mov.u32 %buf8, %ctaid.x; // blockIdx.x +mov.u32 %buf9, %ctaid.y; // blockIdx.y +mov.u32 %buf10, %ctaid.z; // blockIdx.z +mov.u32 %buf11, %nctaid.x; // gridDim.x +mov.u32 %buf12, %nctaid.y; // gridDim.y +mad.lo.s32 %buf13, %buf6, %buf4, %buf3; // blockDim.y * threadIdx.z + threadIdx.y +mad.lo.s32 %buf15, %buf13, %buf5, %buf2; // thread_idx = (blockDim.y * threadIdx.z + threadIdx.y) * blockDim.x + threadIdx.x +mad.lo.s32 %buf16, %buf12, %buf10, %buf9; // gridDim.y * blockIdx.z + blockIdx.y +mad.lo.s32 %buf17, %buf16, %buf11, %buf8; // block_idx = (gridDim.y * blockIdx.z + blockIdx.y) * gridDim.x + blockIdx.x +mul.lo.s32 %buf18, %buf5, %buf6; // blockDim.x * blockDim.y +mul.lo.s32 %buf19, %buf18, %buf7; // blockSize = blockDim.x * blockDim.y * blockDim.z +mad.lo.s32 %buf1, %buf17, %buf19, %buf15; // buf_idx = block_idx * blockSize + thread_idx +// end buffer calculation""" + +""" +NOTE templates for warp-constant datamodel buffer calculation +These part shall be placed ONCE at the beginning of every kernel function definition +if there's any warp-constant probes + +Most registers below is duplicate and will be optimized by PTXAS +""" +WARP_BUFFER_CALC = """// begin buffer calculation +.reg .b32 %warpbuf<21>; // b32 reg to record access, will be optimized by ptxas +.reg .pred %leader; // predicate register +.reg .pred %joint_pred; // used to store AND result of %leader and instruction operand +mov.u32 %warpbuf2, %laneid; // read lane id +setp.eq.u32 %leader, %warpbuf2, 0; // check if thread is warp leader +@%leader mov.u32 %warpbuf3, %nwarpid; // warpDim := number of warp in current group +@%leader mov.u32 %warpbuf4, %tid.x; // threadIdx.x +@%leader mov.u32 %warpbuf5, %tid.y; // threadIdx.y +@%leader mov.u32 %warpbuf6, %tid.z; // threadIdx.z +@%leader mov.u32 %warpbuf7, %ntid.x; // blockDim.x +@%leader mov.u32 %warpbuf8, %ntid.y; // blockDim.y +@%leader mov.u32 %warpbuf18, %ntid.z; // blockDim.z +@%leader mov.u32 %warpbuf9, %ctaid.x; // blockIdx.x +@%leader mov.u32 %warpbuf10, %ctaid.y; // blockIdx.y +@%leader mov.u32 %warpbuf11, %ctaid.z; // blockIdx.z +@%leader mov.u32 %warpbuf12, %nctaid.x; // gridDim.x +@%leader mov.u32 %warpbuf13, %nctaid.y; // gridDim.y +@%leader mad.lo.s32 %warpbuf14, %warpbuf8, %warpbuf6, %warpbuf5; // blockDim.y * threadIdx.z + threadIdx.y +@%leader mad.lo.s32 %warpbuf15, %warpbuf14, %warpbuf7, %warpbuf4; // thread_idx = (blockDim.y * threadIdx.z + threadIdx.y) * blockDim.x + threadIdx.x +@%leader div.s32 %warpbuf15, %warpbuf15, 32; // get persistent warpid instead of dynamic %warpid +@%leader mad.lo.s32 %warpbuf16, %warpbuf13, %warpbuf11, %warpbuf10; // gridDim.y * blockIdx.z + blockIdx.y +@%leader mad.lo.s32 %warpbuf17, %warpbuf16, %warpbuf12, %warpbuf9; // block_idx = (gridDim.y * blockIdx.z + blockIdx.y) * gridDim.x + blockIdx.x +@%leader mul.lo.s32 %warpbuf19, %warpbuf7, %warpbuf8; +@%leader mul.lo.s32 %warpbuf20, %warpbuf19, %warpbuf18; +@%leader div.s32 %warpbuf20, %warpbuf20, 32; +@%leader mad.lo.s32 %warpbuf1, %warpbuf17, %warpbuf20, %warpbuf15; // buf_idx = block_idx * warpSize + warpIdx +// end buffer calculation""" + +# NOTE buffer location for thread-local buffers, every probe has independent this part +THREAD_PROBE_BUFFER = """// begin {name} map +.reg .b64 %map_{name}<5>; // register group defn +mul.wide.s32 %map_{name}4, %buf1, {no_bytes}; // get buffer location, no_bytes is per thread +ld.param.u64 %map_{name}3, [param_{name}]; // load address from .param state space +cvta.to.global.u64 %map_{name}2, %map_{name}3; // convert address to .global state space +add.s64 %map_{name}1, %map_{name}2, %map_{name}4; // offset to get final thread-specific address +// end {name} map""" + +# NOTE buffer of the dynamic stuffs +THREAD_PROBE_DYNAMIC_BUFFER = """// begin {name} dynamic map +.reg .b64 %map_{name}<5>; // register group defn +.reg .b32 %cnt_{name}; // The dynamic count of buffer size +ld.param.u32 %cnt_{name}, [bytes_{name}]; // load sizes from .param state spaces +mul.wide.s32 %map_{name}4, %buf1, %cnt_{name}; // get buffer location, no_bytes is per thread +ld.param.u64 %map_{name}3, [param_{name}]; // load address from .param state space +cvta.to.global.u64 %map_{name}2, %map_{name}3; // convert address to .global state space +add.s64 %map_{name}1, %map_{name}2, %map_{name}4; // offset to get final thread-specific address +// end {name} dynamic map""" + +# NOTE buffer location for warp-local buffers, every probe has independent this part +WARP_PROBE_BUFFER = """// begin {name} map +.reg .b64 %map_{name}<5>; // register group defn +@%leader mul.wide.s32 %map_{name}4, %warpbuf1, {no_bytes}; // get buffer location, no_bytes is per thread +@%leader ld.param.u64 %map_{name}3, [param_{name}]; // load address from .param state space +@%leader cvta.to.global.u64 %map_{name}2, %map_{name}3; // convert address to .global state space +@%leader add.s64 %map_{name}1, %map_{name}2, %map_{name}4; // offset to get final thread-specific address +// end {name} map""" + +# NOTE for every probe with datamodel not none +# only support .u64 and recommend use 16 bytes alignment, minimum is 8 bytes +PROBE_PARAM = ".param .u64 param_{name}" +COUNT_PARAM = ".param .u32 bytes_{name}" + +# NOTE This is a special probe applied if dynamic = True, to be filled with count_inst and count_size +COUNT_PROBE = """regs = 1 +[map.Count] +level = "thread" +type = "array" +size = 8 +cap = 1 + +[map.Count.regs] +count = [ "u64", "None", ] + +[probe.Save] +pos = "kernel" +level = "thread" +before = "mov.u64 %NR0, 0;" +after = "SAVE [ Count ] {{ %NR0 }};" + +[probe.Count] +pos = "{count_inst}" +level = "thread" +before = "add.u64 %NR0, %NR0, {count_size};" +""" + +def probing(asm: str, probes: list[Probe], maps: list[Map], regs: int) -> tuple[str, list[int]]: + """Process the probes, the core function of probing engine""" + + # NOTE parse interesting locations + # A mapping from location to probes, a probe can hook at multiple location + positions: dict[str, list[Probe]] = dict() + kernel_start_probes: list[Probe] = [] + # NOTE turn kernel:end into ret:start for better matching + for probe in probes: + # different position split by ;, and inside split by : for start/end + for position in probe.pos: + if position == "kernel": # turn into listening instructions + if probe.after is not None: + if "ret;" in positions: + positions["ret;"].append(probe) + else: + positions["ret;"] = [probe, ] + if probe.before is not None: + kernel_start_probes.append(probe) + else: + if position in positions: + positions[position].append(probe) + else: + positions[position] = [probe, ] + + # NOTE parse PTX Assembly + ptx_lines = asm.split("\n") # let's do it line by line + # first extract basic kernel signature + entry_found: bool = False # line of .entry or .visible .entry + entry_last_line : int = 0 # last line of entry, marked by () + param_end_line : int = 0 # last line of param declaration, for probe params + body_start_line : int = 0 # first line of body + idx = 0 + while idx < len(ptx_lines): + line = ptx_lines[idx] + if not entry_found and ".entry" in line: # entry not yet found + entry_found = True + if entry_found: # now entry is found + # first check if the entry has been closed + if ")" in line and entry_last_line == 0: + entry_last_line = idx + # if entry is closed, time for body!, another if as ) { can in one line + if body_start_line == 0 and "{" in line and entry_last_line >= 0: + body_start_line = idx + # if not yet reach the entry, then line with .param is param declaration + if ".param" in line and entry_last_line == 0: + param_end_line = idx + # here pattern matching positions TODO optimize performance here + else: + for position, probes in positions.items(): + if position in line: # BUG might mismatch parameter with confused naming + # NOTE we got a match, then every probe will insert snippet before or after the line + # this might cause idx fluctuatting if we use idx to process it + line_idx = idx # a copy to fix the insertion position + for probe in probes: + # specially handle ret;, we need to place it before ret or it won't be executed + if position == "ret;" and probe.after is not None: + ptx_lines.insert(line_idx, Ref(line=line, probe=probe, before_after=False)) + idx += 1 + line_idx += 1 + else: + if probe.before is not None: + ptx_lines.insert(line_idx, Ref(line=line, probe=probe, before_after=True)) + idx += 1 + line_idx += 1 + if probe.after is not None: + ptx_lines.insert(line_idx + 1, Ref(line=line, probe=probe, before_after=False)) + idx += 1 + idx += 1 + + # Now add the probes to PTX Assembly + offset: int = 0 # adding every line need to offset 1 to make it correct + # First let's add parameters + ptx_lines[param_end_line] = ptx_lines[param_end_line] + "," # add , to indicate more param + # NOTE parameter layouts: Parameters are pointers to buffer, or buffer size + # We arange buffer pointers linearly in advance (u64), and later size (u32) + params_added: list[str] = [] + count_params: list[str] = [] # NOTE used for dynamic counts only + + # NOTE save the map_sizes so Hook Driver has a way to allocate the map memory + # we must make sure this is aligned with the order of parameter or will be illegal access + map_sizes: list[tuple[str, int]] = [] # + + levels: set[str] = set() + for map_ in maps: + if map_.cap != "dynamic": + map_sizes.append((map_.level, map_.cap * map_.size)) + params_added.append(PROBE_PARAM.format(name=map_.name)) + levels.add(map_.level) + else: + map_sizes.append((map_.level, -1)) + params_added.append(PROBE_PARAM.format(name=map_.name)) + count_params.append(COUNT_PARAM.format(name=map_.name)) + levels.add(map_.level) + + # else just ignore + params_added = params_added + count_params # formulate the layout + ptx_lines.insert(param_end_line + 1, ",\n".join(params_added)) + offset += 1 # in total one line is added + # Now add the probe with kernel:start -> this shall not dump anything I think + ptx_lines.insert(body_start_line + offset + 1, f".reg .u64 %NR<{regs + 1}>;") + offset += 1 + for probe in kernel_start_probes: + # NOTE kernel:start probe has no helpers and have no predicate + ptx_lines.insert(body_start_line + offset + 1, probe.before) # None is checked before + offset += 1 + # Now add the common buffer calculation + if "thread" in levels: + ptx_lines.insert(body_start_line + offset + 1, COMMON_BUFFER_CALC) + offset += 1 + if "warp" in levels: + ptx_lines.insert(body_start_line + offset + 1, WARP_BUFFER_CALC) + offset += 1 + # Now add the individual buffer calculation + # NOTE add the handle of u32 registers + for map_ in maps: + num_u32 = 0 + for reg in map_.regs: + if reg.dtype == "u32": + num_u32 += 1 + if map_.level == "thread": + if map_.cap != "dynamic": + buffer_asm = THREAD_PROBE_BUFFER.format(name=map_.name, no_bytes=str(map_.cap * map_.size)) + if num_u32 > 0: + buffer_asm = buffer_asm + f"\n.reg .u32 %{map_.name}_u32_<{num_u32 + 1}>;" + ptx_lines.insert(body_start_line + offset + 1, buffer_asm) + offset += 1 + else: + buffer_asm = THREAD_PROBE_DYNAMIC_BUFFER.format(name=map_.name) + if num_u32 > 0: + buffer_asm = buffer_asm + f"\n.reg .u32 %{map_.name}_u32_<{num_u32 + 1}>;" + ptx_lines.insert(body_start_line + offset + 1, buffer_asm) + offset += 1 + elif map_.level == "warp": + if map_.cap != "dynamic": + buffer_asm = WARP_PROBE_BUFFER.format(name=map_.name, no_bytes=str(map_.cap * map_.size)) + if num_u32 > 0: + buffer_asm = buffer_asm + f"\n.reg .u32 %{map_.name}_u32_<{num_u32 + 1}>;" + ptx_lines.insert(body_start_line + offset + 1, buffer_asm) + offset += 1 + else: + raise NotImplementedError() + else: + raise NotImplementedError() + + name_to_map: dict[str, Map] = {m.name: m for m in maps} + # Now add the instruction listenings + for idx in range(len(ptx_lines)): + # ignore most of line that is a string! + if type(ptx_lines[idx]) == Ref: # NOTE isinstance is slow? + line: str = ptx_lines[idx].line + probe: Probe = ptx_lines[idx].probe + before_after: str = ptx_lines[idx].before_after + # parse instruction operands, operands are separated by space fundamentally + tmp = line[:line.index(";")].split(",") + operands: list[str] = [] + # NOTE handling vectorized operands with { and } + merges = [] + merging: bool = False + for operand in tmp: + if "{" in operand and not "}" in operand: + merging = True + merges.append(operand) + elif "}" in operand and not "{" in operand: + merges.append(operand) # FIX, now operand is the last one and shall be included + operands.append(",".join(merges).strip("{} ")) # we don't want {} remains + merges = [] # flush merges + merging = False # reset status + else: + operands.append(operand) if not merging else merges.append(operand) + # first operand also have pred, inst and the real first operand + remaining = operands[0].strip() if len(operands) > 0 else print(line, tmp, operands, merges) + # handle predicate -> used in final insertion + if "@" in remaining: + pred = remaining[:remaining.index(" ") + 1] # include the space! + remaining = remaining[remaining.index(" ") + 1:].strip() + else: + pred = "" + # TODO assert matching instruction + mem_bytes: str = None + out: str = None + if remaining.find(" ") != -1: + inst = remaining[:remaining.index(" ")] + # NOTE a helper to calculate bytes, ld and st's bytes are inferred not from operand + # but the instruction body (likewise ld.global.v2.u64) + if "ld" in inst or "st" in inst: + vec = 1 + if "v2" in inst or "x2" in inst: + vec = 2 + elif "v4" in inst or "x4" in inst: + vec = 4 + # most dtypes are u32, no worries + dtypes = ["u32", "u64", "b16", "u16", "u8", "f32", "f64", "b128", "s32", "s64", "s16", "s8", "b32", "b64", "b8"] + for dtype in dtypes: + if dtype in inst: + mem_bytes = str(vec * int(dtype[1:]) // 8) # 8 "= size" + break + out = remaining[remaining.index(" "):].strip() + if "[" in out: + # NOTE handle [ addr ] used to indicate the memory address + out = out[out.index("[") + 1 : out.index("]")] if out is not None and "[" in out else out # fix + in1: str = operands[1] if len(operands) >= 2 else None + # NOTE handle [ addr ] used to indicate the memory address + in1 = in1[in1.index("[") + 1 : in1.index("]")] if in1 is not None and "[" in in1 else in1 + in2 = operands[2] if len(operands) >= 3 else None + in3 = operands[3] if len(operands) >= 4 else None + # TODO handle some weird syntax like + 0 used meaninglessly to locate correct places + # Currently only a minimal solution + if out is not None and "+" in out: out = out[:out.find("+")] + if in1 is not None and "+" in in1: in1 = in1[:in1.find("+")] + # print(line, out, in1, in2, sep=" / ") + # now handles operand helpers by directly replacing the value + snippet = probe.before if before_after else probe.after + snippet = snippet.replace("OUT", out) if "OUT" in snippet else snippet + snippet = snippet.replace("IN1", in1) if "IN1" in snippet else snippet + snippet = snippet.replace("IN2", in2) if "IN2" in snippet else snippet + snippet = snippet.replace("IN3", in3) if "IN3" in snippet else snippet + # NOTE add a new helper named ADDR referencing gmem address + if "ADDR" in snippet: + if "ld" in operands[0] or "cp.async" in operands[0]: + snippet = snippet.replace("ADDR", in1) + elif "st" in operands[0]: # st has + snippet = snippet.replace("ADDR", out) + if mem_bytes is not None: + snippet = snippet.replace("BYTES", mem_bytes) if "BYTES" in snippet else snippet + # now handles STORE helpers + snippet_lines = snippet.split("\n") + # NOTE special arrangements for warp datamodel + org_pred = pred + if probe.level == "warp": + if pred == "": + pred = "@%leader " # apply filter that only leader works + else: + pred = "@%joint_pred " # will be updated %leader AND pred + + for snippet_line_idx in range(len(snippet_lines)): + snippet_line: str = snippet_lines[snippet_line_idx] + if "SAVE" in snippet_line: # only one save, at the begin of line + # SAVE [ block_sched ] { R0, R1, R3 } + map_ = name_to_map[snippet_line[snippet_line.index("[") + 1: snippet_line.index("]")].strip()] + # dtype = snippet_line[snippet_line.find("SAVE") + 5: snippet_line.find("SAVE") + 8] + items = snippet_line[snippet_line.index("{") + 1:snippet_line.index("}")].split(",") + assert len(items) == len(map_.regs), f"{map_.name}.save not follow definition" + cvt_lines, u64s, u32s = [], [], [] + u32_idx = 1 # NOTE must be 1 due to some syntax issue I think + for item, reg in zip(items, map_.regs): + if reg.dtype == "u64": + u64s.append(item) + elif reg.dtype == "u32": # apply conversion + cvt_lines.append(f"{pred}cvt.u32.u64 %{map_.name}_u32_{u32_idx}, {item};\n") + u32s.append(f"%{map_.name}_u32_{u32_idx}") + u32_idx += 1 + else: + raise NotImplementedError + assert len(u32s) % 2 == 0, "Must save 2n u32 registers for memory alignment, please promote to u64" + save_lines = [] + for item_idx in range(len(u64s) // 2): + save_lines.append(f"{pred}st.global.v2.u64 [%map_{map_.name}1], {{ {u64s[item_idx * 2]}, {u64s[item_idx * 2 + 1]} }};\n{pred}add.s64 %map_{map_.name}1, %map_{map_.name}1, 16;") + if len(u64s) % 2 != 0: # odd number -> one item left! + save_lines.append(f"{pred}st.global.u64 [%map_{map_.name}1], {u64s[-1]};\n{pred}add.s64 %map_{map_.name}1, %map_{map_.name}1, 8;") + for item_idx in range(len(u32s) // 4): + save_lines.append(f"{pred}st.global.v4.u32 [%map_{map_.name}1], {{ {u32s[item_idx * 4]}, {u32s[item_idx * 4 + 1]}, {u32s[item_idx * 4 + 2]}, {u32s[item_idx * 4 + 3]} }};\n{pred}add.s64 %map_{map_.name}1, %map_{map_.name}1, 16;") + if len(u32s) % 4 != 0: # two items left... + save_lines.append(f"{pred}st.global.v2.u32 [%map_{map_.name}1], {{ {u32s[-2]}, {u32s[-1]} }};\n{pred}add.s64 %map_{map_.name}1, %map_{map_.name}1, 8;") + snippet_lines[snippet_line_idx] = "\n".join(cvt_lines) + "\n".join(save_lines) + else: + # or just add the pred! + # NOTE handling warp that having double buffer + if ".reg" not in snippet_line and snippet_line.strip() not in ("{", "}"): + snippet_lines[snippet_line_idx] = pred + snippet_line + else: + snippet_lines[snippet_line_idx] = snippet_line + if probe.level == "warp" and org_pred != "": + snippet_lines.insert(0, f"and.pred %tmp, %leader, {org_pred[1:]}; // joint prediction") # ignore the '@' signal at first + snippet = "\n".join(snippet_lines) + # finally replace the Ref with snippet to finish the probing! + ptx_lines[idx] = snippet + + # Finally finished.1 + return "\n".join(ptx_lines) + +def assemble(workdir: str, name: str) -> None: + """compile the ptx into cubin via ptxas + NOTE: ptxas command like `ptxas -arch=sm_80 --verbose -m64 "original.ptx" -o "original.cubin"` + * This is not actually need for running because CUDA Driver cuModuleLoad can load PTX (JIT), + * But is useful for checking as ptxas --verbose can give more info for debugging + """ + ptx_path = os.path.join(workdir, name) + ".ptx" + bin_path = os.path.join(workdir, name) + ".bin" # target binary + command = ["ptxas", f'-arch={get_arch()}', '-m64', "--verbose", ptx_path, '-o', bin_path] + print(" ".join(command), file=log) + result = subprocess.run( + command, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=True + ) + # print debug and verbose information to the process.log + if len(result.stderr) > 0: + print(result.stderr.decode("utf-8"), file=log) + if len(result.stdout) > 0: + print(result.stdout.decode("utf-8"), file=log) + +def parse_params(ptx: str) -> tuple[list[KernelParam], str]: + """parse kernel function parameters + @see https://docs.nvidia.com/cuda/parallel-thread-execution/#kernel-function-parameters + + NOTE this is because cuLaunchKernel receive void** as kernelParam and one can not infer + the valid no.params from void** (NVIDIA driver also use similar parsing for that) + """ + start = ptx.find("(") + name_start = ptx.rfind(" ", 0, start) + end = ptx.find(")", start) + ptx_lines = ptx[start + 1 : end].split("\n") + param_lines: list[str] = [] + params: list[KernelParam] = [] + + for line in ptx_lines: + if ".param" in line: + param_lines.append(line.strip(" ,")) + for param_line in param_lines: + tmp = param_line.split(" ") + dtype = tmp[1][1:] # .s32 .u64 ... + name = tmp[-1] + params.append(KernelParam(dtype, name)) + return params, ptx[name_start + 1:start] # + 1 := ignore space + + +def write_kernel_info(name: str, params: list[KernelParam], map_sizes: list[int], + workdir: str, file_name: str = "kernel.info"): + """write kernel info to workdir/file_name""" + # TODO add support for vectorized items + with open(os.path.join(workdir, file_name), "w") as f: + # print kernel name + print(name, file=f) + # number of parameters, for parsing void** kernelParams + print(len(params), file=f) + # number of probes with memory + print(len(map_sizes), file=f) + # size of each memory section + for probe_type, size in map_sizes: + print(f"{SUPPORTED_DATAMODEL[probe_type]},{size}", file=f) + # # NOTE: print the hook here, resolve relative path + # if callback != "" and not callback.startswith("/"): + # callback = os.path.join(os.path.dirname(os.path.dirname(__file__)), "tools", callback) + # print(callback, file=f) + # # NOTE: following are referencing stuff not really used by hook driver + # for param in params: + # print(f"{param.name},{param.dtype}", file=f) + +# ENTRY for this tool +if __name__ == "__main__": + # no argparse as the CLI is straightforward + workdir = sys.argv[1] # directory contains original.bin + kernel_name = sys.argv[2].encode('utf-8', 'ignore').decode('utf-8', 'ignore') # for possible case with multiple entry in one binary + + probes: dict + if len(sys.argv) > 3: # NOTE to facilitate debugging, not used in production + probe_path = sys.argv[3] + probes = toml.load(probe_path) + print(probes) + else: # the path in production + # parse the environment variable to read the probes + probe_envvar = os.environ.get("NEUTRINO_PROBES") + if probe_envvar is None: + raise ValueError("Can not read probes from envaraible 'NEUTRINO_PROBES'") + # load it via toml + probes = toml.loads(probe_envvar) + + # # filter out, probes are nested dict in TOML via [name] + # probes: Dict[str, dict] = dict() + # callback = probe_toml["CALLBACK"] if "callback" in probe_toml else "" + # for key, value in probe_toml.items(): + # if isinstance(value, dict): + # probes[key] = value + + # parse the environment variable for filtered out kernel, this is for + # 1. Some buggy kernels caused system fails -> many GPU error is not recoverable + # 2. Some uninterested kernels such as vectorized_elementwise for PyTorch + filter_out = os.environ.get("NEUTRINO_FILTER", "") + filter_out = filter_out.split(":") if len(filter_out) > 0 else None + print(filter_out, file=log) + + filter_in = os.environ.get("NEUTRINO_KERNEL", "") + filter_in = filter_in.split(":") if len(filter_in) > 0 else None + print(filter_in, file=log) + + # NOTE check if some probe is defined as dynamic, if so, we need to add a counter + # for these probes in different arangements + dynamic = bool(os.environ.get("NEUTRINO_DYNAMIC", 0)) + + try: + # first objdump binary to ptx + ptx = dump(workdir) + # then truncate ptx for entry_name + global_section, func_section, entry_section, _ = prune(ptx, kernel_name) + # split and process ptx lines and write kernel info + params, kernel_name = parse_params(entry_section) + + # basic logging + print(kernel_name, file=log) + if filter_in: + matched = False + for tmp in filter_in: + if tmp in kernel_name: + matched = True + if not matched: + print(f"{kernel_name} is not in {filter_in}", file=log) + exit(1) + if filter_out: + for tmp in filter_out: + if tmp != "" and tmp in kernel_name: + print(f"{kernel_name} filtered out from {filter_out}", file=log) + exit(1) + + # write pruned ptx to file + pruned_ptx = global_section + "\n" + func_section + "\n" + entry_section + with open(os.path.join(workdir, "pruned.ptx"), "w") as f: + f.write(pruned_ptx) + + # convert probes from Python Dict to data structure + probes, maps, regs = load(probes) + + # NOTE generate the trace reading code + if dynamic: + # First check the probe with size is dynamic, aka size = -1 + count_inst = [] + count_size = 0 + count_map = "" + for map_ in maps: + if map_.cap == "dynamic": + count_size = map_.size + count_map = map_.name + break + for probe in probes: + # NOTE there might be a bug that before and after can only save once + if probe.before is not None: + if "SAVE" in probe.before and count_map in probe.before: + count_inst = count_inst + probe.pos + if probe.after is not None: + if "SAVE" in probe.after and count_map in probe.after: + count_inst = count_inst + probe.pos + assert len(count_inst) > 0 + count_inst = ":".join(count_inst) + count_probe = COUNT_PROBE.format(count_inst = count_inst, count_size = count_size) + count_probe, count_map, count_reg = load(toml.loads(count_probe)) + count_ptx = probing(entry_section, count_probe, count_map, count_reg) + count_ptx = global_section + "\n" + func_section + "\n" + count_ptx + with open(os.path.join(workdir, "countd.ptx"), "w") as f: + f.write(count_ptx) + + map_sizes: list[tuple[str, int]] = [] + for map_ in maps: + if isinstance(map_.cap, int): + map_sizes.append((map_.level, map_.size * map_.cap)) + elif map_.cap == "dynamic": + map_sizes.append((map_.level, -1)) + else: + raise NotImplementedError + + # process ptx lines + probed_ptx = probing(entry_section, probes, maps, regs) + + # merge global and func back + probed_ptx = global_section + "\n" + func_section + "\n" + probed_ptx + + # write ptx to file + with open(os.path.join(workdir, "probed.ptx"), "w") as f: + f.write(probed_ptx) + + # params = parse_params(ptx_lines) + write_kernel_info(kernel_name, params, map_sizes, workdir) + + # compile ptx to binary, we want both probed and pruned + assemble(workdir, "probed") + assemble(workdir, "pruned") + if dynamic: + assemble(workdir, "countd") + + except Exception as e: + traceback.print_exc(file=log) + exit(1) \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/hip.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/hip.py new file mode 100644 index 0000000..39bd02c --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/hip.py @@ -0,0 +1,486 @@ +"""Neutrino Probing Engine, AMD ROCm HIP Implementation + +HIP's AMDGCN (.amdgcn) has only one source: LLVM AMDGPU Backend +CITE https://llvm.org/docs/AMDGPUUsage.html + +GCNAsm is similar to x86 assembly, kernel mainly has two parts (two .text): +1. GCNAsm Code in form of .section .text +2. AMD HSA Configuration or said `.amdgpu_metadata` + +NOTE At the moment, we only support part of syntax because there's no official + documentation on syntax / semantics of GCN Assembly (only instructions). + +Important GCNAsm syntax for Neutrino developers: +1. s[0:1] holds pointer to kernargs, use s_load_dword to ld.param +2. v0 (32bit) holds 00 + threadIdx.z(30-20)+threadIDx.y(20-10)+threadIdx.x(10-0) +3. blockIdx.xyz is the next 3 registers after first three, gridDim.xyz is ttmp8/9/10 +4. It seems blockDim can not be retrieved easily from special registers, only via + +It's worth noticed that v0 and s[0:1] can be changed by developers, i.e., their +value (threadIdx.xyz) is only available at kernel begins. + +NOTE Why not fully support? +Because we can not find AMD GPUs for testing or debugging (we are not AMD). +There's nearly no cloud providers for AMD GPUs (only MI300x on runpods.io). + +And part of AMD's ISA is ridiculous, for example, until CDNA3, add two u64 +is finally supported on VGPRs, still not supported on SGPRs. Why? Tell me why? +""" + +from typing import List, Tuple, Optional, Dict, Set +import os +import sys +import shutil +import subprocess +import traceback # usef for print backtrace to log file instead of stdout +import toml # to load probes from envariables +import yaml # AMD GCN ASM use YAML as METADATA Storage +from dataclasses import dataclass +from neutrino.common import Register, Probe, Map, load +from neutrino.probe import Ref, KernelParam + +workdir = sys.argv[1] # directory contains original.bin +log = open(os.path.join(workdir, "process.log"), 'w') + +# a macro like terms +SUPPORTED_DATAMODEL = { "thread": 0, "warp": 1 } + +# NOTE applicable to CDNA GPUs but might not be applicable to GDNA GPUs +# TODO change to amdgpu_metadata['amdhsa.kernels'][0]['.wavefront_size'] +WARP_SIZE = 64 + +@dataclass +class KernelParam: # NOTE GCNASM has different defn + value_kind: str + size: int + +# NOTE it's risky but safe as this is a CLI tool invoked for specific kernel +amdgpu_metadata: Dict = None + +# TODO finalize rocm-smi toolchain +def get_arch() -> str: + """At the moment, we extract target arch from the assembly, but not sure + if this may leads to misleading arch for codegen, will see""" + ... + +# TODO finalize llvm-objdump toolchain +def extract(workdir: str, name: str = "original", suffix: str = ".bin") -> str: + bin_path = os.path.join(workdir, name) + suffix + # first check if it's already a NULL-Terminated PTX (i.e., ASCII Text) + result = subprocess.run(['file', bin_path], stdout=subprocess.PIPE, text=True) + out = result.stdout + if "ASCII text" in result.stdout: # raw PTX file, just read it all + shutil.copyfile(bin_path, os.path.join(workdir, name) + ".asm") + print("[decompile] bin is gcnasm", file=log) + with open(os.path.join(workdir, name) + ".asm", "r") as outf: + return outf.read() + +# TODO add prune support +def prune(asm: str, entry_name: str) -> Tuple[str, str]: + """A Minimum parser to truncate the gcn asm for specific entry + + Use this function to locate a specific entry with entry_name as + single .asm / .s usually have > 1 entry! + """ + # First find the two single line of .text + lines = asm.split("\n") + sections = [] + target = None # assembler target like .amdgcn_target "amdgcn-amd-amdhsa--gfx942" + for idx in range(len(lines)): + if ".text" in lines[idx]: + sections.append(idx) # record the sections + elif ".amdgcn_target" in lines[idx]: + target = lines[idx] + # reorganize sections + kernels = [] + for idx in range(len(sections) - 2): # last section holds gcnasm + if "@function" in "\n".join(lines[sections[idx] : sections[idx + 1]]): + kernels.append(sections[idx]) + kernels.append(sections[-1]) + # Now locate the entry + # TODO add rough matching!!! + entry_section = None + for idx in range(len(kernels) - 2): + temp = "\n".join(lines[kernels[idx] : kernels[idx + 1]]) + if entry_name in temp: + entry_section = temp + assert entry_section is not None, "Fail to find" + # adding target if not found + if target not in entry_section: + entry_section = entry_section.split("\n") + entry_section = entry_section[0] + target + "\n".join(entry_section[1:]) + # fix the metadata section + last_section = "\n".join(lines[sections[-1]:]) + assert ".amdgpu_metadata" in last_section + metadata = last_section[last_section.index(".amdgpu_metadata") + 16: last_section.index(".end_amdgpu_metadata") - 1] # BUG -1 is a fix + global amdgpu_metadata + amdgpu_metadata = yaml.safe_load(metadata) + for kernelmeta in amdgpu_metadata['amdhsa.kernels']: + if kernelmeta['.name'] == entry_name: + amdgpu_metadata['amdhsa.kernels'] = [kernelmeta, ] # only want this one + break + return entry_section, last_section + +def parse_params() -> Tuple[List[KernelParam], str]: + global amdgpu_metadata + kernel_name = amdgpu_metadata['amdhsa.kernels'][0]['.name'] + params: List[KernelParam] = [] + for arg in amdgpu_metadata['amdhsa.kernels'][0]['.args']: + params.append(KernelParam(arg['.value_kind'], arg['.size'])) + return params, kernel_name + +""" +NOTE: templates for thread-constant datamodel buffer calculation +These part shall be placed ONCE at the beginning of every kernel function definition +if there's any thread-constant probes + +Most registers below is duplicate and will be optimized by AMD Assembler +""" + +THREAD_BUFFER_COMMON = """;;# begin buffer calculation +V_MOV_B32 v{thread_buff}, v0 ;;# v0 holds threadIdx.x, don't know what's threadIdx.y, threadIdx.z +;;# end buffer calculation""" + +WARP_BUFFER_COMMON = """;;# begin buffer calculation +V_LSHRREV_B32_E32 v{warp_buff}, 6, v0;; # shift 6 bits := // 64 +;;# end buffer calculation""" + +# NOTE buffer location for thread-local buffers, every probe has independent this part +THREAD_BUFFER = """;;# begin {name} buffer +S_LOAD_DWORDX2 s[{param_reg}], s[0:1], {param_offset};;# load buffer address into 64bit register (2x32) +V_MAD_I64_I32 v[{param_addr}], v{thread_buff}, {no_bytes}, s[{param_reg}];;# calculate the address +;;# end {name} buffer""" + +# BUG it shall be possible to move everything into sgpr, but I don't know how to locate +# warpIdx in solely SGPR, please help me +WARP_BUFFER = """;;# begin {name} buffer +S_LOAD_DWORDX2 s[{param_reg}], s[0:1], {param_offset};;# load buffer address into 64bit register (2x32) +V_MAD_I64_I32 v[{param_addr}], v{warp_buff}, {no_bytes}, s[{param_reg}];;# calculate the address +;;# end {name} buffer""" + +def probing(asm: str, probes: List[Probe]) -> Tuple[str, List[int], str]: + """Probing the Assembly, the core of probing engine + + NOTE we assume probe is parsed and (security checked)""" + + # NOTE parse interesting locations + # A mapping from location to probes, a probe can hook at multiple location + positions: Dict[str, List[Probe]] = dict() + kernel_start_probes: List[Probe] = [] + # NOTE turn kernel:end into ret:start for better matching + print(probes) + for probe in probes: + # different position split by ;, and inside split by : for start/end + for position in probe.position: + if position == "kernel": # turn into listening instructions + if probe.after is not None: + if "s_endpgm" in positions: # AMD use s_endpgm to terminate + positions["s_endpgm"].append(probe) + else: + positions["s_endpgm"] = [probe, ] + if probe.before is not None: + kernel_start_probes.append(probe) + else: + if position in positions: + positions[position].append(probe) + else: + positions[position] = [probe, ] + + # NOTE parse GCN Assembly + gcn_lines = asm.split("\n") # let's do it line by line + # first extract basic kernel signature + body_start_line : int = 0 # first line of body + idx = 0 + # NOTE specially handle kernel start probe + + while idx < len(gcn_lines): + line = gcn_lines[idx] + # First try to find ; %bb.0: NOTE just the behavior of hipcc not standard syntax + # but we don't know what's the standard syntax, there's no documentation about this... + # maybe this is the reason why AMD product is hard to use? + if "%bb.0" in line: + body_start_line = idx + # BUG move it to the real beginning before loading and saving ? + for probe in kernel_start_probes: + gcn_lines.insert(idx + 1, Ref(line=line, probe=probe, before_after=True)) # place after + idx += 1 + # here pattern matching positions TODO optimize performance here + else: + for position, probes in positions.items(): + if position in line: # BUG might mismatch parameter with confused naming + # NOTE we got a match, then every probe will insert snippet before or after the line + # this might cause idx fluctuatting if we use idx to process it + line_idx = idx # a copy to fix the insertion position + for probe in probes: + # specially handle ret;, we need to place it before ret or it won't be executed + if position == "s_endpgm" and probe.after is not None: + gcn_lines.insert(line_idx, Ref(line=line, probe=probe, before_after=False)) + idx += 1 + line_idx += 1 + else: + if probe.before is not None: + gcn_lines.insert(line_idx, Ref(line=line, probe=probe, before_after=True)) + idx += 1 + line_idx += 1 + if probe.after is not None: + gcn_lines.insert(line_idx + 1, Ref(line=line, probe=probe, before_after=False)) + idx += 1 + idx += 1 + + # work with register spaces, NOTE AMD GCN Asm don't have declartion syntax + # for registers, just a flatten v[0:1], we need to manage them manually + # GCN Asm has two register spaces: + # 1. VGPR (v0), holding thread-spcific values + # 2. SGPR (s0), holding warp-specifc values + # TODO we can optimize warp probes to SGPR only, avoiding VGPR usage + + # Now add the probes to PTX Assembly + offset: int = 0 # adding every line need to offset 1 to make it correct + probe_mem_sizes: List[Tuple[str, int]] = [] # + # TODO parse these from meta + global amdgpu_metadata + # NOTE here the sgpr number is wrong, there'll be 6 more, I don't know why, I can only + # record it at the moment. + # BUG SGPR number from metadata doesn't match the actual usage. Always + # 6 more is used. I don't know why but let's keep it. + sgpr_all = amdgpu_metadata["amdhsa.kernels"][0]['.sgpr_count'] + sgpr = 0 + for idx in range(len(gcn_lines)): + if type(gcn_lines[idx]) is str and ".amdhsa_next_free_sgpr" in gcn_lines[idx]: + sgpr = int(gcn_lines[idx].strip().split()[1]) + sgpr_diff = sgpr_all - sgpr + vgpr = amdgpu_metadata['amdhsa.kernels'][0]['.vgpr_count'] # used for new stuff + param_off = amdgpu_metadata["amdhsa.kernels"][0]['.kernarg_segment_size'] + param_align = amdgpu_metadata["amdhsa.kernels"][0]['.kernarg_segment_align'] + param_off = ((param_off + param_align - 1) // param_align ) * param_align # round up + params = [] + thread_buff_vgpr, warp_buff_vgpr = None, None # conform Python scope + + processed: Set[str] = set() # a set to avoid repeated process same probe that leads to error + datamodels: Set[str] = set() + for probe in probes: + if probe.name not in processed and probe.datamodel is not None: + probe_mem_sizes.append((probe.datamodel, int(probe.cap) * int(probe.no_bytes))) + processed.add(probe.name) + datamodels.add(probe.datamodel) + + if "thread" in datamodels: + thread_buff_vgpr = f"{vgpr}" + gcn_lines.insert(body_start_line + offset + 1, THREAD_BUFFER_COMMON.format(thread_buff=thread_buff_vgpr)) + offset += 1 + vgpr += 1 + if "warp" in datamodels: + warp_buff_vgpr = f"{vgpr}" + gcn_lines.insert(body_start_line + offset + 1, WARP_BUFFER_COMMON.format(warp_buff=warp_buff_vgpr)) + offset += 1 + vgpr += 1 + + # Now add the individual buffer calculation + processed = set() + for probe in probes: + if probe.name not in processed: + if probe.datamodel == "thread": + no_bytes = str(int(probe.cap) * int(probe.no_bytes)) + gcn_lines.insert(body_start_line + offset + 1, + THREAD_BUFFER.format(name=probe.name, no_bytes=no_bytes, + param_offset=param_off, param_reg=f"{sgpr}:{sgpr+1}", + thread_buff=thread_buff_vgpr, param_addr=f"{vgpr}:{vgpr+1}")) + probe.param_addr = f"{vgpr}:{vgpr+1}" # NOTE record the address + offset += 1 + sgpr += 2 # 2x32bit registers to hold 8bytes, specific to warp + vgpr += 2 # 2x32bit registers to hold 8bytes, specific to thread + params.append({'.address_space': 'global', '.size': 8, + '.offset': param_off, '.value_kind': 'global_buffer'}) + param_off += 8 # only pass in pointers so 8bytes := 64bits + elif probe.datamodel == "warp": + no_bytes = str(int(probe.cap) * int(probe.no_bytes)) + gcn_lines.insert(body_start_line + offset + 1, + WARP_BUFFER.format(name=probe.name, no_bytes=no_bytes, + param_offset=param_off, param_reg=f"{sgpr}:{sgpr+1}", + warp_buff=warp_buff_vgpr, param_addr=f"{vgpr}:{vgpr+1}")) + probe.param_addr = f"{vgpr}:{vgpr+1}" # NOTE record the address + offset += 1 + sgpr += 2 # 2x32bit registers to hold 8bytes, specific to warp + vgpr += 2 # 2x32bit registers to hold 8bytes, specific to thread + params.append({'.address_space': 'global', '.size': 8, + '.offset': param_off, '.value_kind': 'global_buffer'}) + param_off += 8 # only pass in pointers so 8bytes := 64bits + for reg in probe.registers: + if probe.registers[reg] == "b32": + if probe.datamodel == "warp": + probe.registers[reg] = f"s{sgpr}" + sgpr += 1 + elif probe.datamodel == "thread": + probe.registers[reg] = f"v{vgpr}" + vgpr += 1 + elif probe.registers[reg] == "b64": + if probe.datamodel == "warp": + probe.registers[reg] = f"s[{sgpr}:{sgpr+1}]" + sgpr += 2 + elif probe.datamodel == "thread": + probe.registers[reg] = f"v[{vgpr}:{vgpr+1}]" + vgpr += 2 + processed.add(probe.name) + # all rest is treated as no saving + + # Now add the instruction listening + for idx in range(len(gcn_lines)): + # ignore most of line that is a string! + if type(gcn_lines[idx]) == Ref: # NOTE isinstance is slow? + line: str = gcn_lines[idx].line.strip() + probe: Probe = gcn_lines[idx].probe + before_after: str = gcn_lines[idx].before_after + # parse instruction operands, operands are separated by comma + if ";" in line: line = line[:line.find(";")] + tmp = line.split(",") + operands: List[str] = [] + inst, op1 = tmp[0].split(" ")[0], tmp[0].split(" ")[-1] # + operands.append(op1) + for t in tmp[1:]: + operands.append(t.strip().split(" ")[0]) + snippet = probe.before if before_after else probe.after + if "OUT" in snippet: snippet = snippet.replace("OUT", operands[0]) + if "IN1" in snippet: snippet = snippet.replace("IN1", operands[1]) + if "IN2" in snippet: snippet = snippet.replace("IN2", operands[2]) + if "IN3" in snippet: snippet = snippet.replace("IN3", operands[3]) + + # Adding support for SAVE.u64 statement + # NOTE for reading the probe afterwards + snippet_lines = snippet.split("\n") + for snippet_line_idx in range(len(snippet_lines)): + snippet_line: str = snippet_lines[snippet_line_idx] + if "SAVE" in snippet_line: # only one save, at the begin of line + save_lines = [] # start a new string + items = snippet_line[snippet_line.index("{") + 1:snippet_line.index("}")].split(",") + dtype = snippet_line[snippet_line.find("SAVE") + 5: snippet_line.find("SAVE") + 8] + if dtype == "u64": + for item_idx in range(len(items)): + item_val = probe.registers[items[item_idx].strip()] + save_lines.append(f"\tGLOBAL_STORE_DWORDX2 v[{probe.param_addr}], {item_val} \n\tV_LSHL_ADD_U64 v[{probe.param_addr}], 0, 8") + elif dtype == "u32": + for item_idx in range(len(items)): + item_val = probe.registers[items[item_idx].strip()] + save_lines.append(f"\tGLOBAL_STORE_DWORD v[{probe.param_addr}], {item_val} \n\tV_LSHL_ADD_U64 v[{probe.param_addr}], 0, 4") + else: + raise ValueError("Only Support Saving u32 / u64") + snippet_lines[snippet_line_idx] = "\n".join(save_lines) + snippet = "\n".join(snippet_lines) + for reg in probe.registers: + if reg in snippet: + snippet = snippet.replace(reg, probe.registers[reg]) + # Finally replace the line + gcn_lines[idx] = snippet + + # NOTE we need to modify the number of registers used in metasection + # 1. Mofify the kernarg_size .amdhsa_kernarg_size 28 + # 2. Modify the SGPRs used .amdhsa_next_free_sgpr 12 + # 3. Modify the VGPRs used .amdhsa_next_free_vgpr 9 + # Something might need .amdhsa_user_sgpr_count 2 + for idx in range(len(gcn_lines)): + if ".amdhsa_kernarg_size" in gcn_lines[idx]: + gcn_lines[idx] = f"\t.amdhsa_kernarg_size {param_off}" + elif ".amdhsa_next_free_sgpr" in gcn_lines[idx]: + gcn_lines[idx] = f"\t.amdhsa_next_free_sgpr {sgpr}" + elif ".amdhsa_next_free_vgpr" in gcn_lines[idx]: + gcn_lines[idx] = f"\t.amdhsa_next_free_vgpr {vgpr}" + + # NOTE also modify the amdgpu_metadata, after all, becomes + amdgpu_metadata["amdhsa.kernels"][0]['.sgpr_count'] = sgpr + sgpr_diff + amdgpu_metadata['amdhsa.kernels'][0]['.vgpr_count'] = vgpr + amdgpu_metadata["amdhsa.kernels"][0]['.kernarg_segment_size'] = param_off + amdgpu_metadata["amdhsa.kernels"][0]['.args'] += params + # Finally finished, we might need to finalize the metadata + return "\n".join(gcn_lines), probe_mem_sizes + + # NOTE also add new parameters + + +def assemble(workdir: str, name: str) -> None: + """Assemble the GCN Asm (probed.asm) into Machine Code (probed.bin) + NOTE AMD assembler command is part of Clang LLVM like + clang -cc1as -triple amdgcn-amd-amdhsa -filetype obj -target-cpu gfx942 + -mrelocation-model pic -v -mllvm -amdgpu-early-inline-all=true -mllvm + -amdgpu-function-calls=false -o probed.bin probed.asm + """ + # TODO need to locate the clang of ROCm, unlike like ptxas of unique name + asm_path = os.path.join(workdir, name) + ".asm" + bin_path = os.path.join(workdir, name) + ".bin" # target binary + command = ["clang", '-cc1as', '-triple', 'amdgcn-amd-amdhsa', '-filetype=obj', + f"-target-cpu={get_arch()}", '-mrelocation-model=pic', '--verbose', + '-mllvm', '-amdgpu-early-inline-all=true', + '-mllvm', '-amdgpu-function-calls=falsep', + asm_path, '-o', bin_path] + print(" ".join(command), file=log) + result = subprocess.run( + command, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=True + ) + # print debug and verbose information to the process.log + if len(result.stderr) > 0: + print(result.stderr.decode("utf-8"), file=log) + if len(result.stdout) > 0: + print(result.stdout.decode("utf-8"), file=log) + +# ENTRY for this tool +if __name__ == "__main__": + # no argparse as the CLI is straightforward + workdir = sys.argv[1] # directory contains original.bin + kernel_name = sys.argv[2].encode('utf-8', 'ignore').decode('utf-8', 'ignore') # for possible case with multiple entry in one binary + + if len(sys.argv) > 3: # NOTE to facilitate debugging, not used in production + probe_path = sys.argv[3] + probe_toml = toml.load(probe_path) + else: # the pass in production + # parse the environment variable to read the probes + probe_envvar = os.environ.get("NEUTRINO_PROBES") + if probe_envvar is None: + raise ValueError("Can not read probes from envaraible 'NEUTRINO_PROBES'") + # load it via toml + probe_toml = toml.loads(probe_envvar) + + # filter out, probes are nested dict in TOML via [name] + probes: Dict[str, dict] = dict() + analyze_hook = probe_toml["analyze_hook"] if "analyze_hook" in probe_toml else "" + for key, value in probe_toml.items(): + if isinstance(value, dict): + probes[key] =value + + probes = safe_load_probes(probes) + # apply a + + try: + # first decompile binary to ptx + asm = extract(workdir) + # then truncate ptx for entry_name + entry_section, meta_section = prune(asm, kernel_name) + + # split and process ptx lines and write kernel info + params, kernel_name = parse_params() + + # basic logging + print(kernel_name, file=log) + + # write pruned gcnasm to file + meta_section = meta_section[: meta_section.index(".amdgpu_metadata") + 16] + yaml.safe_dump(amdgpu_metadata) + meta_section[meta_section.index(".end_amdgpu_metadata") - 1:] + pruned_ptx = entry_section + "\n" + meta_section + with open(os.path.join(workdir, "pruned.asm"), "w") as f: + f.write(pruned_ptx) + + probed_asm, probe_mem_sizes = probing(entry_section, probes) + + # NOTE we need to update the meta_section we updated + # TODO split into multiple lines + meta_section = meta_section[: meta_section.index(".amdgpu_metadata") + 16] + yaml.safe_dump(amdgpu_metadata) + meta_section[meta_section.index(".end_amdgpu_metadata") - 1:] + + # merge global and func back + probed_asm = probed_asm + "\n" + meta_section + + # write probed gcnasm to file + with open(os.path.join(workdir, "probed.asm"), "w") as f: + f.write(probed_asm) + + except Exception as e: + traceback.print_exc(file=log) + exit(1) \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/common.h b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/common.h new file mode 100644 index 0000000..cac4710 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/common.h @@ -0,0 +1,611 @@ +/** + * Common Definition of Neutrino Hooked Driver + * + * @note Keep common.h only Linux/GNU dependencies, no other platform-specifics + */ +#include // for many thing +#include // for standard library +#include // for file dump +#include // for timing +#include // for loading real shared library +#include // for uint64_t defn +#include // for true false +#include // for ELF Header +#include // for waiting subprocess +#include // for directory +#include // for mutex lock +#include "uthash.h" // for hashmap +#include "sha1.h" // for SHA1 hash algorithm + +/** + * @todo change probe type to enum for better portability + * @todo standardize trace saving, got duplicate codes in cuda.c / hip.c + * @todo standardize JIT interaction, got duplicate codes in cuda.c / hip.c + */ + +#define PROBE_TYPE_THREAD 0 +#define PROBE_TYPE_WARP 1 +#define CDIV(a,b) (a + b - 1) / (b) + +static FILE* event_log; // file pointer to event_log: NEUTRINO_TRACEDIR/MM_DD_HH_MM_SS/event.event_log + +/** + * System Configuration and Setup + */ + +static void* shared_lib = NULL; // handle to real cuda driver +static char* NEUTRINO_REAL_DRIVER = NULL; // path to real cuda driver, loaded by env_var NEUTRINO_REAL_DRIVER +static char* NEUTRINO_PYTHON = NULL; // path to python exe, loaded by env_var NEUTRINO_PYTHON +static char* NEUTRINO_PROBING_PY = NULL; // path to process.py, loaded by env_var NEUTRINO_PROBING_PY +static char* NEUTRINO_CALLBACK = NULL; // callback to analyze the kernel + +// directory structure +static char* RESULT_DIR = NULL; // env_var NEUTRINO_TRACEDIR/MM_DD_HH_MM_SS/result +static char* KERNEL_DIR = NULL; // env_var NEUTRINO_TRACEDIR/MM_DD_HH_MM_SS/kernel + +/** + * Benchmark mode, will include an additional launch after the trace kernel + * Used to measure the kernel-level slowdown of Neutrino, disabled by default + * @warning might cause CUDA_ERROR with in-place kernels, coupled with --filter if encountered + * this intrinsic of program and can not be resolved by Neutrino + * @note benchmark_mem is a 256MB empty memory that will be cuMemSetD32 to 0 + * which take the L2 Cache Space and Remove Previous L2 Cache Value, + * @cite this is inspired by Triton do_bench and Nvidia https://github.com/NVIDIA/nvbench/ + */ +static int NEUTRINO_BENCHMARK = 0; +static size_t NEUTRINO_BENCHMARK_FLUSH_MEM_SIZE = 256e6; + +/** + * A feature to measure the memory usage of probes other than really launch the + * profiling kernel. Useful in debugging / preventing out-of-memory errors. + */ +static int NEUTRINO_MEMUSAGE = 0; + +// simple auto-increasing idx to distinguish kernels of the same name +static int kernel_idx = 0; + +// start time for event_logging. Neutrino trace are named as time since start +static struct timespec start; + +// verbose setting -> to prevent event_log file too large due to unimportant setting +static int VERBOSE = 0; + +// dynamic setting -> enable it leads to a count kernel launched to detect the dynamic part +static int DYNAMIC = 0; + +// helper macro to check dlopen/dlsym error +#define CHECK_DL() do { \ + const char *dl_error = dlerror(); \ + if (dl_error) { \ + fprintf(stderr, "%s\n", dl_error); \ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +// utilities to get the trace folder name +char* get_tracedir() { + // First read the parent directory + char* NEUTRINO_TRACEDIR = getenv("NEUTRINO_TRACEDIR"); + if (NEUTRINO_TRACEDIR == NULL) { + fprintf(stderr, "Environment Variable NEUTRINO_TRACEDIR not set\n"); + exit(EXIT_FAILURE); + } + // check and create folder structure + // first create NEUTRINO_TRACE_DIR + if (access(NEUTRINO_TRACEDIR, F_OK) != 0) { // not existed or bugs + if (mkdir(NEUTRINO_TRACEDIR, 0755) != 0) { + perror("Can not create NEUTRINO_TRACEDIR"); + exit(EXIT_FAILURE); + } + } + + unsigned long long start_time_jiffies; + unsigned long uptime_seconds; + + // 1. read the 22nd value of /proc/[pid]/stat (jiffies of proc start time) + FILE *stat_file = fopen("/proc/self/stat", "r"); + if (!stat_file) { + perror("Failed to open /proc/[pid]/stat"); + exit(1); + } + for (int i = 0; i < 21; i++) { + if (fscanf(stat_file, "%*s") == EOF) { + fclose(stat_file); + fprintf(stderr, "Invalid /proc/self/stat format\n"); + exit(1); + } + } + int read_items = fscanf(stat_file, "%llu", &start_time_jiffies); + fclose(stat_file); + + // 2. get system clock frequency (Hz, usually 100MHz) + long clk_tck = sysconf(_SC_CLK_TCK); + if (clk_tck <= 0) { + fprintf(stderr, "Failed to get system clock tick\n"); + exit(1); + } + + // 3. read the systme boot time (second, since 1970) + FILE *uptime_file = fopen("/proc/uptime", "r"); + if (!uptime_file) { + perror("Failed to open /proc/uptime"); + exit(1); + } + read_items = fscanf(uptime_file, "%lu", &uptime_seconds); + fclose(uptime_file); + + // 4. compute absolute timestamp of proc boot time and format + time_t procstart = (time_t) (time(NULL) - uptime_seconds \ + + (double)start_time_jiffies / clk_tck); + struct tm *timeinfo = localtime(&procstart); + char time_str[20]; + strftime(time_str, sizeof(time_str), "%b%d_%H%M%S", timeinfo); + // generate TRACE_DIR and create if need + char* TRACE_DIR = (char*) malloc(strlen(NEUTRINO_TRACEDIR) + 30); + sprintf(TRACE_DIR, "%s/%s_%d", NEUTRINO_TRACEDIR, time_str, getpid()); + // get or create the TRACE_DIR + if (access(TRACE_DIR, F_OK) != 0) { + if (mkdir(TRACE_DIR, 0755) != 0) { + perror("Can not create TRACE_DIR"); + exit(EXIT_FAILURE); + } + } + return TRACE_DIR; +} + +/** + * @note semaphores for thread safety: Neutrino don't envision multi-threading + * but upper layer, like PyTorch may use multi-threading for their need + * There's only a few critical section like init and hashmaps + */ +static pthread_once_t mutex_is_initialized = PTHREAD_ONCE_INIT; // for safe initialization of mutex +static pthread_mutex_t mutex; // initialization is protected by the mutex_is_initialized +void mutex_init(void) { pthread_mutex_init(&mutex, NULL); } + +/** + * initialize event_log, dir, envvar, these kind of platform-diagnostic commons + * need to be called at the beginning of platform-specific init() + * @note shall be executed with mutex protection!!! + */ +static void common_init(void) { + // first verify NEUTRINO_PROBE is set + char* NEUTRINO_PROBES = getenv("NEUTRINO_PROBES"); + if (NEUTRINO_PROBES == NULL) { + fprintf(stderr, "[error] envariable NEUTRINO_PROBES not set\n"); + } + // get environment variables + NEUTRINO_REAL_DRIVER = getenv("NEUTRINO_REAL_DRIVER"); + if (NEUTRINO_REAL_DRIVER == NULL) { + fprintf(stderr, "[error] envariable NEUTRINO_REAL_DRIVER not set\n"); + exit(EXIT_FAILURE); + } + NEUTRINO_PYTHON = getenv("NEUTRINO_PYTHON"); + if (NEUTRINO_PYTHON == NULL) { + fprintf(stderr, "[error] envariable NEUTRINO_PYTHON not set\n"); + exit(EXIT_FAILURE); + } + NEUTRINO_PROBING_PY = getenv("NEUTRINO_PROBING_PY"); + if (NEUTRINO_PROBING_PY == NULL) { + fprintf(stderr, "[error] envariable NEUTRINO_PROBING_PY not set\n"); + exit(EXIT_FAILURE); + } + NEUTRINO_CALLBACK = getenv("NEUTRINO_CALLBACK"); + // External Feature Controls + char* dynamic = getenv("NEUTRINO_DYNAMIC"); + if (dynamic != NULL && atoi(dynamic) != 0) { + DYNAMIC = 1; + } + char* verbose = getenv("NEUTRINO_VERBOSE"); + if (verbose != NULL && atoi(verbose) != 0) { // otherwise, default is 0 + VERBOSE = 1; + } + char* benchmark = getenv("NEUTRINO_BENCHMARK"); + if (benchmark != NULL && atoi(benchmark) != 0) { + NEUTRINO_BENCHMARK = 1; + } + char* memusage = getenv("NEUTRINO_MEMUSAGE"); + if (memusage != NULL && atoi(memusage) != 0) { + NEUTRINO_MEMUSAGE = 1; + } + // generate TRACE_DIR and create if need + char* TRACE_DIR = get_tracedir(); + fprintf(stderr, "[info] trace in %s \n", TRACE_DIR); + // RESULT_DIR put metrics + RESULT_DIR = malloc(strlen(TRACE_DIR) + 8); + sprintf(RESULT_DIR, "%s/result", TRACE_DIR); + if (mkdir(RESULT_DIR, 0755) != 0) { + perror("Can not create RESULT_DIR"); + exit(EXIT_FAILURE); + } + // KERNEL_DIR is workdirs of the probe engine + KERNEL_DIR = malloc(strlen(TRACE_DIR) + 8); + sprintf(KERNEL_DIR, "%s/kernel", TRACE_DIR); + if (mkdir(KERNEL_DIR, 0755) != 0) { + perror("Can not create KERNEL_DIR"); + exit(EXIT_FAILURE); + } + /** + * Dump the probe.toml to TRACE_DIR/probe.toml + */ + char* TMP_PATH = malloc(strlen(TRACE_DIR) + 20); + sprintf(TMP_PATH, "%s/probe.toml", TRACE_DIR); + FILE* probes_f = fopen(TMP_PATH, "w"); + if (probes_f == NULL) { + perror("Can open probe.toml"); + exit(EXIT_FAILURE); + } + fwrite(NEUTRINO_PROBES, sizeof(char), strlen(NEUTRINO_PROBES), probes_f); + fclose(probes_f); + /** + * Dump the trace reading code to the TRACE_DIR/read.py + */ + const char* NEUTRINO_READING = getenv("NEUTRINO_READING"); + if (NEUTRINO_READING) { + sprintf(TMP_PATH, "%s/read.py", TRACE_DIR); + FILE* reading_f = fopen(TMP_PATH, "w"); + if (reading_f != NULL) { + fwrite(NEUTRINO_READING, sizeof(char), strlen(NEUTRINO_READING), reading_f); + fclose(reading_f); + } + } + /** + * Open the event.log as the operation log of hook driver + */ + sprintf(TMP_PATH, "%s/event.log", TRACE_DIR); + event_log = fopen(TMP_PATH, "a"); + if (event_log == NULL) { + perror("Can open event.log"); + exit(EXIT_FAILURE); + } + // print metadata like pid and cmdline + fprintf(event_log, "[init] pid %d\n", getpid()); // print the process id + // get command line arguments + char cmdpath[128], cmdline[1024]; + sprintf(cmdpath, "/proc/%d/cmdline", getpid()); + FILE *cmdfile = fopen(cmdpath, "r"); + size_t len = fread(cmdline, 1, sizeof(cmdline) - 1, cmdfile); + if (len > 0) { + // Replace null characters with spaces + for (int i = 0; i < len; i++) { + if (cmdline[i] == '\0') { + cmdline[i] = ' '; + } + } + } + fclose(cmdfile); + // print the command line, helpful to correlate source code + fprintf(event_log, "[init] cmd %zu %s\n", len, cmdline); + fflush(event_log); + // load real driver shared library + shared_lib = dlopen(NEUTRINO_REAL_DRIVER, RTLD_LAZY); + CHECK_DL(); + fprintf(event_log, "[init] dl %p\n", shared_lib); + fflush(event_log); + // get the starting time + clock_gettime(CLOCK_REALTIME, &start); + free(TMP_PATH); + free(TRACE_DIR); + // don't free RESULT_DIR and KERNEL_DIR, we will use it later +} + +/** + * Neutrino Trace Headers being dumped + * + * Similar to most binary, Neutrino trace started with a header (trace_header_t) and + * followed by an array of section (trace_section_t) for each probe, and datas. + * @todo add section table similar to ELF for faster parsing + * @todo add a placeholder for probe type + * @todo standardize saving from cuda.c/hip.c + */ +typedef struct { + // basic launch configuration + uint32_t gridDimX; + uint32_t gridDimY; + uint32_t gridDimZ; + uint32_t blockDimX; + uint32_t blockDimY; + uint32_t blockDimZ; + uint32_t sharedMemBytes; // @todo replace with WARP_SIZE + // all above from CUDA/ROCm launch configuration + uint32_t numProbes; // number of traces exposed + // followed by an array of trace_section_t +} trace_header_t; + +typedef struct { + uint32_t size; // size of record per thread/warp in byte + uint32_t warpDiv; // warpSize for warp-level, 1 for thread-level + uint64_t offset; // offset for fseek +} trace_section_t; + +/** + * GPU Code Binary Header Definitions, supporting cubin, fatbin, text(ptx/gcn asm) + * @note ELF is standard ELF and fatbin + * @todo support .hsaco + */ + +// fat binary header defined for fatbin +// @cite https://github.com/rvbelapure/gpu-virtmem/blob/master/cudaFatBinary.h +typedef struct { + unsigned int magic; // magic numbers, checked it before + unsigned int version; // fatbin version + unsigned long long int size; // fatbin size excluding +} fatBinaryHeader; + +// the fat binary wrapper header +// @see fatbinary_section.h in cuda toolkit +typedef struct { + int magic; + int version; + unsigned long long* data; // pointer to real fatbin + void *filename_or_fatbin; /* version 1: offline filename, + * version 2: array of prelinked fatoutbuf */ +} fatBinaryWrapper; + +/** + * Binary Size Calculation based on header because code are of void* + * @note Please use unified API get_managed_code_size + */ + +#define ELF 1 +#define FATBIN 2 +#define WRAPPED_FATBIN 3 +#define PTX 4 +#define ERROR_TYPE 0 + +static const char *code_types[] = { "error", "elf", "fatbin", "warpped_fatbin", "ptx" }; + +// check if content of void *ptr is ELF format or FatBinary Format +static int check_magic(const int magic) { + if (magic == 0x464c457f || magic == 0x7f454c46) { + return ELF; + } else if (magic == 0xba55ed50 || magic == 0x50ed55ba) { + return FATBIN; + } else if (magic == 0x466243B1 || magic == 0xB1436246) { + return WRAPPED_FATBIN; + } else { + return ERROR_TYPE; + } +} + +static unsigned long long get_elf_size(const Elf64_Ehdr *header) { + // for standard executable, use section header + size_t size = header->e_shoff + header->e_shentsize * header->e_shnum; + + // for cubin, only program header can give correct size + if (header->e_phoff + header->e_phentsize * header->e_phnum > size) + size = header->e_phoff + header->e_phentsize * header->e_phnum; + + return size; +} + +static unsigned long long get_fatbin_size(const fatBinaryHeader *header) { + // size of fatbin is given by header->size and don't forget sizeof header + return header->size + sizeof(fatBinaryHeader); +} + +static int get_managed_code_size(void** managed, size_t* size, const void* bin) { + int magic, bin_type; + // check the magic number for binary type + memcpy(&magic, bin, sizeof(int)); + bin_type = check_magic(magic); + const void *code; + if (bin_type == WRAPPED_FATBIN) { + fatBinaryWrapper wrapper; + memcpy(&wrapper, bin, sizeof(wrapper)); + fatBinaryHeader header; + memcpy(&header, wrapper.data, sizeof(header)); + *size = get_fatbin_size(&header); + code = (const void*) wrapper.data; + fprintf(event_log, "[bin] type %s size %zu\n", code_types[bin_type], *size); + } else if (bin_type == FATBIN) { + fatBinaryHeader header; + memcpy(&header, bin, sizeof(header)); + *size = get_fatbin_size(&header); + code = (const void*) bin; + fprintf(event_log, "[bin] type %s size %zu\n", code_types[bin_type], *size); + } else if (bin_type == ELF) { + Elf64_Ehdr header; + memcpy(&header, bin, sizeof(header)); + *size = get_elf_size(&header); + code = (const void*) bin; + fprintf(event_log, "[bin] type %s size %zu\n", code_types[bin_type], *size); + } else if (bin_type == ERROR_TYPE) { + // check whether it's text file of NULL-Terminated ASM File + // ptx must start with '//' and end with '\0' + // @todo add GCN ASM here + const char* ptx = (const char*) bin; + if (ptx[0] == '/' && ptx[1] == '/') { + *size = strlen(ptx); // naturally count till '\0' + code = (const void*) bin; + bin_type = PTX; + fprintf(event_log, "[bin] type %s size %zu\n", code_types[bin_type], *size); + } else { // still unrecognize, report the bug and terminates + fprintf(event_log, "[bin] unrecognize %d\n", magic); + return -1; + } + } + // copy the image to a new managed and protected place + *managed = malloc(*size); + memcpy(*managed, code, *size); + return 0; +} + +/** + * Hash map (uthash) as Code Cache to avoid re-probing the same GPU function, include: + * 1. Binary Map for GPU code before probe, could be library, module, function + * 2. Function Map for probed code, including original/pruned/probed function + * @todo binmap logics are duplicated (update_key, update_name_key), simplify them + */ + +typedef struct { + void* key; // could be CUlibrary, CUmodule, CUfunction or HIP equivalent + void* code; // the binary code + char* name; // name of function + unsigned long long size; // size of bin + UT_hash_handle hh; +} binmap_item; + +static binmap_item* binmap = NULL; // UTHash Initialization + +// add item to bin hashmap, won't raise +int binmap_set(void* key, void* code, unsigned long long size, char* name) { + pthread_mutex_lock(&mutex); + binmap_item* item = (binmap_item*) malloc(sizeof(binmap_item)); + item->key = key; + item->code = code; + item->size = size; + item->name = name; + HASH_ADD_PTR(binmap, key, item); + pthread_mutex_unlock(&mutex); + return 0; +} + +int binmap_update_key(void* old_key, void* new_key) { + pthread_mutex_lock(&mutex); + binmap_item* item; + HASH_FIND_PTR(binmap, &old_key, item); + if (item != NULL) { + HASH_DEL(binmap, item); + item->key = new_key; + HASH_ADD_PTR(binmap, key, item); + pthread_mutex_unlock(&mutex); + return 0; + } else { + pthread_mutex_unlock(&mutex); + return -1; + } +} + +/** + * Update both the name and the key, favored by cuModuleGetFunction + * and cuLibraryGetKernel, which will create new entry to hold the + * new key and value, but underlying binary and size will be shared + */ +int binmap_update_name_key(void* old_key, void* new_key, char* name) { + pthread_mutex_lock(&mutex); + binmap_item* old_item; + HASH_FIND_PTR(binmap, &old_key, old_item); + if (old_item != NULL) { + binmap_item* new_item = (binmap_item*) malloc(sizeof(binmap_item)); + new_item->name = name; + new_item->key = new_key; + new_item->size = old_item->size; + new_item->code = old_item->code; + HASH_ADD_PTR(binmap, key, new_item); + pthread_mutex_unlock(&mutex); + return 0; + } else { + pthread_mutex_unlock(&mutex); + return -1; + } +} + +int binmap_get(void* key, size_t* size, char** name, void** code) { + pthread_mutex_lock(&mutex); + binmap_item* item; + HASH_FIND_PTR(binmap, &key, item); + if (item != NULL) { + *size = item->size; + *name = item->name; + *code = item->code; + pthread_mutex_unlock(&mutex); + return 0; + } else { + pthread_mutex_unlock(&mutex); + return -1; + } +} + +// function map items, used as JIT code cache to avoid re-compilation +typedef struct { + void* original; // original CUfunction/HIPfunction + char* name; // name of function, if made possible, can be NULL + int n_param; // number of parameters, obtained from parsing + int n_probe; // number of probes that would dump memory + int* probe_sizes; // sizes of probe memory, order matches + int* probe_types; // types of probe, + bool succeed; // specify JIT status -> if failed, always goto backup + void* probed; // probed CUfunction/HIPfunction + void* pruned; // pruned CUfunction/HIPfunction, for benchmark only + void* countd; // counting CUfunction/HIPfunction, for DYNAMIC=TRUE only + UT_hash_handle hh; // reserved by uthash +} funcmap_item_t; + +static funcmap_item_t* funcmap = NULL; + +// add an item to the hashmap-based code cache +int funcmap_set(void* original, char* name, int n_param, int n_probe, int* probe_sizes, int* probe_types, bool succeed, void* probed, void* pruned, void* countd) { + pthread_mutex_lock(&mutex); + funcmap_item_t* item = (funcmap_item_t*) malloc(sizeof(funcmap_item_t)); + item->original = original; + item->probed = probed; + item->pruned = pruned; + item->countd = countd; + item->name = name; + item->n_param = n_param; + item->n_probe = n_probe; + item->probe_sizes = probe_sizes; + item->probe_types = probe_types; + item->succeed = succeed; // add func status -> if failed then no need to try probing again and again + HASH_ADD_PTR(funcmap, original, item); + pthread_mutex_unlock(&mutex); + return 0; +} + +// get an item from hashmap-based code cache +int funcmap_get(void* original, char** name, int* n_param, int* n_probe, int** probe_sizes, int** probe_types, bool* succeed, void** probed, void** pruned, void** countd) { + pthread_mutex_lock(&mutex); + funcmap_item_t* item; + HASH_FIND_PTR(funcmap, &original, item); + if (item != NULL) { + *name = item->name; + *n_param = item->n_param; + *n_probe = item->n_probe; + *probe_sizes = item->probe_sizes; + *probe_types = item->probe_types; + *succeed = item->succeed; + *probed = item->probed; + *pruned = item->pruned; + *countd = item->countd; + pthread_mutex_unlock(&mutex); + return 0; + } else { + pthread_mutex_unlock(&mutex); + return -1; + } +} + +/** + * hash text based on sha1 algorithm, mainly to flush kernel name, because the + * C++ template can be long and contains weird bytes (to ASCII). + * @note not memory safe, remember to free pointer returned + */ +char* sha1(const char* text) { + SHA1_CTX ctx; + sha1_init(&ctx); + sha1_update(&ctx, text, strlen(text)); + BYTE hash[SHA1_BLOCK_SIZE]; + sha1_final(&ctx, hash); + char* hexed = malloc(41 * sizeof(char)); // 1 for '\0' + sprintf(hexed, "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x", + hash[0], hash[1], hash[2], hash[3], hash[4], hash[5], hash[6], hash[7], hash[8], hash[9], + hash[10],hash[11],hash[12],hash[13],hash[14],hash[15],hash[16],hash[17],hash[18],hash[19]); + return hexed; +} + +/** + * File Utilities, Read File without knowing size + * @note remember to free pointer returned + */ +inline void* readf(char* path, const char* mode) { + FILE* file = fopen(path, mode); + fseek(file, 0, SEEK_END); + long file_size = ftell(file); + fseek(file, 0, SEEK_SET); + void* ptr = malloc(file_size); + size_t read_size = fread(ptr, 1, file_size, file); + if (read_size != file_size) + fprintf(stderr, "read size mismatched\n"); + fclose(file); + return ptr; +} diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/cuda.c b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/cuda.c new file mode 100644 index 0000000..fa146a5 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/cuda.c @@ -0,0 +1,788 @@ +/** + * Neutrino Hook Driver, NVIDIA CUDA Implementation +*/ + +#include "common.h" // for common headers +#include // for cuda related definition + +/** + * Undefine some symbols updated to v2. These are some historical issue with + * 32bit machine. Now NVIDIA update them to v2 for 64bit. + */ +#undef cuMemAlloc +#undef cuStreamGetCaptureInfo +#undef cuArray3DCreate +#undef cuArray3DGetDescriptor +#undef cuArrayCreate +#undef cuArrayGetDescriptor +#undef cuCtxCreate +#undef cuCtxDestroy +#undef cuCtxPopCurrent +#undef cuCtxPushCurrent +#undef cuDevicePrimaryCtxRelease +#undef cuDevicePrimaryCtxReset +#undef cuDevicePrimaryCtxSetFlags +#undef cuDeviceTotalMem +#undef cuEventDestroy +#undef cuGetProcAddress +#undef cuGraphAddKernelNode +#undef cuGraphExecKernelNodeSetParams +#undef cuGraphExecUpdate +#undef cuGraphicsResourceGetMappedPointer +#undef cuGraphicsResourceSetMapFlags +#undef cuGraphKernelNodeGetParams +#undef cuGraphKernelNodeSetParams +#undef cuIpcOpenMemHandle +#undef cuLinkAddData +#undef cuLinkAddFile +#undef cuLinkCreate +#undef cuMemAllocHost +#undef cuMemAllocPitch +#undef cuMemcpy2DAsync +#undef cuMemcpy2DUnaligned +#undef cuMemcpy2D +#undef cuMemcpy3DAsync +#undef cuMemcpy3D +#undef cuMemcpyAtoA +#undef cuMemcpyAtoD +#undef cuMemcpyAtoHAsync +#undef cuMemcpyAtoH +#undef cuMemcpyDtoA +#undef cuMemcpyDtoDAsync +#undef cuMemcpyDtoD +#undef cuMemcpyDtoHAsync +#undef cuMemcpyDtoH +#undef cuMemcpyHtoAAsync +#undef cuMemcpyHtoA +#undef cuMemcpyHtoDAsync +#undef cuMemcpyHtoD +#undef cuMemFree +#undef cuMemGetAddressRange +#undef cuMemGetInfo +#undef cuMemHostGetDevicePointer +#undef cuMemHostRegister +#undef cuMemsetD16 +#undef cuMemsetD2D16 +#undef cuMemsetD2D32 +#undef cuMemsetD2D8 +#undef cuMemsetD32 +#undef cuMemsetD8 +#undef cuModuleGetGlobal +#undef cuStreamBatchMemOp +#undef cuStreamBeginCapture +#undef cuStreamDestroy +#undef cuStreamWaitValue32 +#undef cuStreamWaitValue64 +#undef cuStreamWriteValue32 +#undef cuStreamWriteValue64 +#undef cuTexRefGetAddress +#undef cuTexRefSetAddress2D +#undef cuTexRefSetAddress + +#define WARP_SIZE 32 // NVIDIA GPUs use 32 for WARP_SIZE +// used by benchmark mode +static CUdeviceptr benchmark_flush_mem = 0u; // aka NULL + +// following functions are hooked for internal usage +CUresult (*real_cuModuleLoadData)(CUmodule*, const void*) = NULL; +CUresult (*real_cuModuleLoadDataEx)(CUmodule*, const void*, unsigned int, CUjit_option*, void**) = NULL; +CUresult (*real_cuModuleGetFunction)(CUfunction*, CUmodule, const char*) = NULL; +CUresult (*real_cuKernelGetFunction)(CUfunction*, CUkernel) = NULL; +CUresult (*real_cuLibraryGetKernel)(CUkernel*, CUlibrary, const char*) = NULL; +CUresult (*real_cuLibraryGetModule)(CUmodule*, CUlibrary) = NULL; +CUresult (*real_cuLibraryLoadData)(CUlibrary*, const void*, CUjit_option*, void**, unsigned int, CUlibraryOption*, void**, unsigned int) = NULL; +CUresult (*real_cuLaunchKernel)(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void**, void**) = NULL; +CUresult (*real_cuMemAlloc_v2)(CUdeviceptr*, size_t) = NULL; +CUresult (*real_cuMemFree_v2)(CUdeviceptr) = NULL; +CUresult (*real_cuModuleLoad)(CUmodule*, const char*) = NULL; +CUresult (*real_cuModuleLoadFatBinary)(CUmodule*, const void*) = NULL; +CUresult (*real_cuLaunchKernelEx)(const CUlaunchConfig*, CUfunction, void**, void**) = NULL; + +// helper macro to check cuda error +#define CUDA_CHECK(cmd) do { \ + CUresult result = cmd; \ + if (result != CUDA_SUCCESS) { \ + const char *msg; \ + real_cuGetErrorName(result, &msg); \ + printf("Neutrino fail: %s:%d '%s'\n", \ + __FILE__, __LINE__, msg); \ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +// include auto-generated signatures for unmodified functions +// @note signature.c will be auto generated by parse.py +#include "signature.c" + +/** + * Function to initialize the environment, including + * * init the cuda driver module via dlopen + * * init the file system as specified above + * * init the hashmap for binaries and CUfunction + * * init commonly used functions like real_cuModuleLoad... + * + * @note this will be called only once when any hooked driver function is called + */ +static void init(void) { + pthread_once(&mutex_is_initialized, mutex_init); + // init() is critical section to be protected + pthread_mutex_lock(&mutex); + if (shared_lib != NULL) { // then it has been initialized by another + pthread_mutex_unlock(&mutex); + return; + } + common_init(); // init common modules + // load hooked function of Neutrino + real_cuModuleLoadData = dlsym(shared_lib, "cuModuleLoadData"); + real_cuModuleLoadDataEx = dlsym(shared_lib, "cuModuleLoadDataEx"); + real_cuModuleGetFunction = dlsym(shared_lib, "cuModuleGetFunction"); + real_cuKernelGetFunction = dlsym(shared_lib, "cuKernelGetFunction"); + real_cuLibraryGetKernel = dlsym(shared_lib, "cuLibraryGetKernel"); + real_cuLibraryGetModule = dlsym(shared_lib, "cuLibraryGetModule"); + real_cuLibraryLoadData = dlsym(shared_lib, "cuLibraryLoadData"); + real_cuLaunchKernel = dlsym(shared_lib, "cuLaunchKernel"); + real_cuMemAlloc_v2 = dlsym(shared_lib, "cuMemAlloc_v2"); + real_cuMemFree_v2 = dlsym(shared_lib, "cuMemFree_v2"); + real_cuModuleLoad = dlsym(shared_lib, "cuModuleLoad"); + real_cuModuleLoadFatBinary = dlsym(shared_lib, "cuModuleLoadFatBinary"); + real_cuLaunchKernelEx = dlsym(shared_lib, "cuLaunchKernelEx"); + init_unmodified(); // init unmodified functions, defined in signature.c + CHECK_DL(); // checking if any dl error presented + // initialzie the L2 Flush Memory if benchmark is enabled + if (NEUTRINO_BENCHMARK) { + fprintf(event_log, "[benchmark] ENABLED L2 Flush Size %ld\n", NEUTRINO_BENCHMARK_FLUSH_MEM_SIZE); + real_cuMemAlloc_v2(&benchmark_flush_mem, NEUTRINO_BENCHMARK_FLUSH_MEM_SIZE); + } + fprintf(event_log, "[init] success\n"); + // leaving critical section, unlock + pthread_mutex_unlock(&mutex); + return; +} + +/** + * Module Management: cuModuleXXX + * @see https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html + * + * This is to track the lowering pass from code (binary on disk) to CUfunction + */ + +// Triton use this to load data +CUresult cuModuleLoadData(CUmodule* module, const void* image) { + if (shared_lib == NULL) { init(); } + + // call the real function, after this, module will be valid + CUresult result = real_cuModuleLoadData(module, image); + + fprintf(event_log, "[mod] cuModuleLoadData %d module %p image %p \n", result, *module, image); + + void* managed; + size_t size; + if (get_managed_code_size(&managed, &size, image) != -1) { + binmap_set(*module, managed, size, NULL); // name = NULL as we don't know it now + } + + return result; +} + +CUresult cuLibraryLoadData(CUlibrary* library, const void* code, CUjit_option* jitOptions, void** jitOptionsValues, unsigned int numJitOptions, CUlibraryOption* libraryOptions, void** libraryOptionValues, unsigned int numLibraryOptions) { + if (shared_lib == NULL) { init(); } + + CUresult result = real_cuLibraryLoadData(library, code, jitOptions, jitOptionsValues, numJitOptions, libraryOptions, libraryOptionValues, numLibraryOptions); + fprintf(event_log, "[mod] cuLibraryLoadData %d lib %p code %p\n", result, *library, code); + + // update to hashmap + void* managed; + size_t size; + if (get_managed_code_size(&managed, &size, code) != -1) { + binmap_set(*library, managed, size, NULL); // name = NULL as we don't know it now + } + + return result; +} + +CUresult cuModuleLoadDataEx(CUmodule* module, const void* image, unsigned int numOptions, CUjit_option* options, void** optionValues) { + if (shared_lib == NULL) { init(); } + + CUresult ret = real_cuModuleLoadDataEx(module, image, numOptions, options, optionValues); + + fprintf(event_log, "[mod] cuModuleLoadDataEx mod %p code %p\n", *module, image); + + void* managed; + size_t size; + if (get_managed_code_size(&managed, &size, image) != -1) { + binmap_set(*module, managed, size, NULL); // name = NULL as we don't know it now + } + + return ret; +} + +// JAX use this API, but they don't pass in fatbin but cubin, so a wrong API to use... +CUresult cuModuleLoadFatBinary(CUmodule* module, const void* fatCubin) { + if (shared_lib == NULL) { init(); } + + CUresult result = real_cuModuleLoadFatBinary(module, fatCubin); // call the symbol + + fprintf(event_log, "[mod] cuModuleLoadFatBinary mod %p code %p\n", *module, fatCubin); + + void* managed; + size_t size; + if (get_managed_code_size(&managed, &size, fatCubin) != -1) { + binmap_set(*module, managed, size, NULL); // name = NULL as we don't know it now + } + return result; +} + +// @todo handle the multiple function with different name problem +CUresult cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, const char* name) { + if (shared_lib == NULL) { init(); } + + // first update the name + size_t len = strlen(name); + char* managed_name = malloc(len); + memcpy(managed_name, name, len); + + // call real function + CUresult result = real_cuModuleGetFunction(hfunc, hmod, name); + + fprintf(event_log, "[mod] cuModuleGetFunction func %p mod %p name %s\n", *hfunc, hmod, name); + + // then update the key from module to function + if (binmap_update_name_key(hmod, *hfunc, managed_name) == -1) { + fprintf(event_log, "[hash] cuModuleGetFunction failed-update %p %p %s\n", hmod, *hfunc, managed_name); + } + + return result; +} + +CUresult cuKernelGetFunction(CUfunction* pFunc, CUkernel kernel) { + if (shared_lib == NULL) { init(); } + + CUresult result = real_cuKernelGetFunction(pFunc, kernel); + + fprintf(event_log, "[mod] cuKernelGetFunction %p %p\n", *pFunc, kernel); + + // then update the key from kernel to function + if (binmap_update_key(kernel, *pFunc) == -1) { + fprintf(event_log, "[hash] cuKernelGetFunction failed-update %p %p\n", kernel, *pFunc); + } + + return result; +} + +CUresult cuLibraryGetKernel(CUkernel* pKernel, CUlibrary library, const char* name) { + if (shared_lib == NULL) { init(); } + + // first update the name + size_t len = strlen(name); + char* managed_name = malloc(len); + memcpy(managed_name, name, len); + + CUresult result = real_cuLibraryGetKernel(pKernel, library, name); + + fprintf(event_log, "[mod] cuLibraryGetKernel kernel %p lib %p name %s\n", *pKernel, library, name); + + // then update the key from library to kernel + if (binmap_update_name_key(library, *pKernel, managed_name) == -1) { + fprintf(event_log, "[hash] cuLibraryGetKernel failed-update %p %p %s\n", library, *pKernel, managed_name); + } + + return result; +} + +CUresult cuLibraryGetModule(CUmodule* pMod, CUlibrary library) { + if (shared_lib == NULL) { init(); } + + CUresult result = real_cuLibraryGetModule(pMod, library); + + fprintf(event_log, "[mod] cuLibraryGetModule %d mod %p lib %p\n", result, *pMod, library); + + // then update the key from library to kernel + if (binmap_update_key(library, *pMod) == -1) { + fprintf(event_log, "[hash] cuLibraryGetModule failed-update %p %p\n", library, *pMod); + } + + return result; +} + +/** + * Execution Control, cuLaunchXXX and cuFuncXXX + * @see https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html + * + * aims at providing runtime probing support + */ + +CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void** kernelParams, void** extra) { + if (shared_lib == NULL) { init(); } + + // for time measurement to understand the overhead of Neutrino + CUevent start_event, end_event; + CUDA_CHECK(real_cuEventCreate(&start_event, CU_EVENT_DEFAULT)); + CUDA_CHECK(real_cuEventCreate(&end_event, CU_EVENT_DEFAULT)); + + CUDA_CHECK(real_cuEventRecord(start_event, hStream)); // use the stream specified in param + + float prologue_time, kernel_time, epilogue_time; // time + CUresult result; + CUfunction probed, pruned; // countd is only used when DYNAMIC == True + char* kernel_name; + int n_param, n_probe; + int* probe_sizes; // size of probes + int* probe_types; // type of probes + bool succeed; // jit status + // @note for dynamic buffer, i.e., only when DYNAMIC=true + CUfunction countd = NULL; + int n_count = 0, count_size = 0; // count_size is used only when DYNAMIC == True + + // try obtain the kernel compiled or raise compilation process + // @note count and record is only valid if succeed == true + if (funcmap_get((void*)f, &kernel_name, &n_param, &n_probe, &probe_sizes, &probe_types, &succeed, (void**)&probed, (void**)&pruned, (void**)&countd) == -1) { + fprintf(event_log, "[exec] funcmap-not-find %p\n", f); + fflush(event_log); + // here try to get binary from binmap and start JIT compile + size_t size; + void* bin; + if (binmap_get(f, &size, &kernel_name, &bin) == -1) { // not found the binary, fall back + fprintf(event_log, "[probe] can't-find %p\n", f); + funcmap_set(f,kernel_name, 0, 0, NULL, NULL, false, NULL, NULL, NULL); // set dummy with status FALSE + goto backup; + } else { + fprintf(event_log, "[probe] find %p name %s bin %p size %zu\n", f, kernel_name, bin, size); + fflush(event_log); + // create a directory under the kernel directory with kernel_name + // @note Linux has limit on directory length 255, replace it to sh1 so 20 char + // @bugfix PyTorch kernel name usually is extremely long :( + // @bugfix Triton autotune leads to a set of kernel with same name -> use counter to differentiate + char *tmp = sha1(kernel_name); + char *folder_name = (char*) malloc(5 + strlen(tmp)); + sprintf(folder_name, "%d_%s", kernel_idx, tmp); + free(tmp); + kernel_idx++; + fprintf(event_log, "[probe] rename %s %s\n", kernel_name, folder_name); + char* dir = malloc(strlen(KERNEL_DIR) + strlen(folder_name) + 10); + sprintf(dir, "%s/%s", KERNEL_DIR, folder_name); + if (mkdir(dir, 0755) == 0) { + fprintf(event_log, "[probe] mkdir %s\n", dir); + } else { + fprintf(event_log, "[probe] can't-mkdir %s\n", dir); + funcmap_set(f,kernel_name, 0, 0, NULL, NULL, false, NULL, NULL, NULL); // set dummy with status FALSE + goto backup; + } + // create original.bin and write the binary to it + char* path = malloc(strlen(dir) + 15); + sprintf(path, "%s/original.bin", dir); + FILE* original_bin = fopen(path, "wb"); + if (original_bin == NULL) { + fprintf(event_log, "[probe] can't-open %s\n", path); + funcmap_set(f,kernel_name, 0, 0, NULL, NULL, false, NULL, NULL, NULL); // set dummy with status FALSE + goto backup; + } + fwrite(bin, size, 1, original_bin); + fclose(original_bin); + fprintf(event_log, "[probe] write %s\n", path); + // create subprocess to run process.py, be aware of multi-processing + pid_t pid = fork(); + if (pid < 0) { + fprintf(event_log, "[probe] can't-folk\n"); + funcmap_set(f,kernel_name, 0, 0, NULL, NULL, false, NULL, NULL, NULL); // set dummy with status FALSE + goto backup; + } else if (pid == 0) { // child process, run python process.py kernel name + // python process.py + execlp(NEUTRINO_PYTHON, NEUTRINO_PYTHON, NEUTRINO_PROBING_PY, dir, kernel_name, NULL); + exit(EXIT_FAILURE); // reach here only if exec error -> failure + } else { // parent process, wait for child + fprintf(event_log, "[probe] subproc %s %s %s %s\n", NEUTRINO_PYTHON, NEUTRINO_PROBING_PY, dir, kernel_name); + int status; + waitpid(pid, &status, 0); + if (status != EXIT_SUCCESS) { + fprintf(event_log, "[probe] python failed\n"); + funcmap_set(f,kernel_name, 0, 0, NULL, NULL, false, NULL, NULL, NULL); // set dummy with status FALSE + goto backup; + } else { + fprintf(event_log, "[probe] python succeed\n"); + } + } + // read the kernel.info from file system + sprintf(path, "%s/kernel.info", dir); + char* kernel_info = readf(path, "r"); + // poor parser for kernel.info + // @todo add checking alignment + char* kernel_end = strchr(kernel_info, '\n'); + *kernel_end = '\0'; + kernel_name = kernel_info; + char* start = kernel_end + 1; + sscanf(start, " %d\n%d\n", &n_param, &n_probe); + // read sizes and types of probe + probe_sizes = malloc(n_probe * sizeof(int)); + probe_types = malloc(n_probe * sizeof(int)); + char* strptr = strchr(strchr(start, '\n') + 1, '\n') + 1; + for (int idx = 0; idx < n_probe; idx++) { + sscanf(strptr, "%d,%d\n", &probe_types[idx], &probe_sizes[idx]); + strptr = strchr(strptr, '\n') + 1; + } + // // @note read process hook, not yet checked + // char* info_end = strchr(strptr, '\n'); + // *info_end = '\0'; + // callback = strptr; + // here read the + fprintf(event_log, "[probe] read %s name %s n_param %d n_probe %d \n", path, kernel_name, n_param, n_probe); + // load probed.bin -> for collecting runtime info + sprintf(path, "%s/probed.bin", dir); + void* probed_bin = readf(path, "rb"); + // load pruned.bin -> for benchmark + sprintf(path, "%s/pruned.bin", dir); + void* pruned_bin = readf(path, "rb"); + // then load the binary to module + CUmodule probed_mod, pruned_mod; + // then get function with the SAME name -> we distinguish via Module + CUDA_CHECK(real_cuModuleLoadData(&probed_mod, probed_bin)); + CUDA_CHECK(real_cuModuleGetFunction(&probed, probed_mod, kernel_name)); + CUDA_CHECK(real_cuModuleLoadData(&pruned_mod, pruned_bin)); + CUDA_CHECK(real_cuModuleGetFunction(&pruned, pruned_mod, kernel_name)); + if (DYNAMIC) { + sprintf(path, "%s/countd.bin", dir); + void* countd_bin = readf(path, "rb"); + CUmodule countd_mod; + CUDA_CHECK(real_cuModuleLoadData(&countd_mod, countd_bin)); + CUDA_CHECK(real_cuModuleGetFunction(&countd, countd_mod, kernel_name)); + } + // add record to hashmap to avoid re-compile + funcmap_set(f, kernel_name, n_param, n_probe, probe_sizes, probe_types, true, probed, pruned, countd); + fprintf(event_log, "[probe] finish %p name %s n_param %d\n", f, kernel_name, n_param); + fflush(event_log); + // free memory before we leave + free(dir); + free(path); + free(kernel_info); + free(probed_bin); + free(pruned_bin); + free(folder_name); + // don't free(probe_sizes) -> used by func-map!!! + succeed = true; + } + } + // expose the original param + fprintf(event_log, "[exec] funcmap-find %p %s\n", f, succeed ? "success" : "fail"); + // check the jit status, if failed, goto backup + if (!succeed) { goto backup; } + + // @bugfix add timestamp to match with readings from high-level integration (PyTorch) + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + long long time = ts.tv_nsec + ts.tv_sec * 1e9; + fprintf(event_log, "[exec] %lld param ", time); + for (int i = 0; i < n_param; i++) { + // @note print raw value -> help check raw number but mostly pointers... + fprintf(event_log, "%llx ", *(CUdeviceptr*)kernelParams[i]); + } + fprintf(event_log, "\n"); + fprintf(event_log, "[exec] grid %u %u %u block %u %u %u shared %u\n", gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes); + fflush(event_log); + + const size_t gridSize = gridDimX * gridDimY * gridDimZ; + const size_t blockSize = blockDimX * blockDimY * blockDimZ; + const size_t warpSize = CDIV(blockSize, WARP_SIZE); + + /** + * Handling Dynamic Memory Size Requirements + * @bug COUNT will not work for kernels with inplace modification, i.e., + * the computed result will pollute next launch, usually seen in the + * decode kernels like topP/topK, please use a large enough constant + * @note We may turns to a ring buffer implementation for dynamic buffers, + * but the trouble is how to have least interruption to frontned job + * Particularly under the case that PCIe speed <<< Memory Bandwidth + * @todo Support Multiple Dynamic Buffer Allocation (by count many times) + */ + if (DYNAMIC) { + n_count = 1; // Let's support 1 dynamic first + // first set the attributes + CUDA_CHECK(real_cuFuncSetAttribute(countd, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, sharedMemBytes)); + // second set the memory sizes + const size_t size_counter = gridSize * blockSize * sizeof(uint64_t); + uint64_t *h_counter = malloc(size_counter); + CUdeviceptr d_counter; + CUDA_CHECK(real_cuMemAlloc_v2(&d_counter, size_counter)); + CUDA_CHECK(real_cuMemsetD32_v2(d_counter, 0, size_counter / 4UL)); + void** count_args = malloc((n_param + 1) * sizeof(void*)); + memcpy(count_args, kernelParams, n_param * sizeof(void*)); // copy the raw parameters + count_args[n_param] = &d_counter; + result = real_cuLaunchKernel(countd, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, + blockDimZ, sharedMemBytes, hStream, count_args, extra); + // cuMemcpy is a blocked call + result = real_cuMemcpyDtoH_v2(h_counter, d_counter, size_counter); + if (result != CUDA_SUCCESS) + goto backup; + for (int idx = 0; idx < gridSize * blockSize; idx++) + count_size = (count_size > h_counter[idx]) ? count_size : h_counter[idx]; + fprintf(event_log, "[count] size: %d \n", count_size); + // clean memory before we leave + free(h_counter); + free(count_args); + CUDA_CHECK(real_cuMemFree_v2(d_counter)); + } + + // here start to calculate memory size for every probe based on grid, block and probe_sizes + // formula similar to ndarray based on grid, block / warp + + size_t *probe_real_sizes = malloc(n_probe * sizeof(size_t)); + size_t total_probe_sizes = 0; + for (int idx = 0; idx < n_probe; idx++) { + if (probe_types[idx] == PROBE_TYPE_THREAD) { + if (probe_sizes[idx] != -1) { + probe_real_sizes[idx] = gridSize * blockSize * probe_sizes[idx]; + fprintf(event_log, "[exec] grid %zu block %zu probe %d total %zu\n", gridSize, blockSize, probe_sizes[idx], probe_real_sizes[idx]); + } else { + probe_real_sizes[idx] = gridSize * blockSize * count_size; + fprintf(event_log, "[exec] grid %zu block %zu probe %d total %zu\n", gridSize, blockSize, count_size, probe_real_sizes[idx]); + } + } else if (probe_types[idx] == PROBE_TYPE_WARP) { + probe_real_sizes[idx] = gridSize * warpSize * probe_sizes[idx]; + fprintf(event_log, "[exec] grid %zu warp %zu probe %d total %zu\n", gridSize, warpSize, probe_sizes[idx], probe_real_sizes[idx]); + } + total_probe_sizes += probe_real_sizes[idx]; + } + + fprintf(event_log, "[exec] probe-mem %zu (bytes)\n", total_probe_sizes); + + // if NEUTRINO_MEMUSAGE, don't execute, just leave + if (NEUTRINO_MEMUSAGE) { + free(probe_real_sizes); // free the allocated + goto backup; + } + + // Allocate Memory on Host and Device + void** h_probe_mems = malloc(n_probe * sizeof(void*)); + CUdeviceptr* d_probe_mems = malloc(n_probe * sizeof(CUdeviceptr)); + for (int idx = 0; idx < n_probe; idx++) { + h_probe_mems[idx] = malloc(probe_real_sizes[idx]); + CUDA_CHECK(real_cuMemAlloc_v2(&d_probe_mems[idx], probe_real_sizes[idx])); + CUDA_CHECK(real_cuMemsetD32_v2(d_probe_mems[idx], 0, probe_real_sizes[idx] / 4UL)); + } + + // @note argument layout is (n_param + n_probe) * sizeof(void*), n_param is parsed inside ptx + void** probe_args = malloc((n_param + n_probe + n_count) * sizeof(void*)); + // first copy the raw parameters + memcpy(probe_args, kernelParams, n_param * sizeof(void*)); + for (int idx = 0; idx < n_probe; idx++) { + probe_args[n_param + idx] = &d_probe_mems[idx]; // offset with n_param -> place later + } + for (int idx = 0; idx < n_count; idx++) { + probe_args[n_param + n_probe + idx] = &count_size; // similar offset + } + + /** + * @note set the shared memory size. If the kernel shared memory size exceed a limit (usually half) + * of the physical SMEM size (per SM), then cuLaunchKernel will raise CUDA_ERROR_INVALID_VALUE, we + * need to manually set via cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared) + * + * @details Neutrino JIT Function is considered a new one and can not inherit original setup... + */ + CUDA_CHECK(real_cuFuncSetAttribute(probed, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, sharedMemBytes)); + if (NEUTRINO_BENCHMARK) { + real_cuMemsetD32_v2(benchmark_flush_mem,0, NEUTRINO_BENCHMARK_FLUSH_MEM_SIZE / 4UL); + } + CUDA_CHECK(real_cuEventRecord(end_event, hStream)); // use the stream specified in param + CUDA_CHECK(real_cuEventSynchronize(end_event)); + CUDA_CHECK(real_cuEventElapsedTime(&prologue_time, start_event, end_event)); + CUDA_CHECK(real_cuEventRecord(start_event, hStream)); // use the stream specified in param + // launch kernel by call real_cuLaunchKernel function + result = real_cuLaunchKernel(probed, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, + blockDimZ, sharedMemBytes, hStream, probe_args, extra); + CUDA_CHECK(real_cuEventRecord(end_event, hStream)); // use the stream specified in param + CUDA_CHECK(real_cuEventSynchronize(end_event)); // kernel ends at this line + // calculate the real kernel time + CUDA_CHECK(real_cuEventElapsedTime(&kernel_time, start_event, end_event)); + CUDA_CHECK(real_cuEventRecord(start_event, hStream)); // use the stream specified in param + if (result != CUDA_SUCCESS) { + for (int idx = 0; idx < n_probe; idx++) { + free(h_probe_mems[idx]); + CUDA_CHECK(real_cuMemFree_v2(d_probe_mems[idx])); + } + free(h_probe_mems); + free(d_probe_mems); + free(probe_real_sizes); + free(probe_args); + fprintf(event_log, "[exec] failed %d\n", result); + goto backup; + } else { + fprintf(event_log, "[exec] succeed %d\n", result); + } + + // On benchmark, we don't save results because we're testing the kernel + if (NEUTRINO_BENCHMARK) { + goto leave; + } + + /** + * Saving Trace to disk + * @todo Standardize this part in common.h because it's platform-agnostic! + */ + for (int idx = 0; idx < n_probe; idx++) { + CUDA_CHECK(real_cuMemcpyDtoH_v2(h_probe_mems[idx], d_probe_mems[idx], probe_real_sizes[idx])); + } + // create dump file + char* DUMP_FILE_NAME = malloc(strlen(RESULT_DIR) + 20); + struct timespec end; + clock_gettime(CLOCK_REALTIME, &end); + double elapsed = ((end.tv_sec * 1e9 + end.tv_nsec) - (start.tv_sec * 1e9 + start.tv_nsec)) / 1e9; + sprintf(DUMP_FILE_NAME, "%s/%.6f.bin", RESULT_DIR, elapsed); + FILE *fp = fopen(DUMP_FILE_NAME, "wb"); + if (!fp) { + fprintf(event_log, "[exec] can't-save %s\n", DUMP_FILE_NAME); + return CUDA_SUCCESS; + } + // write header to file + trace_header_t header = { gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, n_probe }; + fwrite(&header, sizeof(header), 1, fp); + // write sections to file + size_t offset = sizeof(header) + n_probe * sizeof(trace_section_t); + for (int idx = 0; idx < n_probe; idx++) { + trace_section_t section; + section.size = probe_sizes[idx] != -1 ? probe_sizes[idx] : count_size; + section.warpDiv = (probe_types[idx] == PROBE_TYPE_WARP) ? WARP_SIZE : 1; + section.offset = offset; + offset += gridSize * blockSize * section.size / section.warpDiv; + fwrite(§ion, sizeof(section), 1, fp); + } + // write data + for (int idx = 0; idx < n_probe; idx++) { + fwrite(h_probe_mems[idx], 1, probe_real_sizes[idx], fp); + } + // close file + fclose(fp); + fprintf(event_log, "[exec] save %s size %zu\n", DUMP_FILE_NAME, offset); + +leave: + // on leave + // free allocated memory before leave + for (int idx = 0; idx < n_probe; idx++) { + free(h_probe_mems[idx]); + CUDA_CHECK(real_cuMemFree_v2(d_probe_mems[idx])); + } + CUDA_CHECK(real_cuEventRecord(end_event, hStream)); // use the stream specified in param + CUDA_CHECK(real_cuEventSynchronize(end_event)); + CUDA_CHECK(real_cuEventElapsedTime(&epilogue_time, start_event, end_event)); + + if (NEUTRINO_BENCHMARK) { + // On benchmark mode, we + // @note it seems this will launch a kernel implicitly to clear L2 cache + real_cuMemsetD32_v2(benchmark_flush_mem,0, NEUTRINO_BENCHMARK_FLUSH_MEM_SIZE / 4UL); + // here Neutrino use pruned ptx being compiled with exactly the same configuration (assmbler & optimization) with probed + float original_time; + CUDA_CHECK(real_cuFuncSetAttribute(pruned, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, sharedMemBytes)); + CUDA_CHECK(real_cuEventRecord(start_event, hStream)); // use the stream specified in param + // launch original kernel with original parameter + result = real_cuLaunchKernel(pruned, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra); + CUDA_CHECK(real_cuEventRecord(end_event, hStream)); // use the stream specified in param + CUDA_CHECK(real_cuEventSynchronize(end_event)); + // calculate the real kernel time + CUDA_CHECK(real_cuEventElapsedTime(&original_time, start_event, end_event)); + fprintf(event_log, "[benchmark] prologue %f kernel %f epilogue %f original %f impact %f %d\n", prologue_time, kernel_time, epilogue_time, original_time, kernel_time / original_time, result); + } else { + // In normal mode, report the prologue, kernel, epilogue and impact ratio + fprintf(event_log, "[exec] prologue %f kernel %f epilogue %f ratio %f\n", prologue_time, kernel_time, epilogue_time, (prologue_time + kernel_time + epilogue_time) / kernel_time); + // Also create subprocess for analyze routine + if (NEUTRINO_CALLBACK && strlen(NEUTRINO_CALLBACK) >= 3 && strcmp(NEUTRINO_CALLBACK + strlen(NEUTRINO_CALLBACK) - 3, ".py") == 0) { + pid_t pid = fork(); + if (pid < 0) { + fprintf(event_log, "[probe] can't-folk\n"); + } else if (pid == 0) { // child process, run python process.py kernel name + // python process.py + execlp(NEUTRINO_PYTHON, NEUTRINO_PYTHON, NEUTRINO_CALLBACK, DUMP_FILE_NAME, NULL); + exit(EXIT_FAILURE); // reach here only if exec error -> failure + } else { // parent process, wait for child + fprintf(event_log, "[callback] subproc %s %s %s\n", NEUTRINO_PYTHON, NEUTRINO_CALLBACK, DUMP_FILE_NAME); + int status; + waitpid(pid, &status, 0); + if (status != EXIT_SUCCESS) { + fprintf(event_log, "[callback] failed\n"); + } else { + fprintf(event_log, "[callback] succeed\n"); + } + } + } + free(DUMP_FILE_NAME); + } + + free(h_probe_mems); + free(d_probe_mems); + free(probe_real_sizes); + free(probe_args); + fflush(event_log); // make sure all logs are written before we go + return CUDA_SUCCESS; // reach here must be CUDA_SUCCESS + +backup: + // fall back to original version + fprintf(event_log, "[exec] backup %u %u %u block %u %u %u shared %u\n", gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes); + result = real_cuLaunchKernel(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra); + fflush(event_log); + return result; +} + +/** + * Memory API that helps debugging memory operation erros + */ +CUresult cuMemAlloc_v2(CUdeviceptr *dptr, size_t bytesize) { + if (shared_lib == NULL) { init(); } + + CUresult result = real_cuMemAlloc_v2(dptr, bytesize); + fprintf(event_log, "[mem] cuMemAlloc_v2 %d dptr %llx bytesize %zu\n", result, *dptr, bytesize); + return result; +} + +CUresult cuMemFree_v2(CUdeviceptr dptr) { + if (shared_lib == NULL) { init(); } + + CUresult result = real_cuMemFree_v2(dptr); + fprintf(event_log, "[mem] cuMemFree_v2 %d dptr %llx\n", result, dptr); + return result; +} + +/** + * Following functions shall also be hooked but we don't observe any workload + * calling them, thus having a [info] section for tracing, add if needed + */ +CUresult cuModuleLoad(CUmodule* module, const char* fname) { + if (shared_lib == NULL) { init(); } + + CUresult result = real_cuModuleLoad(module, fname); // call the symbol + fprintf(event_log, "[info] cuModuleLoad %d\n", result); + return result; +} + + +/** + * Unmodified part of code, automatically generated by parse.py + * usually we don't trace these API, just print a event_log to indicate they're used + * if there's any weird behavior caused by Neutrino (unlikely), we can have a look + */ +#include "unmodified.c" // include the auto-generated code + +/** + * @note this function is intentially masked out by Neutrino because it might + * let process jump out of Neutrino's Hook Driver directly to real driver + */ +/* +CUresult cuGetProcAddress_v2(const char* symbol, void** pfn, int cudaVersion, cuuint64_t flags, CUdriverProcAddressQueryResult* symbolStatus) { + if (real_cuGetProcAddress_v2 == NULL) + init(); + + CUresult ret; + + if (strcmp(symbol, "cuGetProcAddress") == 0) { + CUresult (*cuGetProcAddress_v2_ptr)(const char* symbol, void** pfn, int cudaVersion, cuuint64_t flags, CUdriverProcAddressQueryResult* symbolStatus) = cuGetProcAddress_v2; + *pfn = cuGetProcAddress_v2_ptr; + ret = CUDA_SUCCESS; + fprintf(event_log, "[pass] cuGetProcAddress_v2 %d %s %d return-myself\n", ret, symbol, cudaVersion); // unexpected func call + } else if (strcmp(symbol, "cuGetExportTable") == 0) { + // CUresult (*cuGetExportTable_ptr)(const void**, const CUuuid*) = cuGetExportTable; + // *pfn = cuGetExportTable_ptr; + ret = CUDA_ERROR_INVALID_VALUE; + fprintf(event_log, "[pass] cuGetProcAddress_v2 %d %s %d return-ours\n", ret, symbol, cudaVersion); // unexpected func call + } else { + ret = real_cuGetProcAddress_v2(symbol, pfn, cudaVersion, flags, symbolStatus); + fprintf(event_log, "[pass] cuGetProcAddress_v2 %d %s %d\n", ret, symbol, cudaVersion); // unexpected func call + } + + return ret; +} +*/ \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/hip.c b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/hip.c new file mode 100644 index 0000000..0fee397 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/hip.c @@ -0,0 +1,586 @@ +/** + * Neutrino Hook Driver, AMD Implementation + * + * @note experimental support only + */ + +#include "common.h" // for common headers +#include // for AMD HIP Hooked APIs + +#define WARP_SIZE 64 // @bug some RDNA use 32 as WARP_SIZE + +// used by benchmark mode +static hipDeviceptr_t benchmark_flush_mem = NULL; + +// following functions are hooked for internal usage +hipError_t (*real_hipModuleLoadData)(hipModule_t*, const void*) = NULL; +hipError_t (*real_hipModuleLoadDataEx)(hipModule_t*, const void*, unsigned int, hipJitOption*, void**) = NULL; +hipError_t (*real_hipModuleGetFunction)(hipFunction_t*, hipModule_t, const char*) = NULL; +hipError_t (*real_hipModuleLaunchKernel)(hipFunction_t, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, hipStream_t, void**, void**) = NULL; +hipError_t (*real_hipMalloc)(void**, size_t) = NULL; +hipError_t (*real_hipFree)(void*) = NULL; +hipError_t (*real_hipModuleLoad)(hipModule_t*, const char*) = NULL; +// following are for weird parsing +hipError_t (*real_hipEventRecord)(hipEvent_t, hipStream_t) = NULL; +const char* (*real_hipGetErrorName)(hipError_t) = NULL; +const char* (*real_hipGetErrorString)(hipError_t) = NULL; +const char* (*real_hipApiName)(uint32_t) = NULL; +const char* (*real_hipKernelNameRef)(const hipFunction_t) = NULL; +const char* (*real_hipKernelNameRefByPtr)(const void*, hipStream_t) = NULL; + +// include auto-generated signatures for unmodified functions +// @note signature.c will be auto generated by parse.py +#include "signature.c" + +#define HIP_CHECK(cmd) do { \ + hipError_t result = cmd; \ + if (result != hipSuccess) { \ + const char *msg; \ + real_hipDrvGetErrorName(result, &msg); \ + printf("Neutrino fail: %s:%d '%s'\n", \ + __FILE__, __LINE__, msg); \ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +static void init(void) { + common_init(); // init common modules + real_hipModuleLoadData = dlsym(shared_lib, "hipModuleLoadData"); + real_hipModuleLoadDataEx = dlsym(shared_lib, "hipModuleLoadDataEx"); + real_hipModuleGetFunction = dlsym(shared_lib, "hipModuleGetFunction"); + real_hipModuleLaunchKernel = dlsym(shared_lib, "hipModuleLaunchKernel"); + real_hipMalloc = dlsym(shared_lib, "hipMalloc"); + real_hipFree = dlsym(shared_lib, "hipFree"); + real_hipModuleLoad = dlsym(shared_lib, "hipModuleLoad"); + real_hipEventRecord = dlsym(shared_lib, "hipEventRecord"); + real_hipGetErrorName = dlsym(shared_lib, "hipGetErrorName"); + real_hipGetErrorString = dlsym(shared_lib, "hipGetErrorString"); + real_hipApiName = dlsym(shared_lib, "hipApiName"); + real_hipKernelNameRef = dlsym(shared_lib, "hipKernelNameRef"); + real_hipKernelNameRefByPtr = dlsym(shared_lib, "hipKernelNameRefByPtr"); + init_unmodified(); // init unmodified functions, defined in signature.c + CHECK_DL(); // checking if any dl error presented + // initialzie the L2 Flush Memory if benchmark is enabled + if (NEUTRINO_BENCHMARK) { + fprintf(event_log, "[benchmark] ENABLED L2 Flush Size %d\n", NEUTRINO_BENCHMARK_FLUSH_MEM_SIZE); + real_hipMalloc(&benchmark_flush_mem, NEUTRINO_BENCHMARK_FLUSH_MEM_SIZE); + } + fprintf(event_log, "[init] success\n"); +} + +/** + * Module Management + * @see https://rocm.docs.amd.com/projects/HIP/en/latest/reference/hip_runtime_api/modules/module_management.html + * + * HIP Module Management is simpler than CUDA, only Module -> Function + */ + +hipError_t hipModuleLoadData(hipModule_t* module, const void* image) { + if (shared_lib == NULL) { init(); } + + hipError_t ret = real_hipModuleLoadData(module, image); + + fprintf(event_log, "[mod] hipModuleLoadData mod %p code %p\n", *module, image); + + // update to hashmap + void* managed; + size_t size; + if (get_managed_code_size(&managed, &size, image) != -1) { + binmap_set(*module, managed, size, NULL); // name = NULL as we don't know it now + } + + return ret; +} + +hipError_t hipModuleLoadDataEx(hipModule_t* module, const void* image, unsigned int numOptions, hipJitOption* options, void** optionValues) { + if (shared_lib == NULL) { init(); } + + hipError_t ret = real_hipModuleLoadDataEx(module, image, numOptions, options, optionValues); + + fprintf(event_log, "[mod] hipModuleLoadData mod %p code %p\n", *module, image); + + // update to hashmap + void* managed; + size_t size; + if (get_managed_code_size(&managed, &size, image) != -1) { + binmap_set(*module, managed, size, NULL); // name = NULL as we don't know it now + } + return ret; +} + +hipError_t hipModuleGetFunction(hipFunction_t* function, hipModule_t module, const char* kname) { + if (shared_lib == NULL) { init(); } + + // first update the name + size_t len = strlen(kname); + char* managed_name = malloc(len); + memcpy(managed_name, kname, len); + + // call real function + hipError_t result = real_hipModuleGetFunction(function, module, kname); + + fprintf(event_log, "[mod] hipModuleGetFunction func %p mod %p name %s\n", *function, module, kname); + + // then update the key from module to function + if (binmap_update_name_key(module, *function, managed_name) == -1) { + fprintf(event_log, "[hash] hipModuleGetFunction failed-update %p %p %s\n", module, *function, managed_name); + } + + return result; +} + +hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, hipStream_t stream, void** kernelParams, void** extra) { + if (shared_lib == NULL) { init(); } + + hipEvent_t start_event, end_event; + HIP_CHECK(real_hipEventCreate(&start_event)); + HIP_CHECK(real_hipEventCreate(&end_event)); + + HIP_CHECK(real_hipEventRecord(start_event, stream)); // use the stream specified in param + + float prologue_time, kernel_time, epilogue_time; + hipErrot_t result; + hipFunction_t probed, pruned; + char* kernel_name; + int n_param, n_probe; + int* probe_sizes; // size of probes + int* probe_types; // type of probes + bool succeed; // jit status + char* analyze_hook; // path to python script + + // try obtain the kernel compiled or raise compilation process + // @note count and record is only valid if succeed == true + if (funcmap_get(f, &kernel_name, &n_param, &n_probe, &probe_sizes, &probe_types, &succeed, &probed, &pruned) == -1) { + fprintf(event_log, "[exec] funcmap-not-find %p\n", f); + fflush(event_log); // we need many fflush to avoid trace not printed + // here try to get binary from binmap and start JIT compile + size_t size; + void* bin; + if (binmap_get(f, &size, &kernel_name, &bin) == -1) { // not found the binary, fall back + fprintf(event_log, "[jit] can't-find %p\n", f); + fflush(event_log); + funcmap_set(f,kernel_name, 0, 0, NULL, NULL, false, NULL, NULL); // set dummy with status FALSE + goto backup; + } else { + fprintf(event_log, "[jit] find %p name %s bin %p size %zu\n", f, kernel_name, bin, size); + fflush(event_log); + // create a directory under the kernel directory with kernel_name + // @note Linux has limit on directory length 255, replace it to sh1 so 20 char + // @bugfix PyTorch kernel name usually is extremely long :( + // @bugfix Triton autotune leads to a set of kernel with same name -> use counter to differentiate + char *tmp = sha1(kernel_name); + char *folder_name = (char*) malloc(5 + strlen(tmp)); + sprintf(folder_name, "%d_%s", kernel_idx, tmp); + free(tmp); + kernel_idx++; + fprintf(event_log, "[jit] rename %s %s\n", kernel_name, folder_name); + fflush(event_log); + char* dir = malloc(strlen(KERNEL_DIR) + strlen(folder_name) + 10); + sprintf(dir, "%s/%s", KERNEL_DIR, folder_name); + if (mkdir(dir, 0755) == 0) { + fprintf(event_log, "[jit] mkdir %s\n", dir); + } else { + fprintf(event_log, "[jit] can't-mkdir %s\n", dir); + funcmap_set(f, kernel_name, 0, 0, NULL, NULL, false, NULL, NULL); // set status FALSE to prevent recompile fault + goto backup; + } + // create original.bin and write the binary to it + char* path = malloc(strlen(dir) + 15); + sprintf(path, "%s/original.bin", dir); + FILE* original_bin = fopen(path, "wb"); + if (original_bin == NULL) { + fprintf(event_log, "[jit] can't-open %s\n", path); + funcmap_set(f, kernel_name, 0, 0, NULL, NULL, false, NULL, NULL); // set status FALSE to prevent recompile fault + goto backup; + } + fwrite(bin, size, 1, original_bin); + fclose(original_bin); + fprintf(event_log, "[jit] write %s\n", path); + // create subprocess to run process.py, be aware of multi-processing + pid_t pid = fork(); + if (pid < 0) { + fprintf(event_log, "[jit] can't-folk\n"); + funcmap_set(f,kernel_name, 0, 0, NULL, NULL, false, NULL, NULL); // set status FALSE to prevent recompile fault + goto backup; + } else if (pid == 0) { // child process, run python process.py kernel name + // python process.py + execlp(NEUTRINO_PYTHON, NEUTRINO_PYTHON, NEUTRINO_PROBING_PY, dir, kernel_name, NULL); + exit(EXIT_FAILURE); // reach here only if exec error -> failure + } else { // parent process, wait for child + fprintf(event_log, "[jit] subproc %s %s %s %s\n", NEUTRINO_PYTHON, NEUTRINO_PROBING_PY, dir, kernel_name); + int status; + waitpid(pid, &status, 0); + if (status != EXIT_SUCCESS) { + fprintf(event_log, "[jit] python failed\n"); + funcmap_set(f,kernel_name, 0, 0, NULL, NULL, false, NULL, NULL); // set dummy with status FALSE + goto backup; + } else { + fprintf(event_log, "[jit] python succeed\n"); + } + } + // read the kernel.info from file system + sprintf(path, "%s/kernel.info", dir); + char* kernel_info = readf(path, "r"); + // poor parser for kernel.info + // @todo add checking alignment + char* kernel_end = strchr(kernel_info, '\n'); + *kernel_end = '\0'; + kernel_name = kernel_info; + char* start = kernel_end + 1; + sscanf(start, " %d\n%d\n", &n_param, &n_probe); + // read sizes and types of probe + probe_sizes = malloc(n_probe * sizeof(int)); + probe_types = malloc(n_probe * sizeof(int)); + char* strptr = strchr(strchr(start, '\n') + 1, '\n') + 1; + for (int idx = 0; idx < n_probe; idx++) { + sscanf(strptr, "%d,%d\n", &probe_types[idx], &probe_sizes[idx]); + strptr = strchr(strptr, '\n') + 1; + } + // @note read process hook, not yet checked + char* info_end = strchr(strptr, '\n'); + *info_end = '\0'; + analyze_hook = strptr; + // here read the + fprintf(event_log, "[jit] read %s name %s n_param %d n_probe %d analyze_hook %s\n", path, kernel_name, n_param, n_probe, analyze_hook); + // load probed.bin -> for collecting runtime info + sprintf(path, "%s/probed.bin", dir); + void* probed_bin = readf(path, "rb"); + // load pruned.bin -> for benchmark + sprintf(path, "%s/pruned.bin", dir); + void* pruned_bin = readf(path, "rb"); + // then load the binary to module + hipModule_t probed_mod, pruned_mod; + HIP_CHECK(real_hipModuleLoadData(&probed_mod, probed_bin)); + HIP_CHECK(real_hipModuleGetFunction(&probed, probed_mod, kernel_name)); + HIP_CHECK(real_hipModuleLoadData(&pruned_mod, pruned_bin)); + HIP_CHECK(real_hipModuleGetFunction(&pruned, pruned_mod, kernel_name)); + // add record to hashmap to avoid re-compile + funcmap_set(f, kernel_name, n_param, n_probe, probe_sizes, probe_types, true, probed, pruned); + fprintf(event_log, "[jit] finish %p name %s n_param %d\n", f, kernel_name, n_param); + fflush(event_log); + // free memory before we leave + free(dir); + free(path); + free(kernel_info); + free(probed_bin); + free(pruned_bin); + free(folder_name); + // don't free(probe_sizes) -> used by func-map!!! + succeed = true; + } + } + // expose the original param + fprintf(event_log, "[exec] funcmap-find %p %s\n", f, succeed ? "success" : "fail"); + // check the jit status, if failed, goto backup + if (!succeed) { goto backup; } + + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + long long time = ts.tv_nsec + ts.tv_sec * 1e9; + fprintf(event_log, "[exec] %lld param ", time); + for (int i = 0; i < n_param; i++) { + // @note print raw value -> help check raw number but mostly pointers... + fprintf(event_log, "%llx ", *(hipDeviceptr_t*)kernelParams[i]); + } + fprintf(event_log, "\n"); + fprintf(event_log, "[exec] grid %u %u %u block %u %u %u shared %u\n", gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes); + fflush(event_log); + + // here start to calculate memory size for every probe based on grid, block and probe_sizes + // formula similar to ndarray based on grid, block / warp + const size_t gridSize = gridDimX * gridDimY * gridDimZ; + const size_t blockSize = blockDimX * blockDimY * blockDimZ; + const size_t warpSize = CDIV(blockSize, WARP_SIZE); + size_t *probe_real_sizes = malloc(n_probe * sizeof(size_t)); + size_t total_probe_sizes = 0; + for (int idx = 0; idx < n_probe; idx++) { + if (probe_types[idx] == PROBE_TYPE_THREAD) { + probe_real_sizes[idx] = gridSize * blockSize * probe_sizes[idx]; + fprintf(event_log, "[exec] grid %zu block %zu probe %d total %zu\n", gridSize, blockSize, probe_sizes[idx], probe_real_sizes[idx]); + } else if (probe_types[idx] == PROBE_TYPE_WARP) { + probe_real_sizes[idx] = gridSize * warpSize * probe_sizes[idx]; + fprintf(event_log, "[exec] grid %zu warp %zu probe %d total %zu\n", gridSize, warpSize, probe_sizes[idx], probe_real_sizes[idx]); + } + total_probe_sizes += probe_real_sizes[idx]; + } + + fprintf(event_log, "[exec] probe-mem %zu (bytes)\n", total_probe_sizes); + + // Allocate Memory on Host and Device + void** h_probe_mems = malloc(n_probe * sizeof(void*)); + hipDeviceptr_t* d_probe_mems = malloc(n_probe * sizeof(hipDeviceptr_t)); + for (int idx = 0; idx < n_probe; idx++) { + h_probe_mems[idx] = malloc(probe_real_sizes[idx]); + HIP_CHECK(real_hipMalloc(&d_probe_mems[idx], probe_real_sizes[idx])); + HIP_CHECK(real_hipMemsetD32(d_probe_mems[idx], 0, probe_real_sizes[idx] / 4UL)); + } + + // @note argument layout is (n_param + n_probe) * sizeof(void*), n_param is parsed inside ptx + void** probe_args = malloc((n_param + n_probe) * sizeof(void*)); + memcpy(probe_args, kernelParams, n_param * sizeof(void*)); // copy the raw parameters + for (int idx = 0; idx < n_probe; idx++) { + probe_args[n_param + idx] = &d_probe_mems[idx]; // offset with n_param -> place later + } + /** + * @note set the shared memory size. If the kernel shared memory size exceed a limit (usually half) + * of the physical SMEM size (per SM), then hipModuleLaunchKernel will raise CUDA_ERROR_INVALID_VALUE, we + * need to manually set via cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared) + * + * @details Neutrino JIT Function is considered a new one and can not inherit original setup... + */ + HIP_CHECK(real_hipFuncSetAttribute(probed, hipFuncAttributeMaxDynamicSharedMemorySize, sharedMemBytes)); + if (NEUTRINO_BENCHMARK) { + real_hipMemsetD32(benchmark_flush_mem, 0, NEUTRINO_BENCHMARK_FLUSH_MEM_SIZE / 4UL); + } + HIP_CHECK(real_hipEventRecord(end_event, stream)); // use the stream specified in param + HIP_CHECK(real_hipEventSynchronize(end_event)); + HIP_CHECK(real_hipEventElapsedTime(&prologue_time, start_event, end_event)); + HIP_CHECK(real_hipEventRecord(start_event, stream)); // use the stream specified in param + // launch kernel by call real_hipModuleLaunchKernel function + result = real_hipModuleLaunchKernel(probed, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, + blockDimZ, sharedMemBytes, stream, probe_args, extra); + HIP_CHECK(real_hipEventRecord(end_event, stream)); // use the stream specified in param + HIP_CHECK(real_hipEventSynchronize(end_event)); + // calculate the real kernel time + HIP_CHECK(real_hipEventElapsedTime(&kernel_time, start_event, end_event)); + HIP_CHECK(real_hipEventRecord(start_event, stream)); // use the stream specified in param + if (result != hipSuccess) { + for (int idx = 0; idx < n_probe; idx++) { + free(h_probe_mems[idx]); + HIP_CHECK(real_hipFree(d_probe_mems[idx])); + } + free(h_probe_mems); + free(d_probe_mems); + free(probe_real_sizes); + free(probe_args); + fprintf(event_log, "[exec] failed %d\n", result); + goto backup; + } else { + fprintf(event_log, "[exec] succeed %d\n", result); + } + + // dump to disk + for (int idx = 0; idx < n_probe; idx++) { + HIP_CHECK(real_hipMemcpyDtoH(h_probe_mems[idx], d_probe_mems[idx], probe_real_sizes[idx])); + } + // create dump file + char* DUMP_FILE_NAME = malloc(strlen(RESULT_DIR) + 20); + struct timespec end; + clock_gettime(CLOCK_REALTIME, &end); + double elapsed = ((end.tv_sec * 1e9 + end.tv_nsec) - (start.tv_sec * 1e9 + start.tv_nsec)) / 1e9; + sprintf(DUMP_FILE_NAME, "%s/%.6f.bin", RESULT_DIR, elapsed); + FILE *fp = fopen(DUMP_FILE_NAME, "wb"); + if (!fp) { + fprintf(event_log, "[exec] can't-save %s\n", DUMP_FILE_NAME); + return hipSuccess; // only can't save, still success in execution + } + // write header to file + trace_header_t header; + // gridDim: uint3 + header.gridDimX = gridDimX; + header.gridDimY = gridDimY; + header.gridDimZ = gridDimZ; + // blockDim: uint3 + header.blockDimX = blockDimX; + header.blockDimY = blockDimY; + header.blockDimZ = blockDimZ; + // sharedMemBytes and numProbes + header.sharedMemBytes = sharedMemBytes; + header.numProbes = n_probe; + fwrite(&header, sizeof(header), 1, fp); + // write sections to file + size_t offset = sizeof(header) + n_probe * sizeof(trace_section_t); + for (int idx = 0; idx < n_probe; idx++) { + trace_section_t section; + section.size = probe_real_sizes[idx]; + section.offset = offset; + offset += section.size; + fwrite(§ion, sizeof(section), 1, fp); + } + // write data + for (int idx = 0; idx < n_probe; idx++) { + fwrite(h_probe_mems[idx], 1, probe_real_sizes[idx], fp); + } + // close file + fclose(fp); + fprintf(event_log, "[exec] save %s size %zu\n", DUMP_FILE_NAME, offset); + // free allocated memory before leave + for (int idx = 0; idx < n_probe; idx++) { + free(h_probe_mems[idx]); + HIP_CHECK(real_hipFree(d_probe_mems[idx])); + } + + // on leave + HIP_CHECK(real_hipEventRecord(end_event, stream)); // use the stream specified in param + HIP_CHECK(real_hipEventSynchronize(end_event)); + HIP_CHECK(real_hipEventElapsedTime(&epilogue_time, start_event, end_event)); + if (NEUTRINO_BENCHMARK) { + real_hipMemsetD32(benchmark_flush_mem,0, NEUTRINO_BENCHMARK_FLUSH_MEM_SIZE / 4UL); + // here Neutrino use pruned ptx being compiled with exactly the same configuration (assmbler & optimization) with probed + float original_time; + HIP_CHECK(real_hipFuncSetAttribute(pruned, hipFuncAttributeMaxDynamicSharedMemorySize, sharedMemBytes)); + HIP_CHECK(real_hipEventRecord(start_event, stream)); // use the stream specified in param + // launch original kernel with original parameter + result = real_hipModuleLaunchKernel(pruned, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, stream, kernelParams, extra); + HIP_CHECK(real_hipEventRecord(end_event, stream)); // use the stream specified in param + HIP_CHECK(real_hipEventSynchronize(end_event)); + // calculate the real kernel time + HIP_CHECK(real_hipEventElapsedTime(&original_time, start_event, end_event)); + fprintf(event_log, "[benchmark] prologue %f kernel %f epilogue %f original %f impact %f %d\n", prologue_time, kernel_time, epilogue_time, original_time, kernel_time / original_time, result); + } else { + fprintf(event_log, "[exec] prologue %f kernel %f epilogue %f ratio %f\n", prologue_time, kernel_time, epilogue_time, (prologue_time + kernel_time + epilogue_time) / kernel_time); + } + + // @note do the analyze_hook if having + if (strlen(analyze_hook) >= 3 && strcmp(analyze_hook + strlen(analyze_hook) - 3, ".py") == 0) { + pid_t pid = fork(); + if (pid < 0) { + fprintf(event_log, "[jit] can't-folk\n"); + } else if (pid == 0) { // child process, run python process.py kernel name + execlp(NEUTRINO_PYTHON, NEUTRINO_PYTHON, analyze_hook, DUMP_FILE_NAME, NULL); + exit(EXIT_FAILURE); // reach here only if exec error -> failure + } else { // parent process, wait for child + fprintf(event_log, "[analyze] subproc %s %s %s\n", NEUTRINO_PYTHON, analyze_hook, DUMP_FILE_NAME); + int status; + waitpid(pid, &status, 0); + if (status != EXIT_SUCCESS) { + fprintf(event_log, "[analyze] failed\n"); + } else { + fprintf(event_log, "[analyze] succeed\n"); + } + } + } + + free(h_probe_mems); + free(d_probe_mems); + free(probe_real_sizes); + free(probe_args); + + return hipSuccess; // reach here must be hipSuccess + +backup: + // fall back to original version + fprintf(event_log, "[exec] backup %u %u %u block %u %u %u shared %u\n", gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes); + result = real_hipModuleLaunchKernel(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, stream, kernelParams, extra); + return result; +} + +/** + * Memory API, hooked to be helpful in checking allocated memory region + */ + +hipError_t hipMalloc(void **ptr, size_t size) { + if (shared_lib == NULL) { init(); } + + hipError_t result = real_hipMalloc(ptr, size); + + fprintf(event_log, "[mem] hipMalloc %d dptr %llx bytesize %zu\n", result, *ptr, size); + + return result; +} + + +hipError_t hipFree(void *ptr) { + if (shared_lib == NULL) { init(); } + + hipError_t result = real_hipFree(ptr); + + fprintf(event_log, "[mem] cuMemFree_v2 %d dptr %llx\n", result, ptr); + + return result; +} + +// verify if this API is used +hipError_t hipModuleLoad(hipModule_t *module, const char *fname) { + if (shared_lib == NULL) { init(); } + + hipError_t result = real_hipModuleLoad(module, fname); + + fprintf(event_log, "[mod] hipModuleLoad %d mod %llx name %s\n", *module, fname); + + return result; +} + +// just for some parsing error, don't understand why HIP has such weird API +hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) { + if (shared_lib == NULL) { init(); } + hipError_t result = real_hipEventRecord(event, stream); + return result; +} + +const char* hipGetErrorName(hipError_t hip_error) { // Hi AMD, why hip_error here? + if (shared_lib == NULL) { init(); } + hipError_t result = real_hipGetErrorName(hip_error); + return result; +} + +const char* hipGetErrorString(hipError_t hipError) { // Hi AMD, why hipError here? + if (shared_lib == NULL) { init(); } + hipError_t result = real_hipGetErrorString(hipError); + return result; +} + +const char* hipApiName(uint32_t id) { + if (shared_lib == NULL) { init(); } + hipError_t result = real_hipApiName(id); + return result; +} + +const char* hipKernelNameRef(const hipFunction_t f) { + if (shared_lib == NULL) { init(); } + hipError_t result = real_hipKernelNameRef(f); + return result; +} + +const char* hipKernelNameRefByPtr(const void* hostFunction, hipStream_t stream) { + if (shared_lib == NULL) { init(); } + hipError_t result = real_hipKernelNameRefByPtr(hostFunction, stream); + return result; +} + +/** + * Unmodified part of code, automatically generated by parse.py + * usually we don't trace these API, just print a event_log to indicate they're used + * if there's any weird behavior caused by Neutrino (unlikely), we can have a look + */ +#include "unmodified.c" // include the auto-generated code + +// Following are some (now) undocumented API from HIP v4.2 +// but they're still in use, at least rocBLAS (dependent of PyToch) will use it + +// @see https://github.com/ROCm/hip/blob/rocm-4.2.x/rocclr/hip_platform.cpp#L76 +void** __hipRegisterFatBinary(const void* data) { + if (shared_lib == NULL) { init(); } + void** (real___hipRegisterFatBinary*)(const void*) = dlsym(shared_lib, "__hipRegisterFatBinary"); + + void** fatbin = real___hipRegisterFatBinary(data); + if (VERBOSE) { fprintf(event_log, "[info] __hipRegisterFatBinary %p %p\n", *fatbin, data); } + return fatbin; +} + +// @see https://github.com/ROCm/hip/blob/rocm-4.2.x/rocclr/hip_platform.cpp#L87 +void __hipRegisterFunction( + void** modules, + const void* hostFunction, + char* deviceFunction, + const char* deviceName, + unsigned int threadLimit, + uint3* tid, // There's no official C style Impl of uint3 and dim3, I refer to + uint3* bid, // https://rocm.docs.amd.com/projects/HIP/en/docs-5.7.0/reference/kernel_language.html#short-vector-types + dim3* blockDim, // so as the dim3 + dim3* gridDim, // https://rocm.docs.amd.com/projects/HIP/en/docs-5.7.0/reference/kernel_language.html#dim3 + int* wSize) { + if (shared_lib == NULL) { init(); } + void (real___hipRegisterFunction*)(void**, const void*, char*, const char*, unsigned int, uint3*, uint3*, dim3*, dim3*, int*) = dlsym(shared_lib, "__hipRegisterFunction"); + + real___hipRegisterFunction(modules, hostFunction, deviceFunction, deviceName, threadLimit, tid, bid, blockDim, gridDim, wSize); + if (VERBOSE) { fprintf(event_log, "[info] __hipRegisterFunction %p %p\n", *modules, hostFunction); } +} + +void __hipUnregisterFatBinary(void** modules) { + if (shared_lib == NULL) { init(); } + void (real___hipUnregisterFatBinary*)(void**) = dlsym(shared_lib, "__hipUnregisterFatBinary"); + + real___hipUnregisterFatBinary(modules); + if (VERBOSE) { fprintf(event_log, "[info] __hipUnregisterFatBinary %p\n", *modules); } +} \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/parse.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/parse.py new file mode 100644 index 0000000..53cfbd0 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/parse.py @@ -0,0 +1,311 @@ +"""Code Generator for Unmodified Driver Functions and Symbols + +NOTE Support CUDA / ROCm + +How it works? +All Driver symbols are exposed via with signatures like: +CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev); + +By parsing this symbol allow us to get and generate a mask driver like: +```c +CUresult (*real_cuDeviceGetName)(char*, int, CUdevice) = NULL; + +CUresult cuDeviceGetName(char* name, int len, CUdevice dev) { + if (shared_lib == NULL) { init(); } + CUresult ret = real_cuDeviceGetName(name, len, dev); + return ret; +} +``` + +But CUDA Symbols might be versioned slightly, like cuMemAlloc now has two +symbol cuMemAlloc (actually a macro) and cuMemAlloc_v2(real but not in cuda.h) +so we need to check libcuda.so to compromise the missed symbol (assume the +signature identical to unversioned. + +NOTE You may see warnings functions such as lacking symbols for +cuGL, cudbg, cuEGL, cuMem, cuProfiler, cuVDP, but they can be ignored mostly +""" +from typing import NamedTuple, List, Tuple, Dict +import sys +import os +import subprocess + +LIB_PATH: str = None +HEADER_PATH: str = None +UNMODIFIED_C_NAME = "unmodified.c" +SIGNATURE_C_NAME = "signature.c" + +# NOTE list of modified symbols -> handled by modified.c unless code generation +MODIFIED_FUNCTIONS: Dict[str, List[str]] = { + "cu": ["cuMemAlloc_v2", "cuMemFree_v2", "cuModuleLoadData", "cuModuleGetFunction", + "cuKernelGetFunction", "cuLibraryGetKernel", "cuLibraryGetModule", + "cuLibraryLoadData", "cuLaunchKernel", "cuGetProcAddress_v2", "cuGetProcAddress", + "cuModuleLoadDataEx", "cuModuleLoad", "cuModuleLoadFatBinary", "cuLaunchKernelEx"], + "hip": ["hipModuleLoadData", "hipModuleLoadDataEx", "hipModuleGetFunction", + "hipModuleLaunchKernel", "hipMalloc", "hipFree", "hipModuleLoad", "hipKernelNameRef", + "hipEventRecord", "hipGetErrorName", "hipGetErrorString", "hipApiName", + "hipKernelNameRefByPtr", "hip_init"] # "hip_init" is weird in hip_runtime_api.h +} + +CODEGEN_TEMPLATE: Dict[str, str] = { + "cu": """ +CUresult {func_name}({param_list}) {{ + if (shared_lib == NULL) {{ init(); }} + CUresult err = real_{func_name}({param_val_list}); // call the real + if (VERBOSE) {{ + fprintf(event_log, "[info] {func_name} %d\\n", err); + fflush(event_log); // block until output written for debugging + }} + return err; +}}""", + "hip": """ +hipError_t {func_name}({param_list}) {{ + if (shared_lib == NULL) {{ init(); }} + hipError_t err = real_{func_name}({param_val_list}); // call the real + if (VERBOSE) {{ + fprintf(event_log, "[info] {func_name} %d\\n", err); + fflush(event_log); // block until output written for debugging + }} + return err; +}}""" +} + +SIGNATURE_TEMPLATE = { + "cu": 'CUresult (*real_{func_name})({param_list}) = NULL;', + "hip": 'hipError_t (*real_{func_name})({param_list}) = NULL;' +} + +INIT_TEMPLATE = ' real_{func_name} = dlsym(shared_lib, "{func_name}");' + +IDENTIFIERS = { + "cu": "CUresult CUDAAPI", + "hip": "hipError_t" +} + +class Parameter(NamedTuple): + type_name: str + var_name: str + +class Signature(NamedTuple): + func_name: str + params: List[Parameter] + +class VersionedSymbol(NamedTuple): + name: str + version: str + +def parse_parameter(param: str) -> Parameter: + # Split the parameter into type and name + param_parts = param.rsplit(' ', 1) + if len(param_parts) == 2: + type_name = param_parts[0].strip() + var_name: str = param_parts[1].strip() + if var_name.startswith("*"): # avoid ptr * at variable side + num_star = var_name.rfind("*") + 1 + type_name = type_name + "*" * num_star + var_name = var_name[num_star:] + else: + type_name = param_parts[0].strip() + var_name = '' + + return Parameter(type_name, var_name) + +def parse_function_signature(signature: str) -> Signature: + # Remove the trailing semicolon + signature = signature.strip().rstrip(';\n') + + # Find the opening parenthesis for parameters + paren_index = signature.find('(') + func_name = signature[:paren_index].strip() + space_index = func_name.rfind(' ') + func_name = func_name[space_index + 1:] + if "\n" in func_name: + space_index = func_name.rfind('\n') + func_name = func_name[space_index + 1:] + + # Extract parameters + params_str = signature[paren_index + 1:].strip() + params_str = params_str[:-1].strip() # Remove closing parenthesis + + # Parse parameters + param_list: List[Parameter] = [] + if params_str: + # Split by commas, considering pointer types + param_parts = [] + current_param = '' + depth = 0 + + for char in params_str: + if char == ',' and depth == 0: + param_parts.append(current_param.strip()) + current_param = '' + else: + current_param += char + if char == '<': + depth += 1 + elif char == '>': + depth -= 1 + + # Add the last parameter + if current_param: + param_parts.append(current_param.strip()) + + for param in param_parts: + param = param.strip() + if param: + param_list.append(parse_parameter(param)) + + return Signature(func_name, param_list) + +def parse_symbol(nm_line: str) -> str: + if len(nm_line.strip()) != 0: + symbol = nm_line.rsplit(" ", 1)[1] + if "@" in symbol: # NOTE remove version tag @ + symbol = symbol.split("@")[0] + return symbol + else: + return "" + +def parse_version_symbol(symbol: str) -> Tuple[str, str]: + if symbol[0] != "_" and "_" in symbol: # FIX __hip + name, version = symbol.split("_", 1) + return name, "_"+version + else: + return symbol, "" + +def gencode(signature: Signature, template: str) -> str: + param_list = [] + param_type_list = [] + param_val_list = [] + for param in signature.params: + param_type_list.append(param.type_name) + param_val_list.append(param.var_name) + param_list.append(param.type_name + " " + param.var_name) + return template.format( + func_name = signature.func_name, + param_list = ", ".join(param_list), + param_type_list = ", ".join(param_type_list), + param_val_list = ", ".join(param_val_list) + ) + +def gensignature(signature: Signature, template: str) -> str: + param_list = [] + param_type_list = [] + param_val_list = [] + for param in signature.params: + param_type_list.append(param.type_name) + param_val_list.append(param.var_name) + param_list.append(param.type_name + " " + param.var_name) + return template.format( + func_name = signature.func_name, + param_list = ", ".join(param_list) + ) + +def geninit(signature: Signature) -> str: + return INIT_TEMPLATE.format(func_name = signature.func_name) + +if __name__ == "__main__": + # parse cli param if given, usage is python parse.py CUDA_HEADER_PATH, CUDA_LIB_PATH + if len(sys.argv) >= 3: + HEADER_PATH = sys.argv[1] + LIB_PATH = sys.argv[2] + else: + print("Usage: python parse.py ") + exit(1) + + unmodified_c = open(os.path.join(os.path.dirname(os.path.realpath(__file__)), UNMODIFIED_C_NAME), "w") + signature_c = open(os.path.join(os.path.dirname(os.path.realpath(__file__)), SIGNATURE_C_NAME), "w") + + print(f"[INFO] use {HEADER_PATH} and {LIB_PATH}", file=sys.stderr) + + if HEADER_PATH.endswith("cuda.h") and "libcuda.so" in LIB_PATH: + target = "cu" + elif HEADER_PATH.endswith("hip_runtime_api.h") and "libamdhip64.so" in LIB_PATH: + target= "hip" + else: # can add more target for the future + raise ValueError(f"[error] {LIB_PATH} is not supported") + + signatures: List[Signature] = [] + + # parse cuda.h to extract cuda headers + with open(HEADER_PATH, "r") as header_file: + headers = header_file.readlines() + idx = 0 + start_idx, ending_idx = 0, 0 + + if target == "cu": + for idx in range(len(headers)): + # BUG FIX weird definition in cuda.h + if "#define CUDAAPI" in headers[idx]: + start_idx = idx + elif "CUDA API versioning support" in headers[idx]: + ending_idx = idx + break + elif target == "hip": + for idx in range(len(headers)): + if " * @defgroup API HIP API" in headers[idx]: + start_idx = idx + elif '} /* extern "c" */' in headers[idx]: + ending_idx = idx + break + idx = start_idx + # print(f"start: {start_idx}, end: {ending_idx}", file=sys.stderr) + identifier = IDENTIFIERS[target] + while idx < ending_idx: + if identifier in headers[idx] and "typedef" not in headers[idx]: + end_idx = idx + 1 + if ";" in headers[idx]: # a full signature + parsed_signature = parse_function_signature(headers[idx]) + signatures.append(parsed_signature) + else: + while ";" not in headers[end_idx]: + end_idx += 1 + parsed_signature = parse_function_signature("".join(headers[idx:end_idx+1])) + signatures.append(parsed_signature) + idx = end_idx + else: + idx += 1 + + + # extract missing symbols from libcuda.so + symbols_so: List[str] = [] + result = subprocess.run(["nm", "-D", LIB_PATH], stdout=subprocess.PIPE, text=True) + so_log = result.stdout.split("\n") + for line in so_log: + symbol = parse_symbol(line) + if symbol.startswith(target): # target is also prefix of API name :) + symbols_so.append(symbol) + + # get the symbols missed in our cuda lib + parsed_symbols = {signature.func_name: signature for signature in signatures} + missed_symbols = [symbol for symbol in symbols_so if symbol not in parsed_symbols] + print(f"[INFO] Extract {len(signatures)} Symbols from {HEADER_PATH}", file=sys.stderr) + + for symbol in missed_symbols: + # try to extract symbol and version + raw_symbol_name, version = parse_version_symbol(symbol) + # check if raw_symbol in parsed_symbols + if raw_symbol_name in parsed_symbols: + # versioned symbol share the same parameter list + raw_symbol = parsed_symbols[raw_symbol_name] + signatures.append(Signature(func_name=symbol, params=raw_symbol.params)) + else: + print(f"[WARNING] can't resolve {symbol}", file=sys.stderr) + + print(f"[INFO] Resolved {len(signatures)} Symbols from {len(symbols_so)} Symbols in {LIB_PATH}", file=sys.stderr) + + print("// auto-generated by parse.py, used with modified.c", file=unmodified_c) + print("// auto-generated by parse.py, used with modified.c", file=signature_c) + inits = [] + for signature in signatures: + if signature.func_name not in MODIFIED_FUNCTIONS[target]: # REMOVE MODIFIED + code = gencode(signature, CODEGEN_TEMPLATE[target]) + print(code, file=unmodified_c) + signautre_ = gensignature(signature, SIGNATURE_TEMPLATE[target]) + print(signautre_, file=signature_c) + init_ = geninit(signature) + inits.append(init_) + + print("\nstatic void init_unmodified(void) {", file=signature_c) + print("\n".join(inits), file=signature_c) + print("}", file=signature_c) \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/preload.c b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/preload.c new file mode 100644 index 0000000..608c2b2 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/preload.c @@ -0,0 +1,122 @@ +/** + * A customized preload shared library to redirect driver (libcuda.so) + * 1. redirect driver interaction so don't need to modify host env (/usr/lib/...) + * 2. filter out proprietary product such as cuBLAS to conform updated NVIDIA EULA + * + * @note Proprietary NVIDIA Softwares includes: + * cublas/curand/cufft/cusparse/cusolver/optix/... + * but at most case, only cuBLAS if you used PyTorch or other AIML workload + */ +#define _GNU_SOURCE +#include // for dynamic library +#include // for I/O +#include // for strcmp +#include // for backtrace and backtrace_symbols +#include // for malloc and free +#include // for timing terms + +#ifndef STACK_TRACE_SIZE +#define STACK_TRACE_SIZE 5 +#endif + +#ifndef DL_VERBOSE +#define DL_VERBOSE 0 +#endif + +// Pointer to GLIBC dlopen function, by dlsym(RTLD_NEXT, "dlopen") +static void* (*real_dlopen)(const char *filename, int flags) = NULL; + +static char* NEUTRINO_REAL_DRIVER = NULL; +static char* NEUTRINO_HOOK_DRIVER = NULL; +static char* NEUTRINO_DRIVER_NAME = NULL; + +/** + * Provides a hook on both statically or dynamically loading shared library + * by overwriting dlopen with the same signature as GLIBC dlopen + * @cite https://man7.org/linux/man-pages/man3/dlopen.3.html + * + * This will leads to 2 dlopen function in the search space of executable: + * 1. our dlopen as follows, will be chosen automatically as LD_PRELOAD + * 2. standard c library's dlopen, will be masked but still can be referred + * if RTLD_NEXT flag is specified + * + * This ensure FULL COVERAGE because dlopen must be linked statically to + * enable dynamic linking (via dlopen) -> a puzzle in UNIX-like OS + */ +void* dlopen(const char *filename, int flags) { + // original (GLIBC) dlopen still exists in search space + // but is less prefered as LD_PRELOAD mask it + // using dlsym with RTLD_NEXT we can extract GLIBC dlopen. + if (!real_dlopen) + real_dlopen = dlsym(RTLD_NEXT, "dlopen"); + + if (!NEUTRINO_DRIVER_NAME) { + NEUTRINO_DRIVER_NAME = getenv("NEUTRINO_DRIVER_NAME"); + // fprintf(stderr, "[info] NEUTRINO_DRIVER_NAME: %s\n", NEUTRINO_DRIVER_NAME); + } + + if (filename != NULL && (strstr(filename, NEUTRINO_DRIVER_NAME) != NULL)) { + + // Check if it's libcublas.so backtrace + // @see https://man7.org/linux/man-pages/man3/backtrace.3.html + void* array[STACK_TRACE_SIZE]; + int size = backtrace(array, STACK_TRACE_SIZE); + char** strings = backtrace_symbols(array, size); + int call_from_cublas = 0; + if (strings != NULL){ + for (int i = 0; i < size; i++) { + // we will add ALL Nvidia Propietray Product here + if (strstr(strings[i], "libcublas") != NULL) { + call_from_cublas = 1; + break; + } + } + } + free(strings); + void* ptr; + if (call_from_cublas) { + if (NEUTRINO_REAL_DRIVER == NULL) { + NEUTRINO_REAL_DRIVER = getenv("NEUTRINO_REAL_DRIVER"); + if (NEUTRINO_REAL_DRIVER == NULL) { // fault + fprintf(stderr, "[error] NEUTRINO_REAL_DRIVER not set\n"); + exit(1); + } + } + ptr = real_dlopen(NEUTRINO_REAL_DRIVER, flags); + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + long long time = ts.tv_nsec + ts.tv_sec * 1e9; + // printf("[info] %lld cublas use real: %s %p %d\n", time, NEUTRINO_REAL_DRIVER, ptr, flags); + fflush(stdout); + } else { + char* NEUTRINO_HOOK_DRIVER = getenv("NEUTRINO_HOOK_DRIVER"); + if (NEUTRINO_HOOK_DRIVER == NULL) { + fprintf(stderr, "[error] NEUTRINO_HOOK_DRIVER not set\n"); + ptr = real_dlopen(filename, flags); // try to backup + } + // @note fix the multiple initialization bug + ptr = real_dlopen(NEUTRINO_HOOK_DRIVER, flags | RTLD_GLOBAL); + // fprintf(stderr, "[dlopen] %s : %d, %p", NEUTRINO_HOOK_DRIVER, flags | RTLD_GLOBAL, ptr); + if (DL_VERBOSE) { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + long long time = ts.tv_nsec + ts.tv_sec * 1e9; + printf("[info] %lld use hooked: %s %p %d\n", time, NEUTRINO_HOOK_DRIVER, ptr, flags); + fflush(stdout); + } + } + return ptr; + } else { // not interested, just let them go via loading the correct + // Call the original dlopen + void* ptr = real_dlopen(filename, flags); + // Print the name of the module being loaded + if (DL_VERBOSE) { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + long long time = ts.tv_nsec + ts.tv_sec * 1e9; + printf("[info] %lld Loading: %s %p %d\n", time, filename, ptr, flags); + fflush(stdout); + } + return ptr; + } +} \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/sha1.h b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/sha1.h new file mode 100644 index 0000000..a9e174b --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/sha1.h @@ -0,0 +1,165 @@ +/** + * SHA-1 Algorithm used to place kernel with long long long name + * exceeds file system limitation on 255 character + * + * @cite https://github.com/B-Con/crypto-algorithms/blob/master/sha1.h + */ + + #ifndef SHA1_H + #define SHA1_H + + /*************************** HEADER FILES ***************************/ + #include + #include + #include + + /****************************** MACROS ******************************/ + #define SHA1_BLOCK_SIZE 20 // SHA1 outputs a 20 byte digest + #define ROTLEFT(a, b) ((a << b) | (a >> (32 - b))) + + /**************************** DATA TYPES ****************************/ + typedef unsigned char BYTE; // 8-bit byte + typedef unsigned int WORD; // 32-bit word, change to "long" for 16-bit machines + + typedef struct { + BYTE data[64]; + WORD datalen; + unsigned long long bitlen; + WORD state[5]; + WORD k[4]; + } SHA1_CTX; + + /*********************** FUNCTION DECLARATIONS **********************/ + + void sha1_transform(SHA1_CTX *ctx, const BYTE data[]) + { + WORD a, b, c, d, e, i, j, t, m[80]; + + for (i = 0, j = 0; i < 16; ++i, j += 4) + m[i] = (data[j] << 24) + (data[j + 1] << 16) + (data[j + 2] << 8) + (data[j + 3]); + for ( ; i < 80; ++i) { + m[i] = (m[i - 3] ^ m[i - 8] ^ m[i - 14] ^ m[i - 16]); + m[i] = (m[i] << 1) | (m[i] >> 31); + } + + a = ctx->state[0]; + b = ctx->state[1]; + c = ctx->state[2]; + d = ctx->state[3]; + e = ctx->state[4]; + + for (i = 0; i < 20; ++i) { + t = ROTLEFT(a, 5) + ((b & c) ^ (~b & d)) + e + ctx->k[0] + m[i]; + e = d; + d = c; + c = ROTLEFT(b, 30); + b = a; + a = t; + } + for ( ; i < 40; ++i) { + t = ROTLEFT(a, 5) + (b ^ c ^ d) + e + ctx->k[1] + m[i]; + e = d; + d = c; + c = ROTLEFT(b, 30); + b = a; + a = t; + } + for ( ; i < 60; ++i) { + t = ROTLEFT(a, 5) + ((b & c) ^ (b & d) ^ (c & d)) + e + ctx->k[2] + m[i]; + e = d; + d = c; + c = ROTLEFT(b, 30); + b = a; + a = t; + } + for ( ; i < 80; ++i) { + t = ROTLEFT(a, 5) + (b ^ c ^ d) + e + ctx->k[3] + m[i]; + e = d; + d = c; + c = ROTLEFT(b, 30); + b = a; + a = t; + } + + ctx->state[0] += a; + ctx->state[1] += b; + ctx->state[2] += c; + ctx->state[3] += d; + ctx->state[4] += e; + } + + void sha1_init(SHA1_CTX *ctx) + { + ctx->datalen = 0; + ctx->bitlen = 0; + ctx->state[0] = 0x67452301; + ctx->state[1] = 0xEFCDAB89; + ctx->state[2] = 0x98BADCFE; + ctx->state[3] = 0x10325476; + ctx->state[4] = 0xc3d2e1f0; + ctx->k[0] = 0x5a827999; + ctx->k[1] = 0x6ed9eba1; + ctx->k[2] = 0x8f1bbcdc; + ctx->k[3] = 0xca62c1d6; + } + + void sha1_update(SHA1_CTX *ctx, const BYTE data[], size_t len) + { + size_t i; + + for (i = 0; i < len; ++i) { + ctx->data[ctx->datalen] = data[i]; + ctx->datalen++; + if (ctx->datalen == 64) { + sha1_transform(ctx, ctx->data); + ctx->bitlen += 512; + ctx->datalen = 0; + } + } + } + + void sha1_final(SHA1_CTX *ctx, BYTE hash[]) + { + WORD i; + + i = ctx->datalen; + + // Pad whatever data is left in the buffer. + if (ctx->datalen < 56) { + ctx->data[i++] = 0x80; + while (i < 56) + ctx->data[i++] = 0x00; + } + else { + ctx->data[i++] = 0x80; + while (i < 64) + ctx->data[i++] = 0x00; + sha1_transform(ctx, ctx->data); + memset(ctx->data, 0, 56); + } + + // Append to the padding the total message's length in bits and transform. + ctx->bitlen += ctx->datalen * 8; + ctx->data[63] = ctx->bitlen; + ctx->data[62] = ctx->bitlen >> 8; + ctx->data[61] = ctx->bitlen >> 16; + ctx->data[60] = ctx->bitlen >> 24; + ctx->data[59] = ctx->bitlen >> 32; + ctx->data[58] = ctx->bitlen >> 40; + ctx->data[57] = ctx->bitlen >> 48; + ctx->data[56] = ctx->bitlen >> 56; + sha1_transform(ctx, ctx->data); + + // Since this implementation uses little endian byte ordering and MD uses big endian, + // reverse all the bytes when copying the final state to the output hash. + for (i = 0; i < 4; ++i) { + hash[i] = (ctx->state[0] >> (24 - i * 8)) & 0x000000ff; + hash[i + 4] = (ctx->state[1] >> (24 - i * 8)) & 0x000000ff; + hash[i + 8] = (ctx->state[2] >> (24 - i * 8)) & 0x000000ff; + hash[i + 12] = (ctx->state[3] >> (24 - i * 8)) & 0x000000ff; + hash[i + 16] = (ctx->state[4] >> (24 - i * 8)) & 0x000000ff; + } + } + + + #endif \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/uthash.h b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/uthash.h new file mode 100644 index 0000000..cf889f9 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/uthash.h @@ -0,0 +1,1144 @@ +/** + * UTHash used for JIT Code Cache in Neutrino + */ + +/* +Copyright (c) 2003-2022, Troy D. Hanson https://troydhanson.github.io/uthash/ +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef UTHASH_H +#define UTHASH_H + +#define UTHASH_VERSION 2.3.0 + +#include /* memcmp, memset, strlen */ +#include /* ptrdiff_t */ +#include /* exit */ + +#if defined(HASH_DEFINE_OWN_STDINT) && HASH_DEFINE_OWN_STDINT +/* This codepath is provided for backward compatibility, but I plan to remove it. */ +#warning "HASH_DEFINE_OWN_STDINT is deprecated; please use HASH_NO_STDINT instead" +typedef unsigned int uint32_t; +typedef unsigned char uint8_t; +#elif defined(HASH_NO_STDINT) && HASH_NO_STDINT +#else +#include /* uint8_t, uint32_t */ +#endif + +/* These macros use decltype or the earlier __typeof GNU extension. + As decltype is only available in newer compilers (VS2010 or gcc 4.3+ + when compiling c++ source) this code uses whatever method is needed + or, for VS2008 where neither is available, uses casting workarounds. */ +#if !defined(DECLTYPE) && !defined(NO_DECLTYPE) +#if defined(_MSC_VER) /* MS compiler */ +#if _MSC_VER >= 1600 && defined(__cplusplus) /* VS2010 or newer in C++ mode */ +#define DECLTYPE(x) (decltype(x)) +#else /* VS2008 or older (or VS2010 in C mode) */ +#define NO_DECLTYPE +#endif +#elif defined(__MCST__) /* Elbrus C Compiler */ +#define DECLTYPE(x) (__typeof(x)) +#elif defined(__BORLANDC__) || defined(__ICCARM__) || defined(__LCC__) || defined(__WATCOMC__) +#define NO_DECLTYPE +#else /* GNU, Sun and other compilers */ +#define DECLTYPE(x) (__typeof(x)) +#endif +#endif + +#ifdef NO_DECLTYPE +#define DECLTYPE(x) +#define DECLTYPE_ASSIGN(dst,src) \ +do { \ + char **_da_dst = (char**)(&(dst)); \ + *_da_dst = (char*)(src); \ +} while (0) +#else +#define DECLTYPE_ASSIGN(dst,src) \ +do { \ + (dst) = DECLTYPE(dst)(src); \ +} while (0) +#endif + +#ifndef uthash_malloc +#define uthash_malloc(sz) malloc(sz) /* malloc fcn */ +#endif +#ifndef uthash_free +#define uthash_free(ptr,sz) free(ptr) /* free fcn */ +#endif +#ifndef uthash_bzero +#define uthash_bzero(a,n) memset(a,'\0',n) +#endif +#ifndef uthash_strlen +#define uthash_strlen(s) strlen(s) +#endif + +#ifndef HASH_FUNCTION +#define HASH_FUNCTION(keyptr,keylen,hashv) HASH_JEN(keyptr, keylen, hashv) +#endif + +#ifndef HASH_KEYCMP +#define HASH_KEYCMP(a,b,n) memcmp(a,b,n) +#endif + +#ifndef uthash_noexpand_fyi +#define uthash_noexpand_fyi(tbl) /* can be defined to log noexpand */ +#endif +#ifndef uthash_expand_fyi +#define uthash_expand_fyi(tbl) /* can be defined to log expands */ +#endif + +#ifndef HASH_NONFATAL_OOM +#define HASH_NONFATAL_OOM 0 +#endif + +#if HASH_NONFATAL_OOM +/* malloc failures can be recovered from */ + +#ifndef uthash_nonfatal_oom +#define uthash_nonfatal_oom(obj) do {} while (0) /* non-fatal OOM error */ +#endif + +#define HASH_RECORD_OOM(oomed) do { (oomed) = 1; } while (0) +#define IF_HASH_NONFATAL_OOM(x) x + +#else +/* malloc failures result in lost memory, hash tables are unusable */ + +#ifndef uthash_fatal +#define uthash_fatal(msg) exit(-1) /* fatal OOM error */ +#endif + +#define HASH_RECORD_OOM(oomed) uthash_fatal("out of memory") +#define IF_HASH_NONFATAL_OOM(x) + +#endif + +/* initial number of buckets */ +#define HASH_INITIAL_NUM_BUCKETS 32U /* initial number of buckets */ +#define HASH_INITIAL_NUM_BUCKETS_LOG2 5U /* lg2 of initial number of buckets */ +#define HASH_BKT_CAPACITY_THRESH 10U /* expand when bucket count reaches */ + +/* calculate the element whose hash handle address is hhp */ +#define ELMT_FROM_HH(tbl,hhp) ((void*)(((char*)(hhp)) - ((tbl)->hho))) +/* calculate the hash handle from element address elp */ +#define HH_FROM_ELMT(tbl,elp) ((UT_hash_handle*)(void*)(((char*)(elp)) + ((tbl)->hho))) + +#define HASH_ROLLBACK_BKT(hh, head, itemptrhh) \ +do { \ + struct UT_hash_handle *_hd_hh_item = (itemptrhh); \ + unsigned _hd_bkt; \ + HASH_TO_BKT(_hd_hh_item->hashv, (head)->hh.tbl->num_buckets, _hd_bkt); \ + (head)->hh.tbl->buckets[_hd_bkt].count++; \ + _hd_hh_item->hh_next = NULL; \ + _hd_hh_item->hh_prev = NULL; \ +} while (0) + +#define HASH_VALUE(keyptr,keylen,hashv) \ +do { \ + HASH_FUNCTION(keyptr, keylen, hashv); \ +} while (0) + +#define HASH_FIND_BYHASHVALUE(hh,head,keyptr,keylen,hashval,out) \ +do { \ + (out) = NULL; \ + if (head) { \ + unsigned _hf_bkt; \ + HASH_TO_BKT(hashval, (head)->hh.tbl->num_buckets, _hf_bkt); \ + if (HASH_BLOOM_TEST((head)->hh.tbl, hashval)) { \ + HASH_FIND_IN_BKT((head)->hh.tbl, hh, (head)->hh.tbl->buckets[ _hf_bkt ], keyptr, keylen, hashval, out); \ + } \ + } \ +} while (0) + +#define HASH_FIND(hh,head,keyptr,keylen,out) \ +do { \ + (out) = NULL; \ + if (head) { \ + unsigned _hf_hashv; \ + HASH_VALUE(keyptr, keylen, _hf_hashv); \ + HASH_FIND_BYHASHVALUE(hh, head, keyptr, keylen, _hf_hashv, out); \ + } \ +} while (0) + +#ifdef HASH_BLOOM +#define HASH_BLOOM_BITLEN (1UL << HASH_BLOOM) +#define HASH_BLOOM_BYTELEN (HASH_BLOOM_BITLEN/8UL) + (((HASH_BLOOM_BITLEN%8UL)!=0UL) ? 1UL : 0UL) +#define HASH_BLOOM_MAKE(tbl,oomed) \ +do { \ + (tbl)->bloom_nbits = HASH_BLOOM; \ + (tbl)->bloom_bv = (uint8_t*)uthash_malloc(HASH_BLOOM_BYTELEN); \ + if (!(tbl)->bloom_bv) { \ + HASH_RECORD_OOM(oomed); \ + } else { \ + uthash_bzero((tbl)->bloom_bv, HASH_BLOOM_BYTELEN); \ + (tbl)->bloom_sig = HASH_BLOOM_SIGNATURE; \ + } \ +} while (0) + +#define HASH_BLOOM_FREE(tbl) \ +do { \ + uthash_free((tbl)->bloom_bv, HASH_BLOOM_BYTELEN); \ +} while (0) + +#define HASH_BLOOM_BITSET(bv,idx) (bv[(idx)/8U] |= (1U << ((idx)%8U))) +#define HASH_BLOOM_BITTEST(bv,idx) ((bv[(idx)/8U] & (1U << ((idx)%8U))) != 0) + +#define HASH_BLOOM_ADD(tbl,hashv) \ + HASH_BLOOM_BITSET((tbl)->bloom_bv, ((hashv) & (uint32_t)((1UL << (tbl)->bloom_nbits) - 1U))) + +#define HASH_BLOOM_TEST(tbl,hashv) \ + HASH_BLOOM_BITTEST((tbl)->bloom_bv, ((hashv) & (uint32_t)((1UL << (tbl)->bloom_nbits) - 1U))) + +#else +#define HASH_BLOOM_MAKE(tbl,oomed) +#define HASH_BLOOM_FREE(tbl) +#define HASH_BLOOM_ADD(tbl,hashv) +#define HASH_BLOOM_TEST(tbl,hashv) 1 +#define HASH_BLOOM_BYTELEN 0U +#endif + +#define HASH_MAKE_TABLE(hh,head,oomed) \ +do { \ + (head)->hh.tbl = (UT_hash_table*)uthash_malloc(sizeof(UT_hash_table)); \ + if (!(head)->hh.tbl) { \ + HASH_RECORD_OOM(oomed); \ + } else { \ + uthash_bzero((head)->hh.tbl, sizeof(UT_hash_table)); \ + (head)->hh.tbl->tail = &((head)->hh); \ + (head)->hh.tbl->num_buckets = HASH_INITIAL_NUM_BUCKETS; \ + (head)->hh.tbl->log2_num_buckets = HASH_INITIAL_NUM_BUCKETS_LOG2; \ + (head)->hh.tbl->hho = (char*)(&(head)->hh) - (char*)(head); \ + (head)->hh.tbl->buckets = (UT_hash_bucket*)uthash_malloc( \ + HASH_INITIAL_NUM_BUCKETS * sizeof(struct UT_hash_bucket)); \ + (head)->hh.tbl->signature = HASH_SIGNATURE; \ + if (!(head)->hh.tbl->buckets) { \ + HASH_RECORD_OOM(oomed); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + } else { \ + uthash_bzero((head)->hh.tbl->buckets, \ + HASH_INITIAL_NUM_BUCKETS * sizeof(struct UT_hash_bucket)); \ + HASH_BLOOM_MAKE((head)->hh.tbl, oomed); \ + IF_HASH_NONFATAL_OOM( \ + if (oomed) { \ + uthash_free((head)->hh.tbl->buckets, \ + HASH_INITIAL_NUM_BUCKETS*sizeof(struct UT_hash_bucket)); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + } \ + ) \ + } \ + } \ +} while (0) + +#define HASH_REPLACE_BYHASHVALUE_INORDER(hh,head,fieldname,keylen_in,hashval,add,replaced,cmpfcn) \ +do { \ + (replaced) = NULL; \ + HASH_FIND_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, hashval, replaced); \ + if (replaced) { \ + HASH_DELETE(hh, head, replaced); \ + } \ + HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh, head, &((add)->fieldname), keylen_in, hashval, add, cmpfcn); \ +} while (0) + +#define HASH_REPLACE_BYHASHVALUE(hh,head,fieldname,keylen_in,hashval,add,replaced) \ +do { \ + (replaced) = NULL; \ + HASH_FIND_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, hashval, replaced); \ + if (replaced) { \ + HASH_DELETE(hh, head, replaced); \ + } \ + HASH_ADD_KEYPTR_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, hashval, add); \ +} while (0) + +#define HASH_REPLACE(hh,head,fieldname,keylen_in,add,replaced) \ +do { \ + unsigned _hr_hashv; \ + HASH_VALUE(&((add)->fieldname), keylen_in, _hr_hashv); \ + HASH_REPLACE_BYHASHVALUE(hh, head, fieldname, keylen_in, _hr_hashv, add, replaced); \ +} while (0) + +#define HASH_REPLACE_INORDER(hh,head,fieldname,keylen_in,add,replaced,cmpfcn) \ +do { \ + unsigned _hr_hashv; \ + HASH_VALUE(&((add)->fieldname), keylen_in, _hr_hashv); \ + HASH_REPLACE_BYHASHVALUE_INORDER(hh, head, fieldname, keylen_in, _hr_hashv, add, replaced, cmpfcn); \ +} while (0) + +#define HASH_APPEND_LIST(hh, head, add) \ +do { \ + (add)->hh.next = NULL; \ + (add)->hh.prev = ELMT_FROM_HH((head)->hh.tbl, (head)->hh.tbl->tail); \ + (head)->hh.tbl->tail->next = (add); \ + (head)->hh.tbl->tail = &((add)->hh); \ +} while (0) + +#define HASH_AKBI_INNER_LOOP(hh,head,add,cmpfcn) \ +do { \ + do { \ + if (cmpfcn(DECLTYPE(head)(_hs_iter), add) > 0) { \ + break; \ + } \ + } while ((_hs_iter = HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->next)); \ +} while (0) + +#ifdef NO_DECLTYPE +#undef HASH_AKBI_INNER_LOOP +#define HASH_AKBI_INNER_LOOP(hh,head,add,cmpfcn) \ +do { \ + char *_hs_saved_head = (char*)(head); \ + do { \ + DECLTYPE_ASSIGN(head, _hs_iter); \ + if (cmpfcn(head, add) > 0) { \ + DECLTYPE_ASSIGN(head, _hs_saved_head); \ + break; \ + } \ + DECLTYPE_ASSIGN(head, _hs_saved_head); \ + } while ((_hs_iter = HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->next)); \ +} while (0) +#endif + +#if HASH_NONFATAL_OOM + +#define HASH_ADD_TO_TABLE(hh,head,keyptr,keylen_in,hashval,add,oomed) \ +do { \ + if (!(oomed)) { \ + unsigned _ha_bkt; \ + (head)->hh.tbl->num_items++; \ + HASH_TO_BKT(hashval, (head)->hh.tbl->num_buckets, _ha_bkt); \ + HASH_ADD_TO_BKT((head)->hh.tbl->buckets[_ha_bkt], hh, &(add)->hh, oomed); \ + if (oomed) { \ + HASH_ROLLBACK_BKT(hh, head, &(add)->hh); \ + HASH_DELETE_HH(hh, head, &(add)->hh); \ + (add)->hh.tbl = NULL; \ + uthash_nonfatal_oom(add); \ + } else { \ + HASH_BLOOM_ADD((head)->hh.tbl, hashval); \ + HASH_EMIT_KEY(hh, head, keyptr, keylen_in); \ + } \ + } else { \ + (add)->hh.tbl = NULL; \ + uthash_nonfatal_oom(add); \ + } \ +} while (0) + +#else + +#define HASH_ADD_TO_TABLE(hh,head,keyptr,keylen_in,hashval,add,oomed) \ +do { \ + unsigned _ha_bkt; \ + (head)->hh.tbl->num_items++; \ + HASH_TO_BKT(hashval, (head)->hh.tbl->num_buckets, _ha_bkt); \ + HASH_ADD_TO_BKT((head)->hh.tbl->buckets[_ha_bkt], hh, &(add)->hh, oomed); \ + HASH_BLOOM_ADD((head)->hh.tbl, hashval); \ + HASH_EMIT_KEY(hh, head, keyptr, keylen_in); \ +} while (0) + +#endif + + +#define HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh,head,keyptr,keylen_in,hashval,add,cmpfcn) \ +do { \ + IF_HASH_NONFATAL_OOM( int _ha_oomed = 0; ) \ + (add)->hh.hashv = (hashval); \ + (add)->hh.key = (char*) (keyptr); \ + (add)->hh.keylen = (unsigned) (keylen_in); \ + if (!(head)) { \ + (add)->hh.next = NULL; \ + (add)->hh.prev = NULL; \ + HASH_MAKE_TABLE(hh, add, _ha_oomed); \ + IF_HASH_NONFATAL_OOM( if (!_ha_oomed) { ) \ + (head) = (add); \ + IF_HASH_NONFATAL_OOM( } ) \ + } else { \ + void *_hs_iter = (head); \ + (add)->hh.tbl = (head)->hh.tbl; \ + HASH_AKBI_INNER_LOOP(hh, head, add, cmpfcn); \ + if (_hs_iter) { \ + (add)->hh.next = _hs_iter; \ + if (((add)->hh.prev = HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->prev)) { \ + HH_FROM_ELMT((head)->hh.tbl, (add)->hh.prev)->next = (add); \ + } else { \ + (head) = (add); \ + } \ + HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->prev = (add); \ + } else { \ + HASH_APPEND_LIST(hh, head, add); \ + } \ + } \ + HASH_ADD_TO_TABLE(hh, head, keyptr, keylen_in, hashval, add, _ha_oomed); \ + HASH_FSCK(hh, head, "HASH_ADD_KEYPTR_BYHASHVALUE_INORDER"); \ +} while (0) + +#define HASH_ADD_KEYPTR_INORDER(hh,head,keyptr,keylen_in,add,cmpfcn) \ +do { \ + unsigned _hs_hashv; \ + HASH_VALUE(keyptr, keylen_in, _hs_hashv); \ + HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh, head, keyptr, keylen_in, _hs_hashv, add, cmpfcn); \ +} while (0) + +#define HASH_ADD_BYHASHVALUE_INORDER(hh,head,fieldname,keylen_in,hashval,add,cmpfcn) \ + HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh, head, &((add)->fieldname), keylen_in, hashval, add, cmpfcn) + +#define HASH_ADD_INORDER(hh,head,fieldname,keylen_in,add,cmpfcn) \ + HASH_ADD_KEYPTR_INORDER(hh, head, &((add)->fieldname), keylen_in, add, cmpfcn) + +#define HASH_ADD_KEYPTR_BYHASHVALUE(hh,head,keyptr,keylen_in,hashval,add) \ +do { \ + IF_HASH_NONFATAL_OOM( int _ha_oomed = 0; ) \ + (add)->hh.hashv = (hashval); \ + (add)->hh.key = (const void*) (keyptr); \ + (add)->hh.keylen = (unsigned) (keylen_in); \ + if (!(head)) { \ + (add)->hh.next = NULL; \ + (add)->hh.prev = NULL; \ + HASH_MAKE_TABLE(hh, add, _ha_oomed); \ + IF_HASH_NONFATAL_OOM( if (!_ha_oomed) { ) \ + (head) = (add); \ + IF_HASH_NONFATAL_OOM( } ) \ + } else { \ + (add)->hh.tbl = (head)->hh.tbl; \ + HASH_APPEND_LIST(hh, head, add); \ + } \ + HASH_ADD_TO_TABLE(hh, head, keyptr, keylen_in, hashval, add, _ha_oomed); \ + HASH_FSCK(hh, head, "HASH_ADD_KEYPTR_BYHASHVALUE"); \ +} while (0) + +#define HASH_ADD_KEYPTR(hh,head,keyptr,keylen_in,add) \ +do { \ + unsigned _ha_hashv; \ + HASH_VALUE(keyptr, keylen_in, _ha_hashv); \ + HASH_ADD_KEYPTR_BYHASHVALUE(hh, head, keyptr, keylen_in, _ha_hashv, add); \ +} while (0) + +#define HASH_ADD_BYHASHVALUE(hh,head,fieldname,keylen_in,hashval,add) \ + HASH_ADD_KEYPTR_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, hashval, add) + +#define HASH_ADD(hh,head,fieldname,keylen_in,add) \ + HASH_ADD_KEYPTR(hh, head, &((add)->fieldname), keylen_in, add) + +#define HASH_TO_BKT(hashv,num_bkts,bkt) \ +do { \ + bkt = ((hashv) & ((num_bkts) - 1U)); \ +} while (0) + +/* delete "delptr" from the hash table. + * "the usual" patch-up process for the app-order doubly-linked-list. + * The use of _hd_hh_del below deserves special explanation. + * These used to be expressed using (delptr) but that led to a bug + * if someone used the same symbol for the head and deletee, like + * HASH_DELETE(hh,users,users); + * We want that to work, but by changing the head (users) below + * we were forfeiting our ability to further refer to the deletee (users) + * in the patch-up process. Solution: use scratch space to + * copy the deletee pointer, then the latter references are via that + * scratch pointer rather than through the repointed (users) symbol. + */ +#define HASH_DELETE(hh,head,delptr) \ + HASH_DELETE_HH(hh, head, &(delptr)->hh) + +#define HASH_DELETE_HH(hh,head,delptrhh) \ +do { \ + const struct UT_hash_handle *_hd_hh_del = (delptrhh); \ + if ((_hd_hh_del->prev == NULL) && (_hd_hh_del->next == NULL)) { \ + HASH_BLOOM_FREE((head)->hh.tbl); \ + uthash_free((head)->hh.tbl->buckets, \ + (head)->hh.tbl->num_buckets * sizeof(struct UT_hash_bucket)); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + (head) = NULL; \ + } else { \ + unsigned _hd_bkt; \ + if (_hd_hh_del == (head)->hh.tbl->tail) { \ + (head)->hh.tbl->tail = HH_FROM_ELMT((head)->hh.tbl, _hd_hh_del->prev); \ + } \ + if (_hd_hh_del->prev != NULL) { \ + HH_FROM_ELMT((head)->hh.tbl, _hd_hh_del->prev)->next = _hd_hh_del->next; \ + } else { \ + DECLTYPE_ASSIGN(head, _hd_hh_del->next); \ + } \ + if (_hd_hh_del->next != NULL) { \ + HH_FROM_ELMT((head)->hh.tbl, _hd_hh_del->next)->prev = _hd_hh_del->prev; \ + } \ + HASH_TO_BKT(_hd_hh_del->hashv, (head)->hh.tbl->num_buckets, _hd_bkt); \ + HASH_DEL_IN_BKT((head)->hh.tbl->buckets[_hd_bkt], _hd_hh_del); \ + (head)->hh.tbl->num_items--; \ + } \ + HASH_FSCK(hh, head, "HASH_DELETE_HH"); \ +} while (0) + +/* convenience forms of HASH_FIND/HASH_ADD/HASH_DEL */ +#define HASH_FIND_STR(head,findstr,out) \ +do { \ + unsigned _uthash_hfstr_keylen = (unsigned)uthash_strlen(findstr); \ + HASH_FIND(hh, head, findstr, _uthash_hfstr_keylen, out); \ +} while (0) +#define HASH_ADD_STR(head,strfield,add) \ +do { \ + unsigned _uthash_hastr_keylen = (unsigned)uthash_strlen((add)->strfield); \ + HASH_ADD(hh, head, strfield[0], _uthash_hastr_keylen, add); \ +} while (0) +#define HASH_REPLACE_STR(head,strfield,add,replaced) \ +do { \ + unsigned _uthash_hrstr_keylen = (unsigned)uthash_strlen((add)->strfield); \ + HASH_REPLACE(hh, head, strfield[0], _uthash_hrstr_keylen, add, replaced); \ +} while (0) +#define HASH_FIND_INT(head,findint,out) \ + HASH_FIND(hh,head,findint,sizeof(int),out) +#define HASH_ADD_INT(head,intfield,add) \ + HASH_ADD(hh,head,intfield,sizeof(int),add) +#define HASH_REPLACE_INT(head,intfield,add,replaced) \ + HASH_REPLACE(hh,head,intfield,sizeof(int),add,replaced) +#define HASH_FIND_PTR(head,findptr,out) \ + HASH_FIND(hh,head,findptr,sizeof(void *),out) +#define HASH_ADD_PTR(head,ptrfield,add) \ + HASH_ADD(hh,head,ptrfield,sizeof(void *),add) +#define HASH_REPLACE_PTR(head,ptrfield,add,replaced) \ + HASH_REPLACE(hh,head,ptrfield,sizeof(void *),add,replaced) +#define HASH_DEL(head,delptr) \ + HASH_DELETE(hh,head,delptr) + +/* HASH_FSCK checks hash integrity on every add/delete when HASH_DEBUG is defined. + * This is for uthash developer only; it compiles away if HASH_DEBUG isn't defined. + */ +#ifdef HASH_DEBUG +#include /* fprintf, stderr */ +#define HASH_OOPS(...) do { fprintf(stderr, __VA_ARGS__); exit(-1); } while (0) +#define HASH_FSCK(hh,head,where) \ +do { \ + struct UT_hash_handle *_thh; \ + if (head) { \ + unsigned _bkt_i; \ + unsigned _count = 0; \ + char *_prev; \ + for (_bkt_i = 0; _bkt_i < (head)->hh.tbl->num_buckets; ++_bkt_i) { \ + unsigned _bkt_count = 0; \ + _thh = (head)->hh.tbl->buckets[_bkt_i].hh_head; \ + _prev = NULL; \ + while (_thh) { \ + if (_prev != (char*)(_thh->hh_prev)) { \ + HASH_OOPS("%s: invalid hh_prev %p, actual %p\n", \ + (where), (void*)_thh->hh_prev, (void*)_prev); \ + } \ + _bkt_count++; \ + _prev = (char*)(_thh); \ + _thh = _thh->hh_next; \ + } \ + _count += _bkt_count; \ + if ((head)->hh.tbl->buckets[_bkt_i].count != _bkt_count) { \ + HASH_OOPS("%s: invalid bucket count %u, actual %u\n", \ + (where), (head)->hh.tbl->buckets[_bkt_i].count, _bkt_count); \ + } \ + } \ + if (_count != (head)->hh.tbl->num_items) { \ + HASH_OOPS("%s: invalid hh item count %u, actual %u\n", \ + (where), (head)->hh.tbl->num_items, _count); \ + } \ + _count = 0; \ + _prev = NULL; \ + _thh = &(head)->hh; \ + while (_thh) { \ + _count++; \ + if (_prev != (char*)_thh->prev) { \ + HASH_OOPS("%s: invalid prev %p, actual %p\n", \ + (where), (void*)_thh->prev, (void*)_prev); \ + } \ + _prev = (char*)ELMT_FROM_HH((head)->hh.tbl, _thh); \ + _thh = (_thh->next ? HH_FROM_ELMT((head)->hh.tbl, _thh->next) : NULL); \ + } \ + if (_count != (head)->hh.tbl->num_items) { \ + HASH_OOPS("%s: invalid app item count %u, actual %u\n", \ + (where), (head)->hh.tbl->num_items, _count); \ + } \ + } \ +} while (0) +#else +#define HASH_FSCK(hh,head,where) +#endif + +/* When compiled with -DHASH_EMIT_KEYS, length-prefixed keys are emitted to + * the descriptor to which this macro is defined for tuning the hash function. + * The app can #include to get the prototype for write(2). */ +#ifdef HASH_EMIT_KEYS +#define HASH_EMIT_KEY(hh,head,keyptr,fieldlen) \ +do { \ + unsigned _klen = fieldlen; \ + write(HASH_EMIT_KEYS, &_klen, sizeof(_klen)); \ + write(HASH_EMIT_KEYS, keyptr, (unsigned long)fieldlen); \ +} while (0) +#else +#define HASH_EMIT_KEY(hh,head,keyptr,fieldlen) +#endif + +/* The Bernstein hash function, used in Perl prior to v5.6. Note (x<<5+x)=x*33. */ +#define HASH_BER(key,keylen,hashv) \ +do { \ + unsigned _hb_keylen = (unsigned)keylen; \ + const unsigned char *_hb_key = (const unsigned char*)(key); \ + (hashv) = 0; \ + while (_hb_keylen-- != 0U) { \ + (hashv) = (((hashv) << 5) + (hashv)) + *_hb_key++; \ + } \ +} while (0) + + +/* SAX/FNV/OAT/JEN hash functions are macro variants of those listed at + * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx + * (archive link: https://archive.is/Ivcan ) + */ +#define HASH_SAX(key,keylen,hashv) \ +do { \ + unsigned _sx_i; \ + const unsigned char *_hs_key = (const unsigned char*)(key); \ + hashv = 0; \ + for (_sx_i=0; _sx_i < keylen; _sx_i++) { \ + hashv ^= (hashv << 5) + (hashv >> 2) + _hs_key[_sx_i]; \ + } \ +} while (0) +/* FNV-1a variation */ +#define HASH_FNV(key,keylen,hashv) \ +do { \ + unsigned _fn_i; \ + const unsigned char *_hf_key = (const unsigned char*)(key); \ + (hashv) = 2166136261U; \ + for (_fn_i=0; _fn_i < keylen; _fn_i++) { \ + hashv = hashv ^ _hf_key[_fn_i]; \ + hashv = hashv * 16777619U; \ + } \ +} while (0) + +#define HASH_OAT(key,keylen,hashv) \ +do { \ + unsigned _ho_i; \ + const unsigned char *_ho_key=(const unsigned char*)(key); \ + hashv = 0; \ + for(_ho_i=0; _ho_i < keylen; _ho_i++) { \ + hashv += _ho_key[_ho_i]; \ + hashv += (hashv << 10); \ + hashv ^= (hashv >> 6); \ + } \ + hashv += (hashv << 3); \ + hashv ^= (hashv >> 11); \ + hashv += (hashv << 15); \ +} while (0) + +#define HASH_JEN_MIX(a,b,c) \ +do { \ + a -= b; a -= c; a ^= ( c >> 13 ); \ + b -= c; b -= a; b ^= ( a << 8 ); \ + c -= a; c -= b; c ^= ( b >> 13 ); \ + a -= b; a -= c; a ^= ( c >> 12 ); \ + b -= c; b -= a; b ^= ( a << 16 ); \ + c -= a; c -= b; c ^= ( b >> 5 ); \ + a -= b; a -= c; a ^= ( c >> 3 ); \ + b -= c; b -= a; b ^= ( a << 10 ); \ + c -= a; c -= b; c ^= ( b >> 15 ); \ +} while (0) + +#define HASH_JEN(key,keylen,hashv) \ +do { \ + unsigned _hj_i,_hj_j,_hj_k; \ + unsigned const char *_hj_key=(unsigned const char*)(key); \ + hashv = 0xfeedbeefu; \ + _hj_i = _hj_j = 0x9e3779b9u; \ + _hj_k = (unsigned)(keylen); \ + while (_hj_k >= 12U) { \ + _hj_i += (_hj_key[0] + ( (unsigned)_hj_key[1] << 8 ) \ + + ( (unsigned)_hj_key[2] << 16 ) \ + + ( (unsigned)_hj_key[3] << 24 ) ); \ + _hj_j += (_hj_key[4] + ( (unsigned)_hj_key[5] << 8 ) \ + + ( (unsigned)_hj_key[6] << 16 ) \ + + ( (unsigned)_hj_key[7] << 24 ) ); \ + hashv += (_hj_key[8] + ( (unsigned)_hj_key[9] << 8 ) \ + + ( (unsigned)_hj_key[10] << 16 ) \ + + ( (unsigned)_hj_key[11] << 24 ) ); \ + \ + HASH_JEN_MIX(_hj_i, _hj_j, hashv); \ + \ + _hj_key += 12; \ + _hj_k -= 12U; \ + } \ + hashv += (unsigned)(keylen); \ + switch ( _hj_k ) { \ + case 11: hashv += ( (unsigned)_hj_key[10] << 24 ); /* FALLTHROUGH */ \ + case 10: hashv += ( (unsigned)_hj_key[9] << 16 ); /* FALLTHROUGH */ \ + case 9: hashv += ( (unsigned)_hj_key[8] << 8 ); /* FALLTHROUGH */ \ + case 8: _hj_j += ( (unsigned)_hj_key[7] << 24 ); /* FALLTHROUGH */ \ + case 7: _hj_j += ( (unsigned)_hj_key[6] << 16 ); /* FALLTHROUGH */ \ + case 6: _hj_j += ( (unsigned)_hj_key[5] << 8 ); /* FALLTHROUGH */ \ + case 5: _hj_j += _hj_key[4]; /* FALLTHROUGH */ \ + case 4: _hj_i += ( (unsigned)_hj_key[3] << 24 ); /* FALLTHROUGH */ \ + case 3: _hj_i += ( (unsigned)_hj_key[2] << 16 ); /* FALLTHROUGH */ \ + case 2: _hj_i += ( (unsigned)_hj_key[1] << 8 ); /* FALLTHROUGH */ \ + case 1: _hj_i += _hj_key[0]; /* FALLTHROUGH */ \ + default: ; \ + } \ + HASH_JEN_MIX(_hj_i, _hj_j, hashv); \ +} while (0) + +/* The Paul Hsieh hash function */ +#undef get16bits +#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \ + || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__) +#define get16bits(d) (*((const uint16_t *) (d))) +#endif + +#if !defined (get16bits) +#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8) \ + +(uint32_t)(((const uint8_t *)(d))[0]) ) +#endif +#define HASH_SFH(key,keylen,hashv) \ +do { \ + unsigned const char *_sfh_key=(unsigned const char*)(key); \ + uint32_t _sfh_tmp, _sfh_len = (uint32_t)keylen; \ + \ + unsigned _sfh_rem = _sfh_len & 3U; \ + _sfh_len >>= 2; \ + hashv = 0xcafebabeu; \ + \ + /* Main loop */ \ + for (;_sfh_len > 0U; _sfh_len--) { \ + hashv += get16bits (_sfh_key); \ + _sfh_tmp = ((uint32_t)(get16bits (_sfh_key+2)) << 11) ^ hashv; \ + hashv = (hashv << 16) ^ _sfh_tmp; \ + _sfh_key += 2U*sizeof (uint16_t); \ + hashv += hashv >> 11; \ + } \ + \ + /* Handle end cases */ \ + switch (_sfh_rem) { \ + case 3: hashv += get16bits (_sfh_key); \ + hashv ^= hashv << 16; \ + hashv ^= (uint32_t)(_sfh_key[sizeof (uint16_t)]) << 18; \ + hashv += hashv >> 11; \ + break; \ + case 2: hashv += get16bits (_sfh_key); \ + hashv ^= hashv << 11; \ + hashv += hashv >> 17; \ + break; \ + case 1: hashv += *_sfh_key; \ + hashv ^= hashv << 10; \ + hashv += hashv >> 1; \ + break; \ + default: ; \ + } \ + \ + /* Force "avalanching" of final 127 bits */ \ + hashv ^= hashv << 3; \ + hashv += hashv >> 5; \ + hashv ^= hashv << 4; \ + hashv += hashv >> 17; \ + hashv ^= hashv << 25; \ + hashv += hashv >> 6; \ +} while (0) + +/* iterate over items in a known bucket to find desired item */ +#define HASH_FIND_IN_BKT(tbl,hh,head,keyptr,keylen_in,hashval,out) \ +do { \ + if ((head).hh_head != NULL) { \ + DECLTYPE_ASSIGN(out, ELMT_FROM_HH(tbl, (head).hh_head)); \ + } else { \ + (out) = NULL; \ + } \ + while ((out) != NULL) { \ + if ((out)->hh.hashv == (hashval) && (out)->hh.keylen == (keylen_in)) { \ + if (HASH_KEYCMP((out)->hh.key, keyptr, keylen_in) == 0) { \ + break; \ + } \ + } \ + if ((out)->hh.hh_next != NULL) { \ + DECLTYPE_ASSIGN(out, ELMT_FROM_HH(tbl, (out)->hh.hh_next)); \ + } else { \ + (out) = NULL; \ + } \ + } \ +} while (0) + +/* add an item to a bucket */ +#define HASH_ADD_TO_BKT(head,hh,addhh,oomed) \ +do { \ + UT_hash_bucket *_ha_head = &(head); \ + _ha_head->count++; \ + (addhh)->hh_next = _ha_head->hh_head; \ + (addhh)->hh_prev = NULL; \ + if (_ha_head->hh_head != NULL) { \ + _ha_head->hh_head->hh_prev = (addhh); \ + } \ + _ha_head->hh_head = (addhh); \ + if ((_ha_head->count >= ((_ha_head->expand_mult + 1U) * HASH_BKT_CAPACITY_THRESH)) \ + && !(addhh)->tbl->noexpand) { \ + HASH_EXPAND_BUCKETS(addhh,(addhh)->tbl, oomed); \ + IF_HASH_NONFATAL_OOM( \ + if (oomed) { \ + HASH_DEL_IN_BKT(head,addhh); \ + } \ + ) \ + } \ +} while (0) + +/* remove an item from a given bucket */ +#define HASH_DEL_IN_BKT(head,delhh) \ +do { \ + UT_hash_bucket *_hd_head = &(head); \ + _hd_head->count--; \ + if (_hd_head->hh_head == (delhh)) { \ + _hd_head->hh_head = (delhh)->hh_next; \ + } \ + if ((delhh)->hh_prev) { \ + (delhh)->hh_prev->hh_next = (delhh)->hh_next; \ + } \ + if ((delhh)->hh_next) { \ + (delhh)->hh_next->hh_prev = (delhh)->hh_prev; \ + } \ +} while (0) + +/* Bucket expansion has the effect of doubling the number of buckets + * and redistributing the items into the new buckets. Ideally the + * items will distribute more or less evenly into the new buckets + * (the extent to which this is true is a measure of the quality of + * the hash function as it applies to the key domain). + * + * With the items distributed into more buckets, the chain length + * (item count) in each bucket is reduced. Thus by expanding buckets + * the hash keeps a bound on the chain length. This bounded chain + * length is the essence of how a hash provides constant time lookup. + * + * The calculation of tbl->ideal_chain_maxlen below deserves some + * explanation. First, keep in mind that we're calculating the ideal + * maximum chain length based on the *new* (doubled) bucket count. + * In fractions this is just n/b (n=number of items,b=new num buckets). + * Since the ideal chain length is an integer, we want to calculate + * ceil(n/b). We don't depend on floating point arithmetic in this + * hash, so to calculate ceil(n/b) with integers we could write + * + * ceil(n/b) = (n/b) + ((n%b)?1:0) + * + * and in fact a previous version of this hash did just that. + * But now we have improved things a bit by recognizing that b is + * always a power of two. We keep its base 2 log handy (call it lb), + * so now we can write this with a bit shift and logical AND: + * + * ceil(n/b) = (n>>lb) + ( (n & (b-1)) ? 1:0) + * + */ +#define HASH_EXPAND_BUCKETS(hh,tbl,oomed) \ +do { \ + unsigned _he_bkt; \ + unsigned _he_bkt_i; \ + struct UT_hash_handle *_he_thh, *_he_hh_nxt; \ + UT_hash_bucket *_he_new_buckets, *_he_newbkt; \ + _he_new_buckets = (UT_hash_bucket*)uthash_malloc( \ + sizeof(struct UT_hash_bucket) * (tbl)->num_buckets * 2U); \ + if (!_he_new_buckets) { \ + HASH_RECORD_OOM(oomed); \ + } else { \ + uthash_bzero(_he_new_buckets, \ + sizeof(struct UT_hash_bucket) * (tbl)->num_buckets * 2U); \ + (tbl)->ideal_chain_maxlen = \ + ((tbl)->num_items >> ((tbl)->log2_num_buckets+1U)) + \ + ((((tbl)->num_items & (((tbl)->num_buckets*2U)-1U)) != 0U) ? 1U : 0U); \ + (tbl)->nonideal_items = 0; \ + for (_he_bkt_i = 0; _he_bkt_i < (tbl)->num_buckets; _he_bkt_i++) { \ + _he_thh = (tbl)->buckets[ _he_bkt_i ].hh_head; \ + while (_he_thh != NULL) { \ + _he_hh_nxt = _he_thh->hh_next; \ + HASH_TO_BKT(_he_thh->hashv, (tbl)->num_buckets * 2U, _he_bkt); \ + _he_newbkt = &(_he_new_buckets[_he_bkt]); \ + if (++(_he_newbkt->count) > (tbl)->ideal_chain_maxlen) { \ + (tbl)->nonideal_items++; \ + if (_he_newbkt->count > _he_newbkt->expand_mult * (tbl)->ideal_chain_maxlen) { \ + _he_newbkt->expand_mult++; \ + } \ + } \ + _he_thh->hh_prev = NULL; \ + _he_thh->hh_next = _he_newbkt->hh_head; \ + if (_he_newbkt->hh_head != NULL) { \ + _he_newbkt->hh_head->hh_prev = _he_thh; \ + } \ + _he_newbkt->hh_head = _he_thh; \ + _he_thh = _he_hh_nxt; \ + } \ + } \ + uthash_free((tbl)->buckets, (tbl)->num_buckets * sizeof(struct UT_hash_bucket)); \ + (tbl)->num_buckets *= 2U; \ + (tbl)->log2_num_buckets++; \ + (tbl)->buckets = _he_new_buckets; \ + (tbl)->ineff_expands = ((tbl)->nonideal_items > ((tbl)->num_items >> 1)) ? \ + ((tbl)->ineff_expands+1U) : 0U; \ + if ((tbl)->ineff_expands > 1U) { \ + (tbl)->noexpand = 1; \ + uthash_noexpand_fyi(tbl); \ + } \ + uthash_expand_fyi(tbl); \ + } \ +} while (0) + + +/* This is an adaptation of Simon Tatham's O(n log(n)) mergesort */ +/* Note that HASH_SORT assumes the hash handle name to be hh. + * HASH_SRT was added to allow the hash handle name to be passed in. */ +#define HASH_SORT(head,cmpfcn) HASH_SRT(hh,head,cmpfcn) +#define HASH_SRT(hh,head,cmpfcn) \ +do { \ + unsigned _hs_i; \ + unsigned _hs_looping,_hs_nmerges,_hs_insize,_hs_psize,_hs_qsize; \ + struct UT_hash_handle *_hs_p, *_hs_q, *_hs_e, *_hs_list, *_hs_tail; \ + if (head != NULL) { \ + _hs_insize = 1; \ + _hs_looping = 1; \ + _hs_list = &((head)->hh); \ + while (_hs_looping != 0U) { \ + _hs_p = _hs_list; \ + _hs_list = NULL; \ + _hs_tail = NULL; \ + _hs_nmerges = 0; \ + while (_hs_p != NULL) { \ + _hs_nmerges++; \ + _hs_q = _hs_p; \ + _hs_psize = 0; \ + for (_hs_i = 0; _hs_i < _hs_insize; ++_hs_i) { \ + _hs_psize++; \ + _hs_q = ((_hs_q->next != NULL) ? \ + HH_FROM_ELMT((head)->hh.tbl, _hs_q->next) : NULL); \ + if (_hs_q == NULL) { \ + break; \ + } \ + } \ + _hs_qsize = _hs_insize; \ + while ((_hs_psize != 0U) || ((_hs_qsize != 0U) && (_hs_q != NULL))) { \ + if (_hs_psize == 0U) { \ + _hs_e = _hs_q; \ + _hs_q = ((_hs_q->next != NULL) ? \ + HH_FROM_ELMT((head)->hh.tbl, _hs_q->next) : NULL); \ + _hs_qsize--; \ + } else if ((_hs_qsize == 0U) || (_hs_q == NULL)) { \ + _hs_e = _hs_p; \ + if (_hs_p != NULL) { \ + _hs_p = ((_hs_p->next != NULL) ? \ + HH_FROM_ELMT((head)->hh.tbl, _hs_p->next) : NULL); \ + } \ + _hs_psize--; \ + } else if ((cmpfcn( \ + DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl, _hs_p)), \ + DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl, _hs_q)) \ + )) <= 0) { \ + _hs_e = _hs_p; \ + if (_hs_p != NULL) { \ + _hs_p = ((_hs_p->next != NULL) ? \ + HH_FROM_ELMT((head)->hh.tbl, _hs_p->next) : NULL); \ + } \ + _hs_psize--; \ + } else { \ + _hs_e = _hs_q; \ + _hs_q = ((_hs_q->next != NULL) ? \ + HH_FROM_ELMT((head)->hh.tbl, _hs_q->next) : NULL); \ + _hs_qsize--; \ + } \ + if ( _hs_tail != NULL ) { \ + _hs_tail->next = ((_hs_e != NULL) ? \ + ELMT_FROM_HH((head)->hh.tbl, _hs_e) : NULL); \ + } else { \ + _hs_list = _hs_e; \ + } \ + if (_hs_e != NULL) { \ + _hs_e->prev = ((_hs_tail != NULL) ? \ + ELMT_FROM_HH((head)->hh.tbl, _hs_tail) : NULL); \ + } \ + _hs_tail = _hs_e; \ + } \ + _hs_p = _hs_q; \ + } \ + if (_hs_tail != NULL) { \ + _hs_tail->next = NULL; \ + } \ + if (_hs_nmerges <= 1U) { \ + _hs_looping = 0; \ + (head)->hh.tbl->tail = _hs_tail; \ + DECLTYPE_ASSIGN(head, ELMT_FROM_HH((head)->hh.tbl, _hs_list)); \ + } \ + _hs_insize *= 2U; \ + } \ + HASH_FSCK(hh, head, "HASH_SRT"); \ + } \ +} while (0) + +/* This function selects items from one hash into another hash. + * The end result is that the selected items have dual presence + * in both hashes. There is no copy of the items made; rather + * they are added into the new hash through a secondary hash + * hash handle that must be present in the structure. */ +#define HASH_SELECT(hh_dst, dst, hh_src, src, cond) \ +do { \ + unsigned _src_bkt, _dst_bkt; \ + void *_last_elt = NULL, *_elt; \ + UT_hash_handle *_src_hh, *_dst_hh, *_last_elt_hh=NULL; \ + ptrdiff_t _dst_hho = ((char*)(&(dst)->hh_dst) - (char*)(dst)); \ + if ((src) != NULL) { \ + for (_src_bkt=0; _src_bkt < (src)->hh_src.tbl->num_buckets; _src_bkt++) { \ + for (_src_hh = (src)->hh_src.tbl->buckets[_src_bkt].hh_head; \ + _src_hh != NULL; \ + _src_hh = _src_hh->hh_next) { \ + _elt = ELMT_FROM_HH((src)->hh_src.tbl, _src_hh); \ + if (cond(_elt)) { \ + IF_HASH_NONFATAL_OOM( int _hs_oomed = 0; ) \ + _dst_hh = (UT_hash_handle*)(void*)(((char*)_elt) + _dst_hho); \ + _dst_hh->key = _src_hh->key; \ + _dst_hh->keylen = _src_hh->keylen; \ + _dst_hh->hashv = _src_hh->hashv; \ + _dst_hh->prev = _last_elt; \ + _dst_hh->next = NULL; \ + if (_last_elt_hh != NULL) { \ + _last_elt_hh->next = _elt; \ + } \ + if ((dst) == NULL) { \ + DECLTYPE_ASSIGN(dst, _elt); \ + HASH_MAKE_TABLE(hh_dst, dst, _hs_oomed); \ + IF_HASH_NONFATAL_OOM( \ + if (_hs_oomed) { \ + uthash_nonfatal_oom(_elt); \ + (dst) = NULL; \ + continue; \ + } \ + ) \ + } else { \ + _dst_hh->tbl = (dst)->hh_dst.tbl; \ + } \ + HASH_TO_BKT(_dst_hh->hashv, _dst_hh->tbl->num_buckets, _dst_bkt); \ + HASH_ADD_TO_BKT(_dst_hh->tbl->buckets[_dst_bkt], hh_dst, _dst_hh, _hs_oomed); \ + (dst)->hh_dst.tbl->num_items++; \ + IF_HASH_NONFATAL_OOM( \ + if (_hs_oomed) { \ + HASH_ROLLBACK_BKT(hh_dst, dst, _dst_hh); \ + HASH_DELETE_HH(hh_dst, dst, _dst_hh); \ + _dst_hh->tbl = NULL; \ + uthash_nonfatal_oom(_elt); \ + continue; \ + } \ + ) \ + HASH_BLOOM_ADD(_dst_hh->tbl, _dst_hh->hashv); \ + _last_elt = _elt; \ + _last_elt_hh = _dst_hh; \ + } \ + } \ + } \ + } \ + HASH_FSCK(hh_dst, dst, "HASH_SELECT"); \ +} while (0) + +#define HASH_CLEAR(hh,head) \ +do { \ + if ((head) != NULL) { \ + HASH_BLOOM_FREE((head)->hh.tbl); \ + uthash_free((head)->hh.tbl->buckets, \ + (head)->hh.tbl->num_buckets*sizeof(struct UT_hash_bucket)); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + (head) = NULL; \ + } \ +} while (0) + +#define HASH_OVERHEAD(hh,head) \ + (((head) != NULL) ? ( \ + (size_t)(((head)->hh.tbl->num_items * sizeof(UT_hash_handle)) + \ + ((head)->hh.tbl->num_buckets * sizeof(UT_hash_bucket)) + \ + sizeof(UT_hash_table) + \ + (HASH_BLOOM_BYTELEN))) : 0U) + +#ifdef NO_DECLTYPE +#define HASH_ITER(hh,head,el,tmp) \ +for(((el)=(head)), ((*(char**)(&(tmp)))=(char*)((head!=NULL)?(head)->hh.next:NULL)); \ + (el) != NULL; ((el)=(tmp)), ((*(char**)(&(tmp)))=(char*)((tmp!=NULL)?(tmp)->hh.next:NULL))) +#else +#define HASH_ITER(hh,head,el,tmp) \ +for(((el)=(head)), ((tmp)=DECLTYPE(el)((head!=NULL)?(head)->hh.next:NULL)); \ + (el) != NULL; ((el)=(tmp)), ((tmp)=DECLTYPE(el)((tmp!=NULL)?(tmp)->hh.next:NULL))) +#endif + +/* obtain a count of items in the hash */ +#define HASH_COUNT(head) HASH_CNT(hh,head) +#define HASH_CNT(hh,head) ((head != NULL)?((head)->hh.tbl->num_items):0U) + +typedef struct UT_hash_bucket { + struct UT_hash_handle *hh_head; + unsigned count; + + /* expand_mult is normally set to 0. In this situation, the max chain length + * threshold is enforced at its default value, HASH_BKT_CAPACITY_THRESH. (If + * the bucket's chain exceeds this length, bucket expansion is triggered). + * However, setting expand_mult to a non-zero value delays bucket expansion + * (that would be triggered by additions to this particular bucket) + * until its chain length reaches a *multiple* of HASH_BKT_CAPACITY_THRESH. + * (The multiplier is simply expand_mult+1). The whole idea of this + * multiplier is to reduce bucket expansions, since they are expensive, in + * situations where we know that a particular bucket tends to be overused. + * It is better to let its chain length grow to a longer yet-still-bounded + * value, than to do an O(n) bucket expansion too often. + */ + unsigned expand_mult; + +} UT_hash_bucket; + +/* random signature used only to find hash tables in external analysis */ +#define HASH_SIGNATURE 0xa0111fe1u +#define HASH_BLOOM_SIGNATURE 0xb12220f2u + +typedef struct UT_hash_table { + UT_hash_bucket *buckets; + unsigned num_buckets, log2_num_buckets; + unsigned num_items; + struct UT_hash_handle *tail; /* tail hh in app order, for fast append */ + ptrdiff_t hho; /* hash handle offset (byte pos of hash handle in element */ + + /* in an ideal situation (all buckets used equally), no bucket would have + * more than ceil(#items/#buckets) items. that's the ideal chain length. */ + unsigned ideal_chain_maxlen; + + /* nonideal_items is the number of items in the hash whose chain position + * exceeds the ideal chain maxlen. these items pay the penalty for an uneven + * hash distribution; reaching them in a chain traversal takes >ideal steps */ + unsigned nonideal_items; + + /* ineffective expands occur when a bucket doubling was performed, but + * afterward, more than half the items in the hash had nonideal chain + * positions. If this happens on two consecutive expansions we inhibit any + * further expansion, as it's not helping; this happens when the hash + * function isn't a good fit for the key domain. When expansion is inhibited + * the hash will still work, albeit no longer in constant time. */ + unsigned ineff_expands, noexpand; + + uint32_t signature; /* used only to find hash tables in external analysis */ +#ifdef HASH_BLOOM + uint32_t bloom_sig; /* used only to test bloom exists in external analysis */ + uint8_t *bloom_bv; + uint8_t bloom_nbits; +#endif + +} UT_hash_table; + +typedef struct UT_hash_handle { + struct UT_hash_table *tbl; + void *prev; /* prev element in app order */ + void *next; /* next element in app order */ + struct UT_hash_handle *hh_prev; /* previous hh in bucket order */ + struct UT_hash_handle *hh_next; /* next hh in bucket order */ + const void *key; /* ptr to enclosing struct's key */ + unsigned keylen; /* enclosing struct's key len */ + unsigned hashv; /* result of hash-fcn(key) */ +} UT_hash_handle; + +#endif /* UTHASH_H */ \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/__init__.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/block_sched.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/block_sched.py new file mode 100644 index 0000000..4a328a9 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/block_sched.py @@ -0,0 +1,25 @@ +from neutrino import probe, Map +import neutrino.language as nl + +CALLBACK = "block_sched_callback.py" # for trace analysis + +# declare maps for persistence +@Map(level="warp", type="array", size=16, cap=1) +class block_sched: + start: nl.u64 + elapsed: nl.u32 + cuid: nl.u32 + +# declare probe registers shared across probes +start: nl.u64 = 0 # starting clock +elapsed: nl.u64 = 0 # elapsed time, initialized to 0 + +# define probes with decorator +@probe(pos="kernel", level="warp", before=True) +def thread_start(): + start = nl.clock() + +@probe(pos="kernel", level="warp") +def thread_end(): + elapsed = nl.clock() - start + block_sched.save(start, elapsed, nl.cuid()) \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/block_sched_callback.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/block_sched_callback.py new file mode 100644 index 0000000..d66e16a --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/block_sched_callback.py @@ -0,0 +1,74 @@ +# Neutrino Auto-Generated Code for Trace Reading +import struct +from typing import NamedTuple, List, Tuple, Dict +from neutrino import TraceHeader, TraceSection + + +class block_sched(NamedTuple): + start: int + elapsed: int + cuid: int + + +def parse(path: str) -> Tuple[TraceHeader, List[TraceSection], Dict[str, List[List[NamedTuple]]]]: + with open(path, "rb") as f: + header: TraceHeader = TraceHeader(*struct.unpack("iiiiiiii", f.read(32))) + sections: List[TraceSection] = [] + for _ in range(header.numProbes): + sections.append(TraceSection(*struct.unpack("IIQ", f.read(16)))) + gridSize = header.gridDimX * header.gridDimY * header.gridDimZ + blockSize = header.blockDimX * header.blockDimY * header.blockDimZ + records: Dict[str, List[List[NamedTuple]]] = dict() + + # Read block_sched + records["block_sched"] = [] + f.seek(sections[0].offset) + for i in range(gridSize): + records["block_sched"].append([]) + for j in range(blockSize // sections[0].warpDiv): + records["block_sched"][-1].append([]) + for k in range(sections[0].size // 16): + records["block_sched"][i][j].append(block_sched(*struct.unpack("qII", f.read(16)))) + + return header, sections, records +# END of Neutrino Auto-Generated Code for Trace Reading +import sys +import numpy as np +header, sections, records_map = parse(sys.argv[1]) # filled by path to trace + +records = records_map["block_sched"] + +unique_sms = set() +for block in records: + unique_sms.add(block[0][0].cuid) + +sm_timelines = [] +for _ in range(len(unique_sms)): + sm_timelines.append([]) +sched_times = [0.0] * len(unique_sms) +work_times = [0.0] * len(unique_sms) + +for cur in records: + # print(sm_timelines[cur[0].cuid]) + sched_out = False + cuid = cur[0][0].cuid + if len(sm_timelines[cuid]) > 0: + for block in sm_timelines[cuid]: + if block.start + block.elapsed <= cur[0][0].start: + # if cur[0].lstart - (block.lstart + block.elapse) < 100000: + # print(cur[0], block) + sched_times[cuid] += cur[0][0].start - (block.start + block.elapsed) + sm_timelines[cuid].remove(block) + sm_timelines[cuid].append(cur[0][0]) + work_times[cuid] += cur[0][0].elapsed + sched_out = True + break + if not sched_out: + sm_timelines[cuid].append(cur[0][0]) + work_times[cuid] += cur[0][0].elapsed + break + else: + sm_timelines[cuid].append(cur[0][0]) + work_times[cuid] += cur[0][0].elapsed + +print(f"No.block:{header.gridDimX * header.gridDimY * header.gridDimZ} Running:{int(np.array(work_times).mean())} Scheduling:{int(np.array(sched_times).mean())}(cycle)") diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/dmat b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/dmat new file mode 100644 index 0000000..e34a2a8 Binary files /dev/null and b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/dmat differ diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/dmat.cc b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/dmat.cc new file mode 100644 index 0000000..86b6dac --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/dmat.cc @@ -0,0 +1,128 @@ +/** + * A Faster C++ STL Based Sparsifying + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Mask out last 16bit of 64bit address -> 16MB Page Size +#define PAGE_MASK 0xFFFFFFFFFFFF0000 + +typedef struct { + // basic launch configuration + uint32_t gridDimX; + uint32_t gridDimY; + uint32_t gridDimZ; + uint32_t blockDimX; + uint32_t blockDimY; + uint32_t blockDimZ; + uint32_t sharedMemBytes; + // all above from CUDA/ROCm launch configuration + uint32_t numProbes; // number of traces exposed + // followed by an array of trace_section_t +} trace_header_t; + +// @todo add a placeholder for probe level, aka warp/thread +typedef struct { + uint32_t size; // size of record per thread/warp in byte + uint32_t warpDiv; // warpSize for warp-level, 1 for thread-level + uint64_t offset; // offset for fseek +} trace_section_t; + +typedef struct { + uint64_t clock; + uint64_t addrs; +} dmat_t; + +int main(int argc, char* argv[]) { + if (argc < 3) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return EXIT_FAILURE; + } + + FILE* inputf = fopen(argv[1], "rb"); + if (inputf == NULL) { + fprintf(stderr, "can't open input %s \n", argv[1]); + return EXIT_FAILURE; + } + + FILE* outf = fopen(argv[2], "wb"); + if (outf == NULL) { + fprintf(stderr, "can't open output %s \n", argv[2]); + return EXIT_FAILURE; + } + + trace_header_t header; + size_t elements_read; + elements_read = fread(&header, sizeof(header), 1, inputf); + + uint32_t gridSize = header.gridDimX * header.gridDimY * header.gridDimZ; + uint32_t blockSize = header.blockDimX * header.blockDimY * header.blockDimZ; + + trace_section_t section; + elements_read = fread(§ion, sizeof(section), 1, inputf); + + uint64_t size = section.size; + uint64_t offset = section.offset; + + fprintf(stderr, "[info] size: %lu, gridSize: %u, blockSize: %u, offset %lu, numProbes: %u\n", size, gridSize, blockSize, offset, header.numProbes); + + // use fseek to locate the section starting point + fseek(inputf, offset, SEEK_SET); + + // allocate buffer size to contain the record, here we know it's uint64_t + void* content = (void*) malloc(size * gridSize * blockSize); + elements_read = fread(content, size * gridSize * blockSize, 1, inputf); + + // I am SORRY I have to use C++ Standard Template Library Containers + // page_reference_map := time -> page -> count + std::unordered_map> page_reference_map; + std::unordered_set pages; + + uint64_t max_clock = 0; + for (int blockIdx = 0; blockIdx < gridSize; blockIdx++) { + for (int threadIdx = 0; threadIdx < blockSize; threadIdx++) { + // Here we know every record takes 16 bytes + for (int recordIdx = 0; recordIdx < (size / 16); recordIdx++) { + dmat_t record = *(dmat_t*)(content); + if (record.clock != ~0) { // valid record + max_clock = (record.clock > max_clock) ? record.clock : max_clock; + uint64_t page = record.addrs & PAGE_MASK; + page_reference_map[record.clock][page]++; // accumulate the offset + pages.insert(page); + } + content += sizeof(dmat_t); // anyway offset by 16 bytes + } + } + } + + // now let's dump it to disk + size_t num_clocks = page_reference_map.size(), num_pages = pages.size(); + + fprintf(stderr, "\n[info] num_pages: %lu, num_clocks: %lu, max_clock: %lu\n", num_pages, num_clocks, max_clock); + + fwrite(&num_pages, sizeof(num_pages), 1, outf); + fwrite(&num_clocks, sizeof(num_clocks), 1, outf); + + std::vector page_vec(pages.begin(), pages.end()); // set -> vector + fwrite(page_vec.data(), sizeof(uint64_t), num_pages, outf); + + for (const auto& [clock, pages_clock] : page_reference_map) { + fwrite(&clock, sizeof(clock), 1, outf); + uint64_t size = pages_clock.size(); + fwrite(&size, sizeof(size), 1, outf); + for (const auto& [page, count] : pages_clock) { + fwrite(&page, sizeof(page), 1, outf); + fwrite(&count, sizeof(count), 1, outf); + } + } + fclose(outf); + return EXIT_SUCCESS; +} \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/dmat.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/dmat.py new file mode 100644 index 0000000..ea023ef --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/dmat.py @@ -0,0 +1,22 @@ +from neutrino import probe, Map +import neutrino.language as nl + +CALLBACK = "dmat_callback.py" + +@Map(level="thread", type="array", size=16, cap="dynamic") +class DMAT: + clock: nl.u64 + addr: nl.u64 + +start: nl.u64 = 0 +mem_clock: nl.u64 = 0 + +# define probes with decorator +@probe(pos="kernel", level="thread", before=True) +def thread_start(): + start = nl.clock() + +@probe(pos="ld.global:st.global:cp.async.cg:cp.async.ca", level="thread") +def memory_access(): + mem_clock = nl.clock() - start + DMAT.save(mem_clock, nl.addr) \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/dmat_callback.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/dmat_callback.py new file mode 100644 index 0000000..2efd80a --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/dmat_callback.py @@ -0,0 +1,244 @@ +"""Analyze the DMAT Output""" +import os +import sys +import subprocess +import struct +from typing import List, Dict, Tuple, Optional +from dataclasses import dataclass + +try: + import numpy as np + import matplotlib.pyplot as plt + from matplotlib.colors import ListedColormap, hex2color + import matplotlib.patches as mpatches +except: + import pip + pip.main(["install", "numpy"]) + import numpy as np + pip.main(["install", "matplotlib"]) + import matplotlib.pyplot as plt + from matplotlib.colors import ListedColormap, hex2color + import matplotlib.patches as mpatches + +# configure the x:y ratio and their scaling +Y = 10 +X = 16 +GRIDX, GRIDY = X * 10, Y * 10 +DPI = 200 + +def sparsify(path: str) -> str: + """Sparsify the raw trace to page reference map format + NOTE This involves calling external C++ program named `dmat.cc`""" + # First try to locate the .cc, it shall be in the same folder as this script + out_path = path[:path.index(".bin")] + ".dmat" + if os.path.exists(out_path): + print(f"[note] {out_path} exists") + return out_path + CURDIR = os.path.dirname(os.path.realpath(__file__)) + contents = os.listdir(CURDIR) + # Built it with g++ if it's not built + cmd = ["g++", os.path.join(CURDIR, "dmat.cc"), "-o", os.path.join(CURDIR, "dmat"), + "-O3", "-std=c++17"] + # print(" ".join(cmd)) + if not "dmat" in contents: + subprocess.check_call(cmd) + # Now call it to apply the C++ program + subprocess.check_call([os.path.join(CURDIR, "dmat"), path, out_path]) + return out_path + +@dataclass +class Param: + """We use shape as str to facilitate hash""" + ptr: int + size: Optional[int] = 0 + shape: Optional[str] = "" + name: Optional[str] = "" + +def read(path: str) -> List[Param]: + """Read the metadata """ + with open(os.path.join(os.path.dirname(os.path.dirname(path)), "event.log"), "r", encoding='utf-8', errors='ignore') as f: + event_logs = f.read().split("\n") + # try to find the last + raw_params = set() + for line in event_logs[::-1]: + # NOTE remove jit.py because Triton will implicitly call many funcs + if "[exec]" in line and "param" in line: + tmp = line.split(" ")[:-1] + for param in tmp[3:]: + raw_params.add(int(param, base=16)) + break # JUST THE LAST RECORD + with open(os.path.join(os.path.dirname(os.path.dirname(path)), "tensor.trace"), "r") as f: + tensor_traces = f.read().split("\n") + found = set() + params: List[Param] = [] + for line in tensor_traces[::-1]: + if line.startswith("[call]") or line.startswith("[ret]"): + record = line.split(" ") # split by 2 spaces + ptr=int(record[4]) + if ptr not in found: + params.append(Param( + ptr = int(record[4]), + size = int(record[3]), + shape = record[2], + name = record[5] + )) + found.add(ptr) + return params + +# use our own colormap -> support up to 6 level and 6 colors +colors = [ + ListedColormap([(0, 0, 0, 0), hex2color("#ccddf7"), hex2color("#99baef"), hex2color("#6698e6"), hex2color("#3375de"), hex2color("#0053d6")]), # blue + ListedColormap([(0, 0, 0, 0), hex2color("#e6d5f9"), hex2color("#cdaaf3"), hex2color("#b380ed"), hex2color("#9a55e7"), hex2color("#812be1")]), # purple + ListedColormap([(0, 0, 0, 0), hex2color("#ccf1cc"), hex2color("#99e499"), hex2color("#66d666"), hex2color("#33c933"), hex2color("#00bb00")]), # green + ListedColormap([(0, 0, 0, 0), hex2color("#f0d1cd"), hex2color("#e1a39b"), hex2color("#d17469"), hex2color("#c24637"), hex2color("#b31805")]), # red + ListedColormap([(0, 0, 0, 0), hex2color("#f0e0d6"), hex2color("#e0c1ad"), hex2color("#d1a383"), hex2color("#c1845a"), hex2color("#b26531")]), # yellow + # ListedColormap(), # +] + +def plot(path: str, params: List[Param]): + """Draw the DMAT Plot""" + unique_pages: List[int] = [] + page_reference_map: Dict[int, Dict[int, int]] = dict() + max_clock: int = 0 + with open(path, "rb") as f: + num_pages, num_clocks = struct.unpack("QQ", f.read(16)) + # print(num_pages, num_clocks) + for _ in range(num_pages): + unique_pages.append(struct.unpack("Q", f.read(8))[0]) + for _ in range(num_clocks): + clock = struct.unpack("Q", f.read(8))[0] + max_clock = max(clock, max_clock) + size = struct.unpack("Q", f.read(8))[0] + page_reference_map[clock] = dict() + for _ in range(size): + data = f.read(12) + if len(data) == 12: + page, count = struct.unpack("QI", data) + page_reference_map[clock][page] = count + + unique_pages = sorted(unique_pages) + # print(unique_pages) + + # print(unique_pages) + # now pages are sorted ascendingly -> distinguish into groups + page_group_start = [unique_pages[0]] + page_group_sizes = [] + current_size = 1 + for i in range(1, len(unique_pages)): + if unique_pages[i] - unique_pages[i - 1] > 2 ** 16: # new group + page_group_sizes.append(current_size) + page_group_start.append(unique_pages[i]) + current_size = 1 + else: # prev group + current_size += 1 + page_group_sizes.append(current_size) + # print(page_group_start) + # print(page_group_sizes) + # group name is the starting address and + page_to_id = {page: i for i, page in enumerate(unique_pages)} + + # need to have a grid + page_to_gridy = len(unique_pages) // (GRIDY - 1) + clock_to_gridx = max_clock // (GRIDX - 1) + + # Flatten the record + #clocks: List[int] = [] + #page_ids: List[int] = [] + counts: List[int] = [] + param_matches: Dict[int, Tuple[List[int], List[int], List[int], str, str]] = {i: ([], [], [], p.shape, p.name) for i, p in enumerate(params)} # page_id: [(clock, param_index, shape)] + + # NOTE Fix: Add a unmatched group + param_matches[len(param_matches)] = ([], [], [], "Unknown", "Unknown") + + # group_matches: Dict[int, Tuple[List[int], List[int], List[int]]] = {i: ([], [], []) for i in range(len(page_group_start))} # page_id -> group_start, group_size + for clock, items in page_reference_map.items(): + if clock < 5000000: # a useless filter for safety + for page, count in items.items(): + page_id = page_to_id[page] + #clocks.append(clock) + #page_ids.append(page_id) + #counts.append(count) + matched = False + for i, param in enumerate(params): + if param.size > 0 and param.ptr <= page <= param.ptr + param.size: # size is raw bytes + param_matches[i][0].append(clock) + param_matches[i][1].append(page_id) + param_matches[i][2].append(count) + matched = True + if not matched: + param_matches[len(param_matches) - 1][0].append(clock) + param_matches[len(param_matches) - 1][1].append(page_id) + param_matches[len(param_matches) - 1][2].append(count) + else: + print(f"Find Weird Data {clock}", file=sys.stderr) # might be bugs + + # print(param_matches) + + # filter out unused group + plotted_matches: List[Tuple[List[int], List[int], List[int], str, str]] = [] + for match in param_matches.values(): + if len(match[0]) > 0: + plotted_matches.append(match) + + # print(len(plotted_matches)) + dist: List[np.ndarray] = [] + + # Create the figure and axis + fig, ax = plt.subplots(figsize=(X, Y), dpi=DPI) + + n = min(len(plotted_matches), 5) + + sys.stdout.write('\r') + sys.stdout.write(f"Ploting Tensors: [{' '*5*n}] 0/{n}") + sys.stdout.flush() + + for i in range(n): # at most 5 now + tmp = np.zeros((Y * 10 + 1, X * 10 + 1), dtype=np.int32) + for clock, page, count in zip(plotted_matches[i][0], plotted_matches[i][1], plotted_matches[i][2]): + tmp[Y * 10 - page // page_to_gridy, clock // clock_to_gridx] += count + # max_ = tmp.max() + dist.append(tmp.flatten()) + boundaries = np.percentile(tmp[tmp != 0], [20, 40, 60, 80]) + boundaries = np.concatenate(([0], boundaries)) + # cut into five region based on percentile + temp = np.zeros_like(tmp) + for j in range(len(boundaries)): + temp[boundaries[j] < tmp] = j + 1 + ax.imshow(temp, cmap=colors[i]) + + sys.stdout.write('\r') + sys.stdout.write(f"Ploting Tensors: [{'='*5*(i+1)}{' '*5*(n-(i+1))}] {i+1}/{n}") + sys.stdout.flush() + + # Set the ticks and labels + ax.set_xticks(np.arange(0, X * 10, 10)) + ax.set_xticklabels([f'{int(max_clock / X * i)}' for i in range(X)], rotation=45) + ax.set_yticks(np.arange(0, Y * 10, 10)) + ax.set_yticklabels([f'{int(len(unique_pages) / Y * i)}' for i in range(Y, 0, -1)]) + + # Manually draw grid lines + for x in range(X * 10 + 1): # Vertical lines + ax.axvline(x - 0.5, color='lightgrey', linewidth=0.4) + + for y in range(Y * 10 + 1): # Horizontal lines + ax.axhline(y - 0.5, color='lightgrey', linewidth=0.4) + + # Create handles for the legend + handles = [mpatches.Patch(color=colors[i].colors[-1], label=f'Ptr {i}: {plotted_matches[i][3]}, {plotted_matches[i][4]}') for i in range(min(len(plotted_matches), 5))] + plt.legend(handles=handles, title=f"Tensor", loc='lower left') + + plt.title('Page Reference Map') + plt.xlabel('Clock') + plt.ylabel('Pages') + + # Save the figure + plt.tight_layout() + plt.savefig(path[:path.index(".dmat")] + ".png") + plt.close(fig) + print(f'\n[info] save to {path[:path.index(".dmat")] + ".png"}') + +if __name__ == "__main__": + path = sys.argv[1] + sparsified = sparsify(path) + params = read(path) + plot(sparsified, params) \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/gmem_bytes.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/gmem_bytes.py new file mode 100644 index 0000000..89f069e --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/gmem_bytes.py @@ -0,0 +1,29 @@ +from neutrino import probe, Map +import neutrino.language as nl + +CALLBACK = "gmem_bytes_analysis.py" + +@Map(level="thread", type="array", size=8, cap=1) +class GMEMBytes: + sync_bytes: nl.u32 + async_bytes: nl.u32 + +sync_bytes: nl.u64 = 0 +async_bytes: nl.u64 = 0 + +@probe(level="thread", pos="kernel", before=True) +def init(): + sync_bytes = 0 + async_bytes = 0 + +@probe(level="thread", pos="ld.global:st.global") +def record_sync(): + sync_bytes += nl.bytes + +@probe(level="thread", pos="cp.async.ca:cp.async.cg") +def record_async(): + async_bytes += nl.bytes + +@probe(level="thread", pos="kernel") +def save(): + GMEMBytes.save(sync_bytes, async_bytes) \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/gmem_bytes_analysis.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/gmem_bytes_analysis.py new file mode 100644 index 0000000..a82e588 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/gmem_bytes_analysis.py @@ -0,0 +1,40 @@ +# Neutrino Auto-Generated Code for Trace Reading +import struct +from typing import NamedTuple, List, Tuple +from neutrino import TraceHeader, TraceSection + +class saving(NamedTuple): + sync_bytes: int + async_bytes: int + + +def parse(path: str) -> Tuple[TraceHeader, List[TraceSection], List[List[saving]]]: + with open(path, "rb") as f: + gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, numProbes = struct.unpack("iiiiiiii", f.read(32)) + header: TraceHeader = TraceHeader(gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, numProbes) + assert header.numProbes == 1 # currently only one saving probe is supported + sections: List[TraceSection] = [] + for _ in range(header.numProbes): + size, offset = struct.unpack("QQ", f.read(16)) + sections.append(TraceSection(size, offset)) + gridSize = header.gridDimX * header.gridDimY * header.gridDimZ + blockSize = header.blockDimX * header.blockDimY * header.blockDimZ + records: List[List[saving]] = [] + for i in range(gridSize): + records.append([]) + for j in range(blockSize): + sync_bytes, async_bytes = struct.unpack("II", f.read(8)) + records[i].append(saving(sync_bytes, async_bytes)) + return header, sections, records +# END OF GENERATED CODE +import sys +header, sections, records = parse(sys.argv[1]) # filled by path to trace + +gridSize = header.gridDimX * header.gridDimY * header.gridDimZ +blockSize = header.blockDimX * header.blockDimY * header.blockDimZ +gmem_bytes = 0 +for i in range(gridSize): + for j in range(blockSize): + gmem_bytes += records[i][j].sync_bytes + records[i][j].async_bytes + +print(f"gmem_bytes:{gmem_bytes}") \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/program_progress.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/program_progress.py new file mode 100644 index 0000000..8919c47 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/program_progress.py @@ -0,0 +1,10 @@ +from neutrino import probe, Map +import neutrino.language as nl + +@Map(level="thread", type="array", size=8, cap=128) +class Sample: + clock: nl.u64 + +@probe(level="thread", pos="bra") +def bra_sample(): + Sample.save(nl.clock()) \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/tensorop.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/tensorop.py new file mode 100644 index 0000000..c708d5c --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/tools/tensorop.py @@ -0,0 +1,20 @@ +from neutrino import probe, Map +import neutrino.language as nl + +@Map(level="thread", type="array", size=8, cap=1) +class TensorOpCount: + count: nl.u64 + +counter: nl.u64 = 0 + +@probe(level="thread", pos="kernel", before=True) +def init(): + counter = 0 + +@probe(level="thread", pos="mma.sync.aligned") +def count(): + counter += 1 + +@probe(level="thread", pos="kernel") +def save(): + TensorOpCount.save(counter) \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/utils/__init__.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/utils/__init__.py new file mode 100644 index 0000000..c676b7c --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/utils/__init__.py @@ -0,0 +1,41 @@ +"""Neutrino 3rd Party Utilities + +Please place each util as a module under neutrino.utils +Following are some common utilities""" + +def get_tracedir() -> str: + """Get or Create (if not yet here) the tracedir + + NOTE Impl of this shall match src/common.h !""" + import os + import time + from datetime import datetime + + neutrino_dir = os.getenv("NEUTRINO_TRACEDIR") + assert neutrino_dir is not None, "NEUTRINO_TRACEDIR must be set" + if not os.path.isdir(neutrino_dir): + os.mkdir(neutrino_dir) + + # 1. read the 22nd value of /proc/[pid]/stat (jiffies of proc start time) + with open("/proc/self/stat", "r") as f: + jiffies = int(f.read().split()[21]) + + # 2. get system clock frequency (Hz, usually 100MHz) + clk_tck = os.sysconf(os.sysconf_names["SC_CLK_TCK"]) + + # 3. read the systme boot time (second, since 1970) + with open("/proc/uptime", "r") as f: + uptime_seconds = int(float(f.read().split()[0])) + + # 4. compute absolute timestamp of proc boot time and format + # NOTE we convert time() and uptime to int to match C algorithm, + # or it's likely to have two folder with 1 second difference + procstart = int(time.time()) - uptime_seconds + (jiffies / clk_tck) + procstart = datetime.fromtimestamp(procstart) + formatted = procstart.strftime("%b%d_%H%M%S") + "_" + str(os.getpid()) + trace_dir = os.path.join(neutrino_dir, formatted) + + if not os.path.isdir(trace_dir): + os.mkdir(trace_dir) + + return trace_dir \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/utils/tensortrace.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/utils/tensortrace.py new file mode 100644 index 0000000..257291d --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/utils/tensortrace.py @@ -0,0 +1,80 @@ +"""Obtain High-Level Tensor Information from PyTorch or other framework + +USAGE `TensorTrace` for with-statement and `tensortrace` for function wrapper +TODO Support JAX via arr.__cuda_array_interface__['data'] for .data_ptr() +INTERNAL Use Python's built-in sys.settrace to track call frames""" + +import sys +import os +import time +from typing import Callable, TextIO +from functools import wraps +import torch +from neutrino.utils import get_tracedir + +__all__ = ["TensorTrace", "tensortrace"] + +def get_time() -> int: + """Python Equivalent of C Style Get Time: + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + long long time = ts.tv_nsec + ts.tv_sec * 1e9; + """ + ts = time.clock_gettime(time.CLOCK_REALTIME) + seconds = int(ts) + nanoseconds = int((ts - seconds) * 1e9) + return nanoseconds + seconds * int(1e9) + +# We use a closure to specify the holder +def callframe_tracer_wrapper(f: TextIO) -> Callable: + + # a trace function conforms sys.settrace() interface + def trace_calls(frame, event, arg): + if event == 'call' and "jit" not in frame.f_code.co_filename: + code = frame.f_code + func_name = code.co_name + func_filename = code.co_filename + func_line_no = frame.f_lineno + for i, varname in enumerate(code.co_varnames): + if i < frame.f_code.co_argcount: + if isinstance(frame.f_locals[varname], torch.Tensor): + print(f"[call] {get_time()} {frame.f_locals[varname].shape} {frame.f_locals[varname].untyped_storage().nbytes()} {frame.f_locals[varname].data_ptr()} {varname} {func_name} {func_filename}:{func_line_no}", flush=True, file=f) + # NOTE might not need return because we pause the exec so it shall stall in call + # elif event == 'return' and "jit" not in code.co_filename: + # code = frame.f_code + # func_name = code.co_name + # func_filename = code.co_filename + # func_line_no = frame.f_lineno + # if isinstance(arg, torch.Tensor): + # print(f"[ret] {get_time()} {arg.shape} {arg.untyped_storage().nbytes()} {arg.data_ptr()} {func_name} {func_filename}:{func_line_no}", flush=True, file=f) + # elif isinstance(arg, tuple): + # for arg_i in arg: + # if isinstance(arg_i, torch.Tensor): + # print(f"[ret] {get_time()} {arg_i.shape} {arg_i.untyped_storage().nbytes()} {arg_i.data_ptr()} {func_name} {func_filename}:{func_line_no}", flush=True, file=f) + + return trace_calls + + return trace_calls + +class TensorTrace: + """A context manager to trace call call stacks""" + def __enter__(self): + trace_file: TextIO + if os.getenv("NEUTRINO_TRACEDIR") is not None: + trace_dir = get_tracedir() + # print(f"[info] tensor trace in {os.path.join(trace_dir, 'tensor.trace')}", file=sys.stderr) + trace_file = open(os.path.join(trace_dir, "tensor.trace"), "w+") + else: + trace_file = sys.stderr + sys.settrace(callframe_tracer_wrapper(f=trace_file)) + + def __exit__(self, exc_type, exc_value, traceback): + sys.settrace(None) # clear the trace function + +def tensortrace(func: Callable) -> Callable: + """A decorator to apply TensorTrace to a function.""" + @wraps(func) + def wrapper(*args, **kwargs): + with TensorTrace(): + return func(*args, **kwargs) + return wrapper \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/utils/trace_reading.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/utils/trace_reading.py new file mode 100644 index 0000000..cedc3a5 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/utils/trace_reading.py @@ -0,0 +1,81 @@ +"""Automatic Code Generation based on the Map definitions for reading trace files. +TODO update to numpy.struct""" + +from neutrino.common import load + +__all__ = ["gen_reading_code"] + +# NOTE Template for Generating Trace Reading Code +TRACE_READING_PY = """# Neutrino Auto-Generated Code for Trace Reading +import struct +from typing import NamedTuple +from neutrino import TraceHeader, TraceSection + +{MAP_DEFNS} + +def parse(path: str) -> tuple[TraceHeader, list[TraceSection], dict[str, list[list[NamedTuple]]]]: + with open(path, "rb") as f: + header: TraceHeader = TraceHeader(*struct.unpack("iiiiiiii", f.read(32))) + sections: list[TraceSection] = [] + for _ in range(header.numProbes): + sections.append(TraceSection(*struct.unpack("IIQ", f.read(16)))) + gridSize = header.gridDimX * header.gridDimY * header.gridDimZ + blockSize = header.blockDimX * header.blockDimY * header.blockDimZ + records: dict[str, list[list[NamedTuple]]] = dict() +{TRACE_READINGS} + return header, sections, records +# END of Neutrino Auto-Generated Code for Trace Reading""" + +TRACE_STRUCT_CODE_PY = """ +class {MAP_NAME}(NamedTuple): +{CONTENT} +""" + +TRACE_PARSING_PY = """ + # Read {MAP_NAME} + records["{MAP_NAME}"] = [] + f.seek(sections[{INDEX}].offset) + for i in range(gridSize): + records["{MAP_NAME}"].append([]) + for j in range(blockSize // sections[{INDEX}].warpDiv): + records["{MAP_NAME}"][-1].append([]) + for k in range(sections[{INDEX}].size // {BYTES}): + records["{MAP_NAME}"][i][j].append({MAP_NAME}(*struct.unpack("{FORMAT_STRING}", f.read({BYTES})))) +""" + +def gen_reading_code(probe: dict) -> str: + """Generate the code for reading the trace file""" + _, maps, _ = load(probe) + trace_structs = [] + trace_readings = [] + + for index, map in enumerate(maps): + content = [] + format_string = "" + reading_bytes = 0 + for reg in map.regs: + content.append(f" {reg.name}: int") + format_string += "q" if reg.dtype == 'u64' else "I" + reading_bytes += 8 if reg.dtype == 'u64' else 4 + trace_structs.append(TRACE_STRUCT_CODE_PY.format( + MAP_NAME=map.name, + CONTENT="\n".join(content) + )) + trace_readings.append(TRACE_PARSING_PY.format( + MAP_NAME=map.name, + FORMAT_STRING=format_string, + BYTES=reading_bytes, + INDEX=index, + )) + + return TRACE_READING_PY.format( + MAP_DEFNS="\n".join(trace_structs), + TRACE_READINGS="\n".join(trace_readings) + ) + +if __name__ == "__main__": + # Example usage + import sys + import toml + probe = toml.load(sys.argv[1]) + print(gen_reading_code(probe)) \ No newline at end of file diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/utils/verifier.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/utils/verifier.py new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/pyproject.toml b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/pyproject.toml new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/setup.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/setup.py new file mode 100644 index 0000000..6ae50b2 --- /dev/null +++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/setup.py @@ -0,0 +1,47 @@ +import sys +import os +import subprocess +from setuptools import setup, find_packages +from setuptools.command.install import install + +class BuildNeutrino(install): + def run(self): + CURDIR = os.path.dirname(os.path.realpath(__file__)) + # run the building script + subprocess.check_output([sys.executable, os.path.join(CURDIR, "neutrino", "build.py")]) + install.run(self) + + +setup( + name='neutrino', + version='0.1.0', + packages=find_packages(), + # package_dir={'': 'neutrino'}, # Set src as the root for packages + package_data={'build': ['*'], 'tools': ['*']}, + include_package_data=True, # Include files specified in MANIFEST.in + install_requires=[ + 'toml', + ], + py_modules=["neutrino"], + entry_points={ + 'console_scripts': [ + 'neutrino = neutrino.cli:main', # Links 'myentry' command to `main` function in `myentry.py` + ], + }, + author='Neutrino Team', # Anonymous Name + author_email='anonymous@example.com', # Anonymous Email + description='Something', + long_description=open('README.md').read(), + long_description_content_type='text/markdown', + url='https://github.com/neutrino-gpu/neutrino', + classifiers=[ + 'Programming Language :: Python :: 3', + 'License :: OSI Approved :: MIT License', + 'Operating System :: POSIX :: Linux', + ], + cmdclass={'install': BuildNeutrino}, + python_requires='>=3.10', # Specify the Python version requirement + setup_requires=[ + 'toml', + ], +) \ No newline at end of file