| + +### Pythonic Tracing DSL +```python +from neutrino import probe, Map +import neutrino.language as nl +CALLBACK = "block_sched.py" # for trace analysis +# declare maps for persistence +@Map(level="warp", type="array", size=16, cap=1) +class block_sched: + start: nl.u64 + elapsed: nl.u32 + cuid: nl.u32 +# declare probe registers shared across probes +start: nl.u64 = 0 # starting clock +elapsed: nl.u64 = 0 # elapsed time, initialized to 0 +# define probes with decorator +@probe(pos="kernel", level="warp", before=True) +def thread_start(): + start = nl.clock() +@probe(pos="kernel", level="warp") +def thread_end(): + elapsed = nl.clock() - start + block_sched.save(start, elapsed, nl.cuid()) +``` + + | ++ +### Direct Assembly wrapped in TOML +```toml +# CUDA PTX Assembly Example +callback="block_sched.py" +[ map.block_sched ] +type = "array" +level = "warp" +size = "16" +cap = "1" +[ probe.thread_start_thread_end ] +position = "kernel" +level = "warp" +register = {"u32": 2, "u64": 3} +before = """.reg .b64 %PD<3>; +.reg .b32 %P<2>; +mov.u64 %PD0, %clock64;""" +after = """mov.u64 %PD1, %clock64; +sub.u64 %PD1, %PD1, %PD0; +cvt.u32.u64 %P1, %PD1; +mov.u32 %P2, %smid; +SAVE [ block_sched ] {%PD0, %P1, %P2};""" +``` + + | +
| + +### Hardware + + +| Hardware Platform | Support Status | +| --- | --- | +| NVIDIA/CUDA/PTX | ✅ Supported | +| AMD/ROCm/GCNAsm | 🛠️ Testing | +| General/OpenCL/SPIR-V | 🚀 Planning | + + | ++ +### Software + +| Software Framework | Status | +| --- | --- | +| cuBLAS/cuFFT/cuSparse... | ❌ (no plan for supporting) | +| CUTLASS | ✅ (with macro in building) | +| PyTorch family (torchvision...) | ✅ (with custom build) | +| JAX | ✅ (with envariable in runtime) | +| Triton | ✅ | + + | +
+
+The source code are placed in the following structure:
+
+```
+neutrino
+├── language # DSL and Compiler, Still in Testing
+│ ├── __init__.py # DSL Primitive
+│ ├── compiler.py # Exported Compiler API
+│ ├── frontend.py # Parser and AST Transformer
+│ ├── gcn.py # CUDA PTX Codegen Backend
+│ └── ptx.py # AMD ROCm Codegen Backend
+├── probe # Probe Engine
+│ ├── __init__.py # Common Definition and Utilities
+│ ├── cuda.py # CUDA PTX Impl
+│ └── hip.py # AMD ROCm Impl
+├── src # Hook Driver
+│ ├── common.h # Platform-agnostic Definition (GNU-only)
+│ ├── cuda.c # CUDA Impl (NVIDIA-related)
+│ ├── hip.c # ROCm Impl (AMD-related)
+│ ├── preload.c # Injector via LD_PRELOAD
+│ ├── parse.py # Generate Unhook API (NVIDIA/AMD)
+│ ├── sha1.h # third-parties header-only library
+│ └── uthash.h # third-parties header-only library
+├── build.py # Builder for driver in src/
+├── cli.py # Command Line Interface Entry
+├── common.py # Common Internal API not for User import
+└── __init__.py # Common Defn for user import like probe, Map
+```
+
+The overall structure is clean and approachable, we welcome developers to hack the system for their need. Raise issues if you need help.
+
+## More
+
+* How to write my probe? Check the [Probe Writing Guide](https://open-neutrino.github.io/docs/write-probes).
+* How are probes executed? Check the [Probe Execution Model](https://open-neutrino.github.io/docs/execute-model).
+* How to read the neutrino trace? Check the [Trace File Structure](https://open-neutrino.github.io/docs/read-trace).
+* How to Neutrino works and how to extend? [Check the Reference and Internals](https://open-neutrino.github.io/docs/system-workflow).
+* How good is Neutrino? Check the [Utilities and Extensions](https://open-neutrino.github.io/docs/analysis-code)
+
+## Citation
+If you used Neutrino in your research, please cite the paper below. And we welcome you to send us a link to your paper.
+```
+@inproceedings{huang2025neutrino,
+ author = {Songlin Huang and Chenshu Wu},
+ title = {Neutrino: Fine-grained GPU Kernel Profiling via Programmable Probing},
+ booktitle = {19th USENIX Symposium on Operating Systems Design and Implementation (OSDI 25)},
+ year = {2025},
+ url = {https://www.usenix.org/conference/osdi25/presentation/huang-songlin},
+ publisher = {USENIX Association},
+}
+```
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/analysis.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/analysis.py
new file mode 100644
index 0000000..2ff4f9b
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/analysis.py
@@ -0,0 +1,54 @@
+# Neutrino Generated Code for Reading Trace
+import struct
+from typing import NamedTuple, List, Tuple
+from neutrino import TraceHeader, TraceSection
+class block_sched(NamedTuple):
+ start: int
+ elapsed: int
+ cuid: int
+def parse(path: str):
+ with open(path, "rb") as f:
+ header: TraceHeader = TraceHeader(
+ struct.unpack("iiiiiiii", f.read(32)))
+ sections: List[TraceSection] = []
+ for _ in range(header.numProbes):
+ size, offset = struct.unpack("QQ", f.read(16))
+ sections.append(TraceSection(size, offset))
+ gridSize = header.gridDimX * header.gridDimY
+ * header.gridDimZ
+ blockSize = header.blockDimX * header.blockDimY
+ * header.blockDimZ
+ records: List[List[block_sched]] = []
+ for i in range(gridSize):
+ records.append([])
+ for j in range(blockSize):
+ start, elapsed, cuid = struct.unpack(
+ "QII", f.read(16))
+ records[i].append(
+ block_sched(start, elapsed, cuid))
+ return header, sections, records
+# END OF GENERATED CODE
+import numpy as np
+header, sections, records = parse(sys.argv[1])
+unique_cus = set()
+for block in records:
+ unique_cus.add(block[0].cuid)
+cu_timelines = [[]] * len(unique_cus)
+sched_times = [0.0] * len(unique_cus)
+work_times = [0.0] * len(unique_cus)
+for cur in records:
+ sched_out = False
+ for block in cu_timelines[cur[0].cuid]:
+ if block.start+block.elapsed<=cur[0].start:
+ sched_times[cur[0].cuid]+=cur[0].start
+ - (block.start + block.elapsed)
+ cu_timelines[cur[0].cuid].remove(block)
+ cu_timelines[cur[0].cuid].append(cur[0])
+ work_times[cur[0].cuid] += cur[0].elapsed
+ sched_out = True
+ break
+ if not sched_out:
+ cu_timelines[cur[0].cuid].append(cur[0])
+ work_times[cur[0].cuid] += cur[0].elapsed
+print(np.array(sched_times).mean(),
+ np.array(work_times).mean())
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/gcn.asm b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/gcn.asm
new file mode 100644
index 0000000..256a044
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/gcn.asm
@@ -0,0 +1 @@
+CALLBACK = "block_sched.py"
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/zero_persistent.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/zero_persistent.py
new file mode 100644
index 0000000..a829336
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/examples/zero_persistent.py
@@ -0,0 +1,29 @@
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def zero_persistent_kernel(output_ptr, numel,
+ BLOCK_SIZE: tl.constexpr, NUM_SMS: tl.constexpr):
+ start_pid = tl.program_id(axis=0)
+ num_blocks = tl.cdiv(numel, BLOCK_SIZE)
+ blocks_per_sm = num_blocks // NUM_SMS
+ if start_pid < num_blocks % NUM_SMS:
+ blocks_per_sm += 1
+ block_id = start_pid - NUM_SMS
+ for _ in range(blocks_per_sm):
+ block_id += NUM_SMS
+ offsets=block_id*BLOCK_SIZE+tl.arange(0,BLOCK_SIZE)
+ mask = offsets < numel
+ tl.store(output_ptr + offsets,
+ tl.zeros([BLOCK_SIZE], dtype=tl.float16), mask)
+def zero_persistent(x: torch.Tensor):
+ numel = x.numel()
+ NUM_SMS = torch.cuda.get_device_properties("cuda")\
+ .multi_processor_count
+ BLOCK_SIZE = 128
+ grid = lambda META: (min(NUM_SMS,
+ triton.cdiv(numel, META['BLOCK_SIZE'])),)
+ zero_persistent_kernel[grid](
+ x, numel, BLOCK_SIZE, NUM_SMS)
+t=torch.empty((4096,4096),torch.float16,device="cuda")
+zero_persistent(t)
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/__init__.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/__init__.py
new file mode 100644
index 0000000..a6f2479
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/__init__.py
@@ -0,0 +1,39 @@
+from typing import NamedTuple, Union, Literal
+
+class TraceHeader(NamedTuple):
+ gridDimX: int
+ gridDimY: int
+ gridDimZ: int
+ blockDimX: int
+ blockDimY: int
+ blockDimZ: int
+ sharedMemBytes: int
+ numProbes: int
+
+class TraceSection(NamedTuple):
+ size: int
+ warpDiv: int
+ offset: int
+
+def probe(pos: str, after: bool = False, level: str = "thread", size: int = 0):
+ """Neutrino Probe Entry"""
+ from functools import wraps
+ # Just preventing the execution as we take it as part of AST only
+ def inner(func: callable):
+ @wraps(func)
+ def wrapper(*args, **kwargs):
+ raise RuntimeError(f"{func.__name__} shall be jit other than run")
+ return wrapper
+ return inner
+
+def Map(level: Literal["warp", "thread"], type: str, size: int, cap: Union[int, Literal["dynamic"]]):
+ """Neutrino Map Definition"""
+ from functools import wraps
+ def inner(cls):
+ @wraps(cls)
+ def wrapper(*args, **kwargs):
+ raise RuntimeError(f"{cls.__name__} shall be jit other than run")
+ return wrapper
+ return inner
+
+# Following are internal definition
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/build.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/build.py
new file mode 100644
index 0000000..8685e9f
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/build.py
@@ -0,0 +1,129 @@
+import os
+import sys
+import subprocess
+from pprint import pprint
+
+try:
+ import toml
+except:
+ import pip
+ pip.main(["install", "toml"])
+ import toml
+
+CURDIR = os.path.dirname(os.path.realpath(__file__))
+
+def check_command(cmd: str):
+ try:
+ _ = subprocess.run([cmd], stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE, text=True, check=True)
+ return True
+ except FileNotFoundError:
+ return False
+
+# Use rocm-smi or nvidia-smi to detect if installed
+if check_command("rocm-smi"):
+ NEUTRINO_MODE = "HIP"
+ NEUTRINO_DRIVER_HEADER_NAME = "hip/hip_runtime_api.h"
+ NEUTRINO_IMPL_SRC = "hip.c"
+ NEUTRINO_HOOK_DRIVER_LIB_NAME = "libamdhip64.so.6"
+ NEUTRINO_DRIVER_HEADER_SEARCH_PATH = [
+ "/opt/rocm/include/", # AFAIK, add if new path is met
+ ]
+ extra_flags = ["-D__HIP_PLATFORM_AMD__"]
+elif check_command("nvidia-smi"):
+ NEUTRINO_MODE = "CUDA"
+ NEUTRINO_DRIVER_HEADER_NAME = "cuda.h"
+ NEUTRINO_IMPL_SRC = "cuda.c"
+ NEUTRINO_HOOK_DRIVER_LIB_NAME = "libcuda.so.1"
+ NEUTRINO_DRIVER_HEADER_SEARCH_PATH = [
+ "/usr/local/cuda/targets/x86_64-linux/include/", # for x86
+ "/usr/local/cuda/targets/aarch64-linux/include/", # for ARM
+ # add if missed
+ ]
+ extra_flags = []
+else:
+ raise RuntimeError("ONLY SUPPORT CUDA and HIP(AMD-ONLY)")
+
+# Internal Configurations
+SRC_DIR = os.path.join(CURDIR, "src")
+BUILD_DIR = os.path.join(CURDIR, "build")
+CC = "cc" # NOTE don't use nvcc or hipcc, need gcc or clang
+PY = sys.executable
+
+for dir_ in NEUTRINO_DRIVER_HEADER_SEARCH_PATH:
+ try:
+ if NEUTRINO_DRIVER_HEADER_NAME in os.listdir(dir_):
+ break
+ except:
+ pass
+# NOTE this will be written in config.toml
+NEUTRINO_DRIVER_HEADER_DIR = dir_
+
+# NOTE Locate Driver Shared Library
+# inspired by: https://github.com/triton-lang/triton/commit/58c54455ffa691be64f90f4e856501162373572c#diff-3d1f29795218f61553ab953426c15fa1e4162b224405b85529022293054da57aR25
+# but we need to further locate the real driver library
+libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode()
+locs = [line.split()[-1] for line in libs.splitlines() if NEUTRINO_HOOK_DRIVER_LIB_NAME in line]
+env_ld_library_path = os.getenv("LD_LIBRARY_PATH")
+if env_ld_library_path and not locs:
+ locs = [os.path.join(dir_, NEUTRINO_HOOK_DRIVER_LIB_NAME) for dir_ in env_ld_library_path.split(":")
+ if os.path.exists(os.path.join(dir_, NEUTRINO_HOOK_DRIVER_LIB_NAME))]
+
+# try to locate the pointed path
+NEUTRINO_REAL_DRIVER_LIB_NAME = ""
+NEUTRINO_REAL_DRIVER_LIB_DIR = ""
+real_libs = []
+for loc in locs:
+ real_lib = os.readlink(loc)
+ # NOTE fix lib32 and i386 bug
+ if "lib32" not in loc and "lib32" not in real_lib and "i386" not in loc and "i386" not in real_lib:
+ if not real_lib.startswith("/"):
+ NEUTRINO_REAL_DRIVER_LIB_DIR = os.path.dirname(loc)
+ NEUTRINO_REAL_DRIVER_LIB_NAME = real_lib
+ else:
+ NEUTRINO_REAL_DRIVER_LIB_DIR = os.path.dirname(loc)
+ NEUTRINO_REAL_DRIVER_LIB_NAME = os.path.basename(loc)
+
+print(NEUTRINO_REAL_DRIVER_LIB_DIR, NEUTRINO_REAL_DRIVER_LIB_NAME, file=sys.stderr)
+
+# NOTE call parse.py
+cmd = [PY, os.path.join(SRC_DIR, "parse.py"),
+ os.path.join(NEUTRINO_DRIVER_HEADER_DIR, NEUTRINO_DRIVER_HEADER_NAME),
+ os.path.join(NEUTRINO_REAL_DRIVER_LIB_DIR, NEUTRINO_REAL_DRIVER_LIB_NAME)]
+print(" ".join(cmd), file=sys.stderr)
+subprocess.check_output(cmd)
+
+# NOTE compile cuda.c/hip.c with common.h
+cmd = [CC, os.path.join(SRC_DIR, NEUTRINO_IMPL_SRC), "-fPIC", "-shared", "-ldl", "-lpthread", "-O3", *extra_flags,
+ "-I", NEUTRINO_DRIVER_HEADER_DIR, "-o", os.path.join(BUILD_DIR, NEUTRINO_HOOK_DRIVER_LIB_NAME)]
+print(" ".join(cmd), file=sys.stderr)
+subprocess.check_output(cmd)
+
+# NOTE compile preload.c
+cmd = [CC, os.path.join(SRC_DIR, "preload.c"), "-fPIC", "-shared", "-O3",
+ "-o", os.path.join(BUILD_DIR, "preload.so")]
+print(" ".join(cmd), file=sys.stderr)
+subprocess.check_output(cmd)
+
+# NOTE create a symbolic link like libcuda.so -> libcuda.so.1
+# TODO verify if this is need
+cmd = ["ln", "-sf", NEUTRINO_HOOK_DRIVER_LIB_NAME,
+ os.path.join(BUILD_DIR, NEUTRINO_HOOK_DRIVER_LIB_NAME[:NEUTRINO_HOOK_DRIVER_LIB_NAME.index("so") + 2])]
+print(" ".join(cmd), file=sys.stderr)
+subprocess.check_output(cmd)
+
+# NOTE dump system configuration for CLI usage
+config = {}
+config["system"] = {
+ "NEUTRINO_MODE" : NEUTRINO_MODE,
+ "NEUTRINO_DRIVER_HEADER_NAME" : NEUTRINO_DRIVER_HEADER_NAME,
+ "NEUTRINO_DRIVER_HEADER_DIR" : NEUTRINO_DRIVER_HEADER_DIR,
+ "NEUTRINO_HOOK_DRIVER_LIB_NAME" : NEUTRINO_HOOK_DRIVER_LIB_NAME,
+ "NEUTRINO_REAL_DRIVER_LIB_NAME" : NEUTRINO_REAL_DRIVER_LIB_NAME,
+ "NEUTRINO_REAL_DRIVER_LIB_DIR" : NEUTRINO_REAL_DRIVER_LIB_DIR,
+}
+toml.dump(config, open(os.path.join(BUILD_DIR, "config.toml"), "w"))
+
+print("Build Success, Configuration")
+print("============================")
+pprint(config)
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/cli.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/cli.py
new file mode 100644
index 0000000..6049a1d
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/cli.py
@@ -0,0 +1,165 @@
+"""CLI Entry for NEUTRINO: Fine-grained GPU Kernel Profiling via Programmable Probing"""
+
+import subprocess
+import os
+import sys
+import toml
+import argparse
+
+
+# Main Function, need a func to be referred by setup.py build system
+def main():
+ # NOTE READ CONFIG.TOML FOR DEFAULT SYSTEM CONFIGURATION
+ # directory of this python file and other toolkits
+ CURDIR = os.path.dirname(os.path.realpath(__file__))
+ # directory of the neutrino internals
+ NEUTRINO_BUILD_DIR: str = os.path.join(CURDIR, "build")
+ NEUTRINO_PROBE_DIR: str = os.path.join(CURDIR, "probe")
+ NEUTRINO_TOOLS_DIR: str = os.path.join(CURDIR, "tools")
+ # load system configuration, generated in building
+ config = toml.load(os.path.join(NEUTRINO_BUILD_DIR, "config.toml"))["system"]
+ # default configurations, can be overwritten by CLI parameters
+ NEUTRINO_HOOK_DRIVER_NAME: str = config["NEUTRINO_HOOK_DRIVER_LIB_NAME"]
+ NEUTRINO_REAL_DRIVER_DIR : str = config["NEUTRINO_REAL_DRIVER_LIB_DIR"]
+ NEUTRINO_REAL_DRIVER_NAME: str = config["NEUTRINO_REAL_DRIVER_LIB_NAME"]
+ NEUTRINO_MODE : str = config["NEUTRINO_MODE"]
+ # available built-in tools
+ NEUTRINO_TOOLS = {tool[:-3] : tool for tool in os.listdir(NEUTRINO_TOOLS_DIR) if tool.endswith(".py")}
+
+ parser = argparse.ArgumentParser(
+ prog='neutrino', usage='%(prog)s [options] command',
+ description=f"""NOTE: Probes must be given via -p (--probe) option. Buit-in tools: {tuple(NEUTRINO_TOOLS.keys())}""",
+ epilog="Examples: `neutrino -t gmem_bytes python test/zero_.py`. Open issue(s) in https://github.com/neutrino-gpu/neutrino if encountered problems",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+ parser.add_argument('-p', '--probe', required=True,
+ help="probe in form of path to the file")
+ parser.add_argument('--tracedir', default="./trace",
+ help="parent folder of traces")
+ parser.add_argument('--driver', default=os.path.join(NEUTRINO_REAL_DRIVER_DIR, NEUTRINO_REAL_DRIVER_NAME),
+ help='path to the real cuda/hip driver shared library')
+ parser.add_argument("--python", default=sys.executable,
+ help='path to python executable used')
+ parser.add_argument('--filter',
+ help='filter OUT buggy kernels by (part of) name, split by :')
+ parser.add_argument('-k', '--kernel',
+ help='filter the kernel by (part of) name, split by :')
+ parser.add_argument('--callback',
+ help='attach callback for trace analysis')
+ parser.add_argument('--benchmark', action='store_true',
+ help="enable benchmark mode to evaluate overhead w.r.t. the original kernel")
+ parser.add_argument('--memusage', action='store_true',
+ help="prevent the profiling and only measure the memory usage")
+ # put command at the end of command
+ parser.add_argument("command", nargs=argparse.REMAINDER)
+ # parse the arguments
+ args = parser.parse_args()
+
+ # same as this executable
+ NEUTRINO_PYTHON: str = args.python # default to be this executable
+ # directory to put the trace
+ NEUTRINO_TRACEDIR: str = args.tracedir
+ # filter of kernel
+ NEUTRINO_FILTER: str = args.filter if args.filter is not None else ""
+ NEUTRINO_KERNEL: str = args.kernel if args.kernel is not None else ""
+ # Benchmark mode, will include an additional launch after the trace kernel
+ # Used to measure the kernel-level slowdown of Neutrino, disabled by default
+ NEUTRINO_BENCHMARK: str = str(int(args.benchmark))
+ NEUTRINO_MEMUSAGE: str = str(int(args.memusage))
+ # Path to the real driver
+ NEUTRINO_REAL_DRIVER: str = args.driver
+ # command to be executed
+ command: str = args.command
+ assert len(command) > 0, "Command must be specified"
+
+ # Parse the PROBE
+ NEUTRINO_PROBE_PATH: str = args.probe
+ NEUTRINO_READING: str = None
+ # NOTE endswith .py triggers the Tracing DSL
+ if NEUTRINO_PROBE_PATH.endswith(".py"):
+ from neutrino.language.compiler import compile
+ source = open(NEUTRINO_PROBE_PATH, "r").read()
+ NEUTRINO_PROBE = compile(NEUTRINO_MODE, source)
+ elif NEUTRINO_PROBE_PATH.endswith(".toml"):
+ NEUTRINO_PROBE = toml.load(NEUTRINO_PROBE_PATH)
+ else:
+ # No suffix := use built-in tools
+ if NEUTRINO_PROBE_PATH not in NEUTRINO_TOOLS:
+ print(f"[error] {NEUTRINO_PROBE_PATH} not in tools: {NEUTRINO_TOOLS}", file=sys.stderr)
+ exit(-1)
+ else:
+ from neutrino.language.compiler import compile
+ source = open(os.path.join(NEUTRINO_TOOLS_DIR, NEUTRINO_TOOLS[NEUTRINO_PROBE_PATH]), "r").read()
+ NEUTRINO_PROBE = compile(NEUTRINO_MODE, source)
+
+ # NOTE generate the trace reading code
+ from neutrino.utils.trace_reading import gen_reading_code
+ NEUTRINO_READING = gen_reading_code(NEUTRINO_PROBE)
+
+ # NOTE check if dynamic is True, shall have a specific keyword in top-level of probe
+ NEUTRINO_DYNAMIC = "dynamic" in NEUTRINO_PROBE and NEUTRINO_PROBE["dynamic"] is True
+
+ # TODO change the callback to other places
+ NEUTRINO_CALLBACK = NEUTRINO_PROBE["CALLBACK"] if "CALLBACK" in NEUTRINO_PROBE else None
+ NEUTRINO_CALLBACK = args.callback if args.callback is not None else NEUTRINO_CALLBACK
+ if NEUTRINO_CALLBACK:
+ # search the path
+ if not os.path.exists(NEUTRINO_CALLBACK):
+ searched = os.path.join(os.path.dirname(NEUTRINO_PROBE_PATH), NEUTRINO_CALLBACK)
+ if os.path.exists(searched):
+ NEUTRINO_CALLBACK = searched
+ else:
+ print(f"[warn] callback {NEUTRINO_CALLBACK} not found")
+ NEUTRINO_CALLBACK = None
+
+ # a copied environment variables
+ env = os.environ.copy()
+ # configure Neutrino related environment variables
+ env["NEUTRINO_REAL_DRIVER"] = NEUTRINO_REAL_DRIVER
+ env["NEUTRINO_DRIVER_NAME"] = NEUTRINO_HOOK_DRIVER_NAME
+ env["NEUTRINO_HOOK_DRIVER"] = os.path.join(NEUTRINO_BUILD_DIR, NEUTRINO_HOOK_DRIVER_NAME)
+ env["NEUTRINO_PYTHON"] = NEUTRINO_PYTHON
+ env["NEUTRINO_PROBING_PY"] = os.path.join(NEUTRINO_BUILD_DIR, "process.py")
+ env["NEUTRINO_FILTER"] = NEUTRINO_FILTER
+ env["NEUTRINO_KERNEL"] = NEUTRINO_KERNEL
+ env["NEUTRINO_TRACEDIR"] = NEUTRINO_TRACEDIR
+ env["NEUTRINO_PROBES"] = toml.dumps(NEUTRINO_PROBE) # dump it to string
+ # GNU LD_PRELOAD to overwrite dlopen, https://man7.org/linux/man-pages/man8/ld.so.8.html
+ env["LD_PRELOAD"] = os.path.join(NEUTRINO_BUILD_DIR, "preload.so")
+ # Add to the LD_LIBRARY_PATH, this would overwrite ldconfig
+ if "LD_LIBRARY_PATH" in env:
+ env["LD_LIBRARY_PATH"] = NEUTRINO_BUILD_DIR + ":" + env["LD_LIBRARY_PATH"]
+ else:
+ env["LD_LIBRARY_PATH"] = NEUTRINO_BUILD_DIR
+ # An Environmental Variable to enable the trace
+ # NOTE some bugs here -> still working on
+ env["NEUTRINO_ENABLE"] = "1"
+ # An Environmental Variable to enable the benchmark mode
+ env["NEUTRINO_BENCHMARK"] = NEUTRINO_BENCHMARK
+ env["NEUTRINO_MEMUSAGE"] = NEUTRINO_MEMUSAGE
+ # An Environmental Variables to enable the debug mode -> more messages
+ # env["NEUTRINO_VERBOSE"] = "1"
+ if NEUTRINO_DYNAMIC:
+ env["NEUTRINO_DYNAMIC"] = "1"
+ if NEUTRINO_READING:
+ env["NEUTRINO_READING"] = NEUTRINO_READING
+ if NEUTRINO_CALLBACK:
+ env["NEUTRINO_CALLBACK"] = NEUTRINO_CALLBACK
+
+ # FIX for Triton
+ if NEUTRINO_MODE == "CUDA":
+ env["TRITON_LIBCUDA_PATH"] = NEUTRINO_BUILD_DIR
+ env["NEUTRINO_PROBING_PY"] = os.path.join(NEUTRINO_PROBE_DIR, "cuda.py")
+ elif NEUTRINO_MODE == "HIP":
+ # NOTE There's a bug in Triton's impl here, for path we refer to the
+ # directory for ld.so to search, instead of spcific file name ...
+ env["TRITON_LIBHIP_PATH"] = os.path.join(NEUTRINO_BUILD_DIR, "libamdhip64.so")
+ env["NEUTRINO_PROBING_PY"] = os.path.join(NEUTRINO_PROBE_DIR, "hip.py")
+
+ # start the program with new environment
+ if len(command) > 0:
+ proc = subprocess.Popen(command, env=env)
+ proc.wait()
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/common.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/common.py
new file mode 100644
index 0000000..873c5fc
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/common.py
@@ -0,0 +1,68 @@
+"""Neutrino Internal APIs, not for user import"""
+
+from typing import Optional, Literal, Union
+from dataclasses import dataclass
+
+@dataclass
+class Register:
+ name: str
+ dtype: Literal['u32', 'u64']
+ init: Optional[int] = None
+
+
+@dataclass
+class Probe:
+ name: str # name is the key in TOML
+ level: Literal["thread", "warp"] # level of the probe
+ pos: list[str] # := tracepoint in the paper
+ before: Union[list, str] = None # snippet inserted before, one of before and after shall be given
+ after: Union[list, str] = None # snippet inserted after, one of before and after shall be given
+
+
+@dataclass
+class Map:
+ name: str
+ level: Literal["thread", "warp"]
+ type: Literal["array"]
+ size: int
+ cap: Union[int, Literal["dynamic"]]
+ regs: list[Register]
+
+
+def load(raw: dict) -> tuple[list[Probe], list[Map], int]:
+ """Unserialize Neutrino probes in Python dict to probes, maps, regs"""
+ assert "probe" in raw.keys() and "map" in raw.keys(), "At least a probe and a map"
+ probes: list[Probe] = []
+ maps: list[Map] = []
+ for name, probe in raw["probe"].items():
+ # first validate the
+ keys = probe.keys()
+ assert "position" in keys or "pos" in keys, f"[error] {name} has no position (required)"
+ # assert "datamodel" in keys, f"[error] "
+ assert "before" in keys or "after" in keys, f"[error] {name} is empty, one of before or after shall be given"
+ assert "level" in keys and probe["level"] in ("warp", "thread"), f"[error] level must be given and one of 'warp', 'thread'"
+ probes.append(Probe(name=name,
+ level=probe["level"],
+ pos=probe["pos"].split(":"),
+ before=probe["before"] if "before" in keys else None,
+ after=probe["after"] if "after" in keys else None))
+ for name, map_ in raw["map"].items():
+ maps.append(Map(name=name,
+ level=map_["level"],
+ type=map_["type"],
+ size=map_["size"],
+ cap=map_["cap"],
+ regs=[Register(name, val[0], init=val[1]) for name, val in map_["regs"].items()]))
+ return probes, maps, raw["regs"]
+
+
+def dump(probes, maps, regs, callback = "") -> dict:
+ """Serialize Neutrino probes to Python dict"""
+ dict_probe = {
+ "regs": regs,
+ "probe" : {p.name: {"level": p.level, "pos": p.pos, "before": p.before, "after": p.after} for p in probes},
+ "map": {m.name: {"level": m.level, "type": m.type, "size": m.size, "cap": m.cap, "regs": {r.name: [r.dtype, r.init] for r in m.regs}} for m in maps}
+ }
+ if len(callback) > 0:
+ dict_probe["CALLBACK"] = callback
+ return dict_probe
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/__init__.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/__init__.py
new file mode 100644
index 0000000..342b036
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/__init__.py
@@ -0,0 +1,35 @@
+"""Neutrino Trace Language Primitive"""
+from functools import wraps
+from typing import TypeAlias
+
+# NOTE Neutrino Language's type system is incomplete and tiny because they're
+# placeholders for compilers instead of functional code for Python.
+u32: TypeAlias = int
+u64: TypeAlias = int
+reg: TypeAlias = int
+
+TYPES = ["u32", "u64"]
+FUNCS = ["smid", "time", "clock", "save"]
+
+def smid() -> u32: ...
+
+def time() -> u64: ...
+
+def clock() -> u64: ...
+
+def save(regs: list[reg], dtype) -> None: ...
+
+# @_disable_execution_
+# def tid() -> None: ...
+
+# @_disable_execution_
+# def pid() -> None: ...
+
+# Following are helpers for parsing register operands
+out: reg = ...
+in1: reg = ...
+in2: reg = ...
+in3: reg = ...
+in4: reg = ...
+addr: reg = ...
+bytes: reg = ...
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/compiler.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/compiler.py
new file mode 100644
index 0000000..516e802
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/compiler.py
@@ -0,0 +1,49 @@
+"""CLI Entry of Language Submodule"""
+from neutrino.common import Probe, dump
+from neutrino.language.frontend import parse
+
+def compile(mode: str, source: str) -> tuple[str, str]:
+ """Compile the Tracing DSL into Assembly Probes"""
+ regs, probes, maps, callback = parse(source)
+ if mode == "CUDA":
+ from neutrino.language.ptx import gencode
+ probes = gencode(probes)
+ elif mode == "HIP":
+ from neutrino.language.gcn import gencode
+ probes = gencode(probes)
+
+ # NOTE Merge probes of the same level and pos
+ merged_probes: dict[tuple[str, str], Probe] = {}
+ for probe in probes:
+ key = (probe.level, probe.pos)
+ if key not in merged_probes:
+ merged_probes[key] = probe
+ else: # merge
+ merged_probes[key].name += "_" + probe.name
+ merged_probes[key].before = (
+ (merged_probes[key].before or "") + (probe.before or "")
+ if merged_probes[key].before or probe.before
+ else None
+ )
+ merged_probes[key].after = (
+ (merged_probes[key].after or "") + (probe.after or "")
+ if merged_probes[key].after or probe.after
+ else None
+ )
+ probes = list(merged_probes.values())
+
+ dumped = dump(probes, maps, regs, callback)
+
+ for map_ in maps:
+ if map_.cap == "dynamic":
+ dumped["dynamic"] = True
+
+ return dumped
+
+if __name__ == "__main__": # A small test case
+ import sys
+ import toml
+ mode, source = sys.argv[1], sys.argv[2]
+ source = open(source, "r").read()
+ asm_probes = compile(mode, source)
+ print(toml.dumps(asm_probes))
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/frontend.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/frontend.py
new file mode 100644
index 0000000..2aa3736
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/frontend.py
@@ -0,0 +1,251 @@
+"""Parse and flatten Python Tracing DSL"""
+from neutrino.language import TYPES # neutrino/language/__init__.py
+import ast
+from typing import Optional
+from dataclasses import dataclass
+from neutrino.common import Register, Probe, Map
+
+allowed_nodes = {
+ ast.Import, # Imported Stuff
+ ast.Module, # the greatest start
+ ast.Name, # Name of Variable
+ ast.Assign, # Assign Value
+ ast.AugAssign, # +=
+ ast.UnaryOp, # Unary Op, only negative
+ ast.BinOp, # Binary Op, +-*/
+ ast.Call, # Call function
+ ast.Attribute, # Access Attribute of Namespace
+ ast.Constant, # Constant Value
+ ast.Expr, # Single Expression
+}
+
+binary_ops = {
+ ast.Add: "add",
+ ast.Sub: "sub",
+ ast.Mult: "mul",
+ ast.Div: "div"
+}
+
+unary_ops = {
+ ast.USub: "neg"
+}
+
+class NeutrinoVisitor(ast.NodeVisitor):
+ def __init__(self, nl_name: str, regs: list[str], maps: list[str]):
+ super().__init__()
+ self.nl_name = nl_name
+ self.reg_counter = -1 # make it R0
+ self.ir: list[tuple] = []
+ self.reg_map: dict[str, str] = {reg: self.fresh_name() for reg in regs}
+ self.maps = maps
+ # initialize and visit tree
+
+ def fresh_name(self):
+ self.reg_counter += 1
+ return f"NR{self.reg_counter}"
+
+ def visit_Assign(self, node): # Lowered to mov
+ # we shall check if the target has a known name
+ name = self.reg_map[node.targets[0].id]
+ if isinstance(node.value, ast.Attribute):
+ self.ir.append(["mov", name, self.visit(node.value)])
+ else:
+ new_name = self.visit(node.value) # this is the temporary name
+ for inst in self.ir:
+ for idx in range(len(inst)):
+ if inst[idx] == new_name:
+ inst[idx] = name
+ self.reg_counter -= 1
+
+ def process_operand(self, operand) -> str:
+ if isinstance(operand, ast.Name):
+ return self.reg_map[operand.id]
+ elif isinstance(operand, ast.Constant):
+ return operand.value
+ elif isinstance(operand, (ast.Attribute, ast.Call, ast.BinOp, ast.UnaryOp)):
+ return self.visit(operand)
+ else:
+ raise ValueError
+
+ def visit_BinOp(self, node): # Lowered to add/sub
+ lhs = self.process_operand(node.left)
+ rhs = self.process_operand(node.right)
+ new_name = self.fresh_name()
+ if isinstance(node.op, ast.Add):
+ self.ir.append(["add", new_name, lhs, rhs])
+ elif isinstance(node.op, ast.Sub):
+ self.ir.append(["sub", new_name, lhs, rhs])
+ elif isinstance(node.op, ast.Mult):
+ self.ir.append(["mul", new_name, lhs, rhs])
+ elif isinstance(node.op, ast.Div):
+ self.ir.append(["div", new_name, lhs, rhs])
+ elif isinstance(node.op, ast.LShift):
+ self.ir.append(["lsh", new_name, lhs, rhs])
+ else:
+ raise NotImplementedError()
+ self.reg_map[new_name] = new_name
+ return new_name
+
+ def visit_AugAssign(self, node):
+ rhs = self.process_operand(node.value)
+ name = self.reg_map[node.target.id]
+ if isinstance(node.op, ast.Add):
+ self.ir.append(["add", name, name, rhs])
+ elif isinstance(node.op, ast.Sub):
+ self.ir.append(["sub", name, name, rhs])
+ elif isinstance(node.op, ast.Mult):
+ self.ir.append(["mul", name, name, rhs])
+ elif isinstance(node.op, ast.Div):
+ self.ir.append(["div", name, name, rhs])
+ elif isinstance(node.op, ast.LShift):
+ self.ir.append(["lsh", name, name, rhs])
+ else:
+ raise NotImplementedError()
+ return name
+
+ def visit_UnaryOp(self, node):
+ value = self.process_operand(node.operand)
+ new_name = self.fresh_name()
+ if isinstance(node.op, ast.USub):
+ self.ir.append(["neg", new_name, value])
+ else:
+ raise NotImplementedError()
+ return new_name
+
+ def visit_Call(self, node):
+ func_name = self.visit(node.func)
+ if func_name == "cuid":
+ new_name = self.fresh_name()
+ self.ir.append(["cuid", new_name])
+ self.reg_map[new_name] = new_name
+ return new_name
+ elif func_name == "time":
+ new_name = self.fresh_name()
+ self.reg_map[new_name] = new_name
+ self.ir.append(["time", new_name])
+ return new_name
+ elif func_name == "clock":
+ new_name = self.fresh_name()
+ self.reg_map[new_name] = new_name
+ self.ir.append(["clock", new_name])
+ return new_name
+ elif func_name == "save":
+ map_name = node.func.value.id
+ if map_name not in self.maps:
+ raise ValueError(f"Map {map_name} not found, known maps: {self.maps}")
+ regs = []
+ for arg in node.args:
+ if isinstance(arg, ast.Name):
+ regs.append(self.reg_map[arg.id])
+ elif isinstance(arg, ast.Attribute):
+ regs.append(self.visit_Attribute(arg))
+ else:
+ regs.append(self.reg_map[self.visit(arg)])
+ self.ir.append(["SAVE", map_name] + regs)
+ else:
+ raise NotImplementedError()
+
+ def visit_Name(self, node):
+ return node.id
+
+ def visit_Attribute(self, node):
+ if node.value.id == self.nl_name or node.value.id in self.maps:
+ if node.attr in ("bytes", "addr", "out", "in1", "in2", "in3"):
+ return node.attr.upper()
+ return node.attr
+ else:
+ raise ValueError(f"can only refer to neutrino.language semantic but got {node.value.id}")
+
+ def visit_Constant(self, node):
+ return node
+
+ def generic_visit(self, node):
+ if type(node) not in allowed_nodes:
+ raise NotImplementedError(f"{type(node).__name__} (lineno: {node.lineno})")
+ super().generic_visit(node)
+
+
+def parse(code: str) -> tuple[list[Register], list[Probe], list[Map], str]:
+ """Parse the code into probes"""
+ tree = ast.parse(code)
+ nl_name: str = None # name of neutrino.language in the code
+ regs: list[Register] = []
+ num_regs: int = 0
+ probes: list[Probe] = []
+ callback: str = "" # not used yet, but we can use it later
+ maps: list[Map] = [] # not used yet, but we can use it later
+
+ for node in tree.body:
+ if type(node) is ast.Import and node.names[0].name == "neutrino.language":
+ nl_name = node.names[0].asname
+ elif type(node) is ast.Assign and node.targets[0].id == "CALLBACK":
+ if isinstance(node.value, ast.Constant):
+ callback = node.value.value
+ else:
+ raise ValueError("CALLBACK must be a string constant")
+ elif type(node) is ast.AnnAssign and node.annotation:
+ if node.annotation.value.id == nl_name and node.annotation.attr in TYPES:
+ regs.append(Register(node.target.id, node.annotation.attr, node.value.value))
+ elif type(node) is ast.ClassDef and node.decorator_list:
+ name = node.name # take class name as map name
+ decorator = node.decorator_list[0]
+ if decorator.func.id == "Map":
+ level, type_, size, cap, contents = None, None, 0, 1, []
+ for keyword in decorator.keywords:
+ if keyword.arg == "level": level = keyword.value.value
+ elif keyword.arg == "type": type_ = keyword.value.value
+ elif keyword.arg == "size": size = keyword.value.value
+ elif keyword.arg == "cap": cap = keyword.value.value
+ if size % 8 != 0:
+ raise ValueError("size must be multiple of 8 to avoid misaligned address")
+ if not level or not type_:
+ raise ValueError("level and type must be specified")
+ if not isinstance(cap, int) and cap != "dynamic":
+ raise ValueError("cap must be an integer or 'dynamic'")
+ # check if map existed or not
+ for node in node.body:
+ if type(node) is ast.AnnAssign and node.annotation:
+ if node.annotation.value.id == nl_name and node.annotation.attr in TYPES:
+ contents.append(Register(node.target.id, node.annotation.attr, None))
+ else:
+ raise ValueError(f"Map {name} must only contain AnnAssign nodes")
+ ordered = sorted(contents, key=lambda reg: reg.dtype, reverse=True)
+ if ordered != contents:
+ print("[warn] map contents reordered")
+ # create a map object
+ maps.append(Map(name=name, level=level, type=type_, size=size, cap=cap, regs=ordered))
+ elif type(node) is ast.FunctionDef and node.decorator_list:
+ name = node.name # take func name as probe name
+ decorator = node.decorator_list[0]
+ if decorator.func.id == "probe":
+ pos, level, before = None, None, False
+ for keyword in decorator.keywords:
+ if keyword.arg == "pos": pos = keyword.value.value
+ elif keyword.arg == "level": level = keyword.value.value
+ elif keyword.arg == "before": before = keyword.value.value
+ if not pos or not level: raise ValueError("position must be specified")
+ # check if probe existed or not
+ visitor = NeutrinoVisitor(nl_name=nl_name, regs=[reg.name for reg in regs], maps=[map.name for map in maps])
+ visitor.visit(ast.Module(body=node.body)) # Take it as independent code
+ probe = Probe(name=name, pos=pos, level=level)
+ if before:
+ probe.before = visitor.ir
+ else:
+ probe.after = visitor.ir
+ probes.append(probe)
+ num_regs = max(num_regs, visitor.reg_counter)
+
+ return num_regs + len(regs), probes, maps, callback
+
+# A Simple Test Case, not really used in production
+if __name__ == "__main__":
+ import sys
+
+ code = open(sys.argv[1], "r").read()
+
+ regs, probes, maps, callback = parse(code)
+
+ print(regs)
+ print(probes)
+ print(maps)
+ print(callback)
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/gcn.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/gcn.py
new file mode 100644
index 0000000..21ef908
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/gcn.py
@@ -0,0 +1,12 @@
+"""Generate the AMD GCN Assembly, a x86-like asm
+
+NOTE Currently only targets CDNA branch of GCN, covering MI100/200/300/325
+This is because AMD's Assembly diverge into CDNA/RDNA in 2020, before that
+there's only one architecture named GCN (so as the name of GCNAsm).
+
+CDNA and RDNA shares the same syntax inherited from GCNAsm, but has slight
+difference in instruction set, for example, CDNA use `S_MEMTIME S[0:1]` to
+read the clock in 64bit but RDNA use `S_GETREG S0, SHADER_CYCLES` in 32bit
+
+We plan to support CDNA arch first and then port to RDNA arch later.
+"""
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/ptx.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/ptx.py
new file mode 100644
index 0000000..051f194
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/language/ptx.py
@@ -0,0 +1,75 @@
+"""Generate the CUDA PTX Assembly, a C-style asm"""
+
+from neutrino.common import Register, Probe, Map
+
+def filter_keyword(reg: str) -> str:
+ if reg in {"ADDR", "BYTES", "OUT", "IN1", "IN2", "IN3", "IN4"}:
+ return reg
+ elif isinstance(reg, int):
+ return reg
+ else:
+ return "%" + reg
+
+def cvt_inst(inst: list[str]) -> str:
+ match inst[0]:
+ # ALU Instructions
+ case "add":
+ return f"add.u64 {filter_keyword(inst[1])}, {filter_keyword(inst[2])}, {filter_keyword(inst[3])};"
+ case "sub":
+ return f"sub.u64 {filter_keyword(inst[1])}, {filter_keyword(inst[2])}, {filter_keyword(inst[3])};"
+ case "mul":
+ return f"mul.u64 {filter_keyword(inst[1])}, {filter_keyword(inst[2])}, {filter_keyword(inst[3])};"
+ case "div":
+ return f"div.u64 {filter_keyword(inst[1])}, {filter_keyword(inst[2])}, {filter_keyword(inst[3])};"
+ case "mod":
+ return f"rem.u64 {filter_keyword(inst[1])}, {filter_keyword(inst[2])}, {filter_keyword(inst[3])};"
+ case "lsh":
+ return f"shl.u64 {filter_keyword(inst[1])}, {filter_keyword(inst[2])}, {filter_keyword(inst[3])};"
+ case "rsh":
+ return f"shr.u64 {filter_keyword(inst[1])}, {filter_keyword(inst[2])}, {filter_keyword(inst[3])};"
+ # Memory Instructions
+ case "SAVE":
+ contents = (filter_keyword(reg) for reg in inst[2:])
+ contents = ", ".join(contents)
+ return f"SAVE [ {inst[1]} ] {{ { contents } }}" # just return everything
+ # Other Instructions
+ case "mov":
+ return f"mov.u64 {filter_keyword(inst[1])}, {filter_keyword(inst[2])};"
+ case "clock":
+ return f"mov.u64 {filter_keyword(inst[1])}, %clock64;"
+ case "time":
+ return f"mov.u64 {filter_keyword(inst[1])}, %globaltimer;"
+ case "cuid":
+ return f"""{{
+ .reg .b32 %tmp;
+ mov.u32 %tmp, %smid;
+ cvt.u64.u32 {filter_keyword(inst[1])}, %tmp;
+ }}"""
+ case _:
+ raise NotImplementedError(f"{inst} not yet supported")
+
+def gencode(probes: list[Probe]) -> list[Probe]:
+ # First handle the initialization of regs
+
+ # Then handle the syntax of probes
+ for probe in probes:
+ # only change the instructions, i.e., before and after part
+ if probe.before is not None:
+ insts: list[str] = []
+ for inst in probe.before:
+ insts.append(cvt_inst(inst))
+ probe.before = "\n".join(insts)
+ elif probe.after is not None:
+ insts: list[str] = []
+ for inst in probe.after:
+ insts.append(cvt_inst(inst))
+ probe.after = "\n".join(insts)
+
+ return probes
+
+if __name__ == "__main__":
+ probes = [
+ Probe(name='thread_start', level='warp', pos='kernel', size=0, before=None, after=[['clock', 'R0']]),
+ Probe(name='thread_end', level='warp', pos='kernel', size=0, before=None, after=[['clock', 'R2'], ['sub', 'R1', 'R2', 'R0'], ['cuid', 'R3'], ['SAVE', 'block_sched', 'R0', 'R1', 'R3']])
+ ]
+ print(gencode(probes))
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/__init__.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/__init__.py
new file mode 100644
index 0000000..4c385f2
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/__init__.py
@@ -0,0 +1,50 @@
+"""Neutrino Probing Engine Protocol
+
+NOTE just a protocol for developers, don't import / export"""
+
+from dataclasses import dataclass
+from neutrino.common import Register, Probe, Map
+
+__all__ = ["Ref", "load_probes", "TRACE_READING_CODE_PY"]
+
+@dataclass
+class Ref:
+ """Reference for replacement"""
+ line: str # Original line
+ probe: str # Probe name for matchine
+ before_after: bool # True if before and False if after -> to distinguish which snippet is used
+
+
+@dataclass
+class KernelParam:
+ dtype: str
+ name: str
+
+
+# NOTE following is just protocol, please implement yours, developers can
+# extend other functions for their need, just keep following implemented
+
+def get_arch() -> str:
+ """get architecture for assembler"""
+ ...
+
+def dump(workdir: str, name: str, suffix: str) -> str:
+ """call objdump to extract assembly from binary"""
+ ...
+
+def prune(ptx: str, entry_name: str):
+ """Prune Assembly to specific entry_name"""
+ ...
+
+def probing(asm: str, probes: list[Probe]):
+ """Probe the probes into asm"""
+ ...
+
+def assemble(workdir: str, name: str):
+ """call assembler to turn assembly to machine code"""
+ ...
+
+def write_kernel_info(name: str, params, probe_mem_sizes: list[int],
+ workdir: str, analyze_hook: str = "", file_name: str = "kernel.info"):
+ """write kernel info for hook driver to read back"""
+ ...
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/cuda.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/cuda.py
new file mode 100644
index 0000000..9f4c1c7
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/cuda.py
@@ -0,0 +1,786 @@
+"""Neutrino Probing Engine, CUDA Implementation"""
+
+import os
+import sys
+import shutil
+import subprocess
+import traceback # usef for print backtrace to log file instead of stdout
+import toml # to load probes from envariables
+from neutrino.common import Register, Probe, Map, load
+from neutrino.probe import Ref, KernelParam
+
+workdir = sys.argv[1] # directory contains original.bin
+log = open(os.path.join(workdir, "process.log"), 'w')
+
+# a macro like terms
+SUPPORTED_DATAMODEL = { "thread": 0, "warp": 1 }
+
+# TODO move it to global variable or configurable
+def get_arch() -> str:
+ """get compute_arch of the gpu, like 'sm_89'
+ # Run nvidia-smi command
+ result = subprocess.run(
+ ['nvidia-smi', '--query-gpu=compute_cap', '--format=csv,noheader'],
+ stdout=subprocess.PIPE,
+ text=True)
+ # sm_version like `8.9`
+ sm_version = result.stdout.split("\n")[0].strip()
+ major, minor = sm_version.split(".")
+ """
+ # NOTE sometimes auto-detection like above will fail, so manually fix at the time
+ result = subprocess.run(
+ ['nvidia-smi', '--query-gpu=compute_cap', '--format=csv,noheader'],
+ stdout=subprocess.PIPE,
+ text=True)
+ # sm_version like `8.9`
+ sm_version = result.stdout.split("\n")[0].strip()
+ major, minor = sm_version.split(".")
+ return f"sm_{major}{minor}"
+
+def dump(workdir: str, name: str = "original", suffix: str = ".bin") -> str:
+ """Extract PTX from cuda binaries (cubin or fatbin) via cuobjdump
+
+ NOTE accept three kind of binary:
+ 1. fatbin @see https://docs.nvidia.com/cuda/nvfatbin/index.html
+ 2. cubin @see https://docs.nvidia.com/cuda/cuda-binary-utilities/index.html
+ 3. PTX text file - no need to further process, just rename it to .ptx
+ """
+ bin_path = os.path.join(workdir, name) + suffix
+ # first check if it's already a NULL-Terminated PTX (i.e., ASCII Text)
+ result = subprocess.run(['file', bin_path], stdout=subprocess.PIPE, text=True)
+ out = result.stdout
+ if "ASCII text" in result.stdout: # raw PTX file, just read it all
+ shutil.copyfile(bin_path, os.path.join(workdir, name) + ".ptx")
+ print("[objdump] bin is ptx", file=log)
+ with open(os.path.join(workdir, name) + ".ptx", "r") as outf:
+ return outf.read()
+ # then try cuobjdump -ptx flag
+ result = subprocess.run(
+ ['cuobjdump', '-ptx', bin_path],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ text=True)
+ out = result.stdout
+ if len(result.stderr) > 0:
+ print(result.stderr, file=log)
+ if out.find(".version") != -1:
+ start = out.index(".version") # ptx valid part starts with .version
+ with open(os.path.join(workdir, name) + ".ptx", "w") as outf:
+ outf.write(out[start:])
+ print("[objdump] via cuobjdump -ptx", file=log)
+ return out[start:]
+ else:
+ # finally try cuobjdump -elf to dump elf content and check .nv_debug_ptx_txt
+ result = subprocess.run(['cuobjdump', '-elf', bin_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+ if len(result.stderr) > 0:
+ print(result.stderr, file=log)
+ sections = result.stdout.split(".section ") # don't remove the space
+ for section in sections:
+ if section.startswith(".nv_debug_ptx_txt"): # PTX Info
+ # write to the original.ptx
+ start = section.index(".version")
+ with open(os.path.join(workdir, name) + ".ptx", "w") as outf:
+ outf.write(section[start:])
+ print("[objdump] via cuobjdump -elf", file=log)
+ return section[start:]
+ # if still not found
+ raise ValueError("PTX Not Found in CUBIN")
+
+def prune(ptx: str, entry_name: str) -> tuple[str, str, str, str]:
+ """ a minimum parser to truncate the ptx for specific entry
+
+ Use this function to locate a specific entry with entry_name.
+ as Single PTX objdumped usually have > 1 entry, (try cuobjdump -ptx libcublas.so)
+
+ NOTE verified on PTX from NVCC GCC Backend and LLVM PTX Backend
+ """
+ # first try to find entry_name and raise error if entry_name not found
+ # .visible .entry and .entry corresponds to CUDA __global__
+ length = len(ptx)
+ # NOTE fix PyTorch problem
+ entry_loc = ptx.find(f".visible .entry {entry_name}")
+ if entry_loc == -1:
+ entry_loc = ptx.find(f".entry {entry_name}") # try raw .entry instead
+ if entry_loc == -1:
+ found = False
+ for i in range(1, 30):
+ # ignore last few character for fuzzy finding
+ entry_loc = ptx.find(f".entry {entry_name[:-i]}") # try raw .entry instead
+ # print(entry_loc, entry_name[:-i])
+ if entry_loc != -1: # if find!
+ entry_name = entry_name[:-i]
+ found = True
+ break
+ if not found:
+ raise ValueError(f"{entry_name} not found")
+
+ # parse the first global section - shall be included for global info
+ # global info are complicated and not involved in later processing, just keep them
+ start_visible_entry = ptx.find(".visible .entry") if ptx.find(".visible .entry") != -1 else length
+ start_entry = ptx.find(".entry") if ptx.find(".entry") != -1 else length
+ start = min(start_visible_entry, start_entry)
+ # include global_section functions like __assertfail and definitions such as gmems
+ global_section = ptx[:start]
+
+ # parse the .func which corresponds to CUDA __device__, might be used by entries
+ start_func = start
+ func_sections = []
+ # tbh only very little code arrives here, so not much overhead
+ while start_func != -1:
+ start_func = ptx.find(".func", start_func)
+ if start_func == -1:
+ break
+ # function entry could be complicated, just keep them
+ pos = ptx.find("{", start_func) + 1
+ ket_count = 1
+ next_bra = ptx.find("{", pos)
+ next_ket = ptx.find("}", pos)
+ # now parse end
+ while ket_count > 0:
+ if next_bra < next_ket:
+ pos = next_bra + 1
+ next_bra = ptx.find("{", pos) # find next {
+ if next_bra == -1: # not found, set to max := ptx length
+ next_bra = length
+ ket_count += 1
+ else: # next_ket < next_bra, := close a bra
+ pos = next_ket + 1
+ next_ket = ptx.find("}", pos) # find next }
+ if next_ket == -1: # not found, set to max := ptx length
+ next_ket = length
+ ket_count -= 1
+ # now pos is end_body
+ func_sections.append(ptx[start_func:pos])
+ start_func = pos #
+ func_section = "\n".join(func_sections)
+
+ # given entry_loc try to parse the whole body
+ pos = ptx.find("{", entry_loc) + 1
+ ket_count = 1 # one } miss
+ next_bra = ptx.find("{", pos)
+ if next_bra == -1:
+ next_bra = length
+ next_ket = ptx.find("}", pos)
+ while ket_count > 0:
+ if next_bra < next_ket:
+ pos = next_bra + 1
+ next_bra = ptx.find("{", pos)
+ if next_bra == -1: # not found, set to max := ptx length
+ next_bra = length
+ ket_count += 1
+ elif next_bra > next_ket: # next_ket < next_bra, := close a bra
+ pos = next_ket + 1 # not found, set to max := ptx length
+ next_ket = ptx.find("}", pos)
+ if next_ket == -1:
+ next_ket = length
+ ket_count -= 1
+
+ entry_section = ptx[entry_loc:pos]
+
+ return global_section, func_section, entry_section, entry_name
+
+"""
+NOTE: templates for thread-constant datamodel buffer calculation
+These part shall be placed ONCE at the beginning of every kernel function definition
+if there's any thread-constant probes
+
+Most registers below is duplicate and will be optimized by PTXAS
+TODO Optimize calculation for 1D/2D Indexing (many kernel don't use 3D Indexing
+"""
+COMMON_BUFFER_CALC = """// begin buffer calculation
+.reg .b32 %buf<20>; // b32 reg to record access, will be optimized by ptxas
+mov.u32 %buf2, %tid.x; // threadIdx.x
+mov.u32 %buf3, %tid.y; // threadIdx.y
+mov.u32 %buf4, %tid.z; // threadIdx.z
+mov.u32 %buf5, %ntid.x; // blockDim.x
+mov.u32 %buf6, %ntid.y; // blockDim.y
+mov.u32 %buf7, %ntid.z; // blockDim.z
+mov.u32 %buf8, %ctaid.x; // blockIdx.x
+mov.u32 %buf9, %ctaid.y; // blockIdx.y
+mov.u32 %buf10, %ctaid.z; // blockIdx.z
+mov.u32 %buf11, %nctaid.x; // gridDim.x
+mov.u32 %buf12, %nctaid.y; // gridDim.y
+mad.lo.s32 %buf13, %buf6, %buf4, %buf3; // blockDim.y * threadIdx.z + threadIdx.y
+mad.lo.s32 %buf15, %buf13, %buf5, %buf2; // thread_idx = (blockDim.y * threadIdx.z + threadIdx.y) * blockDim.x + threadIdx.x
+mad.lo.s32 %buf16, %buf12, %buf10, %buf9; // gridDim.y * blockIdx.z + blockIdx.y
+mad.lo.s32 %buf17, %buf16, %buf11, %buf8; // block_idx = (gridDim.y * blockIdx.z + blockIdx.y) * gridDim.x + blockIdx.x
+mul.lo.s32 %buf18, %buf5, %buf6; // blockDim.x * blockDim.y
+mul.lo.s32 %buf19, %buf18, %buf7; // blockSize = blockDim.x * blockDim.y * blockDim.z
+mad.lo.s32 %buf1, %buf17, %buf19, %buf15; // buf_idx = block_idx * blockSize + thread_idx
+// end buffer calculation"""
+
+"""
+NOTE templates for warp-constant datamodel buffer calculation
+These part shall be placed ONCE at the beginning of every kernel function definition
+if there's any warp-constant probes
+
+Most registers below is duplicate and will be optimized by PTXAS
+"""
+WARP_BUFFER_CALC = """// begin buffer calculation
+.reg .b32 %warpbuf<21>; // b32 reg to record access, will be optimized by ptxas
+.reg .pred %leader; // predicate register
+.reg .pred %joint_pred; // used to store AND result of %leader and instruction operand
+mov.u32 %warpbuf2, %laneid; // read lane id
+setp.eq.u32 %leader, %warpbuf2, 0; // check if thread is warp leader
+@%leader mov.u32 %warpbuf3, %nwarpid; // warpDim := number of warp in current group
+@%leader mov.u32 %warpbuf4, %tid.x; // threadIdx.x
+@%leader mov.u32 %warpbuf5, %tid.y; // threadIdx.y
+@%leader mov.u32 %warpbuf6, %tid.z; // threadIdx.z
+@%leader mov.u32 %warpbuf7, %ntid.x; // blockDim.x
+@%leader mov.u32 %warpbuf8, %ntid.y; // blockDim.y
+@%leader mov.u32 %warpbuf18, %ntid.z; // blockDim.z
+@%leader mov.u32 %warpbuf9, %ctaid.x; // blockIdx.x
+@%leader mov.u32 %warpbuf10, %ctaid.y; // blockIdx.y
+@%leader mov.u32 %warpbuf11, %ctaid.z; // blockIdx.z
+@%leader mov.u32 %warpbuf12, %nctaid.x; // gridDim.x
+@%leader mov.u32 %warpbuf13, %nctaid.y; // gridDim.y
+@%leader mad.lo.s32 %warpbuf14, %warpbuf8, %warpbuf6, %warpbuf5; // blockDim.y * threadIdx.z + threadIdx.y
+@%leader mad.lo.s32 %warpbuf15, %warpbuf14, %warpbuf7, %warpbuf4; // thread_idx = (blockDim.y * threadIdx.z + threadIdx.y) * blockDim.x + threadIdx.x
+@%leader div.s32 %warpbuf15, %warpbuf15, 32; // get persistent warpid instead of dynamic %warpid
+@%leader mad.lo.s32 %warpbuf16, %warpbuf13, %warpbuf11, %warpbuf10; // gridDim.y * blockIdx.z + blockIdx.y
+@%leader mad.lo.s32 %warpbuf17, %warpbuf16, %warpbuf12, %warpbuf9; // block_idx = (gridDim.y * blockIdx.z + blockIdx.y) * gridDim.x + blockIdx.x
+@%leader mul.lo.s32 %warpbuf19, %warpbuf7, %warpbuf8;
+@%leader mul.lo.s32 %warpbuf20, %warpbuf19, %warpbuf18;
+@%leader div.s32 %warpbuf20, %warpbuf20, 32;
+@%leader mad.lo.s32 %warpbuf1, %warpbuf17, %warpbuf20, %warpbuf15; // buf_idx = block_idx * warpSize + warpIdx
+// end buffer calculation"""
+
+# NOTE buffer location for thread-local buffers, every probe has independent this part
+THREAD_PROBE_BUFFER = """// begin {name} map
+.reg .b64 %map_{name}<5>; // register group defn
+mul.wide.s32 %map_{name}4, %buf1, {no_bytes}; // get buffer location, no_bytes is per thread
+ld.param.u64 %map_{name}3, [param_{name}]; // load address from .param state space
+cvta.to.global.u64 %map_{name}2, %map_{name}3; // convert address to .global state space
+add.s64 %map_{name}1, %map_{name}2, %map_{name}4; // offset to get final thread-specific address
+// end {name} map"""
+
+# NOTE buffer of the dynamic stuffs
+THREAD_PROBE_DYNAMIC_BUFFER = """// begin {name} dynamic map
+.reg .b64 %map_{name}<5>; // register group defn
+.reg .b32 %cnt_{name}; // The dynamic count of buffer size
+ld.param.u32 %cnt_{name}, [bytes_{name}]; // load sizes from .param state spaces
+mul.wide.s32 %map_{name}4, %buf1, %cnt_{name}; // get buffer location, no_bytes is per thread
+ld.param.u64 %map_{name}3, [param_{name}]; // load address from .param state space
+cvta.to.global.u64 %map_{name}2, %map_{name}3; // convert address to .global state space
+add.s64 %map_{name}1, %map_{name}2, %map_{name}4; // offset to get final thread-specific address
+// end {name} dynamic map"""
+
+# NOTE buffer location for warp-local buffers, every probe has independent this part
+WARP_PROBE_BUFFER = """// begin {name} map
+.reg .b64 %map_{name}<5>; // register group defn
+@%leader mul.wide.s32 %map_{name}4, %warpbuf1, {no_bytes}; // get buffer location, no_bytes is per thread
+@%leader ld.param.u64 %map_{name}3, [param_{name}]; // load address from .param state space
+@%leader cvta.to.global.u64 %map_{name}2, %map_{name}3; // convert address to .global state space
+@%leader add.s64 %map_{name}1, %map_{name}2, %map_{name}4; // offset to get final thread-specific address
+// end {name} map"""
+
+# NOTE for every probe with datamodel not none
+# only support .u64 and recommend use 16 bytes alignment, minimum is 8 bytes
+PROBE_PARAM = ".param .u64 param_{name}"
+COUNT_PARAM = ".param .u32 bytes_{name}"
+
+# NOTE This is a special probe applied if dynamic = True, to be filled with count_inst and count_size
+COUNT_PROBE = """regs = 1
+[map.Count]
+level = "thread"
+type = "array"
+size = 8
+cap = 1
+
+[map.Count.regs]
+count = [ "u64", "None", ]
+
+[probe.Save]
+pos = "kernel"
+level = "thread"
+before = "mov.u64 %NR0, 0;"
+after = "SAVE [ Count ] {{ %NR0 }};"
+
+[probe.Count]
+pos = "{count_inst}"
+level = "thread"
+before = "add.u64 %NR0, %NR0, {count_size};"
+"""
+
+def probing(asm: str, probes: list[Probe], maps: list[Map], regs: int) -> tuple[str, list[int]]:
+ """Process the probes, the core function of probing engine"""
+
+ # NOTE parse interesting locations
+ # A mapping from location to probes, a probe can hook at multiple location
+ positions: dict[str, list[Probe]] = dict()
+ kernel_start_probes: list[Probe] = []
+ # NOTE turn kernel:end into ret:start for better matching
+ for probe in probes:
+ # different position split by ;, and inside split by : for start/end
+ for position in probe.pos:
+ if position == "kernel": # turn into listening instructions
+ if probe.after is not None:
+ if "ret;" in positions:
+ positions["ret;"].append(probe)
+ else:
+ positions["ret;"] = [probe, ]
+ if probe.before is not None:
+ kernel_start_probes.append(probe)
+ else:
+ if position in positions:
+ positions[position].append(probe)
+ else:
+ positions[position] = [probe, ]
+
+ # NOTE parse PTX Assembly
+ ptx_lines = asm.split("\n") # let's do it line by line
+ # first extract basic kernel signature
+ entry_found: bool = False # line of .entry or .visible .entry
+ entry_last_line : int = 0 # last line of entry, marked by ()
+ param_end_line : int = 0 # last line of param declaration, for probe params
+ body_start_line : int = 0 # first line of body
+ idx = 0
+ while idx < len(ptx_lines):
+ line = ptx_lines[idx]
+ if not entry_found and ".entry" in line: # entry not yet found
+ entry_found = True
+ if entry_found: # now entry is found
+ # first check if the entry has been closed
+ if ")" in line and entry_last_line == 0:
+ entry_last_line = idx
+ # if entry is closed, time for body!, another if as ) { can in one line
+ if body_start_line == 0 and "{" in line and entry_last_line >= 0:
+ body_start_line = idx
+ # if not yet reach the entry, then line with .param is param declaration
+ if ".param" in line and entry_last_line == 0:
+ param_end_line = idx
+ # here pattern matching positions TODO optimize performance here
+ else:
+ for position, probes in positions.items():
+ if position in line: # BUG might mismatch parameter with confused naming
+ # NOTE we got a match, then every probe will insert snippet before or after the line
+ # this might cause idx fluctuatting if we use idx to process it
+ line_idx = idx # a copy to fix the insertion position
+ for probe in probes:
+ # specially handle ret;, we need to place it before ret or it won't be executed
+ if position == "ret;" and probe.after is not None:
+ ptx_lines.insert(line_idx, Ref(line=line, probe=probe, before_after=False))
+ idx += 1
+ line_idx += 1
+ else:
+ if probe.before is not None:
+ ptx_lines.insert(line_idx, Ref(line=line, probe=probe, before_after=True))
+ idx += 1
+ line_idx += 1
+ if probe.after is not None:
+ ptx_lines.insert(line_idx + 1, Ref(line=line, probe=probe, before_after=False))
+ idx += 1
+ idx += 1
+
+ # Now add the probes to PTX Assembly
+ offset: int = 0 # adding every line need to offset 1 to make it correct
+ # First let's add parameters
+ ptx_lines[param_end_line] = ptx_lines[param_end_line] + "," # add , to indicate more param
+ # NOTE parameter layouts: Parameters are pointers to buffer, or buffer size
+ # We arange buffer pointers linearly in advance (u64), and later size (u32)
+ params_added: list[str] = []
+ count_params: list[str] = [] # NOTE used for dynamic counts only
+
+ # NOTE save the map_sizes so Hook Driver has a way to allocate the map memory
+ # we must make sure this is aligned with the order of parameter or will be illegal access
+ map_sizes: list[tuple[str, int]] = [] #
+
+ levels: set[str] = set()
+ for map_ in maps:
+ if map_.cap != "dynamic":
+ map_sizes.append((map_.level, map_.cap * map_.size))
+ params_added.append(PROBE_PARAM.format(name=map_.name))
+ levels.add(map_.level)
+ else:
+ map_sizes.append((map_.level, -1))
+ params_added.append(PROBE_PARAM.format(name=map_.name))
+ count_params.append(COUNT_PARAM.format(name=map_.name))
+ levels.add(map_.level)
+
+ # else just ignore
+ params_added = params_added + count_params # formulate the layout
+ ptx_lines.insert(param_end_line + 1, ",\n".join(params_added))
+ offset += 1 # in total one line is added
+ # Now add the probe with kernel:start -> this shall not dump anything I think
+ ptx_lines.insert(body_start_line + offset + 1, f".reg .u64 %NR<{regs + 1}>;")
+ offset += 1
+ for probe in kernel_start_probes:
+ # NOTE kernel:start probe has no helpers and have no predicate
+ ptx_lines.insert(body_start_line + offset + 1, probe.before) # None is checked before
+ offset += 1
+ # Now add the common buffer calculation
+ if "thread" in levels:
+ ptx_lines.insert(body_start_line + offset + 1, COMMON_BUFFER_CALC)
+ offset += 1
+ if "warp" in levels:
+ ptx_lines.insert(body_start_line + offset + 1, WARP_BUFFER_CALC)
+ offset += 1
+ # Now add the individual buffer calculation
+ # NOTE add the handle of u32 registers
+ for map_ in maps:
+ num_u32 = 0
+ for reg in map_.regs:
+ if reg.dtype == "u32":
+ num_u32 += 1
+ if map_.level == "thread":
+ if map_.cap != "dynamic":
+ buffer_asm = THREAD_PROBE_BUFFER.format(name=map_.name, no_bytes=str(map_.cap * map_.size))
+ if num_u32 > 0:
+ buffer_asm = buffer_asm + f"\n.reg .u32 %{map_.name}_u32_<{num_u32 + 1}>;"
+ ptx_lines.insert(body_start_line + offset + 1, buffer_asm)
+ offset += 1
+ else:
+ buffer_asm = THREAD_PROBE_DYNAMIC_BUFFER.format(name=map_.name)
+ if num_u32 > 0:
+ buffer_asm = buffer_asm + f"\n.reg .u32 %{map_.name}_u32_<{num_u32 + 1}>;"
+ ptx_lines.insert(body_start_line + offset + 1, buffer_asm)
+ offset += 1
+ elif map_.level == "warp":
+ if map_.cap != "dynamic":
+ buffer_asm = WARP_PROBE_BUFFER.format(name=map_.name, no_bytes=str(map_.cap * map_.size))
+ if num_u32 > 0:
+ buffer_asm = buffer_asm + f"\n.reg .u32 %{map_.name}_u32_<{num_u32 + 1}>;"
+ ptx_lines.insert(body_start_line + offset + 1, buffer_asm)
+ offset += 1
+ else:
+ raise NotImplementedError()
+ else:
+ raise NotImplementedError()
+
+ name_to_map: dict[str, Map] = {m.name: m for m in maps}
+ # Now add the instruction listenings
+ for idx in range(len(ptx_lines)):
+ # ignore most of line that is a string!
+ if type(ptx_lines[idx]) == Ref: # NOTE isinstance is slow?
+ line: str = ptx_lines[idx].line
+ probe: Probe = ptx_lines[idx].probe
+ before_after: str = ptx_lines[idx].before_after
+ # parse instruction operands, operands are separated by space fundamentally
+ tmp = line[:line.index(";")].split(",")
+ operands: list[str] = []
+ # NOTE handling vectorized operands with { and }
+ merges = []
+ merging: bool = False
+ for operand in tmp:
+ if "{" in operand and not "}" in operand:
+ merging = True
+ merges.append(operand)
+ elif "}" in operand and not "{" in operand:
+ merges.append(operand) # FIX, now operand is the last one and shall be included
+ operands.append(",".join(merges).strip("{} ")) # we don't want {} remains
+ merges = [] # flush merges
+ merging = False # reset status
+ else:
+ operands.append(operand) if not merging else merges.append(operand)
+ # first operand also have pred, inst and the real first operand
+ remaining = operands[0].strip() if len(operands) > 0 else print(line, tmp, operands, merges)
+ # handle predicate -> used in final insertion
+ if "@" in remaining:
+ pred = remaining[:remaining.index(" ") + 1] # include the space!
+ remaining = remaining[remaining.index(" ") + 1:].strip()
+ else:
+ pred = ""
+ # TODO assert matching instruction
+ mem_bytes: str = None
+ out: str = None
+ if remaining.find(" ") != -1:
+ inst = remaining[:remaining.index(" ")]
+ # NOTE a helper to calculate bytes, ld and st's bytes are inferred not from operand
+ # but the instruction body (likewise ld.global.v2.u64)
+ if "ld" in inst or "st" in inst:
+ vec = 1
+ if "v2" in inst or "x2" in inst:
+ vec = 2
+ elif "v4" in inst or "x4" in inst:
+ vec = 4
+ # most dtypes are u32, no worries
+ dtypes = ["u32", "u64", "b16", "u16", "u8", "f32", "f64", "b128", "s32", "s64", "s16", "s8", "b32", "b64", "b8"]
+ for dtype in dtypes:
+ if dtype in inst:
+ mem_bytes = str(vec * int(dtype[1:]) // 8) # 8 "= size"
+ break
+ out = remaining[remaining.index(" "):].strip()
+ if "[" in out:
+ # NOTE handle [ addr ] used to indicate the memory address
+ out = out[out.index("[") + 1 : out.index("]")] if out is not None and "[" in out else out # fix
+ in1: str = operands[1] if len(operands) >= 2 else None
+ # NOTE handle [ addr ] used to indicate the memory address
+ in1 = in1[in1.index("[") + 1 : in1.index("]")] if in1 is not None and "[" in in1 else in1
+ in2 = operands[2] if len(operands) >= 3 else None
+ in3 = operands[3] if len(operands) >= 4 else None
+ # TODO handle some weird syntax like + 0 used meaninglessly to locate correct places
+ # Currently only a minimal solution
+ if out is not None and "+" in out: out = out[:out.find("+")]
+ if in1 is not None and "+" in in1: in1 = in1[:in1.find("+")]
+ # print(line, out, in1, in2, sep=" / ")
+ # now handles operand helpers by directly replacing the value
+ snippet = probe.before if before_after else probe.after
+ snippet = snippet.replace("OUT", out) if "OUT" in snippet else snippet
+ snippet = snippet.replace("IN1", in1) if "IN1" in snippet else snippet
+ snippet = snippet.replace("IN2", in2) if "IN2" in snippet else snippet
+ snippet = snippet.replace("IN3", in3) if "IN3" in snippet else snippet
+ # NOTE add a new helper named ADDR referencing gmem address
+ if "ADDR" in snippet:
+ if "ld" in operands[0] or "cp.async" in operands[0]:
+ snippet = snippet.replace("ADDR", in1)
+ elif "st" in operands[0]: # st has
+ snippet = snippet.replace("ADDR", out)
+ if mem_bytes is not None:
+ snippet = snippet.replace("BYTES", mem_bytes) if "BYTES" in snippet else snippet
+ # now handles STORE helpers
+ snippet_lines = snippet.split("\n")
+ # NOTE special arrangements for warp datamodel
+ org_pred = pred
+ if probe.level == "warp":
+ if pred == "":
+ pred = "@%leader " # apply filter that only leader works
+ else:
+ pred = "@%joint_pred " # will be updated %leader AND pred
+
+ for snippet_line_idx in range(len(snippet_lines)):
+ snippet_line: str = snippet_lines[snippet_line_idx]
+ if "SAVE" in snippet_line: # only one save, at the begin of line
+ # SAVE [ block_sched ] { R0, R1, R3 }
+ map_ = name_to_map[snippet_line[snippet_line.index("[") + 1: snippet_line.index("]")].strip()]
+ # dtype = snippet_line[snippet_line.find("SAVE") + 5: snippet_line.find("SAVE") + 8]
+ items = snippet_line[snippet_line.index("{") + 1:snippet_line.index("}")].split(",")
+ assert len(items) == len(map_.regs), f"{map_.name}.save not follow definition"
+ cvt_lines, u64s, u32s = [], [], []
+ u32_idx = 1 # NOTE must be 1 due to some syntax issue I think
+ for item, reg in zip(items, map_.regs):
+ if reg.dtype == "u64":
+ u64s.append(item)
+ elif reg.dtype == "u32": # apply conversion
+ cvt_lines.append(f"{pred}cvt.u32.u64 %{map_.name}_u32_{u32_idx}, {item};\n")
+ u32s.append(f"%{map_.name}_u32_{u32_idx}")
+ u32_idx += 1
+ else:
+ raise NotImplementedError
+ assert len(u32s) % 2 == 0, "Must save 2n u32 registers for memory alignment, please promote to u64"
+ save_lines = []
+ for item_idx in range(len(u64s) // 2):
+ save_lines.append(f"{pred}st.global.v2.u64 [%map_{map_.name}1], {{ {u64s[item_idx * 2]}, {u64s[item_idx * 2 + 1]} }};\n{pred}add.s64 %map_{map_.name}1, %map_{map_.name}1, 16;")
+ if len(u64s) % 2 != 0: # odd number -> one item left!
+ save_lines.append(f"{pred}st.global.u64 [%map_{map_.name}1], {u64s[-1]};\n{pred}add.s64 %map_{map_.name}1, %map_{map_.name}1, 8;")
+ for item_idx in range(len(u32s) // 4):
+ save_lines.append(f"{pred}st.global.v4.u32 [%map_{map_.name}1], {{ {u32s[item_idx * 4]}, {u32s[item_idx * 4 + 1]}, {u32s[item_idx * 4 + 2]}, {u32s[item_idx * 4 + 3]} }};\n{pred}add.s64 %map_{map_.name}1, %map_{map_.name}1, 16;")
+ if len(u32s) % 4 != 0: # two items left...
+ save_lines.append(f"{pred}st.global.v2.u32 [%map_{map_.name}1], {{ {u32s[-2]}, {u32s[-1]} }};\n{pred}add.s64 %map_{map_.name}1, %map_{map_.name}1, 8;")
+ snippet_lines[snippet_line_idx] = "\n".join(cvt_lines) + "\n".join(save_lines)
+ else:
+ # or just add the pred!
+ # NOTE handling warp that having double buffer
+ if ".reg" not in snippet_line and snippet_line.strip() not in ("{", "}"):
+ snippet_lines[snippet_line_idx] = pred + snippet_line
+ else:
+ snippet_lines[snippet_line_idx] = snippet_line
+ if probe.level == "warp" and org_pred != "":
+ snippet_lines.insert(0, f"and.pred %tmp, %leader, {org_pred[1:]}; // joint prediction") # ignore the '@' signal at first
+ snippet = "\n".join(snippet_lines)
+ # finally replace the Ref with snippet to finish the probing!
+ ptx_lines[idx] = snippet
+
+ # Finally finished.1
+ return "\n".join(ptx_lines)
+
+def assemble(workdir: str, name: str) -> None:
+ """compile the ptx into cubin via ptxas
+ NOTE: ptxas command like `ptxas -arch=sm_80 --verbose -m64 "original.ptx" -o "original.cubin"`
+ * This is not actually need for running because CUDA Driver cuModuleLoad can load PTX (JIT),
+ * But is useful for checking as ptxas --verbose can give more info for debugging
+ """
+ ptx_path = os.path.join(workdir, name) + ".ptx"
+ bin_path = os.path.join(workdir, name) + ".bin" # target binary
+ command = ["ptxas", f'-arch={get_arch()}', '-m64', "--verbose", ptx_path, '-o', bin_path]
+ print(" ".join(command), file=log)
+ result = subprocess.run(
+ command,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ check=True
+ )
+ # print debug and verbose information to the process.log
+ if len(result.stderr) > 0:
+ print(result.stderr.decode("utf-8"), file=log)
+ if len(result.stdout) > 0:
+ print(result.stdout.decode("utf-8"), file=log)
+
+def parse_params(ptx: str) -> tuple[list[KernelParam], str]:
+ """parse kernel function parameters
+ @see https://docs.nvidia.com/cuda/parallel-thread-execution/#kernel-function-parameters
+
+ NOTE this is because cuLaunchKernel receive void** as kernelParam and one can not infer
+ the valid no.params from void** (NVIDIA driver also use similar parsing for that)
+ """
+ start = ptx.find("(")
+ name_start = ptx.rfind(" ", 0, start)
+ end = ptx.find(")", start)
+ ptx_lines = ptx[start + 1 : end].split("\n")
+ param_lines: list[str] = []
+ params: list[KernelParam] = []
+
+ for line in ptx_lines:
+ if ".param" in line:
+ param_lines.append(line.strip(" ,"))
+ for param_line in param_lines:
+ tmp = param_line.split(" ")
+ dtype = tmp[1][1:] # .s32 .u64 ...
+ name = tmp[-1]
+ params.append(KernelParam(dtype, name))
+ return params, ptx[name_start + 1:start] # + 1 := ignore space
+
+
+def write_kernel_info(name: str, params: list[KernelParam], map_sizes: list[int],
+ workdir: str, file_name: str = "kernel.info"):
+ """write kernel info to workdir/file_name"""
+ # TODO add support for vectorized items
+ with open(os.path.join(workdir, file_name), "w") as f:
+ # print kernel name
+ print(name, file=f)
+ # number of parameters, for parsing void** kernelParams
+ print(len(params), file=f)
+ # number of probes with memory
+ print(len(map_sizes), file=f)
+ # size of each memory section
+ for probe_type, size in map_sizes:
+ print(f"{SUPPORTED_DATAMODEL[probe_type]},{size}", file=f)
+ # # NOTE: print the hook here, resolve relative path
+ # if callback != "" and not callback.startswith("/"):
+ # callback = os.path.join(os.path.dirname(os.path.dirname(__file__)), "tools", callback)
+ # print(callback, file=f)
+ # # NOTE: following are referencing stuff not really used by hook driver
+ # for param in params:
+ # print(f"{param.name},{param.dtype}", file=f)
+
+# ENTRY for this tool
+if __name__ == "__main__":
+ # no argparse as the CLI is straightforward
+ workdir = sys.argv[1] # directory contains original.bin
+ kernel_name = sys.argv[2].encode('utf-8', 'ignore').decode('utf-8', 'ignore') # for possible case with multiple entry in one binary
+
+ probes: dict
+ if len(sys.argv) > 3: # NOTE to facilitate debugging, not used in production
+ probe_path = sys.argv[3]
+ probes = toml.load(probe_path)
+ print(probes)
+ else: # the path in production
+ # parse the environment variable to read the probes
+ probe_envvar = os.environ.get("NEUTRINO_PROBES")
+ if probe_envvar is None:
+ raise ValueError("Can not read probes from envaraible 'NEUTRINO_PROBES'")
+ # load it via toml
+ probes = toml.loads(probe_envvar)
+
+ # # filter out, probes are nested dict in TOML via [name]
+ # probes: Dict[str, dict] = dict()
+ # callback = probe_toml["CALLBACK"] if "callback" in probe_toml else ""
+ # for key, value in probe_toml.items():
+ # if isinstance(value, dict):
+ # probes[key] = value
+
+ # parse the environment variable for filtered out kernel, this is for
+ # 1. Some buggy kernels caused system fails -> many GPU error is not recoverable
+ # 2. Some uninterested kernels such as vectorized_elementwise for PyTorch
+ filter_out = os.environ.get("NEUTRINO_FILTER", "")
+ filter_out = filter_out.split(":") if len(filter_out) > 0 else None
+ print(filter_out, file=log)
+
+ filter_in = os.environ.get("NEUTRINO_KERNEL", "")
+ filter_in = filter_in.split(":") if len(filter_in) > 0 else None
+ print(filter_in, file=log)
+
+ # NOTE check if some probe is defined as dynamic, if so, we need to add a counter
+ # for these probes in different arangements
+ dynamic = bool(os.environ.get("NEUTRINO_DYNAMIC", 0))
+
+ try:
+ # first objdump binary to ptx
+ ptx = dump(workdir)
+ # then truncate ptx for entry_name
+ global_section, func_section, entry_section, _ = prune(ptx, kernel_name)
+ # split and process ptx lines and write kernel info
+ params, kernel_name = parse_params(entry_section)
+
+ # basic logging
+ print(kernel_name, file=log)
+ if filter_in:
+ matched = False
+ for tmp in filter_in:
+ if tmp in kernel_name:
+ matched = True
+ if not matched:
+ print(f"{kernel_name} is not in {filter_in}", file=log)
+ exit(1)
+ if filter_out:
+ for tmp in filter_out:
+ if tmp != "" and tmp in kernel_name:
+ print(f"{kernel_name} filtered out from {filter_out}", file=log)
+ exit(1)
+
+ # write pruned ptx to file
+ pruned_ptx = global_section + "\n" + func_section + "\n" + entry_section
+ with open(os.path.join(workdir, "pruned.ptx"), "w") as f:
+ f.write(pruned_ptx)
+
+ # convert probes from Python Dict to data structure
+ probes, maps, regs = load(probes)
+
+ # NOTE generate the trace reading code
+ if dynamic:
+ # First check the probe with size is dynamic, aka size = -1
+ count_inst = []
+ count_size = 0
+ count_map = ""
+ for map_ in maps:
+ if map_.cap == "dynamic":
+ count_size = map_.size
+ count_map = map_.name
+ break
+ for probe in probes:
+ # NOTE there might be a bug that before and after can only save once
+ if probe.before is not None:
+ if "SAVE" in probe.before and count_map in probe.before:
+ count_inst = count_inst + probe.pos
+ if probe.after is not None:
+ if "SAVE" in probe.after and count_map in probe.after:
+ count_inst = count_inst + probe.pos
+ assert len(count_inst) > 0
+ count_inst = ":".join(count_inst)
+ count_probe = COUNT_PROBE.format(count_inst = count_inst, count_size = count_size)
+ count_probe, count_map, count_reg = load(toml.loads(count_probe))
+ count_ptx = probing(entry_section, count_probe, count_map, count_reg)
+ count_ptx = global_section + "\n" + func_section + "\n" + count_ptx
+ with open(os.path.join(workdir, "countd.ptx"), "w") as f:
+ f.write(count_ptx)
+
+ map_sizes: list[tuple[str, int]] = []
+ for map_ in maps:
+ if isinstance(map_.cap, int):
+ map_sizes.append((map_.level, map_.size * map_.cap))
+ elif map_.cap == "dynamic":
+ map_sizes.append((map_.level, -1))
+ else:
+ raise NotImplementedError
+
+ # process ptx lines
+ probed_ptx = probing(entry_section, probes, maps, regs)
+
+ # merge global and func back
+ probed_ptx = global_section + "\n" + func_section + "\n" + probed_ptx
+
+ # write ptx to file
+ with open(os.path.join(workdir, "probed.ptx"), "w") as f:
+ f.write(probed_ptx)
+
+ # params = parse_params(ptx_lines)
+ write_kernel_info(kernel_name, params, map_sizes, workdir)
+
+ # compile ptx to binary, we want both probed and pruned
+ assemble(workdir, "probed")
+ assemble(workdir, "pruned")
+ if dynamic:
+ assemble(workdir, "countd")
+
+ except Exception as e:
+ traceback.print_exc(file=log)
+ exit(1)
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/hip.py b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/hip.py
new file mode 100644
index 0000000..39bd02c
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/probe/hip.py
@@ -0,0 +1,486 @@
+"""Neutrino Probing Engine, AMD ROCm HIP Implementation
+
+HIP's AMDGCN (.amdgcn) has only one source: LLVM AMDGPU Backend
+CITE https://llvm.org/docs/AMDGPUUsage.html
+
+GCNAsm is similar to x86 assembly, kernel mainly has two parts (two .text):
+1. GCNAsm Code in form of .section .text
+2. AMD HSA Configuration or said `.amdgpu_metadata`
+
+NOTE At the moment, we only support part of syntax because there's no official
+ documentation on syntax / semantics of GCN Assembly (only instructions).
+
+Important GCNAsm syntax for Neutrino developers:
+1. s[0:1] holds pointer to kernargs, use s_load_dword to ld.param
+2. v0 (32bit) holds 00 + threadIdx.z(30-20)+threadIDx.y(20-10)+threadIdx.x(10-0)
+3. blockIdx.xyz is the next 3 registers after first three, gridDim.xyz is ttmp8/9/10
+4. It seems blockDim can not be retrieved easily from special registers, only via
+
+It's worth noticed that v0 and s[0:1] can be changed by developers, i.e., their
+value (threadIdx.xyz) is only available at kernel begins.
+
+NOTE Why not fully support?
+Because we can not find AMD GPUs for testing or debugging (we are not AMD).
+There's nearly no cloud providers for AMD GPUs (only MI300x on runpods.io).
+
+And part of AMD's ISA is ridiculous, for example, until CDNA3, add two u64
+is finally supported on VGPRs, still not supported on SGPRs. Why? Tell me why?
+"""
+
+from typing import List, Tuple, Optional, Dict, Set
+import os
+import sys
+import shutil
+import subprocess
+import traceback # usef for print backtrace to log file instead of stdout
+import toml # to load probes from envariables
+import yaml # AMD GCN ASM use YAML as METADATA Storage
+from dataclasses import dataclass
+from neutrino.common import Register, Probe, Map, load
+from neutrino.probe import Ref, KernelParam
+
+workdir = sys.argv[1] # directory contains original.bin
+log = open(os.path.join(workdir, "process.log"), 'w')
+
+# a macro like terms
+SUPPORTED_DATAMODEL = { "thread": 0, "warp": 1 }
+
+# NOTE applicable to CDNA GPUs but might not be applicable to GDNA GPUs
+# TODO change to amdgpu_metadata['amdhsa.kernels'][0]['.wavefront_size']
+WARP_SIZE = 64
+
+@dataclass
+class KernelParam: # NOTE GCNASM has different defn
+ value_kind: str
+ size: int
+
+# NOTE it's risky but safe as this is a CLI tool invoked for specific kernel
+amdgpu_metadata: Dict = None
+
+# TODO finalize rocm-smi toolchain
+def get_arch() -> str:
+ """At the moment, we extract target arch from the assembly, but not sure
+ if this may leads to misleading arch for codegen, will see"""
+ ...
+
+# TODO finalize llvm-objdump toolchain
+def extract(workdir: str, name: str = "original", suffix: str = ".bin") -> str:
+ bin_path = os.path.join(workdir, name) + suffix
+ # first check if it's already a NULL-Terminated PTX (i.e., ASCII Text)
+ result = subprocess.run(['file', bin_path], stdout=subprocess.PIPE, text=True)
+ out = result.stdout
+ if "ASCII text" in result.stdout: # raw PTX file, just read it all
+ shutil.copyfile(bin_path, os.path.join(workdir, name) + ".asm")
+ print("[decompile] bin is gcnasm", file=log)
+ with open(os.path.join(workdir, name) + ".asm", "r") as outf:
+ return outf.read()
+
+# TODO add prune support
+def prune(asm: str, entry_name: str) -> Tuple[str, str]:
+ """A Minimum parser to truncate the gcn asm for specific entry
+
+ Use this function to locate a specific entry with entry_name as
+ single .asm / .s usually have > 1 entry!
+ """
+ # First find the two single line of .text
+ lines = asm.split("\n")
+ sections = []
+ target = None # assembler target like .amdgcn_target "amdgcn-amd-amdhsa--gfx942"
+ for idx in range(len(lines)):
+ if ".text" in lines[idx]:
+ sections.append(idx) # record the sections
+ elif ".amdgcn_target" in lines[idx]:
+ target = lines[idx]
+ # reorganize sections
+ kernels = []
+ for idx in range(len(sections) - 2): # last section holds gcnasm
+ if "@function" in "\n".join(lines[sections[idx] : sections[idx + 1]]):
+ kernels.append(sections[idx])
+ kernels.append(sections[-1])
+ # Now locate the entry
+ # TODO add rough matching!!!
+ entry_section = None
+ for idx in range(len(kernels) - 2):
+ temp = "\n".join(lines[kernels[idx] : kernels[idx + 1]])
+ if entry_name in temp:
+ entry_section = temp
+ assert entry_section is not None, "Fail to find"
+ # adding target if not found
+ if target not in entry_section:
+ entry_section = entry_section.split("\n")
+ entry_section = entry_section[0] + target + "\n".join(entry_section[1:])
+ # fix the metadata section
+ last_section = "\n".join(lines[sections[-1]:])
+ assert ".amdgpu_metadata" in last_section
+ metadata = last_section[last_section.index(".amdgpu_metadata") + 16: last_section.index(".end_amdgpu_metadata") - 1] # BUG -1 is a fix
+ global amdgpu_metadata
+ amdgpu_metadata = yaml.safe_load(metadata)
+ for kernelmeta in amdgpu_metadata['amdhsa.kernels']:
+ if kernelmeta['.name'] == entry_name:
+ amdgpu_metadata['amdhsa.kernels'] = [kernelmeta, ] # only want this one
+ break
+ return entry_section, last_section
+
+def parse_params() -> Tuple[List[KernelParam], str]:
+ global amdgpu_metadata
+ kernel_name = amdgpu_metadata['amdhsa.kernels'][0]['.name']
+ params: List[KernelParam] = []
+ for arg in amdgpu_metadata['amdhsa.kernels'][0]['.args']:
+ params.append(KernelParam(arg['.value_kind'], arg['.size']))
+ return params, kernel_name
+
+"""
+NOTE: templates for thread-constant datamodel buffer calculation
+These part shall be placed ONCE at the beginning of every kernel function definition
+if there's any thread-constant probes
+
+Most registers below is duplicate and will be optimized by AMD Assembler
+"""
+
+THREAD_BUFFER_COMMON = """;;# begin buffer calculation
+V_MOV_B32 v{thread_buff}, v0 ;;# v0 holds threadIdx.x, don't know what's threadIdx.y, threadIdx.z
+;;# end buffer calculation"""
+
+WARP_BUFFER_COMMON = """;;# begin buffer calculation
+V_LSHRREV_B32_E32 v{warp_buff}, 6, v0;; # shift 6 bits := // 64
+;;# end buffer calculation"""
+
+# NOTE buffer location for thread-local buffers, every probe has independent this part
+THREAD_BUFFER = """;;# begin {name} buffer
+S_LOAD_DWORDX2 s[{param_reg}], s[0:1], {param_offset};;# load buffer address into 64bit register (2x32)
+V_MAD_I64_I32 v[{param_addr}], v{thread_buff}, {no_bytes}, s[{param_reg}];;# calculate the address
+;;# end {name} buffer"""
+
+# BUG it shall be possible to move everything into sgpr, but I don't know how to locate
+# warpIdx in solely SGPR, please help me
+WARP_BUFFER = """;;# begin {name} buffer
+S_LOAD_DWORDX2 s[{param_reg}], s[0:1], {param_offset};;# load buffer address into 64bit register (2x32)
+V_MAD_I64_I32 v[{param_addr}], v{warp_buff}, {no_bytes}, s[{param_reg}];;# calculate the address
+;;# end {name} buffer"""
+
+def probing(asm: str, probes: List[Probe]) -> Tuple[str, List[int], str]:
+ """Probing the Assembly, the core of probing engine
+
+ NOTE we assume probe is parsed and (security checked)"""
+
+ # NOTE parse interesting locations
+ # A mapping from location to probes, a probe can hook at multiple location
+ positions: Dict[str, List[Probe]] = dict()
+ kernel_start_probes: List[Probe] = []
+ # NOTE turn kernel:end into ret:start for better matching
+ print(probes)
+ for probe in probes:
+ # different position split by ;, and inside split by : for start/end
+ for position in probe.position:
+ if position == "kernel": # turn into listening instructions
+ if probe.after is not None:
+ if "s_endpgm" in positions: # AMD use s_endpgm to terminate
+ positions["s_endpgm"].append(probe)
+ else:
+ positions["s_endpgm"] = [probe, ]
+ if probe.before is not None:
+ kernel_start_probes.append(probe)
+ else:
+ if position in positions:
+ positions[position].append(probe)
+ else:
+ positions[position] = [probe, ]
+
+ # NOTE parse GCN Assembly
+ gcn_lines = asm.split("\n") # let's do it line by line
+ # first extract basic kernel signature
+ body_start_line : int = 0 # first line of body
+ idx = 0
+ # NOTE specially handle kernel start probe
+
+ while idx < len(gcn_lines):
+ line = gcn_lines[idx]
+ # First try to find ; %bb.0: NOTE just the behavior of hipcc not standard syntax
+ # but we don't know what's the standard syntax, there's no documentation about this...
+ # maybe this is the reason why AMD product is hard to use?
+ if "%bb.0" in line:
+ body_start_line = idx
+ # BUG move it to the real beginning before loading and saving ?
+ for probe in kernel_start_probes:
+ gcn_lines.insert(idx + 1, Ref(line=line, probe=probe, before_after=True)) # place after
+ idx += 1
+ # here pattern matching positions TODO optimize performance here
+ else:
+ for position, probes in positions.items():
+ if position in line: # BUG might mismatch parameter with confused naming
+ # NOTE we got a match, then every probe will insert snippet before or after the line
+ # this might cause idx fluctuatting if we use idx to process it
+ line_idx = idx # a copy to fix the insertion position
+ for probe in probes:
+ # specially handle ret;, we need to place it before ret or it won't be executed
+ if position == "s_endpgm" and probe.after is not None:
+ gcn_lines.insert(line_idx, Ref(line=line, probe=probe, before_after=False))
+ idx += 1
+ line_idx += 1
+ else:
+ if probe.before is not None:
+ gcn_lines.insert(line_idx, Ref(line=line, probe=probe, before_after=True))
+ idx += 1
+ line_idx += 1
+ if probe.after is not None:
+ gcn_lines.insert(line_idx + 1, Ref(line=line, probe=probe, before_after=False))
+ idx += 1
+ idx += 1
+
+ # work with register spaces, NOTE AMD GCN Asm don't have declartion syntax
+ # for registers, just a flatten v[0:1], we need to manage them manually
+ # GCN Asm has two register spaces:
+ # 1. VGPR (v0), holding thread-spcific values
+ # 2. SGPR (s0), holding warp-specifc values
+ # TODO we can optimize warp probes to SGPR only, avoiding VGPR usage
+
+ # Now add the probes to PTX Assembly
+ offset: int = 0 # adding every line need to offset 1 to make it correct
+ probe_mem_sizes: List[Tuple[str, int]] = [] #
+ # TODO parse these from meta
+ global amdgpu_metadata
+ # NOTE here the sgpr number is wrong, there'll be 6 more, I don't know why, I can only
+ # record it at the moment.
+ # BUG SGPR number from metadata doesn't match the actual usage. Always
+ # 6 more is used. I don't know why but let's keep it.
+ sgpr_all = amdgpu_metadata["amdhsa.kernels"][0]['.sgpr_count']
+ sgpr = 0
+ for idx in range(len(gcn_lines)):
+ if type(gcn_lines[idx]) is str and ".amdhsa_next_free_sgpr" in gcn_lines[idx]:
+ sgpr = int(gcn_lines[idx].strip().split()[1])
+ sgpr_diff = sgpr_all - sgpr
+ vgpr = amdgpu_metadata['amdhsa.kernels'][0]['.vgpr_count'] # used for new stuff
+ param_off = amdgpu_metadata["amdhsa.kernels"][0]['.kernarg_segment_size']
+ param_align = amdgpu_metadata["amdhsa.kernels"][0]['.kernarg_segment_align']
+ param_off = ((param_off + param_align - 1) // param_align ) * param_align # round up
+ params = []
+ thread_buff_vgpr, warp_buff_vgpr = None, None # conform Python scope
+
+ processed: Set[str] = set() # a set to avoid repeated process same probe that leads to error
+ datamodels: Set[str] = set()
+ for probe in probes:
+ if probe.name not in processed and probe.datamodel is not None:
+ probe_mem_sizes.append((probe.datamodel, int(probe.cap) * int(probe.no_bytes)))
+ processed.add(probe.name)
+ datamodels.add(probe.datamodel)
+
+ if "thread" in datamodels:
+ thread_buff_vgpr = f"{vgpr}"
+ gcn_lines.insert(body_start_line + offset + 1, THREAD_BUFFER_COMMON.format(thread_buff=thread_buff_vgpr))
+ offset += 1
+ vgpr += 1
+ if "warp" in datamodels:
+ warp_buff_vgpr = f"{vgpr}"
+ gcn_lines.insert(body_start_line + offset + 1, WARP_BUFFER_COMMON.format(warp_buff=warp_buff_vgpr))
+ offset += 1
+ vgpr += 1
+
+ # Now add the individual buffer calculation
+ processed = set()
+ for probe in probes:
+ if probe.name not in processed:
+ if probe.datamodel == "thread":
+ no_bytes = str(int(probe.cap) * int(probe.no_bytes))
+ gcn_lines.insert(body_start_line + offset + 1,
+ THREAD_BUFFER.format(name=probe.name, no_bytes=no_bytes,
+ param_offset=param_off, param_reg=f"{sgpr}:{sgpr+1}",
+ thread_buff=thread_buff_vgpr, param_addr=f"{vgpr}:{vgpr+1}"))
+ probe.param_addr = f"{vgpr}:{vgpr+1}" # NOTE record the address
+ offset += 1
+ sgpr += 2 # 2x32bit registers to hold 8bytes, specific to warp
+ vgpr += 2 # 2x32bit registers to hold 8bytes, specific to thread
+ params.append({'.address_space': 'global', '.size': 8,
+ '.offset': param_off, '.value_kind': 'global_buffer'})
+ param_off += 8 # only pass in pointers so 8bytes := 64bits
+ elif probe.datamodel == "warp":
+ no_bytes = str(int(probe.cap) * int(probe.no_bytes))
+ gcn_lines.insert(body_start_line + offset + 1,
+ WARP_BUFFER.format(name=probe.name, no_bytes=no_bytes,
+ param_offset=param_off, param_reg=f"{sgpr}:{sgpr+1}",
+ warp_buff=warp_buff_vgpr, param_addr=f"{vgpr}:{vgpr+1}"))
+ probe.param_addr = f"{vgpr}:{vgpr+1}" # NOTE record the address
+ offset += 1
+ sgpr += 2 # 2x32bit registers to hold 8bytes, specific to warp
+ vgpr += 2 # 2x32bit registers to hold 8bytes, specific to thread
+ params.append({'.address_space': 'global', '.size': 8,
+ '.offset': param_off, '.value_kind': 'global_buffer'})
+ param_off += 8 # only pass in pointers so 8bytes := 64bits
+ for reg in probe.registers:
+ if probe.registers[reg] == "b32":
+ if probe.datamodel == "warp":
+ probe.registers[reg] = f"s{sgpr}"
+ sgpr += 1
+ elif probe.datamodel == "thread":
+ probe.registers[reg] = f"v{vgpr}"
+ vgpr += 1
+ elif probe.registers[reg] == "b64":
+ if probe.datamodel == "warp":
+ probe.registers[reg] = f"s[{sgpr}:{sgpr+1}]"
+ sgpr += 2
+ elif probe.datamodel == "thread":
+ probe.registers[reg] = f"v[{vgpr}:{vgpr+1}]"
+ vgpr += 2
+ processed.add(probe.name)
+ # all rest is treated as no saving
+
+ # Now add the instruction listening
+ for idx in range(len(gcn_lines)):
+ # ignore most of line that is a string!
+ if type(gcn_lines[idx]) == Ref: # NOTE isinstance is slow?
+ line: str = gcn_lines[idx].line.strip()
+ probe: Probe = gcn_lines[idx].probe
+ before_after: str = gcn_lines[idx].before_after
+ # parse instruction operands, operands are separated by comma
+ if ";" in line: line = line[:line.find(";")]
+ tmp = line.split(",")
+ operands: List[str] = []
+ inst, op1 = tmp[0].split(" ")[0], tmp[0].split(" ")[-1] #
+ operands.append(op1)
+ for t in tmp[1:]:
+ operands.append(t.strip().split(" ")[0])
+ snippet = probe.before if before_after else probe.after
+ if "OUT" in snippet: snippet = snippet.replace("OUT", operands[0])
+ if "IN1" in snippet: snippet = snippet.replace("IN1", operands[1])
+ if "IN2" in snippet: snippet = snippet.replace("IN2", operands[2])
+ if "IN3" in snippet: snippet = snippet.replace("IN3", operands[3])
+
+ # Adding support for SAVE.u64 statement
+ # NOTE for reading the probe afterwards
+ snippet_lines = snippet.split("\n")
+ for snippet_line_idx in range(len(snippet_lines)):
+ snippet_line: str = snippet_lines[snippet_line_idx]
+ if "SAVE" in snippet_line: # only one save, at the begin of line
+ save_lines = [] # start a new string
+ items = snippet_line[snippet_line.index("{") + 1:snippet_line.index("}")].split(",")
+ dtype = snippet_line[snippet_line.find("SAVE") + 5: snippet_line.find("SAVE") + 8]
+ if dtype == "u64":
+ for item_idx in range(len(items)):
+ item_val = probe.registers[items[item_idx].strip()]
+ save_lines.append(f"\tGLOBAL_STORE_DWORDX2 v[{probe.param_addr}], {item_val} \n\tV_LSHL_ADD_U64 v[{probe.param_addr}], 0, 8")
+ elif dtype == "u32":
+ for item_idx in range(len(items)):
+ item_val = probe.registers[items[item_idx].strip()]
+ save_lines.append(f"\tGLOBAL_STORE_DWORD v[{probe.param_addr}], {item_val} \n\tV_LSHL_ADD_U64 v[{probe.param_addr}], 0, 4")
+ else:
+ raise ValueError("Only Support Saving u32 / u64")
+ snippet_lines[snippet_line_idx] = "\n".join(save_lines)
+ snippet = "\n".join(snippet_lines)
+ for reg in probe.registers:
+ if reg in snippet:
+ snippet = snippet.replace(reg, probe.registers[reg])
+ # Finally replace the line
+ gcn_lines[idx] = snippet
+
+ # NOTE we need to modify the number of registers used in metasection
+ # 1. Mofify the kernarg_size .amdhsa_kernarg_size 28
+ # 2. Modify the SGPRs used .amdhsa_next_free_sgpr 12
+ # 3. Modify the VGPRs used .amdhsa_next_free_vgpr 9
+ # Something might need .amdhsa_user_sgpr_count 2
+ for idx in range(len(gcn_lines)):
+ if ".amdhsa_kernarg_size" in gcn_lines[idx]:
+ gcn_lines[idx] = f"\t.amdhsa_kernarg_size {param_off}"
+ elif ".amdhsa_next_free_sgpr" in gcn_lines[idx]:
+ gcn_lines[idx] = f"\t.amdhsa_next_free_sgpr {sgpr}"
+ elif ".amdhsa_next_free_vgpr" in gcn_lines[idx]:
+ gcn_lines[idx] = f"\t.amdhsa_next_free_vgpr {vgpr}"
+
+ # NOTE also modify the amdgpu_metadata, after all, becomes
+ amdgpu_metadata["amdhsa.kernels"][0]['.sgpr_count'] = sgpr + sgpr_diff
+ amdgpu_metadata['amdhsa.kernels'][0]['.vgpr_count'] = vgpr
+ amdgpu_metadata["amdhsa.kernels"][0]['.kernarg_segment_size'] = param_off
+ amdgpu_metadata["amdhsa.kernels"][0]['.args'] += params
+ # Finally finished, we might need to finalize the metadata
+ return "\n".join(gcn_lines), probe_mem_sizes
+
+ # NOTE also add new parameters
+
+
+def assemble(workdir: str, name: str) -> None:
+ """Assemble the GCN Asm (probed.asm) into Machine Code (probed.bin)
+ NOTE AMD assembler command is part of Clang LLVM like
+ clang -cc1as -triple amdgcn-amd-amdhsa -filetype obj -target-cpu gfx942
+ -mrelocation-model pic -v -mllvm -amdgpu-early-inline-all=true -mllvm
+ -amdgpu-function-calls=false -o probed.bin probed.asm
+ """
+ # TODO need to locate the clang of ROCm, unlike like ptxas of unique name
+ asm_path = os.path.join(workdir, name) + ".asm"
+ bin_path = os.path.join(workdir, name) + ".bin" # target binary
+ command = ["clang", '-cc1as', '-triple', 'amdgcn-amd-amdhsa', '-filetype=obj',
+ f"-target-cpu={get_arch()}", '-mrelocation-model=pic', '--verbose',
+ '-mllvm', '-amdgpu-early-inline-all=true',
+ '-mllvm', '-amdgpu-function-calls=falsep',
+ asm_path, '-o', bin_path]
+ print(" ".join(command), file=log)
+ result = subprocess.run(
+ command,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ check=True
+ )
+ # print debug and verbose information to the process.log
+ if len(result.stderr) > 0:
+ print(result.stderr.decode("utf-8"), file=log)
+ if len(result.stdout) > 0:
+ print(result.stdout.decode("utf-8"), file=log)
+
+# ENTRY for this tool
+if __name__ == "__main__":
+ # no argparse as the CLI is straightforward
+ workdir = sys.argv[1] # directory contains original.bin
+ kernel_name = sys.argv[2].encode('utf-8', 'ignore').decode('utf-8', 'ignore') # for possible case with multiple entry in one binary
+
+ if len(sys.argv) > 3: # NOTE to facilitate debugging, not used in production
+ probe_path = sys.argv[3]
+ probe_toml = toml.load(probe_path)
+ else: # the pass in production
+ # parse the environment variable to read the probes
+ probe_envvar = os.environ.get("NEUTRINO_PROBES")
+ if probe_envvar is None:
+ raise ValueError("Can not read probes from envaraible 'NEUTRINO_PROBES'")
+ # load it via toml
+ probe_toml = toml.loads(probe_envvar)
+
+ # filter out, probes are nested dict in TOML via [name]
+ probes: Dict[str, dict] = dict()
+ analyze_hook = probe_toml["analyze_hook"] if "analyze_hook" in probe_toml else ""
+ for key, value in probe_toml.items():
+ if isinstance(value, dict):
+ probes[key] =value
+
+ probes = safe_load_probes(probes)
+ # apply a
+
+ try:
+ # first decompile binary to ptx
+ asm = extract(workdir)
+ # then truncate ptx for entry_name
+ entry_section, meta_section = prune(asm, kernel_name)
+
+ # split and process ptx lines and write kernel info
+ params, kernel_name = parse_params()
+
+ # basic logging
+ print(kernel_name, file=log)
+
+ # write pruned gcnasm to file
+ meta_section = meta_section[: meta_section.index(".amdgpu_metadata") + 16] + yaml.safe_dump(amdgpu_metadata) + meta_section[meta_section.index(".end_amdgpu_metadata") - 1:]
+ pruned_ptx = entry_section + "\n" + meta_section
+ with open(os.path.join(workdir, "pruned.asm"), "w") as f:
+ f.write(pruned_ptx)
+
+ probed_asm, probe_mem_sizes = probing(entry_section, probes)
+
+ # NOTE we need to update the meta_section we updated
+ # TODO split into multiple lines
+ meta_section = meta_section[: meta_section.index(".amdgpu_metadata") + 16] + yaml.safe_dump(amdgpu_metadata) + meta_section[meta_section.index(".end_amdgpu_metadata") - 1:]
+
+ # merge global and func back
+ probed_asm = probed_asm + "\n" + meta_section
+
+ # write probed gcnasm to file
+ with open(os.path.join(workdir, "probed.asm"), "w") as f:
+ f.write(probed_asm)
+
+ except Exception as e:
+ traceback.print_exc(file=log)
+ exit(1)
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/common.h b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/common.h
new file mode 100644
index 0000000..cac4710
--- /dev/null
+++ b/benchmarks/arteval_bench/data/benchmark/osdi25_neutrino/neutrino/neutrino/src/common.h
@@ -0,0 +1,611 @@
+/**
+ * Common Definition of Neutrino Hooked Driver
+ *
+ * @note Keep common.h only Linux/GNU dependencies, no other platform-specifics
+ */
+#include