From 3a7971d0c63a15259d6fa05d885b606de62a9f48 Mon Sep 17 00:00:00 2001 From: learning-chip Date: Tue, 19 May 2026 05:09:57 +0800 Subject: [PATCH] minimum example for msprof simulator on A5 kernel --- examples/jit_cpp/a5_abs/.gitignore | 3 + examples/jit_cpp/a5_abs/README.md | 100 +++++++++++++++++++ examples/jit_cpp/a5_abs/jit_util_a5_abs.py | 108 +++++++++++++++++++++ examples/jit_cpp/a5_abs/run_abs.py | 46 +++++++++ examples/jit_cpp/a5_abs/run_msprof.sh | 19 ++++ 5 files changed, 276 insertions(+) create mode 100644 examples/jit_cpp/a5_abs/.gitignore create mode 100644 examples/jit_cpp/a5_abs/README.md create mode 100644 examples/jit_cpp/a5_abs/jit_util_a5_abs.py create mode 100644 examples/jit_cpp/a5_abs/run_abs.py create mode 100755 examples/jit_cpp/a5_abs/run_msprof.sh diff --git a/examples/jit_cpp/a5_abs/.gitignore b/examples/jit_cpp/a5_abs/.gitignore new file mode 100644 index 00000000..1770c396 --- /dev/null +++ b/examples/jit_cpp/a5_abs/.gitignore @@ -0,0 +1,3 @@ +*.so +OPPROF_*/ +kernel_meta/ diff --git a/examples/jit_cpp/a5_abs/README.md b/examples/jit_cpp/a5_abs/README.md new file mode 100644 index 00000000..40db98f4 --- /dev/null +++ b/examples/jit_cpp/a5_abs/README.md @@ -0,0 +1,100 @@ +# a5_abs — JIT `vabs_fp16` with torch_npu + msprof CA simulator + +Refactors the [a5 `main_abs`](../../a5/main_abs.cpp) flow: JIT-compile [`kernel_abs.cpp`](../../../csrc/kernel/kernel_abs.cpp) with `bisheng`, launch `call_vabs_fp16` via **ctypes** on **torch_npu** tensors, and profile under the Ascend950 CA model with **msprof op simulator**. + +Numeric correctness is **not** validated in `run_abs.py` when using the CA simulator — `msprof op simulator` models hardware pipeline behavior only. A successful run compiles the kernel, launches it on NPU tensors, and produces profiler output under `OPPROF_*`. + +## Layout + +| File | Role | +|------|------| +| `jit_util_a5_abs.py` | JIT compile `kernel_abs.cpp` → `libkernel_abs_jit.so`, ctypes wrapper | +| `run_abs.py` | Build input, launch kernel, synchronize | +| `run_msprof.sh` | Wrapper: env + `msprof op simulator` | + +## Prerequisites + +- Docker image `agent_npu_cann_950:9.0.0` (or equivalent CANN 9.0 + torch_npu 2.9) +- Host checkout of `pto-kernels` mounted into the container + +## Reproduce (CA simulator) + +From the host, start the container (mount the repo parent so `pto-kernels` is visible): + +```bash +cd /path/to/parent-of-pto-kernels +docker run -it --rm \ + --privileged \ + --network=host \ + --ipc=host \ + -v "$(pwd)":/workspace \ + -w /workspace \ + --name torch_npu \ + agent_npu_cann_950:9.0.0 \ + /bin/bash +``` + +Inside the container: + +```bash +source /usr/local/Ascend/ascend-toolkit/set_env.sh +cd /workspace/pto-kernels/examples/jit_cpp/a5_abs + +./run_msprof.sh +``` + +Or manually (same as [`ca_model.md`](../../../../npu_kernels/950_setup/ca_model.md)): + +```bash +source /usr/local/Ascend/ascend-toolkit/set_env.sh +export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/tools/simulator/Ascend950PR_9599/lib:$LD_LIBRARY_PATH +ulimit -n 65535 + +# msprof splits on spaces; use a script file, not python -c. +msprof op simulator --soc-version=Ascend950PR_9599 \ + python ./run_abs.py +``` + +### Expected success signals + +1. Console: `generated .../libkernel_abs_jit.so`, then `vabs_fp16 kernel launch completed.` +2. msprof log: `Profiling on kernel: vabs_fp16_mix_aic` (or similar) and core duration table +3. New directory `OPPROF__*` in this folder with parsed simulator results + +### Compile flags (Ascend950 / A5) + +JIT build uses Ascend950-oriented flags (aligned with tilelang-ascend PTO `A5` path): + +- `--cce-aicore-arch=dav-c310` +- `-DREGISTER_BASE` +- `-std=gnu++17` (required for A5 PTO headers; `-std=c++20` fails on dav-c310) +- AICore stack LLVM options (`-cce-aicore-stack-size`, etc.) + +Do not use `--npu-arch=dav-2201` here; that targets an older arch profile. + +## Run without msprof (real device) + +On hardware with a working NPU runtime, you can smoke-test launch only: + +```bash +source /usr/local/Ascend/ascend-toolkit/set_env.sh +cd /workspace/pto-kernels/examples/jit_cpp/a5_abs +python ./run_abs.py +``` + +For numeric checks on device, use the packaged op (`tests/test_abs.py` / `pto_abs`) or the legacy ACL sample: + +```bash +cd /workspace/pto-kernels +make run_abs_a5 # cannsim record + examples/a5/main_abs.cpp +``` + +## Relation to legacy a5 sample + +| Legacy (`make run_abs_a5`) | This example | +|----------------------------|--------------| +| `g++` + `libkernel_abs.so` + ACL host buffers | `bisheng` JIT + torch tensors + ctypes | +| `cannsim record --soc=Ascend950` | `msprof op simulator --soc-version=Ascend950PR_9599` | +| `examples/a5/main_abs.cpp` | `run_abs.py` + `jit_util_a5_abs.py` | + +Kernel source is shared: `csrc/kernel/kernel_abs.cpp` (`call_vabs_fp16`, shape `8×128`, `blockDim=8`). diff --git a/examples/jit_cpp/a5_abs/jit_util_a5_abs.py b/examples/jit_cpp/a5_abs/jit_util_a5_abs.py new file mode 100644 index 00000000..f98a46cc --- /dev/null +++ b/examples/jit_cpp/a5_abs/jit_util_a5_abs.py @@ -0,0 +1,108 @@ +import ctypes +import os +import subprocess + +import torch + +REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..")) +KERNEL_SRC = os.path.join(REPO_ROOT, "csrc", "kernel", "kernel_abs.cpp") +KERNEL_INC = os.path.join(REPO_ROOT, "csrc", "kernel") + +ASCEND_TOOLKIT_HOME = os.environ.get("ASCEND_TOOLKIT_HOME", "") +PTO_LIB_PATH = os.environ.get("PTO_LIB_PATH", ASCEND_TOOLKIT_HOME) + +DEFAULT_BLOCK_DIM = 8 + + +def _lib_path() -> str: + return os.path.join(os.path.dirname(__file__), "libkernel_abs_jit.so") + + +def compile_cpp(verbose: bool = False, timeout: int = 120) -> str: + lib_path = _lib_path() + + flags = [ + "bisheng", + "--cce-aicore-arch=dav-c310", + "-DREGISTER_BASE", + "-O2", + "-std=gnu++17", + "-xcce", + "-fPIC", + "--shared", + "-mllvm", + "-cce-aicore-stack-size=0x8000", + "-mllvm", + "-cce-aicore-function-stack-size=0x8000", + "-Wno-ignored-attributes", + f"-I{KERNEL_INC}", + f"-I{PTO_LIB_PATH}/include", + f"-I{ASCEND_TOOLKIT_HOME}/include", + f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc", + f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc/runtime", + f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc/profiling", + ] + + command = [*flags, KERNEL_SRC, "-o", lib_path] + if verbose: + print("compile command:", " ".join(command)) + + try: + result = subprocess.run( + command, + timeout=timeout, + check=True, + capture_output=True, + text=True, + ) + except subprocess.CalledProcessError as e: + output = e.stdout or "" + if e.stderr: + output += e.stderr + raise RuntimeError( + f"Compile failed with exit code {e.returncode}:\n{output}" + ) from e + + if verbose and result.stdout: + print(result.stdout) + if verbose: + print(f"generated {lib_path}") + return lib_path + + +def torch_to_ctypes(tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + +def load_lib(lib_path: str): + lib_path = os.path.abspath(lib_path) + lib = ctypes.CDLL(lib_path) + + lib.call_vabs_fp16.argtypes = [ + ctypes.c_uint32, # blockDim + ctypes.c_void_p, # stream + ctypes.c_void_p, # x + ctypes.c_void_p, # z + ctypes.c_uint32, # num_elements + ] + lib.call_vabs_fp16.restype = None + + def vabs_fp16(x, z, num_elements, block_dim=DEFAULT_BLOCK_DIM, stream_ptr=None): + if stream_ptr is None: + stream_ptr = torch.npu.current_stream()._as_parameter_ # noqa: SLF001 + lib.call_vabs_fp16( + block_dim, + stream_ptr, + torch_to_ctypes(x), + torch_to_ctypes(z), + num_elements, + ) + + return vabs_fp16 + + +def jit_compile(verbose: bool = False, force_recompile: bool = False): + lib_path = _lib_path() + if force_recompile or not os.path.isfile(lib_path): + compile_cpp(verbose=verbose) + return load_lib(lib_path) diff --git a/examples/jit_cpp/a5_abs/run_abs.py b/examples/jit_cpp/a5_abs/run_abs.py new file mode 100644 index 00000000..c023b7ad --- /dev/null +++ b/examples/jit_cpp/a5_abs/run_abs.py @@ -0,0 +1,46 @@ +"""Run vabs_fp16 via JIT-compiled kernel + torch_npu tensors (a5 example).""" + +import numpy as np +import torch +import torch_npu # noqa: F401 + +from jit_util_a5_abs import DEFAULT_BLOCK_DIM, jit_compile + +# Matches examples/a5/main_abs.cpp +VABS_SHAPE = (8, 128) +VABS_NUM_ELEMENTS = VABS_SHAPE[0] * VABS_SHAPE[1] +DEVICE = "npu:0" + + +def make_input_x(): + """Same data as scripts/data_gen_abs.py (seed=42).""" + rng = np.random.default_rng(seed=42) + return rng.uniform(-100, 100, VABS_SHAPE).astype(np.float16) + + +def main(): + torch.npu.config.allow_internal_format = False + torch_npu.npu.set_compile_mode(jit_compile=False) + torch.npu.set_device(DEVICE) + + x_np = make_input_x() + # Allocate on CPU then copy to NPU (same pattern as tests/test_abs.py). + x = torch.from_numpy(x_np) + x = x.npu() + z = torch.empty_like(x) + + print(f"[vabs] shape={VABS_SHAPE}, blockDim={DEFAULT_BLOCK_DIM}") + print("Compiling kernel_abs.cpp ...") + vabs = jit_compile(verbose=True) + + vabs(x, z, VABS_NUM_ELEMENTS, block_dim=DEFAULT_BLOCK_DIM) + torch.npu.synchronize() + + # msprof CA simulator models pipeline timing only, not numeric results. + print("Input X (first 16):", x.flatten()[:16].cpu()) + print("Output Z (first 16):", z.flatten()[:16].cpu()) + print("vabs_fp16 kernel launch completed.") + + +if __name__ == "__main__": + main() diff --git a/examples/jit_cpp/a5_abs/run_msprof.sh b/examples/jit_cpp/a5_abs/run_msprof.sh new file mode 100755 index 00000000..667afea1 --- /dev/null +++ b/examples/jit_cpp/a5_abs/run_msprof.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Run a5 abs JIT example under msprof CA simulator (Ascend950PR_9599). +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +if [[ -f /usr/local/Ascend/ascend-toolkit/set_env.sh ]]; then + # shellcheck source=/dev/null + source /usr/local/Ascend/ascend-toolkit/set_env.sh +fi + +export LD_LIBRARY_PATH="/usr/local/Ascend/ascend-toolkit/latest/tools/simulator/Ascend950PR_9599/lib:${LD_LIBRARY_PATH:-}" +ulimit -n 65535 + +# msprof splits on spaces; use a script file, not python -c. +exec msprof op simulator --soc-version=Ascend950PR_9599 \ + --output="msprof_res" --kernel-name="vabs_fp16_mix_aic" --launch-count=10 \ + python ./run_abs.py