huawei-csl · learning-chip · May 18, 2026 · learning-chip · May 18, 2026 · learning-chip
diff --git a/examples/jit_cpp/a5_abs/.gitignore b/examples/jit_cpp/a5_abs/.gitignore
@@ -0,0 +1,3 @@
+*.so
+OPPROF_*/
+kernel_meta/
diff --git a/examples/jit_cpp/a5_abs/README.md b/examples/jit_cpp/a5_abs/README.md
@@ -0,0 +1,100 @@
+# a5_abs — JIT `vabs_fp16` with torch_npu + msprof CA simulator
+
+Refactors the [a5 `main_abs`](../../a5/main_abs.cpp) flow: JIT-compile [`kernel_abs.cpp`](../../../csrc/kernel/kernel_abs.cpp) with `bisheng`, launch `call_vabs_fp16` via **ctypes** on **torch_npu** tensors, and profile under the Ascend950 CA model with **msprof op simulator**.
+
+Numeric correctness is **not** validated in `run_abs.py` when using the CA simulator — `msprof op simulator` models hardware pipeline behavior only. A successful run compiles the kernel, launches it on NPU tensors, and produces profiler output under `OPPROF_*`.
+
+## Layout
+
+| File | Role |
+|------|------|
+| `jit_util_a5_abs.py` | JIT compile `kernel_abs.cpp` → `libkernel_abs_jit.so`, ctypes wrapper |
+| `run_abs.py` | Build input, launch kernel, synchronize |
+| `run_msprof.sh` | Wrapper: env + `msprof op simulator` |
+
+## Prerequisites
+
+- Docker image `agent_npu_cann_950:9.0.0` (or equivalent CANN 9.0 + torch_npu 2.9)
+- Host checkout of `pto-kernels` mounted into the container
+
+## Reproduce (CA simulator)
+
+From the host, start the container (mount the repo parent so `pto-kernels` is visible):
+
+```bash
+cd /path/to/parent-of-pto-kernels
+docker run -it --rm \
+  --privileged \
+  --network=host \
+  --ipc=host \
+  -v "$(pwd)":/workspace \
+  -w /workspace \
+  --name torch_npu \
+  agent_npu_cann_950:9.0.0 \
+  /bin/bash
+```
+
+Inside the container:
+
+```bash
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+cd /workspace/pto-kernels/examples/jit_cpp/a5_abs
+
+./run_msprof.sh
+```
+
+Or manually (same as [`ca_model.md`](../../../../npu_kernels/950_setup/ca_model.md)):
+
+```bash
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/tools/simulator/Ascend950PR_9599/lib:$LD_LIBRARY_PATH
+ulimit -n 65535
+
+# msprof splits on spaces; use a script file, not python -c.
+msprof op simulator --soc-version=Ascend950PR_9599 \
+  python ./run_abs.py
+```
+
+### Expected success signals
+
+1. Console: `generated .../libkernel_abs_jit.so`, then `vabs_fp16 kernel launch completed.`
+2. msprof log: `Profiling on kernel: vabs_fp16_mix_aic` (or similar) and core duration table
+3. New directory `OPPROF_<timestamp>_*` in this folder with parsed simulator results
+
+### Compile flags (Ascend950 / A5)
+
+JIT build uses Ascend950-oriented flags (aligned with tilelang-ascend PTO `A5` path):
+
+- `--cce-aicore-arch=dav-c310`
+- `-DREGISTER_BASE`
+- `-std=gnu++17` (required for A5 PTO headers; `-std=c++20` fails on dav-c310)
+- AICore stack LLVM options (`-cce-aicore-stack-size`, etc.)
+
+Do not use `--npu-arch=dav-2201` here; that targets an older arch profile.
+
+## Run without msprof (real device)
+
+On hardware with a working NPU runtime, you can smoke-test launch only:
+
+```bash
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+cd /workspace/pto-kernels/examples/jit_cpp/a5_abs
+python ./run_abs.py
+```
+
+For numeric checks on device, use the packaged op (`tests/test_abs.py` / `pto_abs`) or the legacy ACL sample:
+
+```bash
+cd /workspace/pto-kernels
+make run_abs_a5   # cannsim record + examples/a5/main_abs.cpp
+```
+
+## Relation to legacy a5 sample
+
+| Legacy (`make run_abs_a5`) | This example |
+|----------------------------|--------------|
+| `g++` + `libkernel_abs.so` + ACL host buffers | `bisheng` JIT + torch tensors + ctypes |
+| `cannsim record --soc=Ascend950` | `msprof op simulator --soc-version=Ascend950PR_9599` |
+| `examples/a5/main_abs.cpp` | `run_abs.py` + `jit_util_a5_abs.py` |
+
+Kernel source is shared: `csrc/kernel/kernel_abs.cpp` (`call_vabs_fp16`, shape `8×128`, `blockDim=8`).
diff --git a/examples/jit_cpp/a5_abs/jit_util_a5_abs.py b/examples/jit_cpp/a5_abs/jit_util_a5_abs.py
@@ -0,0 +1,108 @@
+import ctypes
+import os
+import subprocess
+
+import torch
+
+REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+KERNEL_SRC = os.path.join(REPO_ROOT, "csrc", "kernel", "kernel_abs.cpp")
+KERNEL_INC = os.path.join(REPO_ROOT, "csrc", "kernel")
+
+ASCEND_TOOLKIT_HOME = os.environ.get("ASCEND_TOOLKIT_HOME", "")
+PTO_LIB_PATH = os.environ.get("PTO_LIB_PATH", ASCEND_TOOLKIT_HOME)
+
+DEFAULT_BLOCK_DIM = 8
+
+
+def _lib_path() -> str:
+    return os.path.join(os.path.dirname(__file__), "libkernel_abs_jit.so")
+
+
+def compile_cpp(verbose: bool = False, timeout: int = 120) -> str:
+    lib_path = _lib_path()
+
+    flags = [
+        "bisheng",
+        "--cce-aicore-arch=dav-c310",
+        "-DREGISTER_BASE",
+        "-O2",
+        "-std=gnu++17",
+        "-xcce",
+        "-fPIC",
+        "--shared",
+        "-mllvm",
+        "-cce-aicore-stack-size=0x8000",
+        "-mllvm",
+        "-cce-aicore-function-stack-size=0x8000",
+        "-Wno-ignored-attributes",
+        f"-I{KERNEL_INC}",
+        f"-I{PTO_LIB_PATH}/include",
+        f"-I{ASCEND_TOOLKIT_HOME}/include",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc/runtime",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc/profiling",
+    ]
+
+    command = [*flags, KERNEL_SRC, "-o", lib_path]
+    if verbose:
+        print("compile command:", " ".join(command))
+
+    try:
+        result = subprocess.run(
+            command,
+            timeout=timeout,
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+    except subprocess.CalledProcessError as e:
+        output = e.stdout or ""
+        if e.stderr:
+            output += e.stderr
+        raise RuntimeError(
+            f"Compile failed with exit code {e.returncode}:\n{output}"
+        ) from e
+
+    if verbose and result.stdout:
+        print(result.stdout)
+    if verbose:
+        print(f"generated {lib_path}")
+    return lib_path
+
+
+def torch_to_ctypes(tensor):
+    return ctypes.c_void_p(tensor.data_ptr())
+
+
+def load_lib(lib_path: str):
+    lib_path = os.path.abspath(lib_path)
+    lib = ctypes.CDLL(lib_path)
+
+    lib.call_vabs_fp16.argtypes = [
+        ctypes.c_uint32,  # blockDim
+        ctypes.c_void_p,  # stream
+        ctypes.c_void_p,  # x
+        ctypes.c_void_p,  # z
+        ctypes.c_uint32,  # num_elements
+    ]
+    lib.call_vabs_fp16.restype = None
+
+    def vabs_fp16(x, z, num_elements, block_dim=DEFAULT_BLOCK_DIM, stream_ptr=None):
+        if stream_ptr is None:
+            stream_ptr = torch.npu.current_stream()._as_parameter_  # noqa: SLF001
+        lib.call_vabs_fp16(
+            block_dim,
+            stream_ptr,
+            torch_to_ctypes(x),
+            torch_to_ctypes(z),
+            num_elements,
+        )
+
+    return vabs_fp16
+
+
+def jit_compile(verbose: bool = False, force_recompile: bool = False):
+    lib_path = _lib_path()
+    if force_recompile or not os.path.isfile(lib_path):
+        compile_cpp(verbose=verbose)
+    return load_lib(lib_path)
diff --git a/examples/jit_cpp/a5_abs/run_abs.py b/examples/jit_cpp/a5_abs/run_abs.py
@@ -0,0 +1,46 @@
+"""Run vabs_fp16 via JIT-compiled kernel + torch_npu tensors (a5 example)."""
+
+import numpy as np
+import torch
+import torch_npu  # noqa: F401
+
+from jit_util_a5_abs import DEFAULT_BLOCK_DIM, jit_compile
+
+# Matches examples/a5/main_abs.cpp
+VABS_SHAPE = (8, 128)
+VABS_NUM_ELEMENTS = VABS_SHAPE[0] * VABS_SHAPE[1]
+DEVICE = "npu:0"
+
+
+def make_input_x():
+    """Same data as scripts/data_gen_abs.py (seed=42)."""
+    rng = np.random.default_rng(seed=42)
+    return rng.uniform(-100, 100, VABS_SHAPE).astype(np.float16)
+
+
+def main():
+    torch.npu.config.allow_internal_format = False
+    torch_npu.npu.set_compile_mode(jit_compile=False)
+    torch.npu.set_device(DEVICE)
+
+    x_np = make_input_x()
+    # Allocate on CPU then copy to NPU (same pattern as tests/test_abs.py).
+    x = torch.from_numpy(x_np)
+    x = x.npu()
+    z = torch.empty_like(x)
+
+    print(f"[vabs] shape={VABS_SHAPE}, blockDim={DEFAULT_BLOCK_DIM}")
+    print("Compiling kernel_abs.cpp ...")
+    vabs = jit_compile(verbose=True)
+
+    vabs(x, z, VABS_NUM_ELEMENTS, block_dim=DEFAULT_BLOCK_DIM)
+    torch.npu.synchronize()
+
+    # msprof CA simulator models pipeline timing only, not numeric results.
+    print("Input X (first 16):", x.flatten()[:16].cpu())
+    print("Output Z (first 16):", z.flatten()[:16].cpu())
+    print("vabs_fp16 kernel launch completed.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/jit_cpp/a5_abs/run_msprof.sh b/examples/jit_cpp/a5_abs/run_msprof.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+# Run a5 abs JIT example under msprof CA simulator (Ascend950PR_9599).
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR"
+
+if [[ -f /usr/local/Ascend/ascend-toolkit/set_env.sh ]]; then
+  # shellcheck source=/dev/null
+  source /usr/local/Ascend/ascend-toolkit/set_env.sh
+fi
+
+export LD_LIBRARY_PATH="/usr/local/Ascend/ascend-toolkit/latest/tools/simulator/Ascend950PR_9599/lib:${LD_LIBRARY_PATH:-}"
+ulimit -n 65535
+
+# msprof splits on spaces; use a script file, not python -c.
+exec msprof op simulator --soc-version=Ascend950PR_9599 \
+  --output="msprof_res" --kernel-name="vabs_fp16_mix_aic" --launch-count=10 \
+  python ./run_abs.py