huawei-csl · MirkoDeVita98 · Mar 9, 2026 · Mar 9, 2026 · Mar 9, 2026 · Mar 9, 2026
diff --git a/examples/agent/fast_hadamard/.gitignore b/examples/agent/fast_hadamard/.gitignore
@@ -0,0 +1,9 @@
+hadamard_no_sync.pto
+hadamard_manual_sync.pto
+hadamard_auto_sync.cpp
+hadamard_manual_sync.cpp
+hadamard_auto_sync.pto
+hadamard_auto_sync_lib.so
+hadamard_manual_sync_lib.so
+
+perf_data*
diff --git a/examples/agent/fast_hadamard/README.md b/examples/agent/fast_hadamard/README.md
@@ -0,0 +1,8 @@
+Usage:
+
+```bash
+bash ./compile.sh  # generate PTO/CPP and build both auto/manual sync libs
+python ./run_hadamard.py  # test auto-sync lib (default)
+python ./run_hadamard.py --manual-sync  # test manual-sync lib
+python ./plot_perf.py  # optionally visualization
+```
diff --git a/examples/agent/fast_hadamard/_bench_wrapper.py b/examples/agent/fast_hadamard/_bench_wrapper.py
@@ -0,0 +1,69 @@
+"""
+Single-config benchmark wrapper for the agentic optimizer.
+Loads hadamard_auto_sync_lib.so and prints:  latency_ms=<number>
+"""
+import ctypes
+import math
+
+import torch
+import torch_npu  # noqa: F401
+
+from ptodsl.test_util import get_test_device
+
+# Representative shape — change to target a different operating point
+BATCH     = 32
+N         = 8192
+BLOCK_DIM = 24
+WARMUP    = 5
+ITERS     = 20
+
+LOG2_N = int(math.log2(N))
+
+
+def torch_to_ctypes(t):
+    return ctypes.c_void_p(t.data_ptr())
+
+
+device = get_test_device()
+torch.npu.set_device(device)
+
+lib = ctypes.CDLL("./hadamard_auto_sync_lib.so")
+lib.call_kernel.argtypes = [
+    ctypes.c_uint32,  # blockDim
+    ctypes.c_void_p,  # stream
+    ctypes.c_void_p,  # x (in-place)
+    ctypes.c_uint32,  # batch
+    ctypes.c_uint32,  # n
+    ctypes.c_uint32,  # log2_n
+]
+lib.call_kernel.restype = None
+
+
+def run(x):
+    lib.call_kernel(
+        BLOCK_DIM,
+        torch.npu.current_stream()._as_parameter_,
+        torch_to_ctypes(x),
+        BATCH,
+        N,
+        LOG2_N,
+    )
+
+
+# Allocate separate tensors to avoid cache reuse
+xs = [torch.randn(BATCH, N, device=device, dtype=torch.float16) for _ in range(WARMUP + ITERS)]
+
+for i in range(WARMUP):
+    run(xs[i])
+torch.npu.synchronize()
+
+starts = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)]
+ends   = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)]
+for i in range(ITERS):
+    starts[i].record()
+    run(xs[WARMUP + i])
+    ends[i].record()
+torch.npu.synchronize()
+
+ms = sum(s.elapsed_time(e) for s, e in zip(starts, ends)) / ITERS
+print(f"latency_ms={ms:.4f}")
diff --git a/examples/agent/fast_hadamard/caller.cpp b/examples/agent/fast_hadamard/caller.cpp
@@ -0,0 +1,28 @@
+#ifndef KERNEL_CPP
+#define KERNEL_CPP "hadamard_auto_sync.cpp"
+#endif
+#include KERNEL_CPP
+
+#ifndef KERNEL_FN
+#define KERNEL_FN fast_hadamard_autosync
+#endif
+
+#ifndef NUM_CORES
+#define NUM_CORES 24
+#endif
+
+extern "C" void call_kernel(
+    uint32_t blockDim,
+    void *stream,
+    uint8_t *x,
+    uint32_t batch,
+    uint32_t n,
+    uint32_t log2_n)
+{
+    uint32_t launch_blocks = blockDim > 0 ? blockDim : NUM_CORES;
+    KERNEL_FN<<<launch_blocks, nullptr, stream>>>(
+        reinterpret_cast<half *>(x),
+        static_cast<int32_t>(batch),
+        static_cast<int32_t>(n),
+        static_cast<int32_t>(log2_n));
+}
diff --git a/examples/agent/fast_hadamard/compile.sh b/examples/agent/fast_hadamard/compile.sh
@@ -0,0 +1,46 @@
+set -e
+
+rm -f \
+    hadamard_auto_sync.pto hadamard_manual_sync.pto \
+    hadamard_auto_sync.cpp hadamard_manual_sync.cpp \
+    hadamard_auto_sync_lib.so hadamard_manual_sync_lib.so
+
+# Auto-sync path: rely on ptoas synchronization insertion.
+python ./hadamard_builder.py > ./hadamard_auto_sync.pto
+ptoas --enable-insert-sync ./hadamard_auto_sync.pto -o ./hadamard_auto_sync.cpp
+
+# Manual-sync path: explicit record/wait events from builder.
+python ./hadamard_builder.py --manual-sync > ./hadamard_manual_sync.pto
+ptoas ./hadamard_manual_sync.pto -o ./hadamard_manual_sync.cpp
+
+bisheng \
+    -I${ASCEND_TOOLKIT_HOME}/include \
+    -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \
+    -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \
+    -xcce -Xhost-start -Xhost-end \
+    -mllvm -cce-aicore-stack-size=0x8000 \
+    -mllvm -cce-aicore-function-stack-size=0x8000 \
+    -mllvm -cce-aicore-record-overflow=true \
+    -mllvm -cce-aicore-addr-transform \
+    -mllvm -cce-aicore-dcci-insert-for-scalar=false \
+    --npu-arch=dav-2201 -DMEMORY_BASE \
+    -std=gnu++17 \
+    ./caller.cpp \
+    -o ./hadamard_auto_sync_lib.so
+
+bisheng \
+    -I${ASCEND_TOOLKIT_HOME}/include \
+    -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \
+    -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \
+    -xcce -Xhost-start -Xhost-end \
+    -mllvm -cce-aicore-stack-size=0x8000 \
+    -mllvm -cce-aicore-function-stack-size=0x8000 \
+    -mllvm -cce-aicore-record-overflow=true \
+    -mllvm -cce-aicore-addr-transform \
+    -mllvm -cce-aicore-dcci-insert-for-scalar=false \
+    --npu-arch=dav-2201 -DMEMORY_BASE \
+    -std=gnu++17 \
+    -DKERNEL_CPP="\"hadamard_manual_sync.cpp\"" \
+    -DKERNEL_FN=fast_hadamard_manualsync \
+    ./caller.cpp \
+    -o ./hadamard_manual_sync_lib.so