Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions examples/agent/fast_hadamard/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
hadamard_no_sync.pto
hadamard_manual_sync.pto
hadamard_auto_sync.cpp
hadamard_manual_sync.cpp
hadamard_auto_sync.pto
hadamard_auto_sync_lib.so
hadamard_manual_sync_lib.so

perf_data*
8 changes: 8 additions & 0 deletions examples/agent/fast_hadamard/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Usage:

```bash
bash ./compile.sh # generate PTO/CPP and build both auto/manual sync libs
python ./run_hadamard.py # test auto-sync lib (default)
python ./run_hadamard.py --manual-sync # test manual-sync lib
python ./plot_perf.py # optionally visualization
```
69 changes: 69 additions & 0 deletions examples/agent/fast_hadamard/_bench_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""
Single-config benchmark wrapper for the agentic optimizer.
Loads hadamard_auto_sync_lib.so and prints: latency_ms=<number>
"""
import ctypes
import math

import torch
import torch_npu # noqa: F401

from ptodsl.test_util import get_test_device

# Representative shape — change to target a different operating point
BATCH = 32
N = 8192
BLOCK_DIM = 24
WARMUP = 5
ITERS = 20

LOG2_N = int(math.log2(N))


def torch_to_ctypes(t):
return ctypes.c_void_p(t.data_ptr())


device = get_test_device()
torch.npu.set_device(device)

lib = ctypes.CDLL("./hadamard_auto_sync_lib.so")
lib.call_kernel.argtypes = [
ctypes.c_uint32, # blockDim
ctypes.c_void_p, # stream
ctypes.c_void_p, # x (in-place)
ctypes.c_uint32, # batch
ctypes.c_uint32, # n
ctypes.c_uint32, # log2_n
]
lib.call_kernel.restype = None


def run(x):
lib.call_kernel(
BLOCK_DIM,
torch.npu.current_stream()._as_parameter_,
torch_to_ctypes(x),
BATCH,
N,
LOG2_N,
)


# Allocate separate tensors to avoid cache reuse
xs = [torch.randn(BATCH, N, device=device, dtype=torch.float16) for _ in range(WARMUP + ITERS)]

for i in range(WARMUP):
run(xs[i])
torch.npu.synchronize()

starts = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)]
ends = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)]
for i in range(ITERS):
starts[i].record()
run(xs[WARMUP + i])
ends[i].record()
torch.npu.synchronize()

ms = sum(s.elapsed_time(e) for s, e in zip(starts, ends)) / ITERS
print(f"latency_ms={ms:.4f}")
28 changes: 28 additions & 0 deletions examples/agent/fast_hadamard/caller.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#ifndef KERNEL_CPP
#define KERNEL_CPP "hadamard_auto_sync.cpp"
#endif
#include KERNEL_CPP

#ifndef KERNEL_FN
#define KERNEL_FN fast_hadamard_autosync
#endif

#ifndef NUM_CORES
#define NUM_CORES 24
#endif

extern "C" void call_kernel(
uint32_t blockDim,
void *stream,
uint8_t *x,
uint32_t batch,
uint32_t n,
uint32_t log2_n)
{
uint32_t launch_blocks = blockDim > 0 ? blockDim : NUM_CORES;
KERNEL_FN<<<launch_blocks, nullptr, stream>>>(
reinterpret_cast<half *>(x),
static_cast<int32_t>(batch),
static_cast<int32_t>(n),
static_cast<int32_t>(log2_n));
}
46 changes: 46 additions & 0 deletions examples/agent/fast_hadamard/compile.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
set -e

rm -f \
hadamard_auto_sync.pto hadamard_manual_sync.pto \
hadamard_auto_sync.cpp hadamard_manual_sync.cpp \
hadamard_auto_sync_lib.so hadamard_manual_sync_lib.so

# Auto-sync path: rely on ptoas synchronization insertion.
python ./hadamard_builder.py > ./hadamard_auto_sync.pto
ptoas --enable-insert-sync ./hadamard_auto_sync.pto -o ./hadamard_auto_sync.cpp

# Manual-sync path: explicit record/wait events from builder.
python ./hadamard_builder.py --manual-sync > ./hadamard_manual_sync.pto
ptoas ./hadamard_manual_sync.pto -o ./hadamard_manual_sync.cpp

bisheng \
-I${ASCEND_TOOLKIT_HOME}/include \
-fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \
-Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \
-xcce -Xhost-start -Xhost-end \
-mllvm -cce-aicore-stack-size=0x8000 \
-mllvm -cce-aicore-function-stack-size=0x8000 \
-mllvm -cce-aicore-record-overflow=true \
-mllvm -cce-aicore-addr-transform \
-mllvm -cce-aicore-dcci-insert-for-scalar=false \
--npu-arch=dav-2201 -DMEMORY_BASE \
-std=gnu++17 \
./caller.cpp \
-o ./hadamard_auto_sync_lib.so

bisheng \
-I${ASCEND_TOOLKIT_HOME}/include \
-fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \
-Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \
-xcce -Xhost-start -Xhost-end \
-mllvm -cce-aicore-stack-size=0x8000 \
-mllvm -cce-aicore-function-stack-size=0x8000 \
-mllvm -cce-aicore-record-overflow=true \
-mllvm -cce-aicore-addr-transform \
-mllvm -cce-aicore-dcci-insert-for-scalar=false \
--npu-arch=dav-2201 -DMEMORY_BASE \
-std=gnu++17 \
-DKERNEL_CPP="\"hadamard_manual_sync.cpp\"" \
-DKERNEL_FN=fast_hadamard_manualsync \
./caller.cpp \
-o ./hadamard_manual_sync_lib.so
Loading