From d5bfce57607685c15ea106ba9e2a67792cf28598 Mon Sep 17 00:00:00 2001 From: mirkodevita Date: Mon, 9 Mar 2026 13:55:44 +0000 Subject: [PATCH 1/6] first implementation of KernelAgent for pto-dsl with optimized geglu --- .../geglu_dynamic_multicore/_bench_wrapper.py | 71 +++ .../geglu_dynamic_multicore_opt/.gitignore | 3 + .../aot/geglu_dynamic_multicore_opt/README.md | 7 + .../_bench_wrapper.py | 71 +++ .../bench_geglu.py | 128 ++++++ .../geglu_dynamic_multicore_opt/caller.cpp | 26 ++ .../geglu_dynamic_multicore_opt/compile.sh | 22 + .../geglu_builder.py | 179 ++++++++ .../geglu_dynamic_multicore_opt/run_geglu.py | 121 ++++++ openenv/README.md | 119 +++++ openenv/__init__.py | 3 + openenv/agent_search.py | 407 ++++++++++++++++++ openenv/geglu_config.toml | 37 ++ openenv/hadamard_config.toml | 54 +++ openenv/kernel_opt_env.py | 221 ++++++++++ 15 files changed, 1469 insertions(+) create mode 100644 examples/aot/geglu_dynamic_multicore/_bench_wrapper.py create mode 100644 examples/aot/geglu_dynamic_multicore_opt/.gitignore create mode 100644 examples/aot/geglu_dynamic_multicore_opt/README.md create mode 100644 examples/aot/geglu_dynamic_multicore_opt/_bench_wrapper.py create mode 100644 examples/aot/geglu_dynamic_multicore_opt/bench_geglu.py create mode 100644 examples/aot/geglu_dynamic_multicore_opt/caller.cpp create mode 100755 examples/aot/geglu_dynamic_multicore_opt/compile.sh create mode 100644 examples/aot/geglu_dynamic_multicore_opt/geglu_builder.py create mode 100644 examples/aot/geglu_dynamic_multicore_opt/run_geglu.py create mode 100644 openenv/README.md create mode 100644 openenv/__init__.py create mode 100644 openenv/agent_search.py create mode 100644 openenv/geglu_config.toml create mode 100644 openenv/hadamard_config.toml create mode 100644 openenv/kernel_opt_env.py diff --git a/examples/aot/geglu_dynamic_multicore/_bench_wrapper.py b/examples/aot/geglu_dynamic_multicore/_bench_wrapper.py new file mode 100644 index 00000000..dec6bfaf --- /dev/null +++ b/examples/aot/geglu_dynamic_multicore/_bench_wrapper.py @@ -0,0 +1,71 @@ +""" +Single-config benchmark wrapper for the agentic optimizer. +Loads geglu_lib.so and prints: latency_ms= +""" +import ctypes + +import torch +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + +# Representative shape — change to target a different operating point +BATCH = 1024 +N_COLS = 8192 +BLOCK_DIM = 24 +WARMUP = 5 +ITERS = 20 + + +def torch_to_ctypes(t): + return ctypes.c_void_p(t.data_ptr()) + + +device = get_test_device() +torch.npu.set_device(device) + +lib = ctypes.CDLL("./geglu_lib.so") +lib.call_kernel.argtypes = [ + ctypes.c_uint32, # blockDim + ctypes.c_void_p, # stream + ctypes.c_void_p, # a + ctypes.c_void_p, # b + ctypes.c_void_p, # c (output) + ctypes.c_uint32, # batch + ctypes.c_uint32, # n_cols +] +lib.call_kernel.restype = None + + +def run(a, b, c): + lib.call_kernel( + BLOCK_DIM, + torch.npu.current_stream()._as_parameter_, + torch_to_ctypes(a), + torch_to_ctypes(b), + torch_to_ctypes(c), + BATCH, + N_COLS, + ) + + +dtype = torch.float16 +# Separate tensors per iteration to reduce cache reuse +as_ = [torch.randn(BATCH, N_COLS, device=device, dtype=dtype).clamp(-4, 4) for _ in range(WARMUP + ITERS)] +bs_ = [torch.randn(BATCH, N_COLS, device=device, dtype=dtype) for _ in range(WARMUP + ITERS)] +c = torch.empty(BATCH, N_COLS, device=device, dtype=dtype) + +for i in range(WARMUP): + run(as_[i], bs_[i], c) +torch.npu.synchronize() + +starts = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)] +ends = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)] +for i in range(ITERS): + starts[i].record() + run(as_[WARMUP + i], bs_[WARMUP + i], c) + ends[i].record() +torch.npu.synchronize() + +ms = sum(s.elapsed_time(e) for s, e in zip(starts, ends)) / ITERS +print(f"latency_ms={ms:.4f}") diff --git a/examples/aot/geglu_dynamic_multicore_opt/.gitignore b/examples/aot/geglu_dynamic_multicore_opt/.gitignore new file mode 100644 index 00000000..99be97c5 --- /dev/null +++ b/examples/aot/geglu_dynamic_multicore_opt/.gitignore @@ -0,0 +1,3 @@ +geglu.pto +geglu.cpp +geglu_lib.so diff --git a/examples/aot/geglu_dynamic_multicore_opt/README.md b/examples/aot/geglu_dynamic_multicore_opt/README.md new file mode 100644 index 00000000..374bb9cc --- /dev/null +++ b/examples/aot/geglu_dynamic_multicore_opt/README.md @@ -0,0 +1,7 @@ +Usage: + +```bash +bash ./compile.sh +python ./run_geglu.py +python ./bench_geglu.py +``` diff --git a/examples/aot/geglu_dynamic_multicore_opt/_bench_wrapper.py b/examples/aot/geglu_dynamic_multicore_opt/_bench_wrapper.py new file mode 100644 index 00000000..dec6bfaf --- /dev/null +++ b/examples/aot/geglu_dynamic_multicore_opt/_bench_wrapper.py @@ -0,0 +1,71 @@ +""" +Single-config benchmark wrapper for the agentic optimizer. +Loads geglu_lib.so and prints: latency_ms= +""" +import ctypes + +import torch +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + +# Representative shape — change to target a different operating point +BATCH = 1024 +N_COLS = 8192 +BLOCK_DIM = 24 +WARMUP = 5 +ITERS = 20 + + +def torch_to_ctypes(t): + return ctypes.c_void_p(t.data_ptr()) + + +device = get_test_device() +torch.npu.set_device(device) + +lib = ctypes.CDLL("./geglu_lib.so") +lib.call_kernel.argtypes = [ + ctypes.c_uint32, # blockDim + ctypes.c_void_p, # stream + ctypes.c_void_p, # a + ctypes.c_void_p, # b + ctypes.c_void_p, # c (output) + ctypes.c_uint32, # batch + ctypes.c_uint32, # n_cols +] +lib.call_kernel.restype = None + + +def run(a, b, c): + lib.call_kernel( + BLOCK_DIM, + torch.npu.current_stream()._as_parameter_, + torch_to_ctypes(a), + torch_to_ctypes(b), + torch_to_ctypes(c), + BATCH, + N_COLS, + ) + + +dtype = torch.float16 +# Separate tensors per iteration to reduce cache reuse +as_ = [torch.randn(BATCH, N_COLS, device=device, dtype=dtype).clamp(-4, 4) for _ in range(WARMUP + ITERS)] +bs_ = [torch.randn(BATCH, N_COLS, device=device, dtype=dtype) for _ in range(WARMUP + ITERS)] +c = torch.empty(BATCH, N_COLS, device=device, dtype=dtype) + +for i in range(WARMUP): + run(as_[i], bs_[i], c) +torch.npu.synchronize() + +starts = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)] +ends = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)] +for i in range(ITERS): + starts[i].record() + run(as_[WARMUP + i], bs_[WARMUP + i], c) + ends[i].record() +torch.npu.synchronize() + +ms = sum(s.elapsed_time(e) for s, e in zip(starts, ends)) / ITERS +print(f"latency_ms={ms:.4f}") diff --git a/examples/aot/geglu_dynamic_multicore_opt/bench_geglu.py b/examples/aot/geglu_dynamic_multicore_opt/bench_geglu.py new file mode 100644 index 00000000..2c36e5a7 --- /dev/null +++ b/examples/aot/geglu_dynamic_multicore_opt/bench_geglu.py @@ -0,0 +1,128 @@ +import argparse +import ctypes + +import torch +import torch.nn.functional as F +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + + +def torch_to_ctypes(tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + +def load_lib(lib_path, block_dim=24): + lib = ctypes.CDLL(lib_path) + lib.call_kernel.argtypes = [ + ctypes.c_uint32, # blockDim + ctypes.c_void_p, # stream + ctypes.c_void_p, # a + ctypes.c_void_p, # b + ctypes.c_void_p, # c (output) + ctypes.c_uint32, # batch + ctypes.c_uint32, # n_cols + ] + lib.call_kernel.restype = None + + def geglu_func(a, b, c, batch, n_cols, stream_ptr=None): + if stream_ptr is None: + stream_ptr = torch.npu.current_stream()._as_parameter_ + lib.call_kernel( + block_dim, + stream_ptr, + torch_to_ctypes(a), + torch_to_ctypes(b), + torch_to_ctypes(c), + batch, + n_cols, + ) + + return geglu_func + + +def bench_geglu( + geglu_func, a, b, c, kernel_name="geglu_func", warmup_iters=5, benchmark_iters=50 +): + batch, n_cols = a.shape + # reads a and b, writes c + io_bytes = a.numel() * a.element_size() * 3 + # Overwrite a large buffer between launches to reduce L2 cache reuse. + cache = torch.empty((256 * 1024 * 1024,), dtype=torch.int8, device=a.device) + + def time_op(fn): + for _ in range(warmup_iters): + fn() + torch.npu.synchronize() + + mixed_start = torch.npu.Event(enable_timing=True) + mixed_end = torch.npu.Event(enable_timing=True) + cache_start = torch.npu.Event(enable_timing=True) + cache_end = torch.npu.Event(enable_timing=True) + + mixed_start.record() + for _ in range(benchmark_iters): + cache.zero_() + fn() + mixed_end.record() + torch.npu.synchronize() + + cache_start.record() + for _ in range(benchmark_iters): + cache.zero_() + cache_end.record() + torch.npu.synchronize() + + mixed_total_ms = mixed_start.elapsed_time(mixed_end) + cache_total_ms = cache_start.elapsed_time(cache_end) + kernel_total_ms = max(mixed_total_ms - cache_total_ms, 0.0) + return kernel_total_ms / benchmark_iters + + custom_ms = time_op(lambda: geglu_func(a, b, c, batch, n_cols)) + torch_ms = time_op(lambda: torch.mul(F.gelu(a, approximate="tanh"), b)) + + custom_bw_gbs = (io_bytes / (custom_ms / 1e3)) / 1e9 + torch_bw_gbs = (io_bytes / (torch_ms / 1e3)) / 1e9 + + print( + f"{kernel_name}: {custom_ms:.3f} ms, " + f"effective bandwidth: {custom_bw_gbs:.3f} GB/s " + f"(IO={io_bytes / 1e6:.2f} MB)" + ) + print( + f"torch gelu*b: {torch_ms:.3f} ms, " + f"effective bandwidth: {torch_bw_gbs:.3f} GB/s " + f"(IO={io_bytes / 1e6:.2f} MB)" + ) + + +def run_bench(lib_path, block_dim=24, batch=1024, n_cols=8192): + device = get_test_device() + torch.npu.set_device(device) + + geglu_func = load_lib(lib_path, block_dim=block_dim) + + torch.manual_seed(0) + dtype = torch.float16 + a = torch.randn(batch, n_cols, device=device, dtype=dtype).clamp(-4, 4) + b = torch.randn(batch, n_cols, device=device, dtype=dtype) + c = torch.empty(batch, n_cols, device=device, dtype=dtype) + + geglu_func(a, b, c, batch, n_cols) + torch.npu.synchronize() + + a_f32 = a.float() + ref = (0.5 * a_f32 * (1.0 + torch.tanh(a_f32))).to(dtype) * b + torch.testing.assert_close(c, ref, rtol=1e-2, atol=1e-2) + + bench_geglu(geglu_func, a, b, c, kernel_name=f"geglu ({lib_path})") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--lib", default="./geglu_lib.so") + parser.add_argument("--block-dim", type=int, default=24) + parser.add_argument("--batch", type=int, default=1024) + parser.add_argument("--n-cols", type=int, default=8192) + args = parser.parse_args() + run_bench(args.lib, block_dim=args.block_dim, batch=args.batch, n_cols=args.n_cols) diff --git a/examples/aot/geglu_dynamic_multicore_opt/caller.cpp b/examples/aot/geglu_dynamic_multicore_opt/caller.cpp new file mode 100644 index 00000000..85351fd4 --- /dev/null +++ b/examples/aot/geglu_dynamic_multicore_opt/caller.cpp @@ -0,0 +1,26 @@ +#ifndef KERNEL_CPP +#define KERNEL_CPP "geglu.cpp" +#endif +#include KERNEL_CPP + +#ifndef NUM_CORES +#define NUM_CORES 24 +#endif + +extern "C" void call_kernel( + uint32_t blockDim, + void *stream, + uint8_t *a, + uint8_t *b, + uint8_t *c, + uint32_t batch, + uint32_t n_cols) +{ + uint32_t launch_blocks = blockDim > 0 ? blockDim : NUM_CORES; + _kernel<<>>( + reinterpret_cast(a), + reinterpret_cast(b), + reinterpret_cast(c), + static_cast(batch), + static_cast(n_cols)); +} diff --git a/examples/aot/geglu_dynamic_multicore_opt/compile.sh b/examples/aot/geglu_dynamic_multicore_opt/compile.sh new file mode 100755 index 00000000..9da3faa7 --- /dev/null +++ b/examples/aot/geglu_dynamic_multicore_opt/compile.sh @@ -0,0 +1,22 @@ +set -e + +rm -f geglu.pto geglu.cpp geglu_lib.so + +python ./geglu_builder.py > ./geglu.pto +ptoas --enable-insert-sync ./geglu.pto -o ./geglu.cpp + +bisheng \ + -I${ASCEND_TOOLKIT_HOME}/include \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ + -xcce -Xhost-start -Xhost-end \ + -mllvm -cce-aicore-stack-size=0x8000 \ + -mllvm -cce-aicore-function-stack-size=0x8000 \ + -mllvm -cce-aicore-record-overflow=true \ + -mllvm -cce-aicore-addr-transform \ + -mllvm -cce-aicore-dcci-insert-for-scalar=false \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -std=gnu++17 \ + -DKERNEL_CPP="\"geglu.cpp\"" \ + ./caller.cpp \ + -o ./geglu_lib.so diff --git a/examples/aot/geglu_dynamic_multicore_opt/geglu_builder.py b/examples/aot/geglu_dynamic_multicore_opt/geglu_builder.py new file mode 100644 index 00000000..4dae7890 --- /dev/null +++ b/examples/aot/geglu_dynamic_multicore_opt/geglu_builder.py @@ -0,0 +1,179 @@ +from ptodsl import to_ir_module +import ptodsl.language as pto + +const = pto.const + +# 32 KB of UB / sizeof(fp16) = 16384 elements per tile +ELEMENTS_PER_TILE = 32 * 1024 // 2 + + +def meta_data(): + dtype = pto.float16 + ptr_type = pto.PtrType(dtype) + index_dtype = pto.int32 + + tensor_type = pto.TensorType(rank=1, dtype=dtype) + subtensor_type = pto.SubTensorType(shape=[1, ELEMENTS_PER_TILE], dtype=dtype) + + tile_cfg = pto.TileBufConfig() + tile_type = pto.TileBufType( + shape=[1, ELEMENTS_PER_TILE], + valid_shape=[1, -1], + dtype=dtype, + memory_space="VEC", + config=tile_cfg, + ) + + return { + "ptr_type": ptr_type, + "index_dtype": index_dtype, + "tensor_type": tensor_type, + "subtensor_type": subtensor_type, + "tile_type": tile_type, + } + + +def build_geglu(fn_name="geglu_fp16"): + """ + Build a dynamic-batch GEGLU kernel in PTO DSL (optimized). + + Computes c = gelu_approx(a) * b, where: + gelu_approx(a) = a * sigmoid(2a) + sigmoid(x) = exp(x) / (exp(x) + 1) + + Optimizations vs baseline: + 1. Sigmoid reformulation: gelu(a) = a * sigmoid(2a) uses only + 1 exp + 1 div per row instead of 2 exp + 2 div. + 2. Hoisted ones: exp(a-a)=1.0 computed once before the loop. + + UB tile budget (fp16, 5 tiles x 32 KB = 160 KB < 192 KB): + tb_a : input row a + tb_b : input row b + tb_ones : constant 1.0 (computed once before loop) + tb_tmp1 : intermediate / final output + tb_tmp2 : intermediate + + Kernel args: + a_ptr : fp16[batch * n_cols] -- gating input + b_ptr : fp16[batch * n_cols] -- linear input + c_ptr : fp16[batch * n_cols] -- output + batch : int32 -- number of rows + n_cols : int32 -- elements per row; must be <= 16384 + """ + + @to_ir_module(meta_data=meta_data) + def _kernel( + a_ptr: "ptr_type", + b_ptr: "ptr_type", + c_ptr: "ptr_type", + batch_i32: "index_dtype", + n_cols_i32: "index_dtype", + ) -> None: + c0 = const(0) + c1 = const(1) + c_tile = const(ELEMENTS_PER_TILE) + + batch = pto.index_cast(batch_i32) + n_cols = pto.index_cast(n_cols_i32) + + with pto.vector_section(): + # Guard: n_cols must be in (0, ELEMENTS_PER_TILE]. + + with pto.if_context(n_cols > c0): + with pto.if_context(c_tile >= n_cols): + cid = pto.get_block_idx() + sub_bid = pto.get_subblock_idx() + sub_bnum = pto.get_subblock_num() + num_blocks = pto.get_block_num() + + vid = pto.index_cast(cid * sub_bnum + sub_bid) # vector core index + num_cores = pto.index_cast(num_blocks * sub_bnum) # number of vector cores + + # Distribute rows across cores (row-level parallelism). + rows_per_core = pto.ceil_div(batch, num_cores) + row_start = vid * rows_per_core + row_end = pto.min_u(row_start + rows_per_core, batch) + num_rows = row_end - row_start + + total_elems = batch * n_cols + tv_a = pto.as_tensor( + tensor_type, ptr=a_ptr, shape=[total_elems], strides=[c1] + ) + tv_b = pto.as_tensor( + tensor_type, ptr=b_ptr, shape=[total_elems], strides=[c1] + ) + tv_c = pto.as_tensor( + tensor_type, ptr=c_ptr, shape=[total_elems], strides=[c1] + ) + + with pto.if_context(num_rows > c0): + # Allocate 5 UB tiles (160 KB total, well under 192 KB UB). + tb_a = pto.alloc_tile(tile_type, valid_col=n_cols) + tb_b = pto.alloc_tile(tile_type, valid_col=n_cols) + tb_ones = pto.alloc_tile(tile_type, valid_col=n_cols) + tb_tmp1 = pto.alloc_tile(tile_type, valid_col=n_cols) + tb_tmp2 = pto.alloc_tile(tile_type, valid_col=n_cols) + + # Hoist ones computation: load first row, compute exp(a-a)=1 + sv_a_first = pto.slice_view( + subtensor_type, + source=tv_a, + offsets=[row_start * n_cols], + sizes=[n_cols], + ) + pto.load(sv_a_first, tb_a) + pto.sub(tb_a, tb_a, tb_tmp1) # tmp1 = 0 + pto.exp(tb_tmp1, tb_ones) # ones = 1.0 + + for row_i in pto.for_range(c0, num_rows, c1): + gm_offset = (row_start + row_i) * n_cols + + sv_a = pto.slice_view( + subtensor_type, + source=tv_a, + offsets=[gm_offset], + sizes=[n_cols], + ) + sv_b = pto.slice_view( + subtensor_type, + source=tv_b, + offsets=[gm_offset], + sizes=[n_cols], + ) + sv_c = pto.slice_view( + subtensor_type, + source=tv_c, + offsets=[gm_offset], + sizes=[n_cols], + ) + + pto.load(sv_a, tb_a) + pto.load(sv_b, tb_b) + + # gelu(a) = a * sigmoid(2a) + # sigmoid(2a) = exp(2a) / (exp(2a) + 1) + pto.add(tb_a, tb_a, tb_tmp1) # tmp1 = 2a + pto.exp(tb_tmp1, tb_tmp1) # tmp1 = exp(2a) + pto.add(tb_tmp1, tb_ones, tb_tmp2) # tmp2 = exp(2a) + 1 + pto.div(tb_tmp1, tb_tmp2, tb_tmp1) # tmp1 = sigmoid(2a) + + # GEGLU: c = a * sigmoid(2a) * b + pto.mul(tb_a, tb_tmp1, tb_tmp1) # tmp1 = gelu(a) + pto.mul(tb_tmp1, tb_b, tb_tmp1) # tmp1 = c + pto.store(tb_tmp1, sv_c) + + _ = fn_name + return _kernel + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--fn-name", + default="geglu_fp16", + help="Generated kernel function name.", + ) + args = parser.parse_args() + print(build_geglu(fn_name=args.fn_name)) diff --git a/examples/aot/geglu_dynamic_multicore_opt/run_geglu.py b/examples/aot/geglu_dynamic_multicore_opt/run_geglu.py new file mode 100644 index 00000000..a180206c --- /dev/null +++ b/examples/aot/geglu_dynamic_multicore_opt/run_geglu.py @@ -0,0 +1,121 @@ +import argparse +import ctypes + +import torch +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + + +def torch_to_ctypes(tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + +def load_lib(lib_path, block_dim=24): + lib = ctypes.CDLL(lib_path) + lib.call_kernel.argtypes = [ + ctypes.c_uint32, # blockDim + ctypes.c_void_p, # stream + ctypes.c_void_p, # a + ctypes.c_void_p, # b + ctypes.c_void_p, # c (output) + ctypes.c_uint32, # batch + ctypes.c_uint32, # n_cols + ] + lib.call_kernel.restype = None + + def geglu_func(a, b, c, batch, n_cols, block_dim=block_dim, stream_ptr=None): + if stream_ptr is None: + stream_ptr = torch.npu.current_stream()._as_parameter_ + lib.call_kernel( + block_dim, + stream_ptr, + torch_to_ctypes(a), + torch_to_ctypes(b), + torch_to_ctypes(c), + batch, + n_cols, + ) + + return geglu_func + + +def geglu_ref(a, b): + """Reference GEGLU matching the PTO kernel. + + Computes c = gelu_approx(a) * b, where: + gelu_approx(a) = 0.5 * a * (1 + tanh(a)) + tanh(a) = (exp(2a) - 1) / (exp(2a) + 1) + + Note: This is a simplified tanh-based GELU (without the polynomial + inner argument used in the full approximation). It matches what the + PTO kernel computes using only tile-tile operations. + """ + a_f32 = a.float() + gelu_a = 0.5 * a_f32 * (1.0 + torch.tanh(a_f32)) + return gelu_a.to(a.dtype) * b + + +def test_geglu(lib_path, block_dim=24): + device = get_test_device() + torch.npu.set_device(device) + + geglu = load_lib(lib_path=lib_path, block_dim=block_dim) + + torch.manual_seed(0) + dtype = torch.float16 + batch_list = [1, 4, 22, 65] + n_cols_list = [128, 256, 512, 1024, 2048, 4096, 8192, 16384] + + results = [] + for batch in batch_list: + for n_cols in n_cols_list: + # Use small range to stay within fp16 exp range (avoid overflow). + a = torch.randn(batch, n_cols, device=device, dtype=dtype).clamp(-4, 4) + b = torch.randn(batch, n_cols, device=device, dtype=dtype) + c = torch.empty(batch, n_cols, device=device, dtype=dtype) + + y_ref = geglu_ref(a, b) + geglu(a, b, c, batch, n_cols) + torch.npu.synchronize() + + is_match = True + detail = "" + try: + torch.testing.assert_close(c, y_ref, rtol=1e-2, atol=1e-2) + except AssertionError as err: + is_match = False + detail = str(err).strip() if str(err) else "assert_close failed" + + status = "match" if is_match else "mismatch" + print(f"[{status}] batch={batch}, n_cols={n_cols}, lib={lib_path}") + if detail: + print(" detail:") + print(detail) + results.append((batch, n_cols, status, detail)) + + print(f"\ndetailed summary for {lib_path}:") + for batch, n_cols, status, detail in results: + msg = f" batch={batch}, n_cols={n_cols}, status={status}" + print(msg) + if detail: + print(" detail:") + print(detail) + return results + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--lib", + default="./geglu_lib.so", + help="Path to the shared library generated by compile.sh.", + ) + parser.add_argument( + "--block-dim", + type=int, + default=24, + help="Kernel blockDim (default: 24).", + ) + args = parser.parse_args() + test_geglu(args.lib, block_dim=args.block_dim) diff --git a/openenv/README.md b/openenv/README.md new file mode 100644 index 00000000..b684a9bf --- /dev/null +++ b/openenv/README.md @@ -0,0 +1,119 @@ +# openenv — Agentic Kernel Optimizer + +Uses Claude Opus with OpenEnv to iteratively edit, build, test, and benchmark PTO-DSL kernels until a faster-than-baseline implementation is found. + +## Quick start + +```bash +ANTHROPIC_API_KEY=sk-... python openenv/agent_search.py --config openenv/hadamard_config.toml +``` + +The agent will: +1. Copy the kernel directory to a `_opt/` working directory (originals untouched) +2. Preload source files and reference docs into its context +3. Iteratively edit → build → test → benchmark in a loop +4. Stop when it achieves a speedup > 1.0x or exhausts its turn budget + +The optimized result is left in the `_opt/` directory. + +## CLI options + +``` +python openenv/agent_search.py + --config PATH Path to a kernel TOML config (default: openenv/hadamard_config.toml) + --max-turns N Max agent turns (default: 30) + --max-tokens N Max tokens per API call (default: 8192; overrides config) +``` + +## Config file reference + +Each kernel needs its own `.toml` config. See `hadamard_config.toml` or `geglu_config.toml` for complete examples. + +```toml +# --- Required --- + +# Path to the kernel source directory (relative to repo root, or absolute) +kernel_dir = "examples/aot/my_kernel" + +# Shell commands run from inside the working directory +build_cmd = ["bash", "compile.sh"] +test_cmd = ["python", "run_my_kernel.py"] +bench_cmd = ["python", "_bench_wrapper.py"] # must print "latency_ms=" + +# Files snapshotted as baseline; restored on reset/revert +baseline_files = ["my_kernel_builder.py"] + +# --- Agent context --- + +# Primary file the agent is told to look at first +main_file = "my_kernel_builder.py" + +# Human-readable kernel name used in prompts +kernel_name = "my_kernel" + +# Free-form domain notes injected into the system prompt +tuning_notes = """ +- Key constraints, valid parameter ranges, hardware limits, etc. +""" + +# Files the agent must NOT modify (enforced via system prompt) +readonly_files = ["run_my_kernel.py", "caller.cpp"] + +# Files embedded verbatim in the system prompt at startup (saves read_file turns) +preload_files = ["my_kernel_builder.py", "compile.sh", "caller.cpp"] + +# Remote docs fetched at startup and embedded in the system prompt. +# GitHub blob URLs are auto-converted to raw.githubusercontent.com. +[[context_urls]] +url = "https://github.com/org/repo/blob/main/some_reference.td" +label = "Human-readable label shown in prompt" + +# --- Optional --- + +# Working directory for edits. Defaults to "{kernel_dir}_opt". +# work_dir = "examples/aot/my_kernel_opt" + +# Max tokens per API call (also settable via --max-tokens CLI flag) +# max_tokens = 8192 +``` + +## Adding a new kernel + +1. Make sure the kernel directory has: + - A build script (e.g. `compile.sh`) + - A test script (e.g. `run_my_kernel.py`) that exits 0 on pass + - A bench wrapper `_bench_wrapper.py` that prints `latency_ms=` to stdout + +2. Create a config file (copy `hadamard_config.toml` and adjust paths/notes) + +3. Run: + ```bash + ANTHROPIC_API_KEY=sk-... python openenv/agent_search.py --config openenv/my_kernel_config.toml + ``` + +### Writing `_bench_wrapper.py` + +The bench wrapper is a plain Python script executed as a subprocess. It must print exactly one line of the form: + +``` +latency_ms=0.1234 +``` + +See `examples/aot/fast_hadamard/_bench_wrapper.py` or `examples/aot/geglu_dynamic_multicore/_bench_wrapper.py` for complete examples. Typical structure: + +```python +# load lib, allocate tensors +# warmup loop +# timed loop with per-iteration events +ms = sum(s.elapsed_time(e) for s, e in zip(starts, ends)) / ITERS +print(f"latency_ms={ms:.4f}") +``` + +## Files + +| File | Purpose | +|---|---| +| `agent_search.py` | Main entry point — agentic loop, config loading, tool executor | +| `kernel_opt_env.py` | `KernelSearchEnv` — edit/build/test/benchmark environment | +| `hadamard_config.toml` | Config for `examples/aot/fast_hadamard` | +| `geglu_config.toml` | Config for `examples/aot/geglu_dynamic_multicore` | diff --git a/openenv/__init__.py b/openenv/__init__.py new file mode 100644 index 00000000..634f885e --- /dev/null +++ b/openenv/__init__.py @@ -0,0 +1,3 @@ +from .kernel_opt_env import KernelAction, KernelObservation, KernelSearchEnv + +__all__ = ["KernelAction", "KernelObservation", "KernelSearchEnv"] diff --git a/openenv/agent_search.py b/openenv/agent_search.py new file mode 100644 index 00000000..e3e2d05e --- /dev/null +++ b/openenv/agent_search.py @@ -0,0 +1,407 @@ +""" +Agentic kernel optimizer. + +Uses Claude Opus with tool use to iteratively edit, build, test, and benchmark +PTO-DSL kernels until a faster-than-baseline implementation is found. + +Usage: + ANTHROPIC_API_KEY=sk-... python openenv/agent_search.py --config openenv/kernel_config.toml + +The agent is given five tools that map 1-to-1 onto KernelSearchEnv actions: + read_file – inspect any file in the kernel directory + edit_file – apply a targeted text replacement to a source file + build – compile the current source to a shared library + run_tests – check correctness of the compiled kernel + benchmark – measure latency and compute speedup vs baseline + +Claude decides what to change, builds, verifies, and benchmarks in a loop. +""" + +import argparse +import shutil +import sys +import time +import tomllib +import urllib.request +from pathlib import Path + +import anthropic + +ROOT = Path(__file__).parent.parent +sys.path.insert(0, str(ROOT)) + +from openenv import KernelSearchEnv, KernelAction + +# --------------------------------------------------------------------------- +# Config loading +# --------------------------------------------------------------------------- + +def load_config(config_path: str) -> dict: + with open(config_path, "rb") as f: + return tomllib.load(f) + + +def build_env(cfg: dict) -> tuple["KernelSearchEnv", Path]: + kernel_dir = Path(cfg["kernel_dir"]) + if not kernel_dir.is_absolute(): + kernel_dir = ROOT / kernel_dir + + # Resolve working directory (copy of originals — never touch kernel_dir itself) + if "work_dir" in cfg: + work_dir = Path(cfg["work_dir"]) + if not work_dir.is_absolute(): + work_dir = ROOT / work_dir + else: + work_dir = kernel_dir.parent / (kernel_dir.name + "_opt") + + if work_dir.exists(): + shutil.rmtree(work_dir) + shutil.copytree(kernel_dir, work_dir) + print(f"Working directory: {work_dir} (originals in {kernel_dir} are untouched)") + + baseline_files = { + name: (kernel_dir / name).read_text(encoding="utf-8") + for name in cfg["baseline_files"] + } + + def resolve_cmd(cmd: list[str]) -> list[str]: + # Replace bare "python" with the current interpreter + return [sys.executable if c == "python" else c for c in cmd] + + env = KernelSearchEnv( + repo_path=str(work_dir), + build_cmd=resolve_cmd(cfg["build_cmd"]), + test_cmd=resolve_cmd(cfg["test_cmd"]), + bench_cmd=resolve_cmd(cfg["bench_cmd"]), + baseline_files=baseline_files, + ) + return env, work_dir + + +# --------------------------------------------------------------------------- +# Parse args and initialise globals +# --------------------------------------------------------------------------- + +_parser = argparse.ArgumentParser(description="Agentic kernel optimizer") +_parser.add_argument( + "--config", default=str(Path(__file__).parent / "kernel_config.toml"), + help="Path to kernel TOML config (default: openenv/kernel_config.toml)", +) +_parser.add_argument("--max-turns", type=int, default=30) +_parser.add_argument("--max-tokens", type=int, default=None, + help="Max tokens per API call (overrides config; default: 8192)") +_args = _parser.parse_args() + +CFG = load_config(_args.config) +EXAMPLE_DIR: Path +env: KernelSearchEnv +env, EXAMPLE_DIR = build_env(CFG) + +# --------------------------------------------------------------------------- +# Tool definitions +# --------------------------------------------------------------------------- +TOOLS = [ + { + "name": "read_file", + "description": ( + "Read a source file from the kernel directory. " + "Use this first to understand the current implementation before proposing changes." + ), + "input_schema": { + "type": "object", + "properties": { + "path": { + "type": "string", + "description": "Path relative to the kernel directory, e.g. 'add_builder.py'", + } + }, + "required": ["path"], + }, + }, + { + "name": "edit_file", + "description": ( + "Replace a unique snippet of text in a source file with new text. " + "'old' must match exactly (including whitespace). " + "Make one focused change at a time." + ), + "input_schema": { + "type": "object", + "properties": { + "path": {"type": "string", "description": "Relative file path"}, + "old": {"type": "string", "description": "Exact text to find and replace"}, + "new": {"type": "string", "description": "Replacement text"}, + }, + "required": ["path", "old", "new"], + }, + }, + { + "name": "build", + "description": "Compile the current source files into a shared library (.so). Must be called after every edit before testing or benchmarking.", + "input_schema": {"type": "object", "properties": {}, "required": []}, + }, + { + "name": "run_tests", + "description": "Run the correctness test suite. Returns pass/fail and any error output. Always call this after build before benchmark.", + "input_schema": {"type": "object", "properties": {}, "required": []}, + }, + { + "name": "benchmark", + "description": "Measure the kernel latency and compute speedup vs the baseline. Returns latency_ms and speedup_vs_baseline.", + "input_schema": {"type": "object", "properties": {}, "required": []}, + }, +] + +# --------------------------------------------------------------------------- +# Tool executor (bridges Claude tool calls → KernelSearchEnv) +# --------------------------------------------------------------------------- +def execute_tool(name: str, tool_input: dict) -> str: + if name == "read_file": + path = EXAMPLE_DIR / tool_input["path"] + try: + return path.read_text(encoding="utf-8") + except FileNotFoundError: + return f"Error: file not found: {tool_input['path']}" + + if name == "edit_file": + for key in ("path", "old", "new"): + if key not in tool_input: + return f"Error: edit_file requires '{key}' parameter." + obs = env.step(KernelAction("edit", { + "path": tool_input["path"], + "old": tool_input["old"], + "new": tool_input["new"], + })) + return obs.summary + + if name == "build": + obs = env.step(KernelAction("build", {})) + return obs.summary + + if name == "run_tests": + obs = env.step(KernelAction("test", {})) + status = "PASS" if obs.passed_tests else "FAIL" + return f"{status}\n{obs.summary}" + + if name == "benchmark": + obs = env.step(KernelAction("benchmark", {})) + if obs.latency_ms is None: + return f"Benchmark failed: {obs.summary}" + return ( + f"latency_ms={obs.latency_ms:.4f} " + f"speedup={obs.speedup_vs_baseline:.4f}x vs baseline ({env.baseline_ms:.4f} ms)\n" + f"Best so far: {env.best_ms:.4f} ms ({env.best_speedup:.4f}x)" + ) + + return f"Unknown tool: {name}" + +# --------------------------------------------------------------------------- +# System prompt (built from config) +# --------------------------------------------------------------------------- +import textwrap as _textwrap + + +def _github_to_raw(url: str) -> str: + """Convert a github.com blob URL to raw.githubusercontent.com.""" + if "github.com" in url and "/blob/" in url: + url = url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/") + return url + + +def _fetch_url(url: str, max_chars: int = 40_000) -> str: + url = _github_to_raw(url) + try: + with urllib.request.urlopen(url, timeout=15) as resp: + content = resp.read().decode("utf-8", errors="replace") + if len(content) > max_chars: + content = content[:max_chars] + f"\n... (truncated at {max_chars} chars)" + return content + except Exception as exc: + return f"(fetch failed: {exc})" + + +def _build_system_prompt(cfg: dict, kernel_dir: Path) -> str: + name = cfg.get("kernel_name", "kernel") + main_file = cfg.get("main_file", "builder.py") + tuning = cfg.get("tuning_notes", "").strip() + readonly = cfg.get("readonly_files", []) + + readonly_rule = ( + f"- Never modify these files (test harness): {', '.join(readonly)}." + if readonly else "" + ) + + files = sorted(f.name for f in kernel_dir.iterdir() if f.is_file()) + file_listing = ", ".join(files) + + # Embed preload_files verbatim so the agent doesn't waste turns reading them + preload_section = "" + for fname in cfg.get("preload_files", []): + fpath = kernel_dir / fname + try: + content = fpath.read_text(encoding="utf-8") + preload_section += f"\n### {fname}\n```\n{content}\n```\n" + except FileNotFoundError: + preload_section += f"\n### {fname}\n(file not found)\n" + + if preload_section: + preload_section = "\n## Pre-loaded files (do NOT re-read these)\n" + preload_section + + # Fetch remote reference docs and embed them + url_section = "" + for entry in cfg.get("context_urls", []): + if isinstance(entry, dict): + url, label = entry["url"], entry.get("label", entry["url"]) + else: + url, label = entry, entry + print(f" Fetching context: {label} …") + content = _fetch_url(url) + url_section += f"\n### {label}\n```\n{content}\n```\n" + if url_section: + url_section = "\n## Reference documentation (read-only, do not edit)\n" + url_section + + return _textwrap.dedent(f"""\ + You are an expert NPU kernel engineer specialising in Ascend/PTO-DSL performance tuning. + + Your task is to find a version of the {name} kernel that is FASTER than the baseline. + + The kernel directory contains exactly these files: {file_listing} + Do NOT attempt to read any other filenames — they do not exist. + {preload_section}{url_section} + ## Your workflow + 1. Propose one targeted change based on the pre-loaded files above + 2. edit_file(...) + 3. build() — check for compile errors + 4. run_tests() — ensure correctness + 5. benchmark() — measure speedup + + Repeat until speedup > 1.0 (faster than baseline) or you have exhausted ideas. + + ## Key tuning notes + {tuning} + + ## Rules + {readonly_rule} + - One edit at a time; build and verify after each change. + - If a build or test fails, revert the change with another edit_file call + and try a different approach. + - Stop as soon as you achieve a confirmed speedup > 1.0x and explain what worked. + """) + +SYSTEM = _build_system_prompt(CFG, EXAMPLE_DIR) + +# --------------------------------------------------------------------------- +# Agentic loop +# --------------------------------------------------------------------------- +def run_agent(max_turns: int = 30) -> None: + # Establish baseline + print("Establishing baseline …") + obs = env.reset() + print(f"Baseline: {obs.latency_ms:.4f} ms\n") + + client = anthropic.Anthropic() + preloaded = CFG.get("preload_files", []) + if preloaded: + start_hint = f"The source files are already in your context above — start proposing a change directly." + else: + main_file = CFG.get("main_file", "builder.py") + start_hint = f"Start by reading {main_file}." + + messages: list[dict] = [ + { + "role": "user", + "content": ( + f"The baseline kernel latency is {obs.latency_ms:.4f} ms. " + f"Please find a faster implementation. {start_hint}" + ), + } + ] + + turn = 0 + while turn < max_turns: + turn += 1 + print(f"\n{'─'*60}") + print(f"Turn {turn}/{max_turns}") + print(f"{'─'*60}") + + max_tokens = _args.max_tokens or CFG.get("max_tokens", 8192) + for attempt in range(5): + try: + with client.messages.stream( + model="claude-opus-4-6", + max_tokens=max_tokens, + thinking={"type": "adaptive"}, + system=SYSTEM, + tools=TOOLS, + messages=messages, + ) as stream: + response = stream.get_final_message() + break + except anthropic.RateLimitError as e: + wait = 60 * (attempt + 1) + print(f"\n[Rate limit] Waiting {wait}s before retry ({attempt+1}/5)… ({e})") + time.sleep(wait) + else: + raise RuntimeError("Rate limit retries exhausted") + + # Print any text Claude produced + for block in response.content: + if block.type == "text" and block.text.strip(): + print(f"\n[Claude] {block.text.strip()}") + + # Append assistant turn — strip trailing thinking blocks (API rejects them as final block) + content = list(response.content) + while content and getattr(content[-1], "type", None) == "thinking": + content.pop() + if content: + messages.append({"role": "assistant", "content": content}) + + # Done if no tool calls + if response.stop_reason == "end_turn": + print("\n[Agent] Claude finished.") + break + + # Execute tool calls + tool_results = [] + for block in response.content: + if block.type != "tool_use": + continue + + print(f"\n → {block.name}({_fmt_input(block.input)})") + result = execute_tool(block.name, block.input) + print(f" ← {result[:300]}" + ("…" if len(result) > 300 else "")) + + tool_results.append({ + "type": "tool_result", + "tool_use_id": block.id, + "content": result, + }) + + if tool_results: + messages.append({"role": "user", "content": tool_results}) + + # Early exit if we already have a confirmed speedup + if env.best_speedup is not None and env.best_speedup > 1.0: + print(f"\n[Agent] Speedup achieved: {env.best_speedup:.4f}x — stopping early.") + break + + # Final summary + print(f"\n{'='*60}") + print("SEARCH COMPLETE") + print(f"{'='*60}") + print(f" Baseline: {env.baseline_ms:.4f} ms") + print(f" Best found: {env.best_ms:.4f} ms") + print(f" Best speedup: {env.best_speedup:.4f}x" if env.best_speedup else " No improvement found.") + + +def _fmt_input(inp: dict) -> str: + """Compact one-line representation of tool input for logging.""" + parts = [] + for k, v in inp.items(): + s = str(v).replace("\n", "\\n") + parts.append(f"{k}={s[:60]!r}" if len(s) > 60 else f"{k}={s!r}") + return ", ".join(parts) + + +if __name__ == "__main__": + run_agent(max_turns=_args.max_turns) diff --git a/openenv/geglu_config.toml b/openenv/geglu_config.toml new file mode 100644 index 00000000..125ce6cb --- /dev/null +++ b/openenv/geglu_config.toml @@ -0,0 +1,37 @@ +# Kernel optimizer configuration — GEGLU dynamic multicore +# Usage: python openenv/agent_search.py --config openenv/geglu_config.toml + +kernel_dir = "examples/aot/geglu_dynamic_multicore" + +# work_dir = "examples/aot/geglu_dynamic_multicore_opt" + +build_cmd = ["bash", "compile.sh"] +test_cmd = ["python", "run_geglu.py"] +bench_cmd = ["python", "_bench_wrapper.py"] + +baseline_files = ["geglu_builder.py"] +main_file = "geglu_builder.py" +kernel_name = "geglu" + +tuning_notes = """ +- Computes c = gelu_approx(a) * b (in-place tanh-GELU gate, fp16). +- ELEMENTS_PER_TILE = 16384 (32KB UB / sizeof(fp16)). n_cols must be <= this. +- Currently uses 5 UB tiles (160 KB / 192 KB). One extra tile is available. +- Constants 1.0 and 2.0 are derived via exp(a-a)=1 (no scalar-tile broadcast). +- Rows are distributed across vector cores; rows_per_core = ceil(batch / num_cores). +- Benchmark config: batch=1024, n_cols=8192, block_dim=24 (see _bench_wrapper.py). +- Key tuning ideas: reduce ops per row (reuse intermediates), pipeline loads with + compute using double-buffering, or explore different tile sizes. +""" + +readonly_files = ["run_geglu.py", "caller.cpp"] + +preload_files = ["geglu_builder.py", "compile.sh", "caller.cpp"] + +[[context_urls]] +url = "https://github.com/zhangstevenunity/PTOAS/blob/main/include/PTO/IR/PTOOps.td" +label = "PTO MLIR op definitions (PTOOps.td)" + +[[context_urls]] +url = "https://github.com/zhangstevenunity/PTOAS/blob/main/python/pto/dialects/pto.py" +label = "PTO Python bindings (pto.py)" diff --git a/openenv/hadamard_config.toml b/openenv/hadamard_config.toml new file mode 100644 index 00000000..d950dd3f --- /dev/null +++ b/openenv/hadamard_config.toml @@ -0,0 +1,54 @@ +# Kernel optimizer configuration +# Pass to agent_search.py with: python agent_search.py --config kernel_config.toml + +# Path to the kernel directory (relative to repo root, or absolute) +kernel_dir = "examples/aot/fast_hadamard" + +# Working directory where edits happen (original files are never touched). +# Relative paths are resolved from repo root. Defaults to "{kernel_dir}_opt". +# work_dir = "examples/aot/fast_hadamard_opt" + +# Shell commands (relative paths resolved from kernel_dir) +build_cmd = ["bash", "compile.sh"] +test_cmd = ["python", "run_hadamard.py"] +bench_cmd = ["python", "_bench_wrapper.py"] + +# Files whose content is snapshotted as the baseline for reset/revert +baseline_files = ["hadamard_builder.py"] + +# The first file the agent should read to understand the kernel +main_file = "hadamard_builder.py" + +# Human-readable name used in prompts +kernel_name = "fast_hadamard" + +# Free-form tuning notes injected into the system prompt +tuning_notes = """ +- The kernel operates on fp16 tensors in-place: shape (batch, N), N must be a power of 2. +- ELEMENTS_PER_TILE = 16384 (32KB UB / sizeof(fp16)). N must be <= this. +- samples_per_load controls how many rows are processed per chunk (currently 1). + Increasing it may improve throughput if UB pressure allows. +- Two ping/pong tile sets (event_id 0/1) are used to hide DMA latency. +- The auto-sync variant relies on `ptoas --enable-insert-sync`; no manual barriers needed. +- Benchmark config: batch=32, N=8192, block_dim=24 (see _bench_wrapper.py). +""" + +# Files the agent must NOT modify (enforced via system-prompt rules) +readonly_files = ["run_hadamard.py", "caller.cpp"] + +# Files embedded verbatim in the system prompt so the agent starts with full context +# (saves turns that would otherwise be spent on read_file calls) +preload_files = ["hadamard_builder.py", "compile.sh", "caller.cpp"] + +# Remote reference docs fetched at startup and embedded in the system prompt. +# GitHub blob URLs are auto-converted to raw. gitcode.com is not fetchable. +# Reference only (not fetchable): https://gitcode.com/cann/pto-isa/tree/master/docs/isa +# Reference only (not fetchable): https://gitcode.com/cann/pto-isa/tree/master/include/pto/npu/a2a3 +# Reference only (not fetchable): https://gitcode.com/cann/pto-isa/blob/master/tests/npu/a2a3/src/st/testcase/ +[[context_urls]] +url = "https://github.com/zhangstevenunity/PTOAS/blob/main/include/PTO/IR/PTOOps.td" +label = "PTO MLIR op definitions (PTOOps.td)" + +[[context_urls]] +url = "https://github.com/zhangstevenunity/PTOAS/blob/main/python/pto/dialects/pto.py" +label = "PTO Python bindings (pto.py)" diff --git a/openenv/kernel_opt_env.py b/openenv/kernel_opt_env.py new file mode 100644 index 00000000..503b99ac --- /dev/null +++ b/openenv/kernel_opt_env.py @@ -0,0 +1,221 @@ +import subprocess +from dataclasses import dataclass +from pathlib import Path + + +@dataclass +class KernelAction: + kind: str # "edit", "build", "test", "benchmark" + payload: dict + + +@dataclass +class KernelObservation: + passed_tests: bool + latency_ms: float | None + speedup_vs_baseline: float | None + summary: str + + +class KernelSearchEnv: + """ + Optimization environment for finding fast NPU kernels. + + env = KernelSearchEnv( + repo_path = "path/to/kernel/dir", + build_cmd = ["bash", "compile.sh"], + test_cmd = ["python", "run.py"], + bench_cmd = ["python", "bench.py"], # must print "latency_ms=" + baseline_files = {"builder.py": ""}, + ) + obs = env.reset() + obs = env.step(KernelAction("edit", {"path": "builder.py", "old": "...", "new": "..."})) + obs = env.step(KernelAction("test", {})) + obs = env.step(KernelAction("benchmark", {})) + """ + + def __init__( + self, + repo_path: str, + test_cmd: list[str], + bench_cmd: list[str], + build_cmd: list[str] | None = None, + baseline_files: dict[str, str] | None = None, + ): + self._root = Path(repo_path) + self._test_cmd = test_cmd + self._bench_cmd = bench_cmd + self._build_cmd = build_cmd + self._baseline_files: dict[str, str] = baseline_files or {} + self._baseline_ms: float | None = None + self._best_ms: float | None = None + + # ------------------------------------------------------------------ + # Public interface + # ------------------------------------------------------------------ + + def reset(self) -> KernelObservation: + """Restore baseline sources, recompile, and benchmark.""" + self._restore_baseline() + if self._build_cmd: + ok, out = self._run_cmd(self._build_cmd) + if not ok: + return self._error_obs(f"Baseline build failed:\n{out}") + latency = self._run_benchmark() + self._baseline_ms = latency + self._best_ms = latency + return KernelObservation( + passed_tests=True, + latency_ms=latency, + speedup_vs_baseline=1.0, + summary="Baseline ready", + ) + + def step(self, action: KernelAction) -> KernelObservation: + dispatch = { + "edit": self._handle_edit, + "build": self._handle_build, + "test": self._handle_test, + "benchmark": self._handle_benchmark, + } + handler = dispatch.get(action.kind) + if handler is None: + raise ValueError(f"Unknown action kind {action.kind!r}. Valid: {list(dispatch)}") + return handler(action.payload) + + # ------------------------------------------------------------------ + # Action handlers + # ------------------------------------------------------------------ + + def _handle_edit(self, payload: dict) -> KernelObservation: + rel = payload.get("path") + if not rel: + return self._error_obs("edit payload must include 'path'") + + target = self._root / rel + target.parent.mkdir(parents=True, exist_ok=True) + + if "content" in payload: + target.write_text(payload["content"], encoding="utf-8") + elif "old" in payload and "new" in payload: + original = target.read_text(encoding="utf-8") + if payload["old"] not in original: + return self._error_obs(f"'old' string not found in {rel}; edit not applied.") + target.write_text(original.replace(payload["old"], payload["new"], 1), encoding="utf-8") + else: + return self._error_obs("edit payload must have 'content' or both 'old'+'new'") + + return KernelObservation( + passed_tests=False, + latency_ms=None, + speedup_vs_baseline=None, + summary=f"Applied edit to {rel}. Run 'test' to verify correctness.", + ) + + def _handle_build(self, payload: dict) -> KernelObservation: + cmd = payload.get("cmd") or self._build_cmd + if not cmd: + return self._error_obs("No build_cmd configured and none provided in payload.") + ok, output = self._run_cmd(cmd) + return KernelObservation( + passed_tests=ok, + latency_ms=None, + speedup_vs_baseline=None, + summary=output if not ok else f"Build succeeded.\n{output}".strip(), + ) + + def _handle_test(self, payload: dict) -> KernelObservation: + cmd = payload.get("cmd") or self._test_cmd + ok, output = self._run_cmd(cmd) + return KernelObservation( + passed_tests=ok, + latency_ms=None, + speedup_vs_baseline=None, + summary=output if not ok else f"All tests passed.\n{output}".strip(), + ) + + def _handle_benchmark(self, payload: dict) -> KernelObservation: + cmd = payload.get("cmd") or self._bench_cmd + ok, output = self._run_cmd(cmd) + if not ok: + return KernelObservation( + passed_tests=False, latency_ms=None, speedup_vs_baseline=None, + summary=f"Benchmark failed:\n{output}", + ) + + latency = self._parse_latency(output) + if latency is None: + return self._error_obs( + f"Could not parse 'latency_ms=' from bench output:\n{output}" + ) + + speedup = self._baseline_ms / latency if self._baseline_ms else None + if self._best_ms is None or latency < self._best_ms: + self._best_ms = latency + + return KernelObservation( + passed_tests=True, + latency_ms=latency, + speedup_vs_baseline=speedup, + summary=self._bench_summary(latency, speedup), + ) + + # ------------------------------------------------------------------ + # Helpers + # ------------------------------------------------------------------ + + def _restore_baseline(self): + for rel, content in self._baseline_files.items(): + target = self._root / rel + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text(content, encoding="utf-8") + + def _run_cmd(self, cmd: list[str] | str) -> tuple[bool, str]: + if isinstance(cmd, str): + cmd = cmd.split() + result = subprocess.run(cmd, cwd=str(self._root), capture_output=True, text=True) + return result.returncode == 0, (result.stdout + result.stderr).strip() + + def _run_benchmark(self) -> float | None: + ok, output = self._run_cmd(self._bench_cmd) + return self._parse_latency(output) if ok else None + + @staticmethod + def _parse_latency(output: str) -> float | None: + for line in reversed(output.splitlines()): + if line.strip().startswith("latency_ms="): + try: + return float(line.strip().split("=", 1)[1]) + except ValueError: + pass + return None + + def _bench_summary(self, latency: float, speedup: float | None) -> str: + parts = [f"latency_ms={latency:.3f}"] + if speedup is not None: + parts.append(f"speedup={speedup:.3f}x vs baseline") + if self._best_ms is not None: + parts.append(f"best so far: {self._best_ms:.3f} ms") + return " ".join(parts) + + @staticmethod + def _error_obs(msg: str) -> KernelObservation: + return KernelObservation(passed_tests=False, latency_ms=None, speedup_vs_baseline=None, summary=msg) + + # ------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------ + + @property + def baseline_ms(self) -> float | None: + return self._baseline_ms + + @property + def best_ms(self) -> float | None: + return self._best_ms + + @property + def best_speedup(self) -> float | None: + if self._baseline_ms and self._best_ms: + return self._baseline_ms / self._best_ms + return None From 4c246c9313ea662b330f6f1d697a1c92c10efa83 Mon Sep 17 00:00:00 2001 From: mirkodevita Date: Mon, 9 Mar 2026 14:45:09 +0000 Subject: [PATCH 2/6] rebased on refactoring and moved the agent kernels to a new subfolder of examples --- .../{aot => agent}/geglu_dynamic_multicore_opt/.gitignore | 0 .../{aot => agent}/geglu_dynamic_multicore_opt/README.md | 0 .../geglu_dynamic_multicore_opt/_bench_wrapper.py | 0 .../geglu_dynamic_multicore_opt/bench_geglu.py | 0 .../{aot => agent}/geglu_dynamic_multicore_opt/caller.cpp | 0 .../{aot => agent}/geglu_dynamic_multicore_opt/compile.sh | 0 .../geglu_dynamic_multicore_opt/geglu_builder.py | 6 +++--- .../{aot => agent}/geglu_dynamic_multicore_opt/run_geglu.py | 0 openenv/agent_search.py | 2 +- 9 files changed, 4 insertions(+), 4 deletions(-) rename examples/{aot => agent}/geglu_dynamic_multicore_opt/.gitignore (100%) rename examples/{aot => agent}/geglu_dynamic_multicore_opt/README.md (100%) rename examples/{aot => agent}/geglu_dynamic_multicore_opt/_bench_wrapper.py (100%) rename examples/{aot => agent}/geglu_dynamic_multicore_opt/bench_geglu.py (100%) rename examples/{aot => agent}/geglu_dynamic_multicore_opt/caller.cpp (100%) rename examples/{aot => agent}/geglu_dynamic_multicore_opt/compile.sh (100%) rename examples/{aot => agent}/geglu_dynamic_multicore_opt/geglu_builder.py (98%) rename examples/{aot => agent}/geglu_dynamic_multicore_opt/run_geglu.py (100%) diff --git a/examples/aot/geglu_dynamic_multicore_opt/.gitignore b/examples/agent/geglu_dynamic_multicore_opt/.gitignore similarity index 100% rename from examples/aot/geglu_dynamic_multicore_opt/.gitignore rename to examples/agent/geglu_dynamic_multicore_opt/.gitignore diff --git a/examples/aot/geglu_dynamic_multicore_opt/README.md b/examples/agent/geglu_dynamic_multicore_opt/README.md similarity index 100% rename from examples/aot/geglu_dynamic_multicore_opt/README.md rename to examples/agent/geglu_dynamic_multicore_opt/README.md diff --git a/examples/aot/geglu_dynamic_multicore_opt/_bench_wrapper.py b/examples/agent/geglu_dynamic_multicore_opt/_bench_wrapper.py similarity index 100% rename from examples/aot/geglu_dynamic_multicore_opt/_bench_wrapper.py rename to examples/agent/geglu_dynamic_multicore_opt/_bench_wrapper.py diff --git a/examples/aot/geglu_dynamic_multicore_opt/bench_geglu.py b/examples/agent/geglu_dynamic_multicore_opt/bench_geglu.py similarity index 100% rename from examples/aot/geglu_dynamic_multicore_opt/bench_geglu.py rename to examples/agent/geglu_dynamic_multicore_opt/bench_geglu.py diff --git a/examples/aot/geglu_dynamic_multicore_opt/caller.cpp b/examples/agent/geglu_dynamic_multicore_opt/caller.cpp similarity index 100% rename from examples/aot/geglu_dynamic_multicore_opt/caller.cpp rename to examples/agent/geglu_dynamic_multicore_opt/caller.cpp diff --git a/examples/aot/geglu_dynamic_multicore_opt/compile.sh b/examples/agent/geglu_dynamic_multicore_opt/compile.sh similarity index 100% rename from examples/aot/geglu_dynamic_multicore_opt/compile.sh rename to examples/agent/geglu_dynamic_multicore_opt/compile.sh diff --git a/examples/aot/geglu_dynamic_multicore_opt/geglu_builder.py b/examples/agent/geglu_dynamic_multicore_opt/geglu_builder.py similarity index 98% rename from examples/aot/geglu_dynamic_multicore_opt/geglu_builder.py rename to examples/agent/geglu_dynamic_multicore_opt/geglu_builder.py index 4dae7890..3a70cc97 100644 --- a/examples/aot/geglu_dynamic_multicore_opt/geglu_builder.py +++ b/examples/agent/geglu_dynamic_multicore_opt/geglu_builder.py @@ -1,7 +1,7 @@ -from ptodsl import to_ir_module -import ptodsl.language as pto +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s -const = pto.const +const = s.const # 32 KB of UB / sizeof(fp16) = 16384 elements per tile ELEMENTS_PER_TILE = 32 * 1024 // 2 diff --git a/examples/aot/geglu_dynamic_multicore_opt/run_geglu.py b/examples/agent/geglu_dynamic_multicore_opt/run_geglu.py similarity index 100% rename from examples/aot/geglu_dynamic_multicore_opt/run_geglu.py rename to examples/agent/geglu_dynamic_multicore_opt/run_geglu.py diff --git a/openenv/agent_search.py b/openenv/agent_search.py index e3e2d05e..f2b3403b 100644 --- a/openenv/agent_search.py +++ b/openenv/agent_search.py @@ -52,7 +52,7 @@ def build_env(cfg: dict) -> tuple["KernelSearchEnv", Path]: if not work_dir.is_absolute(): work_dir = ROOT / work_dir else: - work_dir = kernel_dir.parent / (kernel_dir.name + "_opt") + work_dir = ROOT / "examples" / "agent" / kernel_dir.name if work_dir.exists(): shutil.rmtree(work_dir) From cb4ddee5c4b7a6f9f0cd99d434a6702ad011cd44 Mon Sep 17 00:00:00 2001 From: mirkodevita Date: Mon, 9 Mar 2026 15:36:19 +0000 Subject: [PATCH 3/6] added create mode, see mish and silu toml files --- examples/agent/mish/_bench_wrapper.py | 68 ++++++++ examples/agent/mish/caller.cpp | 24 +++ examples/agent/mish/compile.sh | 22 +++ examples/agent/mish/mish.cpp | 96 +++++++++++ examples/agent/mish/mish.pto | 60 +++++++ examples/agent/mish/mish_builder.py | 174 +++++++++++++++++++ examples/agent/mish/run_mish.py | 114 +++++++++++++ examples/agent/silu/_bench_wrapper.py | 68 ++++++++ examples/agent/silu/caller.cpp | 24 +++ examples/agent/silu/compile.sh | 22 +++ examples/agent/silu/run_silu.py | 113 +++++++++++++ examples/agent/silu/silu.cpp | 91 ++++++++++ examples/agent/silu/silu.pto | 58 +++++++ examples/agent/silu/silu_builder.py | 154 +++++++++++++++++ openenv/README.md | 6 + openenv/agent_search.py | 230 +++++++++++++++++++++----- openenv/geglu_config.toml | 9 + openenv/hadamard_config.toml | 9 + openenv/hardswish_config.toml | 48 ++++++ openenv/kernel_opt_env.py | 2 +- openenv/mish_config.toml | 46 ++++++ openenv/silu_config.toml | 42 +++++ 22 files changed, 1436 insertions(+), 44 deletions(-) create mode 100644 examples/agent/mish/_bench_wrapper.py create mode 100644 examples/agent/mish/caller.cpp create mode 100644 examples/agent/mish/compile.sh create mode 100644 examples/agent/mish/mish.cpp create mode 100644 examples/agent/mish/mish.pto create mode 100644 examples/agent/mish/mish_builder.py create mode 100644 examples/agent/mish/run_mish.py create mode 100644 examples/agent/silu/_bench_wrapper.py create mode 100644 examples/agent/silu/caller.cpp create mode 100644 examples/agent/silu/compile.sh create mode 100644 examples/agent/silu/run_silu.py create mode 100644 examples/agent/silu/silu.cpp create mode 100644 examples/agent/silu/silu.pto create mode 100644 examples/agent/silu/silu_builder.py create mode 100644 openenv/hardswish_config.toml create mode 100644 openenv/mish_config.toml create mode 100644 openenv/silu_config.toml diff --git a/examples/agent/mish/_bench_wrapper.py b/examples/agent/mish/_bench_wrapper.py new file mode 100644 index 00000000..cb4af27f --- /dev/null +++ b/examples/agent/mish/_bench_wrapper.py @@ -0,0 +1,68 @@ +""" +Single-config benchmark wrapper for the agentic optimizer. +Loads mish_lib.so and prints: latency_ms= +""" +import ctypes + +import torch +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + +# Representative shape — change to target a different operating point +BATCH = 1024 +N_COLS = 8192 +BLOCK_DIM = 24 +WARMUP = 5 +ITERS = 20 + + +def torch_to_ctypes(t): + return ctypes.c_void_p(t.data_ptr()) + + +device = get_test_device() +torch.npu.set_device(device) + +lib = ctypes.CDLL("./mish_lib.so") +lib.call_kernel.argtypes = [ + ctypes.c_uint32, # blockDim + ctypes.c_void_p, # stream + ctypes.c_void_p, # x + ctypes.c_void_p, # y (output) + ctypes.c_uint32, # batch + ctypes.c_uint32, # n_cols +] +lib.call_kernel.restype = None + + +def run(x, y): + lib.call_kernel( + BLOCK_DIM, + torch.npu.current_stream()._as_parameter_, + torch_to_ctypes(x), + torch_to_ctypes(y), + BATCH, + N_COLS, + ) + + +dtype = torch.float16 +# Separate tensors per iteration to reduce cache reuse +xs_ = [torch.randn(BATCH, N_COLS, device=device, dtype=dtype).clamp(-4, 4) for _ in range(WARMUP + ITERS)] +y = torch.empty(BATCH, N_COLS, device=device, dtype=dtype) + +for i in range(WARMUP): + run(xs_[i], y) +torch.npu.synchronize() + +starts = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)] +ends = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)] +for i in range(ITERS): + starts[i].record() + run(xs_[WARMUP + i], y) + ends[i].record() +torch.npu.synchronize() + +ms = sum(s.elapsed_time(e) for s, e in zip(starts, ends)) / ITERS +print(f"latency_ms={ms:.4f}") diff --git a/examples/agent/mish/caller.cpp b/examples/agent/mish/caller.cpp new file mode 100644 index 00000000..47fe846c --- /dev/null +++ b/examples/agent/mish/caller.cpp @@ -0,0 +1,24 @@ +#ifndef KERNEL_CPP +#define KERNEL_CPP "mish.cpp" +#endif +#include KERNEL_CPP + +#ifndef NUM_CORES +#define NUM_CORES 24 +#endif + +extern "C" void call_kernel( + uint32_t blockDim, + void *stream, + uint8_t *x, + uint8_t *y, + uint32_t batch, + uint32_t n_cols) +{ + uint32_t launch_blocks = blockDim > 0 ? blockDim : NUM_CORES; + _kernel<<>>( + reinterpret_cast(x), + reinterpret_cast(y), + static_cast(batch), + static_cast(n_cols)); +} diff --git a/examples/agent/mish/compile.sh b/examples/agent/mish/compile.sh new file mode 100644 index 00000000..233c4952 --- /dev/null +++ b/examples/agent/mish/compile.sh @@ -0,0 +1,22 @@ +set -e + +rm -f mish.pto mish.cpp mish_lib.so + +python ./mish_builder.py > ./mish.pto +ptoas --enable-insert-sync ./mish.pto -o ./mish.cpp + +bisheng \ + -I${ASCEND_TOOLKIT_HOME}/include \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ + -xcce -Xhost-start -Xhost-end \ + -mllvm -cce-aicore-stack-size=0x8000 \ + -mllvm -cce-aicore-function-stack-size=0x8000 \ + -mllvm -cce-aicore-record-overflow=true \ + -mllvm -cce-aicore-addr-transform \ + -mllvm -cce-aicore-dcci-insert-for-scalar=false \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -std=gnu++17 \ + -DKERNEL_CPP="\"mish.cpp\"" \ + ./caller.cpp \ + -o ./mish_lib.so diff --git a/examples/agent/mish/mish.cpp b/examples/agent/mish/mish.cpp new file mode 100644 index 00000000..35991bc0 --- /dev/null +++ b/examples/agent/mish/mish.cpp @@ -0,0 +1,96 @@ +#include "pto/pto-inst.hpp" +using namespace pto; +__global__ AICORE void _kernel(__gm__ half* v1, __gm__ half* v2, int32_t v3, int32_t v4) { + unsigned v5 = 1; + unsigned v6 = 0; + int32_t v7 = 16384; + int32_t v8 = 1; + int32_t v9 = 0; + int64_t v10 = 0; + int64_t v11 = 32768; + int64_t v12 = 65536; + int64_t v13 = 98304; + using T = float; + + #if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + if (v4 > v9) { + if (v4 <= v7) { + int64_t v14 = get_block_idx(); + int64_t v15 = get_subblockid(); + int64_t v16 = get_subblockdim(); + int64_t v17 = (int64_t) v16; + int64_t v18 = get_block_num(); + int32_t v19 = (int32_t) ((int64_t) (uint64_t) ((int64_t) v18) * (uint64_t) v17); + int32_t v20 = v3 / v19; + int32_t v21 = v3 % v19 != v9 && v3 < v9 == v19 < v9 ? v20 + v8 : v20; + int32_t v22 = (int32_t) ((uint32_t) ((int32_t) (int64_t) ((uint64_t) ((int64_t) (uint64_t) ((int64_t) v14) * (uint64_t) v17) + (uint64_t) ((int64_t) v15))) * (uint32_t) v21); + int32_t v23 = (int32_t) ((uint32_t) v22 + (uint32_t) v21); + int32_t v24 = (int32_t) ((uint32_t) ((uint32_t) v23 < (uint32_t) v3 ? v23 : v3) - (uint32_t) v22); + int32_t v25 = (int32_t) ((uint32_t) v3 * (uint32_t) v4); + if (v24 > v9) { + Tile v26 = Tile(v4); + TASSIGN(v26, v10); + Tile v27 = Tile(v4); + TASSIGN(v27, v11); + Tile v28 = Tile(v4); + TASSIGN(v28, v12); + Tile v29 = Tile(v4); + TASSIGN(v29, v13); + for (size_t v30 = (size_t) v9; v30 < ((size_t) v24); v30 += (size_t) v8) { + int32_t v31 = (int32_t) ((uint32_t) ((int32_t) (uint32_t) v22 + (uint32_t) ((int32_t) v30)) * (uint32_t) v4); + unsigned v32 = (unsigned) v4 * v5; + pto::Shape<1, 1, 1, 1, -1> v33 = pto::Shape<1, 1, 1, 1, -1>(v4); + pto::Stride<-1, -1, -1, -1, 1> v34 = pto::Stride<-1, -1, -1, -1, 1>(v32, v32, v32, v32); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v35 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v6 + (unsigned) v31 * (unsigned) v8), v33, v34); + unsigned v36 = (unsigned) v4 * v5; + pto::Shape<1, 1, 1, 1, -1> v37 = pto::Shape<1, 1, 1, 1, -1>(v4); + pto::Stride<-1, -1, -1, -1, 1> v38 = pto::Stride<-1, -1, -1, -1, 1>(v36, v36, v36, v36); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v39 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v6 + (unsigned) v31 * (unsigned) v8), v37, v38); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v26, v35); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TSUB(v28, v26, v26); + pipe_barrier(PIPE_V); + TEXP(v27, v28); + pipe_barrier(PIPE_V); + TEXP(v28, v26); + pipe_barrier(PIPE_V); + TADD(v28, v27, v28); + pipe_barrier(PIPE_V); + TMUL(v29, v28, v28); + pipe_barrier(PIPE_V); + TSUB(v28, v29, v27); + pipe_barrier(PIPE_V); + TADD(v29, v29, v27); + pipe_barrier(PIPE_V); + TDIV(v28, v28, v29); + pipe_barrier(PIPE_V); + TMUL(v28, v26, v28); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pipe_barrier(PIPE_MTE3); + TSTORE(v39, v28); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + }; + }; + }; + } + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + #endif // __DAV_VEC__ + + return; +} + diff --git a/examples/agent/mish/mish.pto b/examples/agent/mish/mish.pto new file mode 100644 index 00000000..09e00dd9 --- /dev/null +++ b/examples/agent/mish/mish.pto @@ -0,0 +1,60 @@ +module { + func.func @_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: i32, %arg3: i32) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16384 = arith.constant 16384 : index + %0 = arith.index_cast %arg2 : i32 to index + %1 = arith.index_cast %arg3 : i32 to index + pto.section.vector { + %2 = arith.cmpi sgt, %1, %c0 : index + scf.if %2 { + %3 = arith.cmpi sge, %c16384, %1 : index + scf.if %3 { + %4 = pto.get_block_idx + %5 = pto.get_subblock_idx + %6 = pto.get_subblock_num + %7 = pto.get_block_num + %8 = arith.muli %4, %6 : i64 + %9 = arith.addi %8, %5 : i64 + %10 = arith.index_cast %9 : i64 to index + %11 = arith.muli %7, %6 : i64 + %12 = arith.index_cast %11 : i64 to index + %13 = arith.ceildivsi %0, %12 : index + %14 = arith.muli %10, %13 : index + %15 = arith.addi %14, %13 : index + %16 = arith.minui %15, %0 : index + %17 = arith.subi %16, %14 : index + %18 = arith.muli %0, %1 : index + %19 = pto.make_tensor_view %arg0, shape = [%18] strides = [%c1] : !pto.tensor_view + %20 = pto.make_tensor_view %arg1, shape = [%18] strides = [%c1] : !pto.tensor_view + %21 = arith.cmpi sgt, %17, %c0 : index + scf.if %21 { + %22 = pto.alloc_tile valid_col = %1 : !pto.tile_buf + %23 = pto.alloc_tile valid_col = %1 : !pto.tile_buf + %24 = pto.alloc_tile valid_col = %1 : !pto.tile_buf + %25 = pto.alloc_tile valid_col = %1 : !pto.tile_buf + scf.for %arg4 = %c0 to %17 step %c1 { + %26 = arith.addi %14, %arg4 : index + %27 = arith.muli %26, %1 : index + %28 = pto.partition_view %19, offsets = [%27], sizes = [%1] : !pto.tensor_view -> !pto.partition_tensor_view<1x16384xf16> + %29 = pto.partition_view %20, offsets = [%27], sizes = [%1] : !pto.tensor_view -> !pto.partition_tensor_view<1x16384xf16> + pto.tload ins(%28 : !pto.partition_tensor_view<1x16384xf16>) outs(%22 : !pto.tile_buf) + pto.tsub ins(%22, %22 : !pto.tile_buf, !pto.tile_buf) outs(%24 : !pto.tile_buf) + pto.texp ins(%24 : !pto.tile_buf) outs(%23 : !pto.tile_buf) + pto.texp ins(%22 : !pto.tile_buf) outs(%24 : !pto.tile_buf) + pto.tadd ins(%23, %24 : !pto.tile_buf, !pto.tile_buf) outs(%24 : !pto.tile_buf) + pto.tmul ins(%24, %24 : !pto.tile_buf, !pto.tile_buf) outs(%25 : !pto.tile_buf) + pto.tsub ins(%25, %23 : !pto.tile_buf, !pto.tile_buf) outs(%24 : !pto.tile_buf) + pto.tadd ins(%25, %23 : !pto.tile_buf, !pto.tile_buf) outs(%25 : !pto.tile_buf) + pto.tdiv ins(%24, %25 : !pto.tile_buf, !pto.tile_buf) outs(%24 : !pto.tile_buf) + pto.tmul ins(%22, %24 : !pto.tile_buf, !pto.tile_buf) outs(%24 : !pto.tile_buf) + pto.tstore ins(%24 : !pto.tile_buf) outs(%29 : !pto.partition_tensor_view<1x16384xf16>) + } + } + } + } + } + return + } +} + diff --git a/examples/agent/mish/mish_builder.py b/examples/agent/mish/mish_builder.py new file mode 100644 index 00000000..85dd381e --- /dev/null +++ b/examples/agent/mish/mish_builder.py @@ -0,0 +1,174 @@ +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s + +const = s.const + +# 32 KB of UB / sizeof(fp16) = 16384 elements per tile +ELEMENTS_PER_TILE = 32 * 1024 // 2 + + +def meta_data(): + dtype = pto.float16 + ptr_type = pto.PtrType(dtype) + index_dtype = pto.int32 + + tensor_type = pto.TensorType(rank=1, dtype=dtype) + subtensor_type = pto.SubTensorType(shape=[1, ELEMENTS_PER_TILE], dtype=dtype) + + tile_cfg = pto.TileBufConfig() + tile_type = pto.TileBufType( + shape=[1, ELEMENTS_PER_TILE], + valid_shape=[1, -1], + dtype=dtype, + memory_space="VEC", + config=tile_cfg, + ) + + return { + "ptr_type": ptr_type, + "index_dtype": index_dtype, + "tensor_type": tensor_type, + "subtensor_type": subtensor_type, + "tile_type": tile_type, + } + + +def build_mish(fn_name="mish_fp16"): + """ + Build a dynamic-batch Mish kernel in PTO DSL. + + Computes y = x * tanh(softplus(x)), where: + softplus(x) = ln(1 + exp(x)) + tanh(softplus(x)) = ((1+exp(x))^2 - 1) / ((1+exp(x))^2 + 1) + + This avoids ln() by using the algebraic identity: + exp(2*ln(u)) = u^2 + + Constants (1.0) are derived from the input tile itself using + the identity exp(x - x) = exp(0) = 1.0, which avoids the need for + scalar-tile broadcast operations not available in PTO DSL. + + UB tile budget (fp16, 4 tiles x 32 KB = 128 KB < 192 KB): + tb_x : input row x + tb_ones : constant 1.0 (recomputed each row via exp(x-x)) + tb_tmp1 : intermediate / final output + tb_tmp2 : intermediate + + Kernel args: + x_ptr : fp16[batch * n_cols] -- input + y_ptr : fp16[batch * n_cols] -- output + batch : int32 -- number of rows + n_cols : int32 -- elements per row; must be <= 16384 + """ + + @to_ir_module(meta_data=meta_data) + def _kernel( + x_ptr: "ptr_type", + y_ptr: "ptr_type", + batch_i32: "index_dtype", + n_cols_i32: "index_dtype", + ) -> None: + c0 = const(0) + c1 = const(1) + c_tile = const(ELEMENTS_PER_TILE) + + batch = s.index_cast(batch_i32) + n_cols = s.index_cast(n_cols_i32) + + with pto.vector_section(): + # Guard: n_cols must be in (0, ELEMENTS_PER_TILE]. + with pto.if_context(n_cols > c0): + with pto.if_context(c_tile >= n_cols): + cid = pto.get_block_idx() + sub_bid = pto.get_subblock_idx() + sub_bnum = pto.get_subblock_num() + num_blocks = pto.get_block_num() + + vid = s.index_cast(cid * sub_bnum + sub_bid) + num_cores = s.index_cast(num_blocks * sub_bnum) + + # Distribute rows across cores (row-level parallelism). + rows_per_core = s.ceil_div(batch, num_cores) + row_start = vid * rows_per_core + row_end = s.min_u(row_start + rows_per_core, batch) + num_rows = row_end - row_start + + total_elems = batch * n_cols + tv_x = pto.as_tensor( + tensor_type, ptr=x_ptr, shape=[total_elems], strides=[c1] + ) + tv_y = pto.as_tensor( + tensor_type, ptr=y_ptr, shape=[total_elems], strides=[c1] + ) + + with pto.if_context(num_rows > c0): + # Allocate 4 UB tiles (128 KB total, well under 192 KB UB). + tb_x = pto.alloc_tile(tile_type, valid_col=n_cols) + tb_ones = pto.alloc_tile(tile_type, valid_col=n_cols) + tb_tmp1 = pto.alloc_tile(tile_type, valid_col=n_cols) + tb_tmp2 = pto.alloc_tile(tile_type, valid_col=n_cols) + + for row_i in pto.range(c0, num_rows, c1): + gm_offset = (row_start + row_i) * n_cols + + sv_x = pto.slice_view( + subtensor_type, + source=tv_x, + offsets=[gm_offset], + sizes=[n_cols], + ) + sv_y = pto.slice_view( + subtensor_type, + source=tv_y, + offsets=[gm_offset], + sizes=[n_cols], + ) + + pto.load(sv_x, tb_x) + + # Derive constant 1.0 from data: + # x - x = 0 => exp(0) = 1.0 + tile.sub(tb_x, tb_x, tb_tmp1) # tmp1 = 0.0 + tile.exp(tb_tmp1, tb_ones) # ones = 1.0 + + # Compute mish(x) = x * tanh(softplus(x)) + # Using identity: tanh(ln(1+exp(x))) = ((1+exp(x))^2 - 1) / ((1+exp(x))^2 + 1) + + # Step 1: exp(x) + tile.exp(tb_x, tb_tmp1) # tmp1 = exp(x) + + # Step 2: u = 1 + exp(x) + tile.add(tb_ones, tb_tmp1, tb_tmp1) # tmp1 = 1 + exp(x) + + # Step 3: u^2 = (1 + exp(x))^2 + tile.mul(tb_tmp1, tb_tmp1, tb_tmp2) # tmp2 = (1+exp(x))^2 + + # Step 4: numerator = u^2 - 1 + tile.sub(tb_tmp2, tb_ones, tb_tmp1) # tmp1 = (1+exp(x))^2 - 1 + + # Step 5: denominator = u^2 + 1 + tile.add(tb_tmp2, tb_ones, tb_tmp2) # tmp2 = (1+exp(x))^2 + 1 + + # Step 6: tanh(softplus(x)) = num / den + tile.div(tb_tmp1, tb_tmp2, tb_tmp1) # tmp1 = tanh(softplus(x)) + + # Step 7: mish(x) = x * tanh(softplus(x)) + tile.mul(tb_x, tb_tmp1, tb_tmp1) # tmp1 = mish(x) + + pto.store(tb_tmp1, sv_y) + + _ = fn_name + return _kernel + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--fn-name", + default="mish_fp16", + help="Generated kernel function name.", + ) + args = parser.parse_args() + print(build_mish(fn_name=args.fn_name)) diff --git a/examples/agent/mish/run_mish.py b/examples/agent/mish/run_mish.py new file mode 100644 index 00000000..d4ca82ad --- /dev/null +++ b/examples/agent/mish/run_mish.py @@ -0,0 +1,114 @@ +import argparse +import ctypes + +import torch +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + + +def torch_to_ctypes(tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + +def load_lib(lib_path, block_dim=24): + lib = ctypes.CDLL(lib_path) + lib.call_kernel.argtypes = [ + ctypes.c_uint32, # blockDim + ctypes.c_void_p, # stream + ctypes.c_void_p, # x + ctypes.c_void_p, # y (output) + ctypes.c_uint32, # batch + ctypes.c_uint32, # n_cols + ] + lib.call_kernel.restype = None + + def mish_func(x, y, batch, n_cols, block_dim=block_dim, stream_ptr=None): + if stream_ptr is None: + stream_ptr = torch.npu.current_stream()._as_parameter_ + lib.call_kernel( + block_dim, + stream_ptr, + torch_to_ctypes(x), + torch_to_ctypes(y), + batch, + n_cols, + ) + + return mish_func + + +def mish_ref(x): + """Reference Mish matching the PTO kernel. + + Computes y = x * tanh(softplus(x)), where: + softplus(x) = ln(1 + exp(x)) + tanh(softplus(x)) = ((1+exp(x))^2 - 1) / ((1+exp(x))^2 + 1) + + Uses torch.nn.functional.mish on float32 for reference. + """ + return torch.nn.functional.mish(x.float()).to(x.dtype) + + +def test_mish(lib_path, block_dim=24): + device = get_test_device() + torch.npu.set_device(device) + + mish = load_lib(lib_path=lib_path, block_dim=block_dim) + + torch.manual_seed(0) + dtype = torch.float16 + batch_list = [1, 4, 22, 65] + n_cols_list = [128, 256, 512, 1024, 2048, 4096, 8192, 16384] + + results = [] + for batch in batch_list: + for n_cols in n_cols_list: + # Use small range to stay within fp16 exp range (avoid overflow). + x = torch.randn(batch, n_cols, device=device, dtype=dtype).clamp(-4, 4) + y = torch.empty(batch, n_cols, device=device, dtype=dtype) + + y_ref = mish_ref(x) + mish(x, y, batch, n_cols) + torch.npu.synchronize() + + is_match = True + detail = "" + try: + torch.testing.assert_close(y, y_ref, rtol=1e-2, atol=1e-2) + except AssertionError as err: + is_match = False + detail = str(err).strip() if str(err) else "assert_close failed" + + status = "match" if is_match else "mismatch" + print(f"[{status}] batch={batch}, n_cols={n_cols}, lib={lib_path}") + if detail: + print(" detail:") + print(detail) + results.append((batch, n_cols, status, detail)) + + print(f"\ndetailed summary for {lib_path}:") + for batch, n_cols, status, detail in results: + msg = f" batch={batch}, n_cols={n_cols}, status={status}" + print(msg) + if detail: + print(" detail:") + print(detail) + return results + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--lib", + default="./mish_lib.so", + help="Path to the shared library generated by compile.sh.", + ) + parser.add_argument( + "--block-dim", + type=int, + default=24, + help="Kernel blockDim (default: 24).", + ) + args = parser.parse_args() + test_mish(args.lib, block_dim=args.block_dim) diff --git a/examples/agent/silu/_bench_wrapper.py b/examples/agent/silu/_bench_wrapper.py new file mode 100644 index 00000000..a81933b1 --- /dev/null +++ b/examples/agent/silu/_bench_wrapper.py @@ -0,0 +1,68 @@ +""" +Single-config benchmark wrapper for the agentic optimizer. +Loads silu_lib.so and prints: latency_ms= +""" +import ctypes + +import torch +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + +# Representative shape +BATCH = 1024 +N_COLS = 8192 +BLOCK_DIM = 24 +WARMUP = 5 +ITERS = 20 + + +def torch_to_ctypes(t): + return ctypes.c_void_p(t.data_ptr()) + + +device = get_test_device() +torch.npu.set_device(device) + +lib = ctypes.CDLL("./silu_lib.so") +lib.call_kernel.argtypes = [ + ctypes.c_uint32, # blockDim + ctypes.c_void_p, # stream + ctypes.c_void_p, # x + ctypes.c_void_p, # y (output) + ctypes.c_uint32, # batch + ctypes.c_uint32, # n_cols +] +lib.call_kernel.restype = None + + +def run(x, y): + lib.call_kernel( + BLOCK_DIM, + torch.npu.current_stream()._as_parameter_, + torch_to_ctypes(x), + torch_to_ctypes(y), + BATCH, + N_COLS, + ) + + +dtype = torch.float16 +# Separate tensors per iteration to reduce cache reuse +xs = [torch.randn(BATCH, N_COLS, device=device, dtype=dtype).clamp(-4, 4) for _ in range(WARMUP + ITERS)] +y = torch.empty(BATCH, N_COLS, device=device, dtype=dtype) + +for i in range(WARMUP): + run(xs[i], y) +torch.npu.synchronize() + +starts = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)] +ends = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)] +for i in range(ITERS): + starts[i].record() + run(xs[WARMUP + i], y) + ends[i].record() +torch.npu.synchronize() + +ms = sum(s.elapsed_time(e) for s, e in zip(starts, ends)) / ITERS +print(f"latency_ms={ms:.4f}") diff --git a/examples/agent/silu/caller.cpp b/examples/agent/silu/caller.cpp new file mode 100644 index 00000000..3f3d74aa --- /dev/null +++ b/examples/agent/silu/caller.cpp @@ -0,0 +1,24 @@ +#ifndef KERNEL_CPP +#define KERNEL_CPP "silu.cpp" +#endif +#include KERNEL_CPP + +#ifndef NUM_CORES +#define NUM_CORES 24 +#endif + +extern "C" void call_kernel( + uint32_t blockDim, + void *stream, + uint8_t *x, + uint8_t *y, + uint32_t batch, + uint32_t n_cols) +{ + uint32_t launch_blocks = blockDim > 0 ? blockDim : NUM_CORES; + _kernel<<>>( + reinterpret_cast(x), + reinterpret_cast(y), + static_cast(batch), + static_cast(n_cols)); +} diff --git a/examples/agent/silu/compile.sh b/examples/agent/silu/compile.sh new file mode 100644 index 00000000..0ddb0e7d --- /dev/null +++ b/examples/agent/silu/compile.sh @@ -0,0 +1,22 @@ +set -e + +rm -f silu.pto silu.cpp silu_lib.so + +python ./silu_builder.py > ./silu.pto +ptoas --enable-insert-sync ./silu.pto -o ./silu.cpp + +bisheng \ + -I${ASCEND_TOOLKIT_HOME}/include \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ + -xcce -Xhost-start -Xhost-end \ + -mllvm -cce-aicore-stack-size=0x8000 \ + -mllvm -cce-aicore-function-stack-size=0x8000 \ + -mllvm -cce-aicore-record-overflow=true \ + -mllvm -cce-aicore-addr-transform \ + -mllvm -cce-aicore-dcci-insert-for-scalar=false \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -std=gnu++17 \ + -DKERNEL_CPP="\"silu.cpp\"" \ + ./caller.cpp \ + -o ./silu_lib.so diff --git a/examples/agent/silu/run_silu.py b/examples/agent/silu/run_silu.py new file mode 100644 index 00000000..f71fec06 --- /dev/null +++ b/examples/agent/silu/run_silu.py @@ -0,0 +1,113 @@ +import argparse +import ctypes + +import torch +import torch.nn.functional as F +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + + +def torch_to_ctypes(tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + +def load_lib(lib_path, block_dim=24): + lib = ctypes.CDLL(lib_path) + lib.call_kernel.argtypes = [ + ctypes.c_uint32, # blockDim + ctypes.c_void_p, # stream + ctypes.c_void_p, # x + ctypes.c_void_p, # y (output) + ctypes.c_uint32, # batch + ctypes.c_uint32, # n_cols + ] + lib.call_kernel.restype = None + + def silu_func(x, y, batch, n_cols, block_dim=block_dim, stream_ptr=None): + if stream_ptr is None: + stream_ptr = torch.npu.current_stream()._as_parameter_ + lib.call_kernel( + block_dim, + stream_ptr, + torch_to_ctypes(x), + torch_to_ctypes(y), + batch, + n_cols, + ) + + return silu_func + + +def silu_ref(x): + """Reference SiLU matching the PTO kernel. + + Computes y = x * sigmoid(x) + + Uses fp32 intermediate for reference accuracy, then casts back to fp16. + """ + return F.silu(x.float()).to(x.dtype) + + +def test_silu(lib_path, block_dim=24): + device = get_test_device() + torch.npu.set_device(device) + + silu = load_lib(lib_path=lib_path, block_dim=block_dim) + + torch.manual_seed(0) + dtype = torch.float16 + batch_list = [1, 4, 22, 65] + n_cols_list = [128, 256, 512, 1024, 2048, 4096, 8192, 16384] + + results = [] + for batch in batch_list: + for n_cols in n_cols_list: + # Use small range to stay within fp16 exp range (avoid overflow). + x = torch.randn(batch, n_cols, device=device, dtype=dtype).clamp(-4, 4) + y = torch.empty(batch, n_cols, device=device, dtype=dtype) + + y_ref = silu_ref(x) + silu(x, y, batch, n_cols) + torch.npu.synchronize() + + is_match = True + detail = "" + try: + torch.testing.assert_close(y, y_ref, rtol=1e-2, atol=1e-2) + except AssertionError as err: + is_match = False + detail = str(err).strip() if str(err) else "assert_close failed" + + status = "match" if is_match else "mismatch" + print(f"[{status}] batch={batch}, n_cols={n_cols}, lib={lib_path}") + if detail: + print(" detail:") + print(detail) + results.append((batch, n_cols, status, detail)) + + print(f"\ndetailed summary for {lib_path}:") + for batch, n_cols, status, detail in results: + msg = f" batch={batch}, n_cols={n_cols}, status={status}" + print(msg) + if detail: + print(" detail:") + print(detail) + return results + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--lib", + default="./silu_lib.so", + help="Path to the shared library generated by compile.sh.", + ) + parser.add_argument( + "--block-dim", + type=int, + default=24, + help="Kernel blockDim (default: 24).", + ) + args = parser.parse_args() + test_silu(args.lib, block_dim=args.block_dim) diff --git a/examples/agent/silu/silu.cpp b/examples/agent/silu/silu.cpp new file mode 100644 index 00000000..703395e4 --- /dev/null +++ b/examples/agent/silu/silu.cpp @@ -0,0 +1,91 @@ +#include "pto/pto-inst.hpp" +using namespace pto; +__global__ AICORE void _kernel(__gm__ half* v1, __gm__ half* v2, int32_t v3, int32_t v4) { + unsigned v5 = 1; + unsigned v6 = 0; + int32_t v7 = 16384; + int32_t v8 = 1; + int32_t v9 = 0; + int64_t v10 = 0; + int64_t v11 = 32768; + int64_t v12 = 65536; + int64_t v13 = 98304; + using T = float; + + #if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + if (v4 > v9) { + if (v4 <= v7) { + int64_t v14 = get_block_idx(); + int64_t v15 = get_subblockid(); + int64_t v16 = get_subblockdim(); + int64_t v17 = (int64_t) v16; + int64_t v18 = get_block_num(); + int32_t v19 = (int32_t) ((int64_t) (uint64_t) ((int64_t) v18) * (uint64_t) v17); + int32_t v20 = v3 / v19; + int32_t v21 = v3 % v19 != v9 && v3 < v9 == v19 < v9 ? v20 + v8 : v20; + int32_t v22 = (int32_t) ((uint32_t) ((int32_t) (int64_t) ((uint64_t) ((int64_t) (uint64_t) ((int64_t) v14) * (uint64_t) v17) + (uint64_t) ((int64_t) v15))) * (uint32_t) v21); + int32_t v23 = (int32_t) ((uint32_t) v22 + (uint32_t) v21); + int32_t v24 = (int32_t) ((uint32_t) ((uint32_t) v23 < (uint32_t) v3 ? v23 : v3) - (uint32_t) v22); + int32_t v25 = (int32_t) ((uint32_t) v3 * (uint32_t) v4); + if (v24 > v9) { + Tile v26 = Tile(v4); + TASSIGN(v26, v10); + Tile v27 = Tile(v4); + TASSIGN(v27, v11); + Tile v28 = Tile(v4); + TASSIGN(v28, v12); + Tile v29 = Tile(v4); + TASSIGN(v29, v13); + for (size_t v30 = (size_t) v9; v30 < ((size_t) v24); v30 += (size_t) v8) { + int32_t v31 = (int32_t) ((uint32_t) ((int32_t) (uint32_t) v22 + (uint32_t) ((int32_t) v30)) * (uint32_t) v4); + unsigned v32 = (unsigned) v4 * v5; + pto::Shape<1, 1, 1, 1, -1> v33 = pto::Shape<1, 1, 1, 1, -1>(v4); + pto::Stride<-1, -1, -1, -1, 1> v34 = pto::Stride<-1, -1, -1, -1, 1>(v32, v32, v32, v32); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v35 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v6 + (unsigned) v31 * (unsigned) v8), v33, v34); + unsigned v36 = (unsigned) v4 * v5; + pto::Shape<1, 1, 1, 1, -1> v37 = pto::Shape<1, 1, 1, 1, -1>(v4); + pto::Stride<-1, -1, -1, -1, 1> v38 = pto::Stride<-1, -1, -1, -1, 1>(v36, v36, v36, v36); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v39 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v6 + (unsigned) v31 * (unsigned) v8), v37, v38); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v26, v35); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TSUB(v29, v26, v26); + pipe_barrier(PIPE_V); + TEXP(v27, v29); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TSUB(v28, v29, v26); + pipe_barrier(PIPE_V); + TEXP(v28, v28); + pipe_barrier(PIPE_V); + TADD(v28, v28, v27); + pipe_barrier(PIPE_V); + TDIV(v28, v27, v28); + pipe_barrier(PIPE_V); + TMUL(v28, v26, v28); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pipe_barrier(PIPE_MTE3); + TSTORE(v39, v28); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + }; + }; + }; + } + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + #endif // __DAV_VEC__ + + return; +} + diff --git a/examples/agent/silu/silu.pto b/examples/agent/silu/silu.pto new file mode 100644 index 00000000..2dfd1cbf --- /dev/null +++ b/examples/agent/silu/silu.pto @@ -0,0 +1,58 @@ +module { + func.func @_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: i32, %arg3: i32) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16384 = arith.constant 16384 : index + %0 = arith.index_cast %arg2 : i32 to index + %1 = arith.index_cast %arg3 : i32 to index + pto.section.vector { + %2 = arith.cmpi sgt, %1, %c0 : index + scf.if %2 { + %3 = arith.cmpi sge, %c16384, %1 : index + scf.if %3 { + %4 = pto.get_block_idx + %5 = pto.get_subblock_idx + %6 = pto.get_subblock_num + %7 = pto.get_block_num + %8 = arith.muli %4, %6 : i64 + %9 = arith.addi %8, %5 : i64 + %10 = arith.index_cast %9 : i64 to index + %11 = arith.muli %7, %6 : i64 + %12 = arith.index_cast %11 : i64 to index + %13 = arith.ceildivsi %0, %12 : index + %14 = arith.muli %10, %13 : index + %15 = arith.addi %14, %13 : index + %16 = arith.minui %15, %0 : index + %17 = arith.subi %16, %14 : index + %18 = arith.muli %0, %1 : index + %19 = pto.make_tensor_view %arg0, shape = [%18] strides = [%c1] : !pto.tensor_view + %20 = pto.make_tensor_view %arg1, shape = [%18] strides = [%c1] : !pto.tensor_view + %21 = arith.cmpi sgt, %17, %c0 : index + scf.if %21 { + %22 = pto.alloc_tile valid_col = %1 : !pto.tile_buf + %23 = pto.alloc_tile valid_col = %1 : !pto.tile_buf + %24 = pto.alloc_tile valid_col = %1 : !pto.tile_buf + %25 = pto.alloc_tile valid_col = %1 : !pto.tile_buf + scf.for %arg4 = %c0 to %17 step %c1 { + %26 = arith.addi %14, %arg4 : index + %27 = arith.muli %26, %1 : index + %28 = pto.partition_view %19, offsets = [%27], sizes = [%1] : !pto.tensor_view -> !pto.partition_tensor_view<1x16384xf16> + %29 = pto.partition_view %20, offsets = [%27], sizes = [%1] : !pto.tensor_view -> !pto.partition_tensor_view<1x16384xf16> + pto.tload ins(%28 : !pto.partition_tensor_view<1x16384xf16>) outs(%22 : !pto.tile_buf) + pto.tsub ins(%22, %22 : !pto.tile_buf, !pto.tile_buf) outs(%25 : !pto.tile_buf) + pto.texp ins(%25 : !pto.tile_buf) outs(%23 : !pto.tile_buf) + pto.tsub ins(%25, %22 : !pto.tile_buf, !pto.tile_buf) outs(%24 : !pto.tile_buf) + pto.texp ins(%24 : !pto.tile_buf) outs(%24 : !pto.tile_buf) + pto.tadd ins(%24, %23 : !pto.tile_buf, !pto.tile_buf) outs(%24 : !pto.tile_buf) + pto.tdiv ins(%23, %24 : !pto.tile_buf, !pto.tile_buf) outs(%24 : !pto.tile_buf) + pto.tmul ins(%22, %24 : !pto.tile_buf, !pto.tile_buf) outs(%24 : !pto.tile_buf) + pto.tstore ins(%24 : !pto.tile_buf) outs(%29 : !pto.partition_tensor_view<1x16384xf16>) + } + } + } + } + } + return + } +} + diff --git a/examples/agent/silu/silu_builder.py b/examples/agent/silu/silu_builder.py new file mode 100644 index 00000000..8613588c --- /dev/null +++ b/examples/agent/silu/silu_builder.py @@ -0,0 +1,154 @@ +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s + +const = s.const + +# 32 KB of UB / sizeof(fp16) = 16384 elements per tile +ELEMENTS_PER_TILE = 32 * 1024 // 2 + + +def meta_data(): + dtype = pto.float16 + ptr_type = pto.PtrType(dtype) + index_dtype = pto.int32 + + tensor_type = pto.TensorType(rank=1, dtype=dtype) + subtensor_type = pto.SubTensorType(shape=[1, ELEMENTS_PER_TILE], dtype=dtype) + + tile_cfg = pto.TileBufConfig() + tile_type = pto.TileBufType( + shape=[1, ELEMENTS_PER_TILE], + valid_shape=[1, -1], + dtype=dtype, + memory_space="VEC", + config=tile_cfg, + ) + + return { + "ptr_type": ptr_type, + "index_dtype": index_dtype, + "tensor_type": tensor_type, + "subtensor_type": subtensor_type, + "tile_type": tile_type, + } + + +def build_silu(fn_name="silu_fp16"): + """ + Build a dynamic-batch SiLU (Swish) kernel in PTO DSL. + + Computes y = x * sigmoid(x), where: + sigmoid(x) = 1 / (1 + exp(-x)) + + Constants (1.0) are derived from the input tile itself using + the identity exp(x - x) = exp(0) = 1.0, which avoids the need for + scalar-tile broadcast operations not available in PTO DSL. + + UB tile budget (fp16, 4 tiles x 32 KB = 128 KB < 192 KB): + tb_x : input row x + tb_ones : constant 1.0 (recomputed each row via exp(x-x)) + tb_tmp1 : intermediate / final output + tb_tmp2 : intermediate (for zero) + + Kernel args: + x_ptr : fp16[batch * n_cols] -- input + y_ptr : fp16[batch * n_cols] -- output + batch : int32 -- number of rows + n_cols : int32 -- elements per row; must be <= 16384 + """ + + @to_ir_module(meta_data=meta_data) + def _kernel( + x_ptr: "ptr_type", + y_ptr: "ptr_type", + batch_i32: "index_dtype", + n_cols_i32: "index_dtype", + ) -> None: + c0 = const(0) + c1 = const(1) + c_tile = const(ELEMENTS_PER_TILE) + + batch = s.index_cast(batch_i32) + n_cols = s.index_cast(n_cols_i32) + + with pto.vector_section(): + # Guard: n_cols must be in (0, ELEMENTS_PER_TILE]. + with pto.if_context(n_cols > c0): + with pto.if_context(c_tile >= n_cols): + cid = pto.get_block_idx() + sub_bid = pto.get_subblock_idx() + sub_bnum = pto.get_subblock_num() + num_blocks = pto.get_block_num() + + vid = s.index_cast(cid * sub_bnum + sub_bid) + num_cores = s.index_cast(num_blocks * sub_bnum) + + # Distribute rows across cores (row-level parallelism). + rows_per_core = s.ceil_div(batch, num_cores) + row_start = vid * rows_per_core + row_end = s.min_u(row_start + rows_per_core, batch) + num_rows = row_end - row_start + + total_elems = batch * n_cols + tv_x = pto.as_tensor( + tensor_type, ptr=x_ptr, shape=[total_elems], strides=[c1] + ) + tv_y = pto.as_tensor( + tensor_type, ptr=y_ptr, shape=[total_elems], strides=[c1] + ) + + with pto.if_context(num_rows > c0): + # Allocate 4 UB tiles (128 KB total, well under 192 KB UB). + tb_x = pto.alloc_tile(tile_type, valid_col=n_cols) + tb_ones = pto.alloc_tile(tile_type, valid_col=n_cols) + tb_tmp1 = pto.alloc_tile(tile_type, valid_col=n_cols) + tb_tmp2 = pto.alloc_tile(tile_type, valid_col=n_cols) + + for row_i in pto.range(c0, num_rows, c1): + gm_offset = (row_start + row_i) * n_cols + + sv_x = pto.slice_view( + subtensor_type, + source=tv_x, + offsets=[gm_offset], + sizes=[n_cols], + ) + sv_y = pto.slice_view( + subtensor_type, + source=tv_y, + offsets=[gm_offset], + sizes=[n_cols], + ) + + pto.load(sv_x, tb_x) + + # Derive constants from data: + # x - x = 0 => exp(0) = 1.0 + tile.sub(tb_x, tb_x, tb_tmp2) # tmp2 = 0.0 + tile.exp(tb_tmp2, tb_ones) # ones = 1.0 + + # sigmoid(x) = 1 / (1 + exp(-x)) + tile.sub(tb_tmp2, tb_x, tb_tmp1) # tmp1 = 0 - x = -x + tile.exp(tb_tmp1, tb_tmp1) # tmp1 = exp(-x) + tile.add(tb_tmp1, tb_ones, tb_tmp1) # tmp1 = 1 + exp(-x) + tile.div(tb_ones, tb_tmp1, tb_tmp1) # tmp1 = sigmoid(x) + + # silu(x) = x * sigmoid(x) + tile.mul(tb_x, tb_tmp1, tb_tmp1) # tmp1 = x * sigmoid(x) + pto.store(tb_tmp1, sv_y) + + _ = fn_name + return _kernel + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--fn-name", + default="silu_fp16", + help="Generated kernel function name.", + ) + args = parser.parse_args() + print(build_silu(fn_name=args.fn_name)) diff --git a/openenv/README.md b/openenv/README.md index b684a9bf..6f38c2fe 100644 --- a/openenv/README.md +++ b/openenv/README.md @@ -2,6 +2,12 @@ Uses Claude Opus with OpenEnv to iteratively edit, build, test, and benchmark PTO-DSL kernels until a faster-than-baseline implementation is found. +## Installation + +```bash +pip install openenv-core +``` + ## Quick start ```bash diff --git a/openenv/agent_search.py b/openenv/agent_search.py index f2b3403b..9552db2e 100644 --- a/openenv/agent_search.py +++ b/openenv/agent_search.py @@ -41,41 +41,54 @@ def load_config(config_path: str) -> dict: return tomllib.load(f) -def build_env(cfg: dict) -> tuple["KernelSearchEnv", Path]: - kernel_dir = Path(cfg["kernel_dir"]) - if not kernel_dir.is_absolute(): - kernel_dir = ROOT / kernel_dir +def build_env(cfg: dict) -> tuple["KernelSearchEnv", Path, Path]: + """Returns (env, work_dir, ref_dir). - # Resolve working directory (copy of originals — never touch kernel_dir itself) + ref_dir is kernel_dir (the reference/skeleton). For create mode work_dir + starts empty; for optimize mode it is a full copy of ref_dir. + """ + ref_dir = Path(cfg["kernel_dir"]) + if not ref_dir.is_absolute(): + ref_dir = ROOT / ref_dir + + # Resolve working directory if "work_dir" in cfg: work_dir = Path(cfg["work_dir"]) if not work_dir.is_absolute(): work_dir = ROOT / work_dir else: - work_dir = ROOT / "examples" / "agent" / kernel_dir.name + default_name = cfg.get("kernel_name", ref_dir.name) + work_dir = ROOT / "examples" / "agent" / default_name if work_dir.exists(): shutil.rmtree(work_dir) - shutil.copytree(kernel_dir, work_dir) - print(f"Working directory: {work_dir} (originals in {kernel_dir} are untouched)") + + if cfg.get("mode") == "create": + # Start with an empty directory — agent creates all files from scratch + work_dir.mkdir(parents=True) + print(f"Working directory: {work_dir} (empty — agent will create all files)") + else: + shutil.copytree(ref_dir, work_dir) + print(f"Working directory: {work_dir} (originals in {ref_dir} are untouched)") baseline_files = { - name: (kernel_dir / name).read_text(encoding="utf-8") - for name in cfg["baseline_files"] + name: (ref_dir / name).read_text(encoding="utf-8") + for name in cfg.get("baseline_files", []) } def resolve_cmd(cmd: list[str]) -> list[str]: - # Replace bare "python" with the current interpreter return [sys.executable if c == "python" else c for c in cmd] + bench_cmd = resolve_cmd(cfg["bench_cmd"]) if "bench_cmd" in cfg else None + env = KernelSearchEnv( repo_path=str(work_dir), build_cmd=resolve_cmd(cfg["build_cmd"]), test_cmd=resolve_cmd(cfg["test_cmd"]), - bench_cmd=resolve_cmd(cfg["bench_cmd"]), + bench_cmd=bench_cmd, baseline_files=baseline_files, ) - return env, work_dir + return env, work_dir, ref_dir # --------------------------------------------------------------------------- @@ -94,8 +107,9 @@ def resolve_cmd(cmd: list[str]) -> list[str]: CFG = load_config(_args.config) EXAMPLE_DIR: Path +REF_DIR: Path env: KernelSearchEnv -env, EXAMPLE_DIR = build_env(CFG) +env, EXAMPLE_DIR, REF_DIR = build_env(CFG) # --------------------------------------------------------------------------- # Tool definitions @@ -150,6 +164,21 @@ def resolve_cmd(cmd: list[str]) -> list[str]: "description": "Measure the kernel latency and compute speedup vs the baseline. Returns latency_ms and speedup_vs_baseline.", "input_schema": {"type": "object", "properties": {}, "required": []}, }, + { + "name": "write_file", + "description": ( + "Write the complete content of a file, creating it if it doesn't exist. " + "Use this to write a new kernel file from scratch." + ), + "input_schema": { + "type": "object", + "properties": { + "path": {"type": "string", "description": "Relative file path"}, + "content": {"type": "string", "description": "Full file content"}, + }, + "required": ["path", "content"], + }, + }, ] # --------------------------------------------------------------------------- @@ -193,6 +222,16 @@ def execute_tool(name: str, tool_input: dict) -> str: f"Best so far: {env.best_ms:.4f} ms ({env.best_speedup:.4f}x)" ) + if name == "write_file": + for key in ("path", "content"): + if key not in tool_input: + return f"Error: write_file requires '{key}' parameter." + obs = env.step(KernelAction("edit", { + "path": tool_input["path"], + "content": tool_input["content"], + })) + return obs.summary + return f"Unknown tool: {name}" # --------------------------------------------------------------------------- @@ -247,6 +286,18 @@ def _build_system_prompt(cfg: dict, kernel_dir: Path) -> str: if preload_section: preload_section = "\n## Pre-loaded files (do NOT re-read these)\n" + preload_section + # Embed local reference files (repo-relative paths) + local_section = "" + for fpath_str in cfg.get("local_context_files", []): + fpath = ROOT / fpath_str + try: + content = fpath.read_text(encoding="utf-8") + local_section += f"\n### {fpath.name}\n```python\n{content}\n```\n" + except FileNotFoundError: + local_section += f"\n### {fpath_str}\n(file not found)\n" + if local_section: + local_section = "\n## PTO-DSL API (local, authoritative)\n" + local_section + # Fetch remote reference docs and embed them url_section = "" for entry in cfg.get("context_urls", []): @@ -267,7 +318,7 @@ def _build_system_prompt(cfg: dict, kernel_dir: Path) -> str: The kernel directory contains exactly these files: {file_listing} Do NOT attempt to read any other filenames — they do not exist. - {preload_section}{url_section} + {preload_section}{local_section}{url_section} ## Your workflow 1. Propose one targeted change based on the pre-loaded files above 2. edit_file(...) @@ -288,35 +339,116 @@ def _build_system_prompt(cfg: dict, kernel_dir: Path) -> str: - Stop as soon as you achieve a confirmed speedup > 1.0x and explain what worked. """) -SYSTEM = _build_system_prompt(CFG, EXAMPLE_DIR) +def _build_create_system_prompt(cfg: dict, work_dir: Path, ref_dir: Path) -> str: + name = cfg.get("kernel_name", "kernel") + create_file = cfg.get("create_file", "builder.py") + description = cfg.get("create_prompt", "").strip() + + files = sorted(f.name for f in work_dir.iterdir() if f.is_file()) + file_listing = ", ".join(files) if files else "(empty — you must create all files)" + + # Preload_files come from ref_dir (the reference skeleton), not the empty work_dir + preload_section = "" + for fname in cfg.get("preload_files", []): + fpath = ref_dir / fname + try: + content = fpath.read_text(encoding="utf-8") + preload_section += f"\n### {fname} (from reference kernel — adapt for {name})\n```\n{content}\n```\n" + except FileNotFoundError: + preload_section += f"\n### {fname}\n(file not found in reference)\n" + if preload_section: + preload_section = "\n## Reference files (adapt these for the new kernel — do NOT copy verbatim)\n" + preload_section + + # Embed local reference files (repo-relative paths) + local_section = "" + for fpath_str in cfg.get("local_context_files", []): + fpath = ROOT / fpath_str + try: + content = fpath.read_text(encoding="utf-8") + local_section += f"\n### {fpath.name}\n```python\n{content}\n```\n" + except FileNotFoundError: + local_section += f"\n### {fpath_str}\n(file not found)\n" + if local_section: + local_section = "\n## PTO-DSL API (local, authoritative)\n" + local_section + + url_section = "" + for entry in cfg.get("context_urls", []): + if isinstance(entry, dict): + url, label = entry["url"], entry.get("label", entry["url"]) + else: + url, label = entry, entry + print(f" Fetching context: {label} …") + content = _fetch_url(url) + url_section += f"\n### {label}\n```\n{content}\n```\n" + if url_section: + url_section = "\n## Reference documentation\n" + url_section + + return _textwrap.dedent(f"""\ + You are an expert NPU kernel engineer specialising in Ascend/PTO-DSL. + + Your task is to implement the {name} kernel entirely from scratch. + The working directory starts with: {file_listing} + You must create ALL required files yourself using write_file. + + {preload_section}{local_section}{url_section} + ## Kernel to implement + {description} + + ## Files you must create + - `{create_file}` — PTO-DSL kernel builder (the main implementation) + - `compile.sh` — build script (adapt from reference above; use {name}-specific names) + - `caller.cpp` — C++ entry point (adapt signature to match the {name} kernel exactly) + - `run_{name}.py` — correctness test script (adapt from reference; use `{name}_lib.so`) + + ## Your workflow + 1. Write all required files using write_file(path="...", content="...") + 2. build() — check for compile errors; fix with write_file/edit_file as needed + 3. run_tests() — verify correctness; fix failures and rebuild + 4. Once tests pass, call benchmark() to measure latency + + ## Rules + - Study the reference files above carefully to understand the PTO-DSL API and build system. + - The compile.sh and caller.cpp you create must be consistent with each other and with `{create_file}`. + - If build or tests fail, fix the issue — do not give up. + - Stop once tests pass and you have a benchmark result. + """) + + +SYSTEM = ( + _build_create_system_prompt(CFG, EXAMPLE_DIR, REF_DIR) + if CFG.get("mode") == "create" + else _build_system_prompt(CFG, EXAMPLE_DIR) +) # --------------------------------------------------------------------------- # Agentic loop # --------------------------------------------------------------------------- def run_agent(max_turns: int = 30) -> None: - # Establish baseline - print("Establishing baseline …") - obs = env.reset() - print(f"Baseline: {obs.latency_ms:.4f} ms\n") - + mode = CFG.get("mode", "optimize") client = anthropic.Anthropic() - preloaded = CFG.get("preload_files", []) - if preloaded: - start_hint = f"The source files are already in your context above — start proposing a change directly." + + if mode == "create": + name = CFG.get("kernel_name", "kernel") + create_file = CFG.get("create_file", "builder.py") + print(f"Create mode — implementing {name} kernel from scratch …\n") + messages: list[dict] = [{"role": "user", "content": + f"Please implement the {name} kernel from scratch. " + f"Create all required files (compile.sh, caller.cpp, {create_file}, run_{name}.py) " + "using write_file. The reference files and PTO-DSL docs are in your system context above."}] else: - main_file = CFG.get("main_file", "builder.py") - start_hint = f"Start by reading {main_file}." - - messages: list[dict] = [ - { - "role": "user", - "content": ( - f"The baseline kernel latency is {obs.latency_ms:.4f} ms. " - f"Please find a faster implementation. {start_hint}" - ), - } - ] + print("Establishing baseline …") + obs = env.reset() + print(f"Baseline: {obs.latency_ms:.4f} ms\n") + preloaded = CFG.get("preload_files", []) + start_hint = ( + "The source files are already in your context above — start proposing a change directly." + if preloaded else f"Start by reading {CFG.get('main_file', 'builder.py')}." + ) + messages = [{"role": "user", "content": + f"The baseline kernel latency is {obs.latency_ms:.4f} ms. " + f"Please find a faster implementation. {start_hint}"}] + tests_passed = False turn = 0 while turn < max_turns: turn += 1 @@ -376,22 +508,34 @@ def run_agent(max_turns: int = 30) -> None: "tool_use_id": block.id, "content": result, }) + # Track test results for create-mode exit condition + if block.name == "run_tests" and result.startswith("PASS"): + tests_passed = True if tool_results: messages.append({"role": "user", "content": tool_results}) - # Early exit if we already have a confirmed speedup - if env.best_speedup is not None and env.best_speedup > 1.0: + if mode == "create" and tests_passed: + print("\n[Agent] Tests passed — kernel implemented successfully.") + break + + if mode == "optimize" and env.best_speedup is not None and env.best_speedup > 1.0: print(f"\n[Agent] Speedup achieved: {env.best_speedup:.4f}x — stopping early.") break # Final summary print(f"\n{'='*60}") - print("SEARCH COMPLETE") - print(f"{'='*60}") - print(f" Baseline: {env.baseline_ms:.4f} ms") - print(f" Best found: {env.best_ms:.4f} ms") - print(f" Best speedup: {env.best_speedup:.4f}x" if env.best_speedup else " No improvement found.") + if mode == "create": + print("CREATE COMPLETE") + print(f"{'='*60}") + print(f" Result: {'tests passed' if tests_passed else 'did not reach passing tests'}") + print(f" Output: {EXAMPLE_DIR}") + else: + print("SEARCH COMPLETE") + print(f"{'='*60}") + print(f" Baseline: {env.baseline_ms:.4f} ms") + print(f" Best found: {env.best_ms:.4f} ms") + print(f" Best speedup: {env.best_speedup:.4f}x" if env.best_speedup else " No improvement found.") def _fmt_input(inp: dict) -> str: diff --git a/openenv/geglu_config.toml b/openenv/geglu_config.toml index 125ce6cb..686c6db6 100644 --- a/openenv/geglu_config.toml +++ b/openenv/geglu_config.toml @@ -28,6 +28,15 @@ readonly_files = ["run_geglu.py", "caller.cpp"] preload_files = ["geglu_builder.py", "compile.sh", "caller.cpp"] +# Local PTO-DSL API files (authoritative, read from repo) +local_context_files = [ + "ptodsl/api/pto.py", + "ptodsl/api/tile.py", + "ptodsl/api/scalar.py", + "ptodsl/api/control_flow.py", + "ptodsl/api/synchronization.py", +] + [[context_urls]] url = "https://github.com/zhangstevenunity/PTOAS/blob/main/include/PTO/IR/PTOOps.td" label = "PTO MLIR op definitions (PTOOps.td)" diff --git a/openenv/hadamard_config.toml b/openenv/hadamard_config.toml index d950dd3f..8675e91a 100644 --- a/openenv/hadamard_config.toml +++ b/openenv/hadamard_config.toml @@ -45,6 +45,15 @@ preload_files = ["hadamard_builder.py", "compile.sh", "caller.cpp"] # Reference only (not fetchable): https://gitcode.com/cann/pto-isa/tree/master/docs/isa # Reference only (not fetchable): https://gitcode.com/cann/pto-isa/tree/master/include/pto/npu/a2a3 # Reference only (not fetchable): https://gitcode.com/cann/pto-isa/blob/master/tests/npu/a2a3/src/st/testcase/ +# Local PTO-DSL API files (authoritative, read from repo) +local_context_files = [ + "ptodsl/api/pto.py", + "ptodsl/api/tile.py", + "ptodsl/api/scalar.py", + "ptodsl/api/control_flow.py", + "ptodsl/api/synchronization.py", +] + [[context_urls]] url = "https://github.com/zhangstevenunity/PTOAS/blob/main/include/PTO/IR/PTOOps.td" label = "PTO MLIR op definitions (PTOOps.td)" diff --git a/openenv/hardswish_config.toml b/openenv/hardswish_config.toml new file mode 100644 index 00000000..c99dfeb6 --- /dev/null +++ b/openenv/hardswish_config.toml @@ -0,0 +1,48 @@ +mode = "create" + +kernel_dir = "examples/aot/geglu_dynamic_multicore" # reference for API patterns only +kernel_name = "hardswish" +create_file = "hardswish_builder.py" + +build_cmd = ["bash", "compile.sh"] +test_cmd = ["python", "run_hardswish.py"] +bench_cmd = ["python", "_bench_wrapper.py"] + +create_prompt = """ +Implement Hard-Swish activation (used in MobileNetV3): + y = x * relu6(x + 3) / 6 + = x * clamp(x + 3, 0, 6) / 6 + +PyTorch reference: torch.nn.functional.hardswish(x) + +Input: fp16 tensor (batch, n_cols) +Output: fp16 tensor (batch, n_cols) — separate output pointer + +The kernel signature in caller.cpp should be: + extern "C" void call_kernel(uint32_t blockDim, void *stream, + uint8_t *x, uint8_t *y, + uint32_t batch, uint32_t n_cols) + +The .so produced by compile.sh should be named hardswish_lib.so. +The test script run_hardswish.py should compare against torch.nn.functional.hardswish(x.float()).to(x.dtype). +The bench wrapper _bench_wrapper.py should print latency_ms=. + +Implementation hint: clamp(x + 3, 0, 6) can be computed as min(max(x + 3, 0), 6). +The division by 6 can be done as multiplication by 1/6 ≈ 0.16667. +""" + +# These are loaded from the reference kernel_dir as examples — adapt them for hardswish +preload_files = ["compile.sh", "caller.cpp", "geglu_builder.py", "run_geglu.py", "_bench_wrapper.py"] + +# Local PTO-DSL API files (authoritative, read from repo) +local_context_files = [ + "ptodsl/api/pto.py", + "ptodsl/api/tile.py", + "ptodsl/api/scalar.py", + "ptodsl/api/control_flow.py", + "ptodsl/api/synchronization.py", +] + +[[context_urls]] +url = "https://github.com/zhangstevenunity/PTOAS/blob/main/python/pto/dialects/pto.py" +label = "PTO Python bindings" diff --git a/openenv/kernel_opt_env.py b/openenv/kernel_opt_env.py index 503b99ac..a21a87fe 100644 --- a/openenv/kernel_opt_env.py +++ b/openenv/kernel_opt_env.py @@ -38,7 +38,7 @@ def __init__( self, repo_path: str, test_cmd: list[str], - bench_cmd: list[str], + bench_cmd: list[str] | None = None, build_cmd: list[str] | None = None, baseline_files: dict[str, str] | None = None, ): diff --git a/openenv/mish_config.toml b/openenv/mish_config.toml new file mode 100644 index 00000000..2f5fe8e6 --- /dev/null +++ b/openenv/mish_config.toml @@ -0,0 +1,46 @@ +mode = "create" + +kernel_dir = "examples/aot/geglu_dynamic_multicore" # reference for API patterns only +kernel_name = "mish" +create_file = "mish_builder.py" + +build_cmd = ["bash", "compile.sh"] +test_cmd = ["python", "run_mish.py"] +bench_cmd = ["python", "_bench_wrapper.py"] + +create_prompt = """ +Implement Mish activation: y = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x))) +PyTorch reference: torch.nn.functional.mish(x) + +Input: fp16 tensor (batch, n_cols) +Output: fp16 tensor (batch, n_cols) — separate output pointer + +The kernel signature in caller.cpp should be: + extern "C" void call_kernel(uint32_t blockDim, void *stream, + uint8_t *x, uint8_t *y, + uint32_t batch, uint32_t n_cols) + +The .so produced by compile.sh should be named mish_lib.so. +The test script run_mish.py should compare against torch.nn.functional.mish(x.float()).to(x.dtype). +The bench wrapper _bench_wrapper.py should print latency_ms=. + +Implementation hint: compute softplus(x) = ln(1 + exp(x)) first, then tanh of that, then multiply by x. +For numerical stability at large x, softplus(x) ≈ x (exp overflows), so clamp or use the identity +softplus(x) = x + ln(1 + exp(-x)) for large positive x. +""" + +# These are loaded from the reference kernel_dir as examples — adapt them for mish +preload_files = ["compile.sh", "caller.cpp", "geglu_builder.py", "run_geglu.py", "_bench_wrapper.py"] + +# Local PTO-DSL API files (authoritative, read from repo) +local_context_files = [ + "ptodsl/api/pto.py", + "ptodsl/api/tile.py", + "ptodsl/api/scalar.py", + "ptodsl/api/control_flow.py", + "ptodsl/api/synchronization.py", +] + +[[context_urls]] +url = "https://github.com/zhangstevenunity/PTOAS/blob/main/python/pto/dialects/pto.py" +label = "PTO Python bindings" diff --git a/openenv/silu_config.toml b/openenv/silu_config.toml new file mode 100644 index 00000000..ddd69841 --- /dev/null +++ b/openenv/silu_config.toml @@ -0,0 +1,42 @@ +mode = "create" + +kernel_dir = "examples/aot/geglu_dynamic_multicore" # reference for API patterns only +kernel_name = "silu" +create_file = "silu_builder.py" + +build_cmd = ["bash", "compile.sh"] +test_cmd = ["python", "run_silu.py"] +bench_cmd = ["python", "_bench_wrapper.py"] + +create_prompt = """ +Implement SiLU (Swish) activation: y = x * sigmoid(x) +PyTorch reference: torch.nn.functional.silu(x) + +Input: fp16 tensor (batch, n_cols) +Output: fp16 tensor (batch, n_cols) — separate output pointer + +The kernel signature in caller.cpp should be: + extern "C" void call_kernel(uint32_t blockDim, void *stream, + uint8_t *x, uint8_t *y, + uint32_t batch, uint32_t n_cols) + +The .so produced by compile.sh should be named silu_lib.so. +The test script run_silu.py should compare against torch.nn.functional.silu(x.float()).to(x.dtype). +The bench wrapper _bench_wrapper.py should print latency_ms=. +""" + +# These are loaded from the reference kernel_dir as examples — adapt them for silu +preload_files = ["compile.sh", "caller.cpp", "geglu_builder.py", "run_geglu.py", "_bench_wrapper.py"] + +# Local PTO-DSL API files (authoritative, read from repo) +local_context_files = [ + "ptodsl/api/pto.py", + "ptodsl/api/tile.py", + "ptodsl/api/scalar.py", + "ptodsl/api/control_flow.py", + "ptodsl/api/synchronization.py", +] + +[[context_urls]] +url = "https://github.com/zhangstevenunity/PTOAS/blob/main/python/pto/dialects/pto.py" +label = "PTO Python bindings" From 81e5669f7e1833dd00f4671474e7f18146e6e608 Mon Sep 17 00:00:00 2001 From: mirkodevita Date: Mon, 9 Mar 2026 16:00:35 +0000 Subject: [PATCH 4/6] added agent egnerated hardswish example --- examples/agent/hardswish/_bench_wrapper.py | 68 +++++++ examples/agent/hardswish/caller.cpp | 24 +++ examples/agent/hardswish/compile.sh | 24 +++ examples/agent/hardswish/hardswish.cpp | 102 +++++++++++ examples/agent/hardswish/hardswish.pto | 63 +++++++ examples/agent/hardswish/hardswish_builder.py | 168 ++++++++++++++++++ examples/agent/hardswish/run_hardswish.py | 107 +++++++++++ openenv/agent_search.py | 74 +++++++- 8 files changed, 627 insertions(+), 3 deletions(-) create mode 100644 examples/agent/hardswish/_bench_wrapper.py create mode 100644 examples/agent/hardswish/caller.cpp create mode 100755 examples/agent/hardswish/compile.sh create mode 100644 examples/agent/hardswish/hardswish.cpp create mode 100644 examples/agent/hardswish/hardswish.pto create mode 100644 examples/agent/hardswish/hardswish_builder.py create mode 100644 examples/agent/hardswish/run_hardswish.py diff --git a/examples/agent/hardswish/_bench_wrapper.py b/examples/agent/hardswish/_bench_wrapper.py new file mode 100644 index 00000000..0199edb2 --- /dev/null +++ b/examples/agent/hardswish/_bench_wrapper.py @@ -0,0 +1,68 @@ +""" +Single-config benchmark wrapper for the agentic optimizer. +Loads hardswish_lib.so and prints: latency_ms= +""" +import ctypes + +import torch +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + +# Representative shape +BATCH = 1024 +N_COLS = 8192 +BLOCK_DIM = 24 +WARMUP = 5 +ITERS = 20 + + +def torch_to_ctypes(t): + return ctypes.c_void_p(t.data_ptr()) + + +device = get_test_device() +torch.npu.set_device(device) + +lib = ctypes.CDLL("./hardswish_lib.so") +lib.call_kernel.argtypes = [ + ctypes.c_uint32, # blockDim + ctypes.c_void_p, # stream + ctypes.c_void_p, # x + ctypes.c_void_p, # y (output) + ctypes.c_uint32, # batch + ctypes.c_uint32, # n_cols +] +lib.call_kernel.restype = None + + +def run(x, y): + lib.call_kernel( + BLOCK_DIM, + torch.npu.current_stream()._as_parameter_, + torch_to_ctypes(x), + torch_to_ctypes(y), + BATCH, + N_COLS, + ) + + +dtype = torch.float16 +# Separate tensors per iteration to reduce cache reuse +xs = [torch.randn(BATCH, N_COLS, device=device, dtype=dtype).clamp(-8, 8) for _ in range(WARMUP + ITERS)] +y = torch.empty(BATCH, N_COLS, device=device, dtype=dtype) + +for i in range(WARMUP): + run(xs[i], y) +torch.npu.synchronize() + +starts = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)] +ends = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)] +for i in range(ITERS): + starts[i].record() + run(xs[WARMUP + i], y) + ends[i].record() +torch.npu.synchronize() + +ms = sum(s.elapsed_time(e) for s, e in zip(starts, ends)) / ITERS +print(f"latency_ms={ms:.4f}") diff --git a/examples/agent/hardswish/caller.cpp b/examples/agent/hardswish/caller.cpp new file mode 100644 index 00000000..4d9c6220 --- /dev/null +++ b/examples/agent/hardswish/caller.cpp @@ -0,0 +1,24 @@ +#ifndef KERNEL_CPP +#define KERNEL_CPP "hardswish.cpp" +#endif +#include KERNEL_CPP + +#ifndef NUM_CORES +#define NUM_CORES 24 +#endif + +extern "C" void call_kernel( + uint32_t blockDim, + void *stream, + uint8_t *x, + uint8_t *y, + uint32_t batch, + uint32_t n_cols) +{ + uint32_t launch_blocks = blockDim > 0 ? blockDim : NUM_CORES; + _kernel<<>>( + reinterpret_cast(x), + reinterpret_cast(y), + static_cast(batch), + static_cast(n_cols)); +} diff --git a/examples/agent/hardswish/compile.sh b/examples/agent/hardswish/compile.sh new file mode 100755 index 00000000..3c33ccaf --- /dev/null +++ b/examples/agent/hardswish/compile.sh @@ -0,0 +1,24 @@ +set -e + +rm -f hardswish.pto hardswish.cpp hardswish_lib.so + +python ./hardswish_builder.py > ./hardswish.pto +ptoas --enable-insert-sync ./hardswish.pto -o ./hardswish.cpp + +PTO_LIB_PATH=/sources/pto-isa +bisheng \ + -I${PTO_LIB_PATH}/include \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ + -xcce -Xhost-start -Xhost-end \ + -mllvm -cce-aicore-stack-size=0x8000 \ + -mllvm -cce-aicore-function-stack-size=0x8000 \ + -mllvm -cce-aicore-record-overflow=true \ + -mllvm -cce-aicore-addr-transform \ + -mllvm -cce-aicore-dcci-insert-for-scalar=false \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -std=gnu++17 \ + -DKERNEL_CPP="\"hardswish.cpp\"" \ + ./caller.cpp \ + -o ./hardswish_lib.so + diff --git a/examples/agent/hardswish/hardswish.cpp b/examples/agent/hardswish/hardswish.cpp new file mode 100644 index 00000000..d989a1b9 --- /dev/null +++ b/examples/agent/hardswish/hardswish.cpp @@ -0,0 +1,102 @@ +#include "pto/pto-inst.hpp" +using namespace pto; +__global__ AICORE void _kernel(__gm__ half* v1, __gm__ half* v2, int32_t v3, int32_t v4) { + unsigned v5 = 1; + unsigned v6 = 0; + int32_t v7 = 16384; + int32_t v8 = 1; + int32_t v9 = 0; + int64_t v10 = 0; + int64_t v11 = 32768; + int64_t v12 = 65536; + int64_t v13 = 98304; + using T = float; + + #if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + if (v4 > v9) { + if (v4 <= v7) { + int64_t v14 = get_block_idx(); + int64_t v15 = get_subblockid(); + int64_t v16 = get_subblockdim(); + int64_t v17 = (int64_t) v16; + int64_t v18 = get_block_num(); + int32_t v19 = (int32_t) ((int64_t) (uint64_t) ((int64_t) v18) * (uint64_t) v17); + int32_t v20 = v3 / v19; + int32_t v21 = v3 % v19 != v9 && v3 < v9 == v19 < v9 ? v20 + v8 : v20; + int32_t v22 = (int32_t) ((uint32_t) ((int32_t) (int64_t) ((uint64_t) ((int64_t) (uint64_t) ((int64_t) v14) * (uint64_t) v17) + (uint64_t) ((int64_t) v15))) * (uint32_t) v21); + int32_t v23 = (int32_t) ((uint32_t) v22 + (uint32_t) v21); + int32_t v24 = (int32_t) ((uint32_t) ((uint32_t) v23 < (uint32_t) v3 ? v23 : v3) - (uint32_t) v22); + int32_t v25 = (int32_t) ((uint32_t) v3 * (uint32_t) v4); + if (v24 > v9) { + Tile v26 = Tile(v4); + TASSIGN(v26, v10); + Tile v27 = Tile(v4); + TASSIGN(v27, v11); + Tile v28 = Tile(v4); + TASSIGN(v28, v12); + Tile v29 = Tile(v4); + TASSIGN(v29, v13); + for (size_t v30 = (size_t) v9; v30 < ((size_t) v24); v30 += (size_t) v8) { + int32_t v31 = (int32_t) ((uint32_t) ((int32_t) (uint32_t) v22 + (uint32_t) ((int32_t) v30)) * (uint32_t) v4); + unsigned v32 = (unsigned) v4 * v5; + pto::Shape<1, 1, 1, 1, -1> v33 = pto::Shape<1, 1, 1, 1, -1>(v4); + pto::Stride<-1, -1, -1, -1, 1> v34 = pto::Stride<-1, -1, -1, -1, 1>(v32, v32, v32, v32); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v35 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v6 + (unsigned) v31 * (unsigned) v8), v33, v34); + unsigned v36 = (unsigned) v4 * v5; + pto::Shape<1, 1, 1, 1, -1> v37 = pto::Shape<1, 1, 1, 1, -1>(v4); + pto::Stride<-1, -1, -1, -1, 1> v38 = pto::Stride<-1, -1, -1, -1, 1>(v36, v36, v36, v36); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v39 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v6 + (unsigned) v31 * (unsigned) v8), v37, v38); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v26, v35); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TSUB(v27, v26, v26); + pipe_barrier(PIPE_V); + TEXP(v27, v27); + pipe_barrier(PIPE_V); + TADD(v28, v27, v27); + pipe_barrier(PIPE_V); + TADD(v28, v28, v27); + pipe_barrier(PIPE_V); + TADD(v29, v28, v28); + pipe_barrier(PIPE_V); + TADD(v28, v26, v28); + pipe_barrier(PIPE_V); + TRELU(v28, v28); + pipe_barrier(PIPE_V); + TSUB(v27, v28, v29); + pipe_barrier(PIPE_V); + TRELU(v27, v27); + pipe_barrier(PIPE_V); + TSUB(v27, v28, v27); + pipe_barrier(PIPE_V); + TMUL(v27, v26, v27); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_V); + TDIV(v27, v27, v29); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pipe_barrier(PIPE_MTE3); + TSTORE(v39, v27); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + }; + }; + }; + } + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + #endif // __DAV_VEC__ + + return; +} + diff --git a/examples/agent/hardswish/hardswish.pto b/examples/agent/hardswish/hardswish.pto new file mode 100644 index 00000000..2c60a7f2 --- /dev/null +++ b/examples/agent/hardswish/hardswish.pto @@ -0,0 +1,63 @@ +module { + func.func @_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: i32, %arg3: i32) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16384 = arith.constant 16384 : index + %0 = arith.index_cast %arg2 : i32 to index + %1 = arith.index_cast %arg3 : i32 to index + pto.section.vector { + %2 = arith.cmpi sgt, %1, %c0 : index + scf.if %2 { + %3 = arith.cmpi sge, %c16384, %1 : index + scf.if %3 { + %4 = pto.get_block_idx + %5 = pto.get_subblock_idx + %6 = pto.get_subblock_num + %7 = pto.get_block_num + %8 = arith.muli %4, %6 : i64 + %9 = arith.addi %8, %5 : i64 + %10 = arith.index_cast %9 : i64 to index + %11 = arith.muli %7, %6 : i64 + %12 = arith.index_cast %11 : i64 to index + %13 = arith.ceildivsi %0, %12 : index + %14 = arith.muli %10, %13 : index + %15 = arith.addi %14, %13 : index + %16 = arith.minui %15, %0 : index + %17 = arith.subi %16, %14 : index + %18 = arith.muli %0, %1 : index + %19 = pto.make_tensor_view %arg0, shape = [%18] strides = [%c1] : !pto.tensor_view + %20 = pto.make_tensor_view %arg1, shape = [%18] strides = [%c1] : !pto.tensor_view + %21 = arith.cmpi sgt, %17, %c0 : index + scf.if %21 { + %22 = pto.alloc_tile valid_col = %1 : !pto.tile_buf + %23 = pto.alloc_tile valid_col = %1 : !pto.tile_buf + %24 = pto.alloc_tile valid_col = %1 : !pto.tile_buf + %25 = pto.alloc_tile valid_col = %1 : !pto.tile_buf + scf.for %arg4 = %c0 to %17 step %c1 { + %26 = arith.addi %14, %arg4 : index + %27 = arith.muli %26, %1 : index + %28 = pto.partition_view %19, offsets = [%27], sizes = [%1] : !pto.tensor_view -> !pto.partition_tensor_view<1x16384xf16> + %29 = pto.partition_view %20, offsets = [%27], sizes = [%1] : !pto.tensor_view -> !pto.partition_tensor_view<1x16384xf16> + pto.tload ins(%28 : !pto.partition_tensor_view<1x16384xf16>) outs(%22 : !pto.tile_buf) + pto.tsub ins(%22, %22 : !pto.tile_buf, !pto.tile_buf) outs(%23 : !pto.tile_buf) + pto.texp ins(%23 : !pto.tile_buf) outs(%23 : !pto.tile_buf) + pto.tadd ins(%23, %23 : !pto.tile_buf, !pto.tile_buf) outs(%24 : !pto.tile_buf) + pto.tadd ins(%24, %23 : !pto.tile_buf, !pto.tile_buf) outs(%24 : !pto.tile_buf) + pto.tadd ins(%24, %24 : !pto.tile_buf, !pto.tile_buf) outs(%25 : !pto.tile_buf) + pto.tadd ins(%22, %24 : !pto.tile_buf, !pto.tile_buf) outs(%24 : !pto.tile_buf) + pto.trelu ins(%24 : !pto.tile_buf) outs(%24 : !pto.tile_buf) + pto.tsub ins(%24, %25 : !pto.tile_buf, !pto.tile_buf) outs(%23 : !pto.tile_buf) + pto.trelu ins(%23 : !pto.tile_buf) outs(%23 : !pto.tile_buf) + pto.tsub ins(%24, %23 : !pto.tile_buf, !pto.tile_buf) outs(%23 : !pto.tile_buf) + pto.tmul ins(%22, %23 : !pto.tile_buf, !pto.tile_buf) outs(%23 : !pto.tile_buf) + pto.tdiv ins(%23, %25 : !pto.tile_buf, !pto.tile_buf) outs(%23 : !pto.tile_buf) + pto.tstore ins(%23 : !pto.tile_buf) outs(%29 : !pto.partition_tensor_view<1x16384xf16>) + } + } + } + } + } + return + } +} + diff --git a/examples/agent/hardswish/hardswish_builder.py b/examples/agent/hardswish/hardswish_builder.py new file mode 100644 index 00000000..248581bd --- /dev/null +++ b/examples/agent/hardswish/hardswish_builder.py @@ -0,0 +1,168 @@ +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s + +const = s.const + +# 32 KB of UB / sizeof(fp16) = 16384 elements per tile +ELEMENTS_PER_TILE = 32 * 1024 // 2 + + +def meta_data(): + dtype = pto.float16 + ptr_type = pto.PtrType(dtype) + index_dtype = pto.int32 + + tensor_type = pto.TensorType(rank=1, dtype=dtype) + subtensor_type = pto.SubTensorType(shape=[1, ELEMENTS_PER_TILE], dtype=dtype) + + tile_cfg = pto.TileBufConfig() + tile_type = pto.TileBufType( + shape=[1, ELEMENTS_PER_TILE], + valid_shape=[1, -1], + dtype=dtype, + memory_space="VEC", + config=tile_cfg, + ) + + return { + "ptr_type": ptr_type, + "index_dtype": index_dtype, + "tensor_type": tensor_type, + "subtensor_type": subtensor_type, + "tile_type": tile_type, + } + + +def build_hardswish(fn_name="hardswish_fp16"): + """ + Build a dynamic-batch Hard-Swish kernel in PTO DSL. + + Computes y = x * clamp(x + 3, 0, 6) / 6, where: + clamp(v, 0, 6) = relu(v) - relu(v - 6) + + Constants (1.0, 3.0, 6.0) are derived from the input tile itself using + the identity exp(a - a) = exp(0) = 1.0, which avoids the need for + scalar-tile broadcast operations not available in PTO DSL. + + UB tile budget (fp16, 4 tiles x 32 KB = 128 KB < 192 KB): + tb_x : input row x + tb_t1 : intermediate + tb_t2 : intermediate + tb_t3 : holds 6.0, then intermediate + + Kernel args: + x_ptr : fp16[batch * n_cols] -- input + y_ptr : fp16[batch * n_cols] -- output + batch : int32 -- number of rows + n_cols : int32 -- elements per row; must be <= 16384 + """ + + @to_ir_module(meta_data=meta_data) + def _kernel( + x_ptr: "ptr_type", + y_ptr: "ptr_type", + batch_i32: "index_dtype", + n_cols_i32: "index_dtype", + ) -> None: + c0 = const(0) + c1 = const(1) + c_tile = const(ELEMENTS_PER_TILE) + + batch = s.index_cast(batch_i32) + n_cols = s.index_cast(n_cols_i32) + + with pto.vector_section(): + # Guard: n_cols must be in (0, ELEMENTS_PER_TILE]. + with pto.if_context(n_cols > c0): + with pto.if_context(c_tile >= n_cols): + cid = pto.get_block_idx() + sub_bid = pto.get_subblock_idx() + sub_bnum = pto.get_subblock_num() + num_blocks = pto.get_block_num() + + vid = s.index_cast(cid * sub_bnum + sub_bid) + num_cores = s.index_cast(num_blocks * sub_bnum) + + # Distribute rows across cores (row-level parallelism). + rows_per_core = s.ceil_div(batch, num_cores) + row_start = vid * rows_per_core + row_end = s.min_u(row_start + rows_per_core, batch) + num_rows = row_end - row_start + + total_elems = batch * n_cols + tv_x = pto.as_tensor( + tensor_type, ptr=x_ptr, shape=[total_elems], strides=[c1] + ) + tv_y = pto.as_tensor( + tensor_type, ptr=y_ptr, shape=[total_elems], strides=[c1] + ) + + with pto.if_context(num_rows > c0): + # Allocate 4 UB tiles (128 KB total, well under 192 KB UB). + tb_x = pto.alloc_tile(tile_type, valid_col=n_cols) + tb_t1 = pto.alloc_tile(tile_type, valid_col=n_cols) + tb_t2 = pto.alloc_tile(tile_type, valid_col=n_cols) + tb_t3 = pto.alloc_tile(tile_type, valid_col=n_cols) + + for row_i in pto.range(c0, num_rows, c1): + gm_offset = (row_start + row_i) * n_cols + + sv_x = pto.slice_view( + subtensor_type, + source=tv_x, + offsets=[gm_offset], + sizes=[n_cols], + ) + sv_y = pto.slice_view( + subtensor_type, + source=tv_y, + offsets=[gm_offset], + sizes=[n_cols], + ) + + # Load input + pto.load(sv_x, tb_x) + + # Derive constants from data: + # x - x = 0 => exp(0) = 1.0 + tile.sub(tb_x, tb_x, tb_t1) # tb_t1 = 0.0 + tile.exp(tb_t1, tb_t1) # tb_t1 = 1.0 (ones) + + # Build 3.0: + tile.add(tb_t1, tb_t1, tb_t2) # tb_t2 = 2.0 + tile.add(tb_t2, tb_t1, tb_t2) # tb_t2 = 3.0 + + # Build 6.0: + tile.add(tb_t2, tb_t2, tb_t3) # tb_t3 = 6.0 + + # Compute x + 3: + tile.add(tb_x, tb_t2, tb_t2) # tb_t2 = x + 3 + + # clamp(x+3, 0, 6) = relu(x+3) - relu(relu(x+3) - 6) + tile.relu(tb_t2, tb_t2) # tb_t2 = relu(x + 3) = max(x+3, 0) + tile.sub(tb_t2, tb_t3, tb_t1) # tb_t1 = max(x+3, 0) - 6 + tile.relu(tb_t1, tb_t1) # tb_t1 = relu(max(x+3,0) - 6) + tile.sub(tb_t2, tb_t1, tb_t1) # tb_t1 = clamp(x+3, 0, 6) + + # y = x * clamp(x+3, 0, 6) / 6 + tile.mul(tb_x, tb_t1, tb_t1) # tb_t1 = x * clamp(x+3, 0, 6) + tile.div(tb_t1, tb_t3, tb_t1) # tb_t1 = y + + pto.store(tb_t1, sv_y) + + _ = fn_name + return _kernel + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--fn-name", + default="hardswish_fp16", + help="Generated kernel function name.", + ) + args = parser.parse_args() + print(build_hardswish(fn_name=args.fn_name)) + diff --git a/examples/agent/hardswish/run_hardswish.py b/examples/agent/hardswish/run_hardswish.py new file mode 100644 index 00000000..34dee006 --- /dev/null +++ b/examples/agent/hardswish/run_hardswish.py @@ -0,0 +1,107 @@ +import argparse +import ctypes + +import torch +import torch.nn.functional as F +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + + +def torch_to_ctypes(tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + +def load_lib(lib_path, block_dim=24): + lib = ctypes.CDLL(lib_path) + lib.call_kernel.argtypes = [ + ctypes.c_uint32, # blockDim + ctypes.c_void_p, # stream + ctypes.c_void_p, # x + ctypes.c_void_p, # y (output) + ctypes.c_uint32, # batch + ctypes.c_uint32, # n_cols + ] + lib.call_kernel.restype = None + + def hardswish_func(x, y, batch, n_cols, block_dim=block_dim, stream_ptr=None): + if stream_ptr is None: + stream_ptr = torch.npu.current_stream()._as_parameter_ + lib.call_kernel( + block_dim, + stream_ptr, + torch_to_ctypes(x), + torch_to_ctypes(y), + batch, + n_cols, + ) + + return hardswish_func + + +def hardswish_ref(x): + """Reference Hard-Swish: y = x * clamp(x + 3, 0, 6) / 6""" + return F.hardswish(x.float()).to(x.dtype) + + +def test_hardswish(lib_path, block_dim=24): + device = get_test_device() + torch.npu.set_device(device) + + hardswish = load_lib(lib_path=lib_path, block_dim=block_dim) + + torch.manual_seed(0) + dtype = torch.float16 + batch_list = [1, 4, 22, 65] + n_cols_list = [128, 256, 512, 1024, 2048, 4096, 8192, 16384] + + results = [] + for batch in batch_list: + for n_cols in n_cols_list: + x = torch.randn(batch, n_cols, device=device, dtype=dtype).clamp(-8, 8) + y = torch.empty(batch, n_cols, device=device, dtype=dtype) + + y_ref = hardswish_ref(x) + hardswish(x, y, batch, n_cols) + torch.npu.synchronize() + + is_match = True + detail = "" + try: + torch.testing.assert_close(y, y_ref, rtol=1e-2, atol=1e-2) + except AssertionError as err: + is_match = False + detail = str(err).strip() if str(err) else "assert_close failed" + + status = "match" if is_match else "mismatch" + print(f"[{status}] batch={batch}, n_cols={n_cols}, lib={lib_path}") + if detail: + print(" detail:") + print(detail) + results.append((batch, n_cols, status, detail)) + + print(f"\ndetailed summary for {lib_path}:") + for batch, n_cols, status, detail in results: + msg = f" batch={batch}, n_cols={n_cols}, status={status}" + print(msg) + if detail: + print(" detail:") + print(detail) + return results + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--lib", + default="./hardswish_lib.so", + help="Path to the shared library generated by compile.sh.", + ) + parser.add_argument( + "--block-dim", + type=int, + default=24, + help="Kernel blockDim (default: 24).", + ) + args = parser.parse_args() + test_hardswish(args.lib, block_dim=args.block_dim) diff --git a/openenv/agent_search.py b/openenv/agent_search.py index 9552db2e..e5e9591d 100644 --- a/openenv/agent_search.py +++ b/openenv/agent_search.py @@ -237,6 +237,73 @@ def execute_tool(name: str, tool_input: dict) -> str: # --------------------------------------------------------------------------- # System prompt (built from config) # --------------------------------------------------------------------------- + +_API_CHEATSHEET = """\ +## PTO-DSL API cheatsheet — use these, do NOT reimplement them + +```python +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s + +# --- Kernel entry point --- +@to_ir_module +def _kernel(x: pto.PtrType, y: pto.PtrType, batch: pto.int32, n_cols: pto.int32): + ... + +# --- Tile ops (all operate on allocated tile buffers, no return value) --- +tile.mov(src, dst) # copy tile +tile.add(lhs, rhs, out) # element-wise add +tile.sub(lhs, rhs, out) # element-wise subtract +tile.mul(lhs, rhs, out) # element-wise multiply +tile.div(lhs, rhs, out) # element-wise divide +tile.exp(inp, out) # element-wise e^x +tile.log(inp, out) # element-wise ln(x) +tile.relu(inp, out) # element-wise max(x, 0) +tile.abs(inp, out) # element-wise |x| +tile.sqrt(inp, out) # element-wise sqrt(x) +tile.rsqrt(inp, out) # element-wise 1/sqrt(x) +tile.reciprocal(inp, out) # element-wise 1/x +tile.gather(src, out, indices) # gather rows + +# NOT available as a single op — implement from primitives: +# tanh(x) = 1 - 2/(exp(2x)+1) [use tile.exp, tile.add, tile.div, tile.sub] +# sigmoid(x) = 1/(1+exp(-x)) [negate x, tile.exp, then tile.reciprocal+add] +# clamp(x,0,6) = relu(x) - relu(x-6) +# min(a,b) = 0.5*(a+b) - 0.5*abs(a-b) +# max(a,b) = relu(a-b) + b + +# --- Memory & layout --- +pto.get_block_idx() # current core index (scalar Value) +pto.get_block_num() # total number of cores (scalar Value) +pto.as_tensor(type, ptr=, shape=, strides=) # wrap pointer as tensor view +pto.slice_view(type, source=, offsets=, sizes=) # sub-view of a tensor +pto.alloc_tile(tile_type) # allocate UB tile buffer +pto.load(source, dest) # DMA: tensor view → tile buffer +pto.store(source, dest) # DMA: tile buffer → tensor view + +# --- Sections (context managers) --- +with pto.vector_section(): # all tile ops must be inside here + ... + +# --- Scalar arithmetic (s.xxx returns Value) --- +s.const(value) # integer constant +s.ceil_div(a, b) # ceil(a/b) +s.index_cast(v, type) # type cast +# Scalar Value supports: +, -, *, //, %, <, >, ==, != + +# --- Control flow --- +with pto.range(start, stop, step) as i: # for loop + ... +with pto.if_context(cond): # if branch + ... + +# --- Synchronization (when NOT using --enable-insert-sync) --- +pto.record_event(record_op, wait_op, event_id) +pto.wait_event(record_op, wait_op, event_id) +pto.record_wait_pair(record_op, wait_op, event_id) +pto.barrier(sync_op) +``` +""" import textwrap as _textwrap @@ -296,7 +363,7 @@ def _build_system_prompt(cfg: dict, kernel_dir: Path) -> str: except FileNotFoundError: local_section += f"\n### {fpath_str}\n(file not found)\n" if local_section: - local_section = "\n## PTO-DSL API (local, authoritative)\n" + local_section + local_section = "\n## PTO-DSL API source files (these are the exact functions you import and call — do NOT reimplement them)\n" + local_section # Fetch remote reference docs and embed them url_section = "" @@ -318,6 +385,7 @@ def _build_system_prompt(cfg: dict, kernel_dir: Path) -> str: The kernel directory contains exactly these files: {file_listing} Do NOT attempt to read any other filenames — they do not exist. + {_API_CHEATSHEET} {preload_section}{local_section}{url_section} ## Your workflow 1. Propose one targeted change based on the pre-loaded files above @@ -369,7 +437,7 @@ def _build_create_system_prompt(cfg: dict, work_dir: Path, ref_dir: Path) -> str except FileNotFoundError: local_section += f"\n### {fpath_str}\n(file not found)\n" if local_section: - local_section = "\n## PTO-DSL API (local, authoritative)\n" + local_section + local_section = "\n## PTO-DSL API source files (these are the exact functions you import and call — do NOT reimplement them)\n" + local_section url_section = "" for entry in cfg.get("context_urls", []): @@ -389,7 +457,7 @@ def _build_create_system_prompt(cfg: dict, work_dir: Path, ref_dir: Path) -> str Your task is to implement the {name} kernel entirely from scratch. The working directory starts with: {file_listing} You must create ALL required files yourself using write_file. - + {_API_CHEATSHEET} {preload_section}{local_section}{url_section} ## Kernel to implement {description} From ab494e74bf49eb3884abdcdd6524d06c08ff2eed Mon Sep 17 00:00:00 2001 From: mirkodevita Date: Tue, 10 Mar 2026 09:28:53 +0000 Subject: [PATCH 5/6] agent fast hadamard implementation beats baseline --- examples/agent/fast_hadamard/.gitignore | 9 + examples/agent/fast_hadamard/README.md | 8 + .../agent/fast_hadamard/_bench_wrapper.py | 69 +++++ examples/agent/fast_hadamard/caller.cpp | 28 ++ examples/agent/fast_hadamard/compile.sh | 46 ++++ .../agent/fast_hadamard/hadamard_builder.py | 249 ++++++++++++++++++ examples/agent/fast_hadamard/plot_perf.py | 72 +++++ examples/agent/fast_hadamard/run_hadamard.py | 208 +++++++++++++++ examples/aot/fast_hadamard/_bench_wrapper.py | 69 +++++ 9 files changed, 758 insertions(+) create mode 100644 examples/agent/fast_hadamard/.gitignore create mode 100644 examples/agent/fast_hadamard/README.md create mode 100644 examples/agent/fast_hadamard/_bench_wrapper.py create mode 100644 examples/agent/fast_hadamard/caller.cpp create mode 100644 examples/agent/fast_hadamard/compile.sh create mode 100644 examples/agent/fast_hadamard/hadamard_builder.py create mode 100644 examples/agent/fast_hadamard/plot_perf.py create mode 100644 examples/agent/fast_hadamard/run_hadamard.py create mode 100644 examples/aot/fast_hadamard/_bench_wrapper.py diff --git a/examples/agent/fast_hadamard/.gitignore b/examples/agent/fast_hadamard/.gitignore new file mode 100644 index 00000000..663e5a84 --- /dev/null +++ b/examples/agent/fast_hadamard/.gitignore @@ -0,0 +1,9 @@ +hadamard_no_sync.pto +hadamard_manual_sync.pto +hadamard_auto_sync.cpp +hadamard_manual_sync.cpp +hadamard_auto_sync.pto +hadamard_auto_sync_lib.so +hadamard_manual_sync_lib.so + +perf_data* diff --git a/examples/agent/fast_hadamard/README.md b/examples/agent/fast_hadamard/README.md new file mode 100644 index 00000000..6b19cee9 --- /dev/null +++ b/examples/agent/fast_hadamard/README.md @@ -0,0 +1,8 @@ +Usage: + +```bash +bash ./compile.sh # generate PTO/CPP and build both auto/manual sync libs +python ./run_hadamard.py # test auto-sync lib (default) +python ./run_hadamard.py --manual-sync # test manual-sync lib +python ./plot_perf.py # optionally visualization +``` diff --git a/examples/agent/fast_hadamard/_bench_wrapper.py b/examples/agent/fast_hadamard/_bench_wrapper.py new file mode 100644 index 00000000..2a8ad08d --- /dev/null +++ b/examples/agent/fast_hadamard/_bench_wrapper.py @@ -0,0 +1,69 @@ +""" +Single-config benchmark wrapper for the agentic optimizer. +Loads hadamard_auto_sync_lib.so and prints: latency_ms= +""" +import ctypes +import math + +import torch +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + +# Representative shape — change to target a different operating point +BATCH = 32 +N = 8192 +BLOCK_DIM = 24 +WARMUP = 5 +ITERS = 20 + +LOG2_N = int(math.log2(N)) + + +def torch_to_ctypes(t): + return ctypes.c_void_p(t.data_ptr()) + + +device = get_test_device() +torch.npu.set_device(device) + +lib = ctypes.CDLL("./hadamard_auto_sync_lib.so") +lib.call_kernel.argtypes = [ + ctypes.c_uint32, # blockDim + ctypes.c_void_p, # stream + ctypes.c_void_p, # x (in-place) + ctypes.c_uint32, # batch + ctypes.c_uint32, # n + ctypes.c_uint32, # log2_n +] +lib.call_kernel.restype = None + + +def run(x): + lib.call_kernel( + BLOCK_DIM, + torch.npu.current_stream()._as_parameter_, + torch_to_ctypes(x), + BATCH, + N, + LOG2_N, + ) + + +# Allocate separate tensors to avoid cache reuse +xs = [torch.randn(BATCH, N, device=device, dtype=torch.float16) for _ in range(WARMUP + ITERS)] + +for i in range(WARMUP): + run(xs[i]) +torch.npu.synchronize() + +starts = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)] +ends = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)] +for i in range(ITERS): + starts[i].record() + run(xs[WARMUP + i]) + ends[i].record() +torch.npu.synchronize() + +ms = sum(s.elapsed_time(e) for s, e in zip(starts, ends)) / ITERS +print(f"latency_ms={ms:.4f}") diff --git a/examples/agent/fast_hadamard/caller.cpp b/examples/agent/fast_hadamard/caller.cpp new file mode 100644 index 00000000..1ddaff6a --- /dev/null +++ b/examples/agent/fast_hadamard/caller.cpp @@ -0,0 +1,28 @@ +#ifndef KERNEL_CPP +#define KERNEL_CPP "hadamard_auto_sync.cpp" +#endif +#include KERNEL_CPP + +#ifndef KERNEL_FN +#define KERNEL_FN fast_hadamard_autosync +#endif + +#ifndef NUM_CORES +#define NUM_CORES 24 +#endif + +extern "C" void call_kernel( + uint32_t blockDim, + void *stream, + uint8_t *x, + uint32_t batch, + uint32_t n, + uint32_t log2_n) +{ + uint32_t launch_blocks = blockDim > 0 ? blockDim : NUM_CORES; + KERNEL_FN<<>>( + reinterpret_cast(x), + static_cast(batch), + static_cast(n), + static_cast(log2_n)); +} diff --git a/examples/agent/fast_hadamard/compile.sh b/examples/agent/fast_hadamard/compile.sh new file mode 100644 index 00000000..a95f6148 --- /dev/null +++ b/examples/agent/fast_hadamard/compile.sh @@ -0,0 +1,46 @@ +set -e + +rm -f \ + hadamard_auto_sync.pto hadamard_manual_sync.pto \ + hadamard_auto_sync.cpp hadamard_manual_sync.cpp \ + hadamard_auto_sync_lib.so hadamard_manual_sync_lib.so + +# Auto-sync path: rely on ptoas synchronization insertion. +python ./hadamard_builder.py > ./hadamard_auto_sync.pto +ptoas --enable-insert-sync ./hadamard_auto_sync.pto -o ./hadamard_auto_sync.cpp + +# Manual-sync path: explicit record/wait events from builder. +python ./hadamard_builder.py --manual-sync > ./hadamard_manual_sync.pto +ptoas ./hadamard_manual_sync.pto -o ./hadamard_manual_sync.cpp + +bisheng \ + -I${ASCEND_TOOLKIT_HOME}/include \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ + -xcce -Xhost-start -Xhost-end \ + -mllvm -cce-aicore-stack-size=0x8000 \ + -mllvm -cce-aicore-function-stack-size=0x8000 \ + -mllvm -cce-aicore-record-overflow=true \ + -mllvm -cce-aicore-addr-transform \ + -mllvm -cce-aicore-dcci-insert-for-scalar=false \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -std=gnu++17 \ + ./caller.cpp \ + -o ./hadamard_auto_sync_lib.so + +bisheng \ + -I${ASCEND_TOOLKIT_HOME}/include \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ + -xcce -Xhost-start -Xhost-end \ + -mllvm -cce-aicore-stack-size=0x8000 \ + -mllvm -cce-aicore-function-stack-size=0x8000 \ + -mllvm -cce-aicore-record-overflow=true \ + -mllvm -cce-aicore-addr-transform \ + -mllvm -cce-aicore-dcci-insert-for-scalar=false \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -std=gnu++17 \ + -DKERNEL_CPP="\"hadamard_manual_sync.cpp\"" \ + -DKERNEL_FN=fast_hadamard_manualsync \ + ./caller.cpp \ + -o ./hadamard_manual_sync_lib.so diff --git a/examples/agent/fast_hadamard/hadamard_builder.py b/examples/agent/fast_hadamard/hadamard_builder.py new file mode 100644 index 00000000..b98a390c --- /dev/null +++ b/examples/agent/fast_hadamard/hadamard_builder.py @@ -0,0 +1,249 @@ +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s + +const = s.const + +ELEMENTS_PER_TILE = 32 * 1024 // 2 # 32KB UB / sizeof(fp16) +HALF_ELEMENTS_PER_TILE = ELEMENTS_PER_TILE // 2 + + +def meta_data(): + dtype = pto.float16 + ptr_type = pto.PtrType(dtype) + index_dtype = pto.int32 + + tensor_type = pto.TensorType(rank=1, dtype=dtype) + subtensor_full = pto.SubTensorType(shape=[1, ELEMENTS_PER_TILE], dtype=dtype) + subtensor_half = pto.SubTensorType(shape=[1, HALF_ELEMENTS_PER_TILE], dtype=dtype) + + tile_cfg = pto.TileBufConfig() + tile_full = pto.TileBufType( + shape=[1, ELEMENTS_PER_TILE], + valid_shape=[1, -1], + dtype=dtype, + memory_space="VEC", + config=tile_cfg, + ) + tile_half = pto.TileBufType( + shape=[1, HALF_ELEMENTS_PER_TILE], + valid_shape=[1, -1], + dtype=dtype, + memory_space="VEC", + config=tile_cfg, + ) + + return { + "ptr_type": ptr_type, + "index_dtype": index_dtype, + "tensor_type": tensor_type, + "subtensor_full": subtensor_full, + "subtensor_half": subtensor_half, + "tile_full": tile_full, + "tile_half": tile_half, + } + + +@to_ir_module(meta_data=meta_data) +def fast_hadamard_autosync( + x_ptr: "ptr_type", + batch_i32: "index_dtype", + n_i32: "index_dtype", + log2_n_i32: "index_dtype", +) -> None: + c0 = const(0) + c1 = const(1) + c2 = const(2) + + batch = s.index_cast(batch_i32) + n = s.index_cast(n_i32) + log2_n = s.index_cast(log2_n_i32) + + cid = pto.get_block_idx() + sub_bid = pto.get_subblock_idx() + sub_bnum = pto.get_subblock_num() + num_blocks = pto.get_block_num() + + vid = s.index_cast(cid * sub_bnum + sub_bid) # vector core index + num_cores = s.index_cast(num_blocks * sub_bnum) # number of vector cores + + with pto.vector_section(): + samples_per_core = s.ceil_div(batch, num_cores) + sample_offset = vid * samples_per_core + + with pto.if_context(sample_offset < batch): + samples_end = sample_offset + samples_per_core + samples_to_process = s.select( + samples_end > batch, + batch - sample_offset, + samples_per_core, + ) + + with pto.if_context(samples_to_process > c0): + total_elements = batch * n + tv_x = pto.as_tensor( + tensor_type, ptr=x_ptr, shape=[total_elements], strides=[c1] + ) + + # Single buffer set — no ping-pong overhead + tb_row = pto.alloc_tile(tile_full, valid_col=n) + tb_even = pto.alloc_tile(tile_half, valid_col=n // c2) + tb_odd = pto.alloc_tile(tile_half, valid_col=n // c2) + + n_half = n // c2 + + # Alias row halves inside UB row tile + tb_first = tile.subset(tb_row, [c0, c0], [1, HALF_ELEMENTS_PER_TILE]) + tb_second = tile.subset(tb_row, [c0, n_half], [1, HALF_ELEMENTS_PER_TILE]) + + # Direct loop over samples — no chunk abstraction overhead + for si in pto.range(c0, samples_to_process, c1): + row_offset = (sample_offset + si) * n + sv_row = pto.slice_view( + subtensor_full, source=tv_x, offsets=[row_offset], sizes=[n] + ) + + pto.load(sv_row, tb_row) + for _ in pto.range(c0, log2_n, c1): + tile.gather(tb_row, tb_even, mask_pattern="P0101") + tile.gather(tb_row, tb_odd, mask_pattern="P1010") + tile.add(tb_even, tb_odd, tb_first) + tile.sub(tb_even, tb_odd, tb_second) + pto.store(tb_row, sv_row) + + +@to_ir_module(meta_data=meta_data) +def fast_hadamard_manualsync( + x_ptr: "ptr_type", + batch_i32: "index_dtype", + n_i32: "index_dtype", + log2_n_i32: "index_dtype", +) -> None: + c0 = const(0) + c1 = const(1) + c2 = const(2) + + batch = s.index_cast(batch_i32) + n = s.index_cast(n_i32) + log2_n = s.index_cast(log2_n_i32) + + cid = pto.get_block_idx() + sub_bid = pto.get_subblock_idx() + sub_bnum = pto.get_subblock_num() + num_blocks = pto.get_block_num() + + vid = s.index_cast(cid * sub_bnum + sub_bid) # vector core index + num_cores = s.index_cast(num_blocks * sub_bnum) # number of vector cores + + with pto.vector_section(): + samples_per_core = s.ceil_div(batch, num_cores) + sample_offset = vid * samples_per_core + + with pto.if_context(sample_offset < batch): + samples_end = sample_offset + samples_per_core + samples_to_process = s.select( + samples_end > batch, + batch - sample_offset, + samples_per_core, + ) + + with pto.if_context(samples_to_process > c0): + total_elements = batch * n + tv_x = pto.as_tensor( + tensor_type, ptr=x_ptr, shape=[total_elements], strides=[c1] + ) + + # Two independent tile sets (ping/pong) so event_id 0/1 map to + # disjoint UB buffers, matching the manual C++ reference. + tb_row_0 = pto.alloc_tile(tile_full, valid_col=n) + tb_even_0 = pto.alloc_tile(tile_half, valid_col=n // c2) + tb_odd_0 = pto.alloc_tile(tile_half, valid_col=n // c2) + + tb_row_1 = pto.alloc_tile(tile_full, valid_col=n) + tb_even_1 = pto.alloc_tile(tile_half, valid_col=n // c2) + tb_odd_1 = pto.alloc_tile(tile_half, valid_col=n // c2) + + n_half = n // c2 + + # Keep one sample per chunk. Multi-sample chunks interact + # poorly with static tile subset sizing in current PTO Python + # bindings and can corrupt rows for larger batches. + samples_per_load = c1 + num_chunks = s.ceil_div(samples_to_process, samples_per_load) + + def process_rows( + tb_row, tb_even, tb_odd, event_id, gm_offset, cur_samples + ): + for s in pto.range(c0, cur_samples, c1): + row_offset = gm_offset + s * n + sv_row = pto.slice_view( + subtensor_full, source=tv_x, offsets=[row_offset], sizes=[n] + ) + # Alias row halves inside UB row tile (no GM round-trip + # per Hadamard iteration). + tb_first = tile.subset( + tb_row, [c0, c0], [1, HALF_ELEMENTS_PER_TILE] + ) + tb_second = tile.subset( + tb_row, [c0, n_half], [1, HALF_ELEMENTS_PER_TILE] + ) + + pto.wait_event("VEC", "LOAD", event_id=event_id) + pto.wait_event("STORE_VEC", "VEC", event_id=event_id) + pto.load(sv_row, tb_row) + pto.record_wait_pair("LOAD", "VEC", event_id=event_id) + + for _ in pto.range(c0, log2_n, c1): + tile.gather(tb_row, tb_even, mask_pattern="P0101") + tile.gather(tb_row, tb_odd, mask_pattern="P1010") + pto.barrier("VEC") + tile.add(tb_even, tb_odd, tb_first) + tile.sub(tb_even, tb_odd, tb_second) + pto.barrier("VEC") + + pto.record_wait_pair( + "VEC", "STORE_VEC", event_id=event_id + ) + pto.store(tb_row, sv_row) + pto.record_event("STORE_VEC", "VEC", event_id=event_id) + pto.record_event("VEC", "LOAD", event_id=event_id) + + for event_id in (0, 1): + pto.record_event("VEC", "LOAD", event_id=event_id) + pto.record_event("STORE_VEC", "VEC", event_id=event_id) + + for chunk_i in pto.range(c0, num_chunks, c1): + sample_done = chunk_i * samples_per_load + chunk_left = samples_to_process - sample_done + cur_samples = s.select( + chunk_left < samples_per_load, chunk_left, samples_per_load + ) + + with pto.if_context(cur_samples > c0): + gm_offset = (sample_offset + sample_done) * n + use_ev0 = (chunk_i % c2) == c0 + + with pto.if_context(use_ev0, has_else=True) as branch: + process_rows(tb_row_0, tb_even_0, tb_odd_0, 0, gm_offset, cur_samples) + with branch.else_context(): + process_rows(tb_row_1, tb_even_1, tb_odd_1, 1, gm_offset, cur_samples) + + for event_id in (0, 1): + pto.wait_event("VEC", "LOAD", event_id=event_id) + pto.wait_event("STORE_VEC", "VEC", event_id=event_id) + + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument( + "--manual-sync", + action="store_true", + help="Emit explicit record/wait events instead of relying on --enable-insert-sync.", + ) + args = parser.parse_args() + if args.manual_sync: + module = fast_hadamard_manualsync + else: + module = fast_hadamard_autosync + print(module) diff --git a/examples/agent/fast_hadamard/plot_perf.py b/examples/agent/fast_hadamard/plot_perf.py new file mode 100644 index 00000000..6a894fd7 --- /dev/null +++ b/examples/agent/fast_hadamard/plot_perf.py @@ -0,0 +1,72 @@ +import os +import csv +try: + import matplotlib.pyplot as plt +except ImportError: + plt = None + + +def plot_bandwidth(input_dir="./perf_data/", output_path="bw_vs_shape.png"): + """Generate bandwidth plot from benchmark CSVs.""" + if plt is None: + print("Warning: matplotlib is not installed; skipping plot generation.") + return + + BENCH_BATCHES = [1, 5, 8, 10, 16, 20, 32, 40, 64, 128, 256, 512, 1024] + BENCH_BLOCK_DIMS = [20, 24] + + fig, axes = plt.subplots(1, len(BENCH_BLOCK_DIMS), figsize=(14, 6), sharey=True) + if len(BENCH_BLOCK_DIMS) == 1: + axes = [axes] + + for ax, block_dim in zip(axes, BENCH_BLOCK_DIMS): + csv_path = os.path.join(input_dir, f"fht_pto_bd{block_dim}.csv") + if not os.path.exists(csv_path): + ax.set_title(f"BLOCK_DIM={block_dim} (no data)") + continue + + # Parse CSV: hidden_dim -> {batch: bw} + data = {} + with open(csv_path, encoding="utf-8") as f: + reader = csv.DictReader(f) + for row in reader: + batch = int(row["batch"]) + n = int(row["N"]) + bw = float(row["bandwidth_gbs"]) + data.setdefault(n, {})[batch] = bw + + for idx, hidden_dim in enumerate(sorted(data.keys())): + batches = sorted(data[hidden_dim].keys()) + bws = [data[hidden_dim][b] for b in batches] + + if idx < 10: + marker = "o" + else: + last_markers = ["s", "^", "D"] + marker = last_markers[idx - 10] + + ax.plot( + batches, + bws, + marker=marker, + markersize=4, + label=f"hidden_dim={hidden_dim}", + ) + + ax.set_xscale("log", base=2) + ax.set_xticks(BENCH_BATCHES) + ax.set_xticklabels([str(b) for b in BENCH_BATCHES], rotation=45, fontsize=7) + ax.set_xlabel("batch") + ax.set_title(f"BLOCK_DIM={block_dim}") + ax.grid(True, alpha=0.3) + ax.legend(fontsize=7, ncol=2) + + axes[0].set_ylabel("Bandwidth (GB/s)") + fig.suptitle("Fast Hadamard PTO-DSL: Bandwidth vs Shape") + fig.tight_layout() + fig.savefig(input_dir + output_path, dpi=150) + print(f"\nPlot saved to {input_dir+output_path}") + + +if __name__ == "__main__": + plot_bandwidth() diff --git a/examples/agent/fast_hadamard/run_hadamard.py b/examples/agent/fast_hadamard/run_hadamard.py new file mode 100644 index 00000000..60ee8aee --- /dev/null +++ b/examples/agent/fast_hadamard/run_hadamard.py @@ -0,0 +1,208 @@ +import os +import argparse +import ctypes +import csv +import math + +import torch +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + +ELEMENTS_PER_TILE = 32 * 1024 // 2 # 32KB UB / sizeof(fp16) + + +def torch_to_ctypes(tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + +def load_lib(lib_path, block_dim=24): + lib = ctypes.CDLL(lib_path) + lib.call_kernel.argtypes = [ + ctypes.c_uint32, # blockDim + ctypes.c_void_p, # stream + ctypes.c_void_p, # x (in-place) + ctypes.c_uint32, # batch + ctypes.c_uint32, # n + ctypes.c_uint32, # log2_n + ] + lib.call_kernel.restype = None + + def hadamard_func(x, batch, n, log2_n, block_dim=block_dim, stream_ptr=None): + if stream_ptr is None: + stream_ptr = torch.npu.current_stream()._as_parameter_ + assert n <= ELEMENTS_PER_TILE, f"n must be <= {ELEMENTS_PER_TILE}, got {n}" + lib.call_kernel( + block_dim, + stream_ptr, + torch_to_ctypes(x), + batch, + n, + log2_n, + ) + + return hadamard_func + + +def hadamard_ref_inplace(x): + """Reference FHT matching TGATHER(P0101/P1010) + TADD/TSUB layout.""" + x = x.clone() + n = x.shape[-1] + n_half = n // 2 + log2_n = int(math.log2(n)) + for _ in range(log2_n): + even = x[..., 0::2].clone() + odd = x[..., 1::2].clone() + x[..., :n_half] = even + odd + x[..., n_half:] = even - odd + return x + + +def _is_power_of_two(v): + return v > 0 and (v & (v - 1)) == 0 + + +def test_hadamard(hadamard_func, block_dim=24): + torch.manual_seed(0) + dtype = torch.float16 + batch_list = [1, 7, 29, 65] + n_list = [128, 256, 512, 1024, 2048, 4096, 8192, 16384] + + results = [] + for batch in batch_list: + for n in n_list: + if not _is_power_of_two(n): + continue + log2_n = int(math.log2(n)) + x = torch.randn(batch, n, device=device, dtype=dtype) + y_ref = hadamard_ref_inplace(x) + + hadamard_func(x, batch, n, log2_n) + torch.npu.synchronize() + + is_match = True + detail = "" + try: + torch.testing.assert_close(x, y_ref) + except AssertionError as err: + is_match = False + detail = str(err).strip() if str(err) else "assert_close failed" + + status = "match" if is_match else "mismatch" + print(f"[{status}] batch={batch}, n={n}, lib={lib_path}") + if detail: + print(" detail:") + print(detail) + results.append((batch, n, status, detail)) + + print(f"detailed summary for {lib_path}:") + for batch, n, status, detail in results: + msg = f" batch={batch}, n={n}, status={status}" + print(msg) + if detail: + print(" detail:") + print(detail) + return results + + +def benchmark(hadamard_func, warmup=2, repeats=20, output_dir="./perf_data/"): + """Benchmark across (batch, N, block_dim) configs. + + Uses separate input tensors per run to avoid L2 cache reuse, + and a single timing-event pair averaged over all runs. + """ + TEST_HIDDEN_DIMS = [128, 256, 512, 1024, 2048, 4096, 8192, 16384] + BENCH_BATCHES = [1, 5, 8, 10, 16, 20, 32, 40, 64, 128, 256, 512, 1024] + BENCH_BLOCK_DIMS = [20, 24] + + os.makedirs(output_dir, exist_ok=True) + + for block_dim in BENCH_BLOCK_DIMS: + print(f"\n{'=' * 60}") + print(f"BENCHMARK (BLOCK_DIM={block_dim})") + print(f"{'=' * 60}") + header = ( + f"{'batch':>6s} {'N':>6s}" + f" {'duration_us':>12s} {'bandwidth_gbs':>14s}" + ) + print(header) + print("-" * len(header)) + + records = [] + + for batch in BENCH_BATCHES: + for n in TEST_HIDDEN_DIMS: + log2_n = int(math.log2(n)) + allocated = warmup + repeats + + # Separate GM tensors to avoid L2 cache reuse + x_list = [ + torch.randn(batch, n, device="npu", dtype=torch.float16) + for _ in range(allocated) + ] + + # Warmup + for i in range(warmup): + hadamard_func(x_list[i], batch, n, log2_n, block_dim=block_dim) + torch.npu.synchronize() + + # Timed runs — single event pair, average over repeats + start = torch.npu.Event(enable_timing=True) + end = torch.npu.Event(enable_timing=True) + + start.record() + for i in range(repeats): + hadamard_func( + x_list[warmup + i], + batch, + n, + log2_n, + block_dim=block_dim, + ) + end.record() + torch.npu.synchronize() + + duration_ms = start.elapsed_time(end) / repeats + dur_us = duration_ms * 1e3 + + # Bandwidth: read + write = 2 * batch * n * sizeof(half) + data_bytes = 2 * batch * n * 2 + bw_gbs = (data_bytes / 1e9) / (dur_us / 1e6) if dur_us > 0 else 0.0 + + print(f"{batch:>6d} {n:>6d}" f" {dur_us:>12.2f} {bw_gbs:>14.2f}") + records.append(f"{batch},{n},{dur_us:.4f},{bw_gbs:.4f}") + + csv_path = os.path.join(output_dir, f"fht_pto_bd{block_dim}.csv") + with open(csv_path, "w", encoding="utf-8") as f: + f.write("batch,N,duration_us,bandwidth_gbs\n") + f.write("\n".join(records) + "\n") + print(f"\nSaved to {csv_path}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--manual-sync", + action="store_true", + help="Use manual-sync library instead of the default auto-sync library.", + ) + parser.add_argument( + "--block-dim", + type=int, + default=24, + help="Kernel blockDim (default: 24).", + ) + args = parser.parse_args() + + lib_path = ( + "./hadamard_manual_sync_lib.so" + if args.manual_sync + else "./hadamard_auto_sync_lib.so" + ) + + device = get_test_device() + torch.npu.set_device(device) + hadamard_func = load_lib(lib_path=lib_path, block_dim=args.block_dim) + + test_hadamard(hadamard_func) + benchmark(hadamard_func) diff --git a/examples/aot/fast_hadamard/_bench_wrapper.py b/examples/aot/fast_hadamard/_bench_wrapper.py new file mode 100644 index 00000000..2a8ad08d --- /dev/null +++ b/examples/aot/fast_hadamard/_bench_wrapper.py @@ -0,0 +1,69 @@ +""" +Single-config benchmark wrapper for the agentic optimizer. +Loads hadamard_auto_sync_lib.so and prints: latency_ms= +""" +import ctypes +import math + +import torch +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + +# Representative shape — change to target a different operating point +BATCH = 32 +N = 8192 +BLOCK_DIM = 24 +WARMUP = 5 +ITERS = 20 + +LOG2_N = int(math.log2(N)) + + +def torch_to_ctypes(t): + return ctypes.c_void_p(t.data_ptr()) + + +device = get_test_device() +torch.npu.set_device(device) + +lib = ctypes.CDLL("./hadamard_auto_sync_lib.so") +lib.call_kernel.argtypes = [ + ctypes.c_uint32, # blockDim + ctypes.c_void_p, # stream + ctypes.c_void_p, # x (in-place) + ctypes.c_uint32, # batch + ctypes.c_uint32, # n + ctypes.c_uint32, # log2_n +] +lib.call_kernel.restype = None + + +def run(x): + lib.call_kernel( + BLOCK_DIM, + torch.npu.current_stream()._as_parameter_, + torch_to_ctypes(x), + BATCH, + N, + LOG2_N, + ) + + +# Allocate separate tensors to avoid cache reuse +xs = [torch.randn(BATCH, N, device=device, dtype=torch.float16) for _ in range(WARMUP + ITERS)] + +for i in range(WARMUP): + run(xs[i]) +torch.npu.synchronize() + +starts = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)] +ends = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)] +for i in range(ITERS): + starts[i].record() + run(xs[WARMUP + i]) + ends[i].record() +torch.npu.synchronize() + +ms = sum(s.elapsed_time(e) for s, e in zip(starts, ends)) / ITERS +print(f"latency_ms={ms:.4f}") From 349f30630fdf405098f97c1a01ab40629dba2c2c Mon Sep 17 00:00:00 2001 From: mirkodevita Date: Tue, 10 Mar 2026 15:00:13 +0000 Subject: [PATCH 6/6] added gemini support ad claude generated sigmoid --- examples/agent/sigmoid/_bench_wrapper.py | 67 +++++++++ examples/agent/sigmoid/caller.cpp | 24 ++++ examples/agent/sigmoid/compile.sh | 22 +++ examples/agent/sigmoid/run_sigmoid.py | 106 ++++++++++++++ examples/agent/sigmoid/sigmoid.cpp | 89 ++++++++++++ examples/agent/sigmoid/sigmoid.pto | 57 ++++++++ examples/agent/sigmoid/sigmoid_builder.py | 151 ++++++++++++++++++++ openenv/agent_search.py | 111 ++++++++------- openenv/anthropic_provider.py | 71 ++++++++++ openenv/gemini_provider.py | 162 ++++++++++++++++++++++ openenv/rms_norm_config.toml | 66 +++++++++ openenv/sigmoid_config.toml | 50 +++++++ 12 files changed, 929 insertions(+), 47 deletions(-) create mode 100644 examples/agent/sigmoid/_bench_wrapper.py create mode 100644 examples/agent/sigmoid/caller.cpp create mode 100644 examples/agent/sigmoid/compile.sh create mode 100644 examples/agent/sigmoid/run_sigmoid.py create mode 100644 examples/agent/sigmoid/sigmoid.cpp create mode 100644 examples/agent/sigmoid/sigmoid.pto create mode 100644 examples/agent/sigmoid/sigmoid_builder.py create mode 100644 openenv/anthropic_provider.py create mode 100644 openenv/gemini_provider.py create mode 100644 openenv/rms_norm_config.toml create mode 100644 openenv/sigmoid_config.toml diff --git a/examples/agent/sigmoid/_bench_wrapper.py b/examples/agent/sigmoid/_bench_wrapper.py new file mode 100644 index 00000000..b3adbe24 --- /dev/null +++ b/examples/agent/sigmoid/_bench_wrapper.py @@ -0,0 +1,67 @@ +""" +Single-config benchmark wrapper for the agentic optimizer. +Loads sigmoid_lib.so and prints: latency_ms= +""" +import ctypes + +import torch +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + +# Representative shape +BATCH = 1024 +N_COLS = 8192 +BLOCK_DIM = 24 +WARMUP = 5 +ITERS = 20 + + +def torch_to_ctypes(t): + return ctypes.c_void_p(t.data_ptr()) + + +device = get_test_device() +torch.npu.set_device(device) + +lib = ctypes.CDLL("./sigmoid_lib.so") +lib.call_kernel.argtypes = [ + ctypes.c_uint32, # blockDim + ctypes.c_void_p, # stream + ctypes.c_void_p, # x + ctypes.c_void_p, # y (output) + ctypes.c_uint32, # batch + ctypes.c_uint32, # n_cols +] +lib.call_kernel.restype = None + + +def run(x, y): + lib.call_kernel( + BLOCK_DIM, + torch.npu.current_stream()._as_parameter_, + torch_to_ctypes(x), + torch_to_ctypes(y), + BATCH, + N_COLS, + ) + + +dtype = torch.float16 +xs = [torch.randn(BATCH, N_COLS, device=device, dtype=dtype).clamp(-4, 4) for _ in range(WARMUP + ITERS)] +y = torch.empty(BATCH, N_COLS, device=device, dtype=dtype) + +for i in range(WARMUP): + run(xs[i], y) +torch.npu.synchronize() + +starts = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)] +ends = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)] +for i in range(ITERS): + starts[i].record() + run(xs[WARMUP + i], y) + ends[i].record() +torch.npu.synchronize() + +ms = sum(s.elapsed_time(e) for s, e in zip(starts, ends)) / ITERS +print(f"latency_ms={ms:.4f}") diff --git a/examples/agent/sigmoid/caller.cpp b/examples/agent/sigmoid/caller.cpp new file mode 100644 index 00000000..e00da7b7 --- /dev/null +++ b/examples/agent/sigmoid/caller.cpp @@ -0,0 +1,24 @@ +#ifndef KERNEL_CPP +#define KERNEL_CPP "sigmoid.cpp" +#endif +#include KERNEL_CPP + +#ifndef NUM_CORES +#define NUM_CORES 24 +#endif + +extern "C" void call_kernel( + uint32_t blockDim, + void *stream, + uint8_t *x, + uint8_t *y, + uint32_t batch, + uint32_t n_cols) +{ + uint32_t launch_blocks = blockDim > 0 ? blockDim : NUM_CORES; + _kernel<<>>( + reinterpret_cast(x), + reinterpret_cast(y), + static_cast(batch), + static_cast(n_cols)); +} diff --git a/examples/agent/sigmoid/compile.sh b/examples/agent/sigmoid/compile.sh new file mode 100644 index 00000000..dd6dd2c8 --- /dev/null +++ b/examples/agent/sigmoid/compile.sh @@ -0,0 +1,22 @@ +set -e + +rm -f sigmoid.pto sigmoid.cpp sigmoid_lib.so + +python ./sigmoid_builder.py > ./sigmoid.pto +ptoas --enable-insert-sync ./sigmoid.pto -o ./sigmoid.cpp + +bisheng \ + -I${ASCEND_TOOLKIT_HOME}/include \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ + -xcce -Xhost-start -Xhost-end \ + -mllvm -cce-aicore-stack-size=0x8000 \ + -mllvm -cce-aicore-function-stack-size=0x8000 \ + -mllvm -cce-aicore-record-overflow=true \ + -mllvm -cce-aicore-addr-transform \ + -mllvm -cce-aicore-dcci-insert-for-scalar=false \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -std=gnu++17 \ + -DKERNEL_CPP="\"sigmoid.cpp\"" \ + ./caller.cpp \ + -o ./sigmoid_lib.so diff --git a/examples/agent/sigmoid/run_sigmoid.py b/examples/agent/sigmoid/run_sigmoid.py new file mode 100644 index 00000000..bf3cfd40 --- /dev/null +++ b/examples/agent/sigmoid/run_sigmoid.py @@ -0,0 +1,106 @@ +import argparse +import ctypes + +import torch +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + + +def torch_to_ctypes(tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + +def load_lib(lib_path, block_dim=24): + lib = ctypes.CDLL(lib_path) + lib.call_kernel.argtypes = [ + ctypes.c_uint32, # blockDim + ctypes.c_void_p, # stream + ctypes.c_void_p, # x + ctypes.c_void_p, # y (output) + ctypes.c_uint32, # batch + ctypes.c_uint32, # n_cols + ] + lib.call_kernel.restype = None + + def sigmoid_func(x, y, batch, n_cols, block_dim=block_dim, stream_ptr=None): + if stream_ptr is None: + stream_ptr = torch.npu.current_stream()._as_parameter_ + lib.call_kernel( + block_dim, + stream_ptr, + torch_to_ctypes(x), + torch_to_ctypes(y), + batch, + n_cols, + ) + + return sigmoid_func + + +def sigmoid_ref(x): + """Reference sigmoid: 1 / (1 + exp(-x)), computed in fp32 then cast back.""" + return torch.sigmoid(x.float()).to(x.dtype) + + +def test_sigmoid(lib_path, block_dim=24): + device = get_test_device() + torch.npu.set_device(device) + + sigmoid = load_lib(lib_path=lib_path, block_dim=block_dim) + + torch.manual_seed(0) + dtype = torch.float16 + batch_list = [1, 4, 22, 65] + n_cols_list = [128, 256, 512, 1024, 2048, 4096, 8192, 16384] + + results = [] + for batch in batch_list: + for n_cols in n_cols_list: + x = torch.randn(batch, n_cols, device=device, dtype=dtype).clamp(-4, 4) + y = torch.empty(batch, n_cols, device=device, dtype=dtype) + + y_ref = sigmoid_ref(x) + sigmoid(x, y, batch, n_cols) + torch.npu.synchronize() + + is_match = True + detail = "" + try: + torch.testing.assert_close(y, y_ref, rtol=1e-2, atol=1e-2) + except AssertionError as err: + is_match = False + detail = str(err).strip() if str(err) else "assert_close failed" + + status = "match" if is_match else "mismatch" + print(f"[{status}] batch={batch}, n_cols={n_cols}, lib={lib_path}") + if detail: + print(" detail:") + print(detail) + results.append((batch, n_cols, status, detail)) + + print(f"\ndetailed summary for {lib_path}:") + for batch, n_cols, status, detail in results: + msg = f" batch={batch}, n_cols={n_cols}, status={status}" + print(msg) + if detail: + print(" detail:") + print(detail) + return results + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--lib", + default="./sigmoid_lib.so", + help="Path to the shared library generated by compile.sh.", + ) + parser.add_argument( + "--block-dim", + type=int, + default=24, + help="Kernel blockDim (default: 24).", + ) + args = parser.parse_args() + test_sigmoid(args.lib, block_dim=args.block_dim) diff --git a/examples/agent/sigmoid/sigmoid.cpp b/examples/agent/sigmoid/sigmoid.cpp new file mode 100644 index 00000000..e8248de9 --- /dev/null +++ b/examples/agent/sigmoid/sigmoid.cpp @@ -0,0 +1,89 @@ +#include "pto/pto-inst.hpp" +using namespace pto; +__global__ AICORE void _kernel(__gm__ half* v1, __gm__ half* v2, int32_t v3, int32_t v4) { + unsigned v5 = 1; + unsigned v6 = 0; + int32_t v7 = 16384; + int32_t v8 = 1; + int32_t v9 = 0; + int64_t v10 = 0; + int64_t v11 = 32768; + int64_t v12 = 65536; + int64_t v13 = 98304; + using T = float; + + #if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + if (v4 > v9) { + if (v4 <= v7) { + int64_t v14 = get_block_idx(); + int64_t v15 = get_subblockid(); + int64_t v16 = get_subblockdim(); + int64_t v17 = (int64_t) v16; + int64_t v18 = get_block_num(); + int32_t v19 = (int32_t) ((int64_t) (uint64_t) ((int64_t) v18) * (uint64_t) v17); + int32_t v20 = v3 / v19; + int32_t v21 = v3 % v19 != v9 && v3 < v9 == v19 < v9 ? v20 + v8 : v20; + int32_t v22 = (int32_t) ((uint32_t) ((int32_t) (int64_t) ((uint64_t) ((int64_t) (uint64_t) ((int64_t) v14) * (uint64_t) v17) + (uint64_t) ((int64_t) v15))) * (uint32_t) v21); + int32_t v23 = (int32_t) ((uint32_t) v22 + (uint32_t) v21); + int32_t v24 = (int32_t) ((uint32_t) ((uint32_t) v23 < (uint32_t) v3 ? v23 : v3) - (uint32_t) v22); + int32_t v25 = (int32_t) ((uint32_t) v3 * (uint32_t) v4); + if (v24 > v9) { + Tile v26 = Tile(v4); + TASSIGN(v26, v10); + Tile v27 = Tile(v4); + TASSIGN(v27, v11); + Tile v28 = Tile(v4); + TASSIGN(v28, v12); + Tile v29 = Tile(v4); + TASSIGN(v29, v13); + for (size_t v30 = (size_t) v9; v30 < ((size_t) v24); v30 += (size_t) v8) { + int32_t v31 = (int32_t) ((uint32_t) ((int32_t) (uint32_t) v22 + (uint32_t) ((int32_t) v30)) * (uint32_t) v4); + unsigned v32 = (unsigned) v4 * v5; + pto::Shape<1, 1, 1, 1, -1> v33 = pto::Shape<1, 1, 1, 1, -1>(v4); + pto::Stride<-1, -1, -1, -1, 1> v34 = pto::Stride<-1, -1, -1, -1, 1>(v32, v32, v32, v32); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v35 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v6 + (unsigned) v31 * (unsigned) v8), v33, v34); + unsigned v36 = (unsigned) v4 * v5; + pto::Shape<1, 1, 1, 1, -1> v37 = pto::Shape<1, 1, 1, 1, -1>(v4); + pto::Stride<-1, -1, -1, -1, 1> v38 = pto::Stride<-1, -1, -1, -1, 1>(v36, v36, v36, v36); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v39 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v6 + (unsigned) v31 * (unsigned) v8), v37, v38); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v26, v35); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TSUB(v29, v26, v26); + pipe_barrier(PIPE_V); + TEXP(v27, v29); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TSUB(v28, v29, v26); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_V); + TEXP(v28, v28); + pipe_barrier(PIPE_V); + TADD(v28, v28, v27); + pipe_barrier(PIPE_V); + TDIV(v28, v27, v28); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pipe_barrier(PIPE_MTE3); + TSTORE(v39, v28); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + }; + }; + }; + } + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + #endif // __DAV_VEC__ + + return; +} + diff --git a/examples/agent/sigmoid/sigmoid.pto b/examples/agent/sigmoid/sigmoid.pto new file mode 100644 index 00000000..722397d9 --- /dev/null +++ b/examples/agent/sigmoid/sigmoid.pto @@ -0,0 +1,57 @@ +module { + func.func @_kernel(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: i32, %arg3: i32) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16384 = arith.constant 16384 : index + %0 = arith.index_cast %arg2 : i32 to index + %1 = arith.index_cast %arg3 : i32 to index + pto.section.vector { + %2 = arith.cmpi sgt, %1, %c0 : index + scf.if %2 { + %3 = arith.cmpi sge, %c16384, %1 : index + scf.if %3 { + %4 = pto.get_block_idx + %5 = pto.get_subblock_idx + %6 = pto.get_subblock_num + %7 = pto.get_block_num + %8 = arith.muli %4, %6 : i64 + %9 = arith.addi %8, %5 : i64 + %10 = arith.index_cast %9 : i64 to index + %11 = arith.muli %7, %6 : i64 + %12 = arith.index_cast %11 : i64 to index + %13 = arith.ceildivsi %0, %12 : index + %14 = arith.muli %10, %13 : index + %15 = arith.addi %14, %13 : index + %16 = arith.minui %15, %0 : index + %17 = arith.subi %16, %14 : index + %18 = arith.muli %0, %1 : index + %19 = pto.make_tensor_view %arg0, shape = [%18] strides = [%c1] : !pto.tensor_view + %20 = pto.make_tensor_view %arg1, shape = [%18] strides = [%c1] : !pto.tensor_view + %21 = arith.cmpi sgt, %17, %c0 : index + scf.if %21 { + %22 = pto.alloc_tile valid_col = %1 : !pto.tile_buf + %23 = pto.alloc_tile valid_col = %1 : !pto.tile_buf + %24 = pto.alloc_tile valid_col = %1 : !pto.tile_buf + %25 = pto.alloc_tile valid_col = %1 : !pto.tile_buf + scf.for %arg4 = %c0 to %17 step %c1 { + %26 = arith.addi %14, %arg4 : index + %27 = arith.muli %26, %1 : index + %28 = pto.partition_view %19, offsets = [%27], sizes = [%1] : !pto.tensor_view -> !pto.partition_tensor_view<1x16384xf16> + %29 = pto.partition_view %20, offsets = [%27], sizes = [%1] : !pto.tensor_view -> !pto.partition_tensor_view<1x16384xf16> + pto.tload ins(%28 : !pto.partition_tensor_view<1x16384xf16>) outs(%22 : !pto.tile_buf) + pto.tsub ins(%22, %22 : !pto.tile_buf, !pto.tile_buf) outs(%25 : !pto.tile_buf) + pto.texp ins(%25 : !pto.tile_buf) outs(%23 : !pto.tile_buf) + pto.tsub ins(%25, %22 : !pto.tile_buf, !pto.tile_buf) outs(%24 : !pto.tile_buf) + pto.texp ins(%24 : !pto.tile_buf) outs(%24 : !pto.tile_buf) + pto.tadd ins(%24, %23 : !pto.tile_buf, !pto.tile_buf) outs(%24 : !pto.tile_buf) + pto.tdiv ins(%23, %24 : !pto.tile_buf, !pto.tile_buf) outs(%24 : !pto.tile_buf) + pto.tstore ins(%24 : !pto.tile_buf) outs(%29 : !pto.partition_tensor_view<1x16384xf16>) + } + } + } + } + } + return + } +} + diff --git a/examples/agent/sigmoid/sigmoid_builder.py b/examples/agent/sigmoid/sigmoid_builder.py new file mode 100644 index 00000000..9b79d014 --- /dev/null +++ b/examples/agent/sigmoid/sigmoid_builder.py @@ -0,0 +1,151 @@ +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s + +const = s.const + +# 32 KB of UB / sizeof(fp16) = 16384 elements per tile +ELEMENTS_PER_TILE = 32 * 1024 // 2 + + +def meta_data(): + dtype = pto.float16 + ptr_type = pto.PtrType(dtype) + index_dtype = pto.int32 + + tensor_type = pto.TensorType(rank=1, dtype=dtype) + subtensor_type = pto.SubTensorType(shape=[1, ELEMENTS_PER_TILE], dtype=dtype) + + tile_cfg = pto.TileBufConfig() + tile_type = pto.TileBufType( + shape=[1, ELEMENTS_PER_TILE], + valid_shape=[1, -1], + dtype=dtype, + memory_space="VEC", + config=tile_cfg, + ) + + return { + "ptr_type": ptr_type, + "index_dtype": index_dtype, + "tensor_type": tensor_type, + "subtensor_type": subtensor_type, + "tile_type": tile_type, + } + + +def build_sigmoid(fn_name="sigmoid_fp16"): + """ + Build a dynamic-batch Sigmoid kernel in PTO DSL. + + Computes y = 1 / (1 + exp(-x)), where: + -x = 0 - x (tile.sub) + exp(-x) (tile.exp) + 1 + exp(-x) (tile.add with ones tile) + 1 / (1 + exp(-x)) (tile.reciprocal) + + Constants (1.0) are derived from the input tile itself using + the identity exp(x - x) = exp(0) = 1.0, which avoids the need for + scalar-tile broadcast operations not available in PTO DSL. + + UB tile budget (fp16, 4 tiles x 32 KB = 128 KB < 192 KB): + tb_x : input row x + tb_ones : constant 1.0 (recomputed each row via exp(x-x)) + tb_tmp1 : intermediate / final output + tb_tmp2 : intermediate (zeros / neg_x) + + Kernel args: + x_ptr : fp16[batch * n_cols] -- input + y_ptr : fp16[batch * n_cols] -- output + batch : int32 -- number of rows + n_cols : int32 -- elements per row; must be <= 16384 + """ + + @to_ir_module(meta_data=meta_data) + def _kernel( + x_ptr: "ptr_type", + y_ptr: "ptr_type", + batch_i32: "index_dtype", + n_cols_i32: "index_dtype", + ) -> None: + c0 = const(0) + c1 = const(1) + c_tile = const(ELEMENTS_PER_TILE) + + batch = s.index_cast(batch_i32) + n_cols = s.index_cast(n_cols_i32) + + with pto.vector_section(): + with pto.if_context(n_cols > c0): + with pto.if_context(c_tile >= n_cols): + cid = pto.get_block_idx() + sub_bid = pto.get_subblock_idx() + sub_bnum = pto.get_subblock_num() + num_blocks = pto.get_block_num() + + vid = s.index_cast(cid * sub_bnum + sub_bid) + num_cores = s.index_cast(num_blocks * sub_bnum) + + rows_per_core = s.ceil_div(batch, num_cores) + row_start = vid * rows_per_core + row_end = s.min_u(row_start + rows_per_core, batch) + num_rows = row_end - row_start + + total_elems = batch * n_cols + tv_x = pto.as_tensor( + tensor_type, ptr=x_ptr, shape=[total_elems], strides=[c1] + ) + tv_y = pto.as_tensor( + tensor_type, ptr=y_ptr, shape=[total_elems], strides=[c1] + ) + + with pto.if_context(num_rows > c0): + tb_x = pto.alloc_tile(tile_type, valid_col=n_cols) + tb_ones = pto.alloc_tile(tile_type, valid_col=n_cols) + tb_tmp1 = pto.alloc_tile(tile_type, valid_col=n_cols) + tb_tmp2 = pto.alloc_tile(tile_type, valid_col=n_cols) + + for row_i in pto.range(c0, num_rows, c1): + gm_offset = (row_start + row_i) * n_cols + + sv_x = pto.slice_view( + subtensor_type, + source=tv_x, + offsets=[gm_offset], + sizes=[n_cols], + ) + sv_y = pto.slice_view( + subtensor_type, + source=tv_y, + offsets=[gm_offset], + sizes=[n_cols], + ) + + pto.load(sv_x, tb_x) + + # Derive ones: exp(x - x) = exp(0) = 1.0 + tile.sub(tb_x, tb_x, tb_tmp2) # tmp2 = 0.0 + tile.exp(tb_tmp2, tb_ones) # ones = 1.0 + + # sigmoid(x) = 1 / (1 + exp(-x)) + tile.sub(tb_tmp2, tb_x, tb_tmp1) # tmp1 = -x (0 - x) + tile.exp(tb_tmp1, tb_tmp1) # tmp1 = exp(-x) + tile.add(tb_tmp1, tb_ones, tb_tmp1) # tmp1 = 1 + exp(-x) + tile.div(tb_ones, tb_tmp1, tb_tmp1) # tmp1 = 1 / (1 + exp(-x)) + + pto.store(tb_tmp1, sv_y) + + _ = fn_name + return _kernel + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--fn-name", + default="sigmoid_fp16", + help="Generated kernel function name.", + ) + args = parser.parse_args() + print(build_sigmoid(fn_name=args.fn_name)) diff --git a/openenv/agent_search.py b/openenv/agent_search.py index e5e9591d..199e2eb6 100644 --- a/openenv/agent_search.py +++ b/openenv/agent_search.py @@ -18,19 +18,19 @@ """ import argparse +import os import shutil import sys -import time import tomllib import urllib.request from pathlib import Path -import anthropic - ROOT = Path(__file__).parent.parent sys.path.insert(0, str(ROOT)) from openenv import KernelSearchEnv, KernelAction +import openenv.anthropic_provider as _anthropic_provider +import openenv.gemini_provider as _gemini_provider # --------------------------------------------------------------------------- # Config loading @@ -103,6 +103,8 @@ def resolve_cmd(cmd: list[str]) -> list[str]: _parser.add_argument("--max-turns", type=int, default=30) _parser.add_argument("--max-tokens", type=int, default=None, help="Max tokens per API call (overrides config; default: 8192)") +_parser.add_argument("--model", default=None, + help="Model to use, e.g. claude-opus-4-6 (default) or gemini-2.5-flash") _args = _parser.parse_args() CFG = load_config(_args.config) @@ -404,7 +406,14 @@ def _build_system_prompt(cfg: dict, kernel_dir: Path) -> str: - One edit at a time; build and verify after each change. - If a build or test fails, revert the change with another edit_file call and try a different approach. - - Stop as soon as you achieve a confirmed speedup > 1.0x and explain what worked. + - Once you achieve a confirmed speedup > 1.0x, write a file called + `optimization_note.md` using write_file. It should explain: + * What changes were made compared to the baseline + * Why each change improves performance + * The final speedup achieved + Then stop. + - If you exhaust all ideas without a speedup, still write `optimization_note.md` + summarising what was tried and why it did not help. """) def _build_create_system_prompt(cfg: dict, work_dir: Path, ref_dir: Path) -> str: @@ -488,12 +497,27 @@ def _build_create_system_prompt(cfg: dict, work_dir: Path, ref_dir: Path) -> str else _build_system_prompt(CFG, EXAMPLE_DIR) ) +# --------------------------------------------------------------------------- +# Model dispatch +# --------------------------------------------------------------------------- + +def _is_gemini(model: str) -> bool: + return model.startswith("gemini") + + +def _call_model(messages: list[dict], system: str, model: str, max_tokens: int) -> tuple[list[dict], str]: + if _is_gemini(model): + return _gemini_provider.call(messages, system, model, max_tokens, TOOLS) + return _anthropic_provider.call(messages, system, model, max_tokens, TOOLS) + + # --------------------------------------------------------------------------- # Agentic loop # --------------------------------------------------------------------------- def run_agent(max_turns: int = 30) -> None: - mode = CFG.get("mode", "optimize") - client = anthropic.Anthropic() + mode = CFG.get("mode", "optimize") + model = _args.model or CFG.get("model", "claude-opus-4-6") + print(f"Model: {model}") if mode == "create": name = CFG.get("kernel_name", "kernel") @@ -525,59 +549,50 @@ def run_agent(max_turns: int = 30) -> None: print(f"{'─'*60}") max_tokens = _args.max_tokens or CFG.get("max_tokens", 8192) - for attempt in range(5): - try: - with client.messages.stream( - model="claude-opus-4-6", - max_tokens=max_tokens, - thinking={"type": "adaptive"}, - system=SYSTEM, - tools=TOOLS, - messages=messages, - ) as stream: - response = stream.get_final_message() - break - except anthropic.RateLimitError as e: - wait = 60 * (attempt + 1) - print(f"\n[Rate limit] Waiting {wait}s before retry ({attempt+1}/5)… ({e})") - time.sleep(wait) - else: - raise RuntimeError("Rate limit retries exhausted") - - # Print any text Claude produced - for block in response.content: - if block.type == "text" and block.text.strip(): - print(f"\n[Claude] {block.text.strip()}") - - # Append assistant turn — strip trailing thinking blocks (API rejects them as final block) - content = list(response.content) - while content and getattr(content[-1], "type", None) == "thinking": - content.pop() - if content: + content_dicts, stop_reason = _call_model(messages, SYSTEM, model, max_tokens) + + # Print any text the model produced + for block in content_dicts: + if block["type"] == "text" and block["text"].strip(): + print(f"\n[Agent] {block['text'].strip()}") + + # Append assistant turn — strip trailing thinking blocks (Anthropic API rejects + # them as the final block). If the entire response is thinking (tokens exhausted + # before any output), inject synthetic turns to advance context. + content = list(content_dicts) + has_non_thinking = any(b["type"] != "thinking" for b in content) + if has_non_thinking: + while content and content[-1]["type"] == "thinking": + content.pop() messages.append({"role": "assistant", "content": content}) + elif content: + # All-thinking: inject a synthetic round-trip to unblock the next call. + messages.append({"role": "assistant", "content": [{"type": "text", "text": "I need to use the tools to proceed."}]}) + messages.append({"role": "user", "content": "Please proceed and use the available tools."}) # Done if no tool calls - if response.stop_reason == "end_turn": - print("\n[Agent] Claude finished.") + if stop_reason == "end_turn": + print("\n[Agent] Finished.") break # Execute tool calls tool_results = [] - for block in response.content: - if block.type != "tool_use": + for block in content_dicts: + if block["type"] != "tool_use": continue - print(f"\n → {block.name}({_fmt_input(block.input)})") - result = execute_tool(block.name, block.input) + print(f"\n → {block['name']}({_fmt_input(block['input'])})") + result = execute_tool(block["name"], block["input"]) print(f" ← {result[:300]}" + ("…" if len(result) > 300 else "")) tool_results.append({ - "type": "tool_result", - "tool_use_id": block.id, - "content": result, + "type": "tool_result", + "tool_use_id": block["id"], + "tool_name": block["name"], # kept for Gemini message translation + "content": result, }) # Track test results for create-mode exit condition - if block.name == "run_tests" and result.startswith("PASS"): + if block["name"] == "run_tests" and result.startswith("PASS"): tests_passed = True if tool_results: @@ -588,8 +603,10 @@ def run_agent(max_turns: int = 30) -> None: break if mode == "optimize" and env.best_speedup is not None and env.best_speedup > 1.0: - print(f"\n[Agent] Speedup achieved: {env.best_speedup:.4f}x — stopping early.") - break + note_path = EXAMPLE_DIR / "optimization_note.md" + if note_path.exists(): + print(f"\n[Agent] Speedup achieved: {env.best_speedup:.4f}x and optimization_note.md written — stopping.") + break # Final summary print(f"\n{'='*60}") diff --git a/openenv/anthropic_provider.py b/openenv/anthropic_provider.py new file mode 100644 index 00000000..35765fc9 --- /dev/null +++ b/openenv/anthropic_provider.py @@ -0,0 +1,71 @@ +"""Anthropic/Claude provider for agent_search.""" + +import time +import anthropic + + +def _sanitize_messages(messages: list[dict]) -> list[dict]: + """Strip provider-specific fields that Anthropic rejects (e.g. tool_name in tool_result).""" + clean = [] + for msg in messages: + content = msg["content"] + if isinstance(content, list): + new_content = [] + for block in content: + if isinstance(block, dict) and block.get("type") == "tool_result": + block = {k: v for k, v in block.items() if k != "tool_name"} + new_content.append(block) + clean.append({**msg, "content": new_content}) + else: + clean.append(msg) + return clean + + +def call( + messages: list[dict], + system: str, + model: str, + max_tokens: int, + tools: list[dict], +) -> tuple[list[dict], str]: + """Call Anthropic and return (content_dicts, stop_reason).""" + client = anthropic.Anthropic() + + for attempt in range(5): + try: + with client.messages.stream( + model=model, + max_tokens=max_tokens, + thinking={"type": "adaptive"}, + system=system, + tools=tools, + messages=_sanitize_messages(messages), + ) as stream: + response = stream.get_final_message() + break + except anthropic.RateLimitError as e: + wait = 60 * (attempt + 1) + print(f"\n[Rate limit] Waiting {wait}s before retry ({attempt+1}/5)… ({e})") + time.sleep(wait) + else: + raise RuntimeError("Rate limit retries exhausted") + + content_dicts: list[dict] = [] + for block in response.content: + if block.type == "thinking": + entry: dict = {"type": "thinking", "thinking": getattr(block, "thinking", "")} + sig = getattr(block, "signature", None) + if sig: + entry["signature"] = sig + content_dicts.append(entry) + elif block.type == "text": + content_dicts.append({"type": "text", "text": block.text}) + elif block.type == "tool_use": + content_dicts.append({ + "type": "tool_use", + "id": block.id, + "name": block.name, + "input": block.input, + }) + + return content_dicts, response.stop_reason diff --git a/openenv/gemini_provider.py b/openenv/gemini_provider.py new file mode 100644 index 00000000..9714a424 --- /dev/null +++ b/openenv/gemini_provider.py @@ -0,0 +1,162 @@ +"""Google Gemini provider for agent_search.""" + +import os +import time +import uuid + + +def _tools_for_gemini(tools: list[dict]) -> list[dict]: + """Convert Anthropic-style tool defs to Gemini function declarations. + + Gemini generates MALFORMED_FUNCTION_CALL when a tool declaration has an + empty-properties schema (e.g. build/run_tests/benchmark). Omit the + parameters key entirely for no-argument tools so Gemini knows they take none. + """ + decls = [] + for t in tools: + decl = {"name": t["name"], "description": t["description"]} + schema = t.get("input_schema", {}) + if schema.get("properties"): + decl["parameters"] = schema + decls.append(decl) + return [{"function_declarations": decls}] + + +def _messages_to_contents(messages: list[dict]) -> list[dict]: + """Translate shared message history to Gemini contents format.""" + contents = [] + for msg in messages: + role = "model" if msg["role"] == "assistant" else "user" + raw = msg["content"] + if isinstance(raw, str): + contents.append({"role": role, "parts": [{"text": raw}]}) + continue + parts: list[dict] = [] + for block in raw: + btype = block.get("type") if isinstance(block, dict) else getattr(block, "type", None) + if btype == "text": + text = block["text"] if isinstance(block, dict) else block.text + if text: + parts.append({"text": text}) + elif btype == "tool_use": + name = block["name"] if isinstance(block, dict) else block.name + inp = block["input"] if isinstance(block, dict) else block.input + fc_dict: dict = {"name": name, "args": inp or {}} + # Echo thought_signature back — required by Gemini thinking models. + sig = block.get("thought_signature") if isinstance(block, dict) else getattr(block, "thought_signature", None) + if sig is not None: + fc_dict["thought_signature"] = sig + parts.append({"function_call": fc_dict}) + elif btype == "tool_result": + fn_name = block.get("tool_name", block.get("tool_use_id", "tool")) + result = block.get("content", "") + parts.append({"function_response": {"name": fn_name, "response": {"result": result}}}) + elif btype == "gemini_thought": + text = block.get("text", "") if isinstance(block, dict) else getattr(block, "text", "") + parts.append({"thought": True, "text": text}) + # Anthropic-style thinking blocks are not forwarded to Gemini + if parts: + contents.append({"role": role, "parts": parts}) + return contents + + +def call( + messages: list[dict], + system: str, + model: str, + max_tokens: int, + tools: list[dict], +) -> tuple[list[dict], str]: + """Call Gemini and return (content_dicts, stop_reason).""" + try: + from google import genai as _genai + from google.genai import types as _gtypes + except ImportError: + raise RuntimeError("google-genai not installed — run: pip install google-genai") + + api_key = os.environ.get("GOOGLE_API_KEY") + if not api_key: + raise RuntimeError("GOOGLE_API_KEY environment variable not set") + + gclient = _genai.Client(api_key=api_key) + + max_retries = 5 + base_delay = 30.0 + for attempt in range(max_retries): + try: + response = gclient.models.generate_content( + model=model, + contents=_messages_to_contents(messages), + config=_gtypes.GenerateContentConfig( + system_instruction=system, + tools=_tools_for_gemini(tools), + max_output_tokens=max_tokens, + ), + ) + break + except Exception as _e: + err_str = str(_e) + if "429" not in err_str and "RESOURCE_EXHAUSTED" not in err_str: + raise + if "PerDay" in err_str or "per_day" in err_str.lower(): + raise RuntimeError( + "Gemini daily quota exhausted. " + "Upgrade your plan or wait until quota resets (midnight Pacific).\n" + f" Original error: {_e}" + ) from None + if attempt == max_retries - 1: + raise + import re as _re + delay_match = _re.search(r"retryDelay.*?(\d+)s", err_str) + delay = float(delay_match.group(1)) if delay_match else base_delay * (2 ** attempt) + print(f"\n[Gemini] Rate limited (429), retrying in {delay:.0f}s (attempt {attempt+1}/{max_retries})...") + time.sleep(delay) + + content_dicts: list[dict] = [] + stop_reason = "end_turn" + candidate = response.candidates[0] if response.candidates else None + + if candidate is None: + fb = getattr(response, "prompt_feedback", None) + raise RuntimeError( + f"Gemini returned no candidates — prompt may have been blocked.\n" + f" prompt_feedback: {fb}" + ) + + if candidate.content is None: + finish = str(getattr(candidate, "finish_reason", "unknown")) + if "MALFORMED_FUNCTION_CALL" in finish: + print(f"\n[Gemini] Warning: malformed function call ({finish}), nudging model to retry.") + return [{"type": "text", "text": "My previous tool call was malformed. I will retry with correct syntax."}], "tool_use" + raise RuntimeError( + f"Gemini candidate has no content (finish_reason={finish!r}).\n" + f" The response was likely filtered. Try a different model or simplify the system prompt." + ) + + for part in (candidate.content.parts or []): + if getattr(part, "thought", False): + content_dicts.append({"type": "gemini_thought", "text": getattr(part, "text", "") or ""}) + continue + fc = getattr(part, "function_call", None) + if fc and getattr(fc, "name", None): + entry: dict = { + "type": "tool_use", + "id": f"gemini-{uuid.uuid4().hex[:8]}", + "name": fc.name, + "input": dict(fc.args) if fc.args else {}, + } + sig = getattr(fc, "thought_signature", None) + if sig is not None: + entry["thought_signature"] = sig + content_dicts.append(entry) + stop_reason = "tool_use" + elif getattr(part, "text", None): + content_dicts.append({"type": "text", "text": part.text}) + + finish = str(getattr(candidate, "finish_reason", "")) + if "MAX_TOKENS" in finish: + stop_reason = "max_tokens" + elif "STOP" in finish and stop_reason != "tool_use": + stop_reason = "end_turn" + + return content_dicts, stop_reason diff --git a/openenv/rms_norm_config.toml b/openenv/rms_norm_config.toml new file mode 100644 index 00000000..91be9e40 --- /dev/null +++ b/openenv/rms_norm_config.toml @@ -0,0 +1,66 @@ +mode = "create" + +kernel_dir = "examples/aot/geglu_dynamic_multicore" # reference for API/build patterns +kernel_name = "rms_norm" +create_file = "rms_norm_builder.py" + +build_cmd = ["bash", "compile.sh"] +test_cmd = ["python", "run_rms_norm.py"] +bench_cmd = ["python", "_bench_wrapper.py"] + +create_prompt = """ +Implement RMS Layer Normalisation: y = x * rsqrt(mean(x^2) + eps) * weight +PyTorch reference: torch.nn.functional.rms_norm(x, (n_cols,), weight, eps=1e-6) + +Inputs: + x — fp32 tensor (batch, n_cols) + weight — fp32 tensor (n_cols,) — per-element learnable scale (gamma) +Output: + out — fp32 tensor (batch, n_cols) — normalised and scaled values + +The kernel signature in caller.cpp should be: + extern "C" void call_kernel(uint32_t blockDim, void *stream, + uint8_t *x, uint8_t *weight, uint8_t *out, + uint32_t batch, uint32_t n_cols) + +The .so produced by compile.sh should be named rms_norm_lib.so. + +The test script run_rms_norm.py should: + - Use batch=16, n_cols=1024 (or similar) + - Compare against torch.nn.functional.rms_norm(x, (n_cols,), weight, eps=1e-6) + - Check max absolute difference < 1e-3 (fp32 tolerance) + +The bench wrapper _bench_wrapper.py should print latency_ms=. + +Implementation notes: + - Process one row per loop iteration. + - Load weight once before the loop (it is shared across all rows). + - For each row: + 1. Load x[row, :] into a tile. + 2. Square element-wise: tmp = x * x. + 3. Compute row sum of tmp → scalar tile (row_sum). + 4. Divide scalar by n_cols → mean of squares. + 5. Add eps (store eps as a scalar constant and add it using tile.add or a scalar op). + 6. rsqrt of the scalar → rms_inv. + 7. Broadcast rms_inv across a row tile and multiply x by it. + 8. Multiply element-wise by weight. + 9. Store result to out[row, :]. + - eps = 1e-6; represent as a tile filled with that constant for tile.add, + or add it to the scalar before rsqrt. +""" + +# Reference files from geglu — agent should adapt these for rms_norm +preload_files = ["compile.sh", "caller.cpp", "geglu_builder.py", "run_geglu.py", "_bench_wrapper.py"] + +# Local PTO-DSL API files (authoritative) +local_context_files = [ + "ptodsl/api/pto.py", + "ptodsl/api/tile.py", + "ptodsl/api/scalar.py", + "ptodsl/api/control_flow.py", + "ptodsl/api/synchronization.py", +] + +[[context_urls]] +url = "https://github.com/zhangstevenunity/PTOAS/blob/main/python/pto/dialects/pto.py" +label = "PTO Python bindings" diff --git a/openenv/sigmoid_config.toml b/openenv/sigmoid_config.toml new file mode 100644 index 00000000..9acc5ff7 --- /dev/null +++ b/openenv/sigmoid_config.toml @@ -0,0 +1,50 @@ +mode = "create" + +kernel_dir = "examples/aot/geglu_dynamic_multicore" # reference for API patterns only +kernel_name = "sigmoid" +create_file = "sigmoid_builder.py" + +build_cmd = ["bash", "compile.sh"] +test_cmd = ["python", "run_sigmoid.py"] +bench_cmd = ["python", "_bench_wrapper.py"] + +create_prompt = """ +Implement Sigmoid activation: y = 1 / (1 + exp(-x)) +PyTorch reference: torch.sigmoid(x) + +Input: fp16 tensor (batch, n_cols) +Output: fp16 tensor (batch, n_cols) — separate output pointer + +The kernel signature in caller.cpp should be: + extern "C" void call_kernel(uint32_t blockDim, void *stream, + uint8_t *x, uint8_t *y, + uint32_t batch, uint32_t n_cols) + +The .so produced by compile.sh should be named sigmoid_lib.so. +The test script run_sigmoid.py should compare against torch.sigmoid(x.float()).to(x.dtype). +The bench wrapper _bench_wrapper.py should print latency_ms=. + +Implementation hint: + sigmoid(x) = 1 / (1 + exp(-x)) + -x can be computed as tile.sub(zeros, x, neg_x) where zeros = tile with 0.0. + Use exp(x - x) = 1.0 to derive a tile of ones (no scalar-tile broadcast available). + Then: tile.exp(neg_x) -> tile.add(exp_neg_x, ones) -> tile.reciprocal(...). + +Scalar-to-tile broadcast is NOT available — do not pass scalar values to tile ops. +""" + +# These are loaded from the reference kernel_dir as examples — adapt them for sigmoid +preload_files = ["compile.sh", "caller.cpp", "geglu_builder.py", "run_geglu.py", "_bench_wrapper.py"] + +# Local PTO-DSL API files (authoritative, read from repo) +local_context_files = [ + "ptodsl/api/pto.py", + "ptodsl/api/tile.py", + "ptodsl/api/scalar.py", + "ptodsl/api/control_flow.py", + "ptodsl/api/synchronization.py", +] + +[[context_urls]] +url = "https://github.com/zhangstevenunity/PTOAS/blob/main/python/pto/dialects/pto.py" +label = "PTO Python bindings"