From d5bfce57607685c15ea106ba9e2a67792cf28598 Mon Sep 17 00:00:00 2001
From: mirkodevita <mirko.de.vita@h-partners.com>
Date: Mon, 9 Mar 2026 13:55:44 +0000
Subject: [PATCH 1/6] first implementation of KernelAgent for pto-dsl with
 optimized geglu

---
 .../geglu_dynamic_multicore/_bench_wrapper.py |  71 +++
 .../geglu_dynamic_multicore_opt/.gitignore    |   3 +
 .../aot/geglu_dynamic_multicore_opt/README.md |   7 +
 .../_bench_wrapper.py                         |  71 +++
 .../bench_geglu.py                            | 128 ++++++
 .../geglu_dynamic_multicore_opt/caller.cpp    |  26 ++
 .../geglu_dynamic_multicore_opt/compile.sh    |  22 +
 .../geglu_builder.py                          | 179 ++++++++
 .../geglu_dynamic_multicore_opt/run_geglu.py  | 121 ++++++
 openenv/README.md                             | 119 +++++
 openenv/__init__.py                           |   3 +
 openenv/agent_search.py                       | 407 ++++++++++++++++++
 openenv/geglu_config.toml                     |  37 ++
 openenv/hadamard_config.toml                  |  54 +++
 openenv/kernel_opt_env.py                     | 221 ++++++++++
 15 files changed, 1469 insertions(+)
 create mode 100644 examples/aot/geglu_dynamic_multicore/_bench_wrapper.py
 create mode 100644 examples/aot/geglu_dynamic_multicore_opt/.gitignore
 create mode 100644 examples/aot/geglu_dynamic_multicore_opt/README.md
 create mode 100644 examples/aot/geglu_dynamic_multicore_opt/_bench_wrapper.py
 create mode 100644 examples/aot/geglu_dynamic_multicore_opt/bench_geglu.py
 create mode 100644 examples/aot/geglu_dynamic_multicore_opt/caller.cpp
 create mode 100755 examples/aot/geglu_dynamic_multicore_opt/compile.sh
 create mode 100644 examples/aot/geglu_dynamic_multicore_opt/geglu_builder.py
 create mode 100644 examples/aot/geglu_dynamic_multicore_opt/run_geglu.py
 create mode 100644 openenv/README.md
 create mode 100644 openenv/__init__.py
 create mode 100644 openenv/agent_search.py
 create mode 100644 openenv/geglu_config.toml
 create mode 100644 openenv/hadamard_config.toml
 create mode 100644 openenv/kernel_opt_env.py
diff --git a/examples/aot/geglu_dynamic_multicore/_bench_wrapper.py b/examples/aot/geglu_dynamic_multicore/_bench_wrapper.py
new file mode 100644
index 00000000..dec6bfaf
--- /dev/null
+++ b/examples/aot/geglu_dynamic_multicore/_bench_wrapper.py
@@ -0,0 +1,71 @@
+"""
+Single-config benchmark wrapper for the agentic optimizer.
+Loads geglu_lib.so and prints:  latency_ms=<number>
+"""
+import ctypes
+
+import torch
+import torch_npu  # noqa: F401
+
+from ptodsl.test_util import get_test_device
+
+# Representative shape — change to target a different operating point
+BATCH     = 1024
+N_COLS    = 8192
+BLOCK_DIM = 24
+WARMUP    = 5
+ITERS     = 20
+
+
+def torch_to_ctypes(t):
+    return ctypes.c_void_p(t.data_ptr())
+
+
+device = get_test_device()
+torch.npu.set_device(device)
+
+lib = ctypes.CDLL("./geglu_lib.so")
+lib.call_kernel.argtypes = [
+    ctypes.c_uint32,  # blockDim
+    ctypes.c_void_p,  # stream
+    ctypes.c_void_p,  # a
+    ctypes.c_void_p,  # b
+    ctypes.c_void_p,  # c (output)
+    ctypes.c_uint32,  # batch
+    ctypes.c_uint32,  # n_cols
+]
+lib.call_kernel.restype = None
+
+
+def run(a, b, c):
+    lib.call_kernel(
+        BLOCK_DIM,
+        torch.npu.current_stream()._as_parameter_,
+        torch_to_ctypes(a),
+        torch_to_ctypes(b),
+        torch_to_ctypes(c),
+        BATCH,
+        N_COLS,
+    )
+
+
+dtype = torch.float16
+# Separate tensors per iteration to reduce cache reuse
+as_ = [torch.randn(BATCH, N_COLS, device=device, dtype=dtype).clamp(-4, 4) for _ in range(WARMUP + ITERS)]
+bs_ = [torch.randn(BATCH, N_COLS, device=device, dtype=dtype) for _ in range(WARMUP + ITERS)]
+c   = torch.empty(BATCH, N_COLS, device=device, dtype=dtype)
+
+for i in range(WARMUP):
+    run(as_[i], bs_[i], c)
+torch.npu.synchronize()
+
+starts = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)]
+ends   = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)]
+for i in range(ITERS):
+    starts[i].record()
+    run(as_[WARMUP + i], bs_[WARMUP + i], c)
+    ends[i].record()
+torch.npu.synchronize()
+
+ms = sum(s.elapsed_time(e) for s, e in zip(starts, ends)) / ITERS
+print(f"latency_ms={ms:.4f}")
diff --git a/examples/aot/geglu_dynamic_multicore_opt/.gitignore b/examples/aot/geglu_dynamic_multicore_opt/.gitignore
new file mode 100644
index 00000000..99be97c5
--- /dev/null
+++ b/examples/aot/geglu_dynamic_multicore_opt/.gitignore
@@ -0,0 +1,3 @@
+geglu.pto
+geglu.cpp
+geglu_lib.so
diff --git a/examples/aot/geglu_dynamic_multicore_opt/README.md b/examples/aot/geglu_dynamic_multicore_opt/README.md
new file mode 100644
index 00000000..374bb9cc
--- /dev/null
+++ b/examples/aot/geglu_dynamic_multicore_opt/README.md
@@ -0,0 +1,7 @@
+Usage:
+
+```bash
+bash ./compile.sh
+python ./run_geglu.py
+python ./bench_geglu.py
+```
diff --git a/examples/aot/geglu_dynamic_multicore_opt/_bench_wrapper.py b/examples/aot/geglu_dynamic_multicore_opt/_bench_wrapper.py
new file mode 100644
index 00000000..dec6bfaf
--- /dev/null
+++ b/examples/aot/geglu_dynamic_multicore_opt/_bench_wrapper.py
@@ -0,0 +1,71 @@
+"""
+Single-config benchmark wrapper for the agentic optimizer.
+Loads geglu_lib.so and prints:  latency_ms=<number>
+"""
+import ctypes
+
+import torch
+import torch_npu  # noqa: F401
+
+from ptodsl.test_util import get_test_device
+
+# Representative shape — change to target a different operating point
+BATCH     = 1024
+N_COLS    = 8192
+BLOCK_DIM = 24
+WARMUP    = 5
+ITERS     = 20
+
+
+def torch_to_ctypes(t):
+    return ctypes.c_void_p(t.data_ptr())
+
+
+device = get_test_device()
+torch.npu.set_device(device)
+
+lib = ctypes.CDLL("./geglu_lib.so")
+lib.call_kernel.argtypes = [
+    ctypes.c_uint32,  # blockDim
+    ctypes.c_void_p,  # stream
+    ctypes.c_void_p,  # a
+    ctypes.c_void_p,  # b
+    ctypes.c_void_p,  # c (output)
+    ctypes.c_uint32,  # batch
+    ctypes.c_uint32,  # n_cols
+]
+lib.call_kernel.restype = None
+
+
+def run(a, b, c):
+    lib.call_kernel(
+        BLOCK_DIM,
+        torch.npu.current_stream()._as_parameter_,
+        torch_to_ctypes(a),
+        torch_to_ctypes(b),
+        torch_to_ctypes(c),
+        BATCH,
+        N_COLS,
+    )
+
+
+dtype = torch.float16
+# Separate tensors per iteration to reduce cache reuse
+as_ = [torch.randn(BATCH, N_COLS, device=device, dtype=dtype).clamp(-4, 4) for _ in range(WARMUP + ITERS)]
+bs_ = [torch.randn(BATCH, N_COLS, device=device, dtype=dtype) for _ in range(WARMUP + ITERS)]
+c   = torch.empty(BATCH, N_COLS, device=device, dtype=dtype)
+
+for i in range(WARMUP):
+    run(as_[i], bs_[i], c)
+torch.npu.synchronize()
+
+starts = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)]
+ends   = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)]
+for i in range(ITERS):
+    starts[i].record()
+    run(as_[WARMUP + i], bs_[WARMUP + i], c)
+    ends[i].record()
+torch.npu.synchronize()
+
+ms = sum(s.elapsed_time(e) for s, e in zip(starts, ends)) / ITERS
+print(f"latency_ms={ms:.4f}")
diff --git a/examples/aot/geglu_dynamic_multicore_opt/bench_geglu.py b/examples/aot/geglu_dynamic_multicore_opt/bench_geglu.py
new file mode 100644
index 00000000..2c36e5a7
--- /dev/null
+++ b/examples/aot/geglu_dynamic_multicore_opt/bench_geglu.py
@@ -0,0 +1,128 @@
+import argparse
+import ctypes
+
+import torch
+import torch.nn.functional as F
+import torch_npu  # noqa: F401
+
+from ptodsl.test_util import get_test_device
+
+
+def torch_to_ctypes(tensor):
+    return ctypes.c_void_p(tensor.data_ptr())
+
+
+def load_lib(lib_path, block_dim=24):
+    lib = ctypes.CDLL(lib_path)
+    lib.call_kernel.argtypes = [
+        ctypes.c_uint32,  # blockDim
+        ctypes.c_void_p,  # stream
+        ctypes.c_void_p,  # a
+        ctypes.c_void_p,  # b
+        ctypes.c_void_p,  # c (output)
+        ctypes.c_uint32,  # batch
+        ctypes.c_uint32,  # n_cols
+    ]
+    lib.call_kernel.restype = None
+
+    def geglu_func(a, b, c, batch, n_cols, stream_ptr=None):
+        if stream_ptr is None:
+            stream_ptr = torch.npu.current_stream()._as_parameter_
+        lib.call_kernel(
+            block_dim,
+            stream_ptr,
+            torch_to_ctypes(a),
+            torch_to_ctypes(b),
+            torch_to_ctypes(c),
+            batch,
+            n_cols,
+        )
+
+    return geglu_func
+
+
+def bench_geglu(
+    geglu_func, a, b, c, kernel_name="geglu_func", warmup_iters=5, benchmark_iters=50
+):
+    batch, n_cols = a.shape
+    # reads a and b, writes c
+    io_bytes = a.numel() * a.element_size() * 3
+    # Overwrite a large buffer between launches to reduce L2 cache reuse.
+    cache = torch.empty((256 * 1024 * 1024,), dtype=torch.int8, device=a.device)
+
+    def time_op(fn):
+        for _ in range(warmup_iters):
+            fn()
+        torch.npu.synchronize()
+
+        mixed_start = torch.npu.Event(enable_timing=True)
+        mixed_end = torch.npu.Event(enable_timing=True)
+        cache_start = torch.npu.Event(enable_timing=True)
+        cache_end = torch.npu.Event(enable_timing=True)
+
+        mixed_start.record()
+        for _ in range(benchmark_iters):
+            cache.zero_()
+            fn()
+        mixed_end.record()
+        torch.npu.synchronize()
+
+        cache_start.record()
+        for _ in range(benchmark_iters):
+            cache.zero_()
+        cache_end.record()
+        torch.npu.synchronize()
+
+        mixed_total_ms = mixed_start.elapsed_time(mixed_end)
+        cache_total_ms = cache_start.elapsed_time(cache_end)
+        kernel_total_ms = max(mixed_total_ms - cache_total_ms, 0.0)
+        return kernel_total_ms / benchmark_iters
+
+    custom_ms = time_op(lambda: geglu_func(a, b, c, batch, n_cols))
+    torch_ms = time_op(lambda: torch.mul(F.gelu(a, approximate="tanh"), b))
+
+    custom_bw_gbs = (io_bytes / (custom_ms / 1e3)) / 1e9
+    torch_bw_gbs = (io_bytes / (torch_ms / 1e3)) / 1e9
+
+    print(
+        f"{kernel_name}: {custom_ms:.3f} ms, "
+        f"effective bandwidth: {custom_bw_gbs:.3f} GB/s "
+        f"(IO={io_bytes / 1e6:.2f} MB)"
+    )
+    print(
+        f"torch gelu*b: {torch_ms:.3f} ms, "
+        f"effective bandwidth: {torch_bw_gbs:.3f} GB/s "
+        f"(IO={io_bytes / 1e6:.2f} MB)"
+    )
+
+
+def run_bench(lib_path, block_dim=24, batch=1024, n_cols=8192):
+    device = get_test_device()
+    torch.npu.set_device(device)
+
+    geglu_func = load_lib(lib_path, block_dim=block_dim)
+
+    torch.manual_seed(0)
+    dtype = torch.float16
+    a = torch.randn(batch, n_cols, device=device, dtype=dtype).clamp(-4, 4)
+    b = torch.randn(batch, n_cols, device=device, dtype=dtype)
+    c = torch.empty(batch, n_cols, device=device, dtype=dtype)
+
+    geglu_func(a, b, c, batch, n_cols)
+    torch.npu.synchronize()
+
+    a_f32 = a.float()
+    ref = (0.5 * a_f32 * (1.0 + torch.tanh(a_f32))).to(dtype) * b
+    torch.testing.assert_close(c, ref, rtol=1e-2, atol=1e-2)
+
+    bench_geglu(geglu_func, a, b, c, kernel_name=f"geglu ({lib_path})")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--lib", default="./geglu_lib.so")
+    parser.add_argument("--block-dim", type=int, default=24)
+    parser.add_argument("--batch", type=int, default=1024)
+    parser.add_argument("--n-cols", type=int, default=8192)
+    args = parser.parse_args()
+    run_bench(args.lib, block_dim=args.block_dim, batch=args.batch, n_cols=args.n_cols)
diff --git a/examples/aot/geglu_dynamic_multicore_opt/caller.cpp b/examples/aot/geglu_dynamic_multicore_opt/caller.cpp
new file mode 100644
index 00000000..85351fd4
--- /dev/null
+++ b/examples/aot/geglu_dynamic_multicore_opt/caller.cpp
@@ -0,0 +1,26 @@
+#ifndef KERNEL_CPP
+#define KERNEL_CPP "geglu.cpp"
+#endif
+#include KERNEL_CPP
+
+#ifndef NUM_CORES
+#define NUM_CORES 24
+#endif
+
+extern "C" void call_kernel(
+    uint32_t blockDim,
+    void *stream,
+    uint8_t *a,
+    uint8_t *b,
+    uint8_t *c,
+    uint32_t batch,
+    uint32_t n_cols)
+{
+    uint32_t launch_blocks = blockDim > 0 ? blockDim : NUM_CORES;
+    _kernel<<<launch_blocks, nullptr, stream>>>(
+        reinterpret_cast<half *>(a),
+        reinterpret_cast<half *>(b),
+        reinterpret_cast<half *>(c),
+        static_cast<int32_t>(batch),
+        static_cast<int32_t>(n_cols));
+}
diff --git a/examples/aot/geglu_dynamic_multicore_opt/compile.sh b/examples/aot/geglu_dynamic_multicore_opt/compile.sh
new file mode 100755
index 00000000..9da3faa7
--- /dev/null
+++ b/examples/aot/geglu_dynamic_multicore_opt/compile.sh
@@ -0,0 +1,22 @@
+set -e
+
+rm -f geglu.pto geglu.cpp geglu_lib.so
+
+python ./geglu_builder.py > ./geglu.pto
+ptoas --enable-insert-sync ./geglu.pto -o ./geglu.cpp
+
+bisheng \
+    -I${ASCEND_TOOLKIT_HOME}/include \
+    -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \
+    -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \
+    -xcce -Xhost-start -Xhost-end \
+    -mllvm -cce-aicore-stack-size=0x8000 \
+    -mllvm -cce-aicore-function-stack-size=0x8000 \
+    -mllvm -cce-aicore-record-overflow=true \
+    -mllvm -cce-aicore-addr-transform \
+    -mllvm -cce-aicore-dcci-insert-for-scalar=false \
+    --npu-arch=dav-2201 -DMEMORY_BASE \
+    -std=gnu++17 \
+    -DKERNEL_CPP="\"geglu.cpp\"" \
+    ./caller.cpp \
+    -o ./geglu_lib.so
diff --git a/examples/aot/geglu_dynamic_multicore_opt/geglu_builder.py b/examples/aot/geglu_dynamic_multicore_opt/geglu_builder.py
new file mode 100644
index 00000000..4dae7890
--- /dev/null
+++ b/examples/aot/geglu_dynamic_multicore_opt/geglu_builder.py
@@ -0,0 +1,179 @@
+from ptodsl import to_ir_module
+import ptodsl.language as pto
+
+const = pto.const
+
+# 32 KB of UB / sizeof(fp16) = 16384 elements per tile
+ELEMENTS_PER_TILE = 32 * 1024 // 2
+
+
+def meta_data():
+    dtype = pto.float16
+    ptr_type = pto.PtrType(dtype)
+    index_dtype = pto.int32
+
+    tensor_type = pto.TensorType(rank=1, dtype=dtype)
+    subtensor_type = pto.SubTensorType(shape=[1, ELEMENTS_PER_TILE], dtype=dtype)
+
+    tile_cfg = pto.TileBufConfig()
+    tile_type = pto.TileBufType(
+        shape=[1, ELEMENTS_PER_TILE],
+        valid_shape=[1, -1],
+        dtype=dtype,
+        memory_space="VEC",
+        config=tile_cfg,
+    )
+
+    return {
+        "ptr_type": ptr_type,
+        "index_dtype": index_dtype,
+        "tensor_type": tensor_type,
+        "subtensor_type": subtensor_type,
+        "tile_type": tile_type,
+    }
+
+
+def build_geglu(fn_name="geglu_fp16"):
+    """
+    Build a dynamic-batch GEGLU kernel in PTO DSL (optimized).
+
+    Computes c = gelu_approx(a) * b, where:
+        gelu_approx(a) = a * sigmoid(2a)
+        sigmoid(x)     = exp(x) / (exp(x) + 1)
+
+    Optimizations vs baseline:
+      1. Sigmoid reformulation: gelu(a) = a * sigmoid(2a) uses only
+         1 exp + 1 div per row instead of 2 exp + 2 div.
+      2. Hoisted ones: exp(a-a)=1.0 computed once before the loop.
+
+    UB tile budget (fp16, 5 tiles x 32 KB = 160 KB < 192 KB):
+        tb_a    : input row a
+        tb_b    : input row b
+        tb_ones : constant 1.0 (computed once before loop)
+        tb_tmp1 : intermediate / final output
+        tb_tmp2 : intermediate
+
+    Kernel args:
+        a_ptr   : fp16[batch * n_cols]  -- gating input
+        b_ptr   : fp16[batch * n_cols]  -- linear input
+        c_ptr   : fp16[batch * n_cols]  -- output
+        batch   : int32                 -- number of rows
+        n_cols  : int32                 -- elements per row; must be <= 16384
+    """
+
+    @to_ir_module(meta_data=meta_data)
+    def _kernel(
+        a_ptr: "ptr_type",
+        b_ptr: "ptr_type",
+        c_ptr: "ptr_type",
+        batch_i32: "index_dtype",
+        n_cols_i32: "index_dtype",
+    ) -> None:
+        c0 = const(0)
+        c1 = const(1)
+        c_tile = const(ELEMENTS_PER_TILE)
+
+        batch = pto.index_cast(batch_i32)
+        n_cols = pto.index_cast(n_cols_i32)
+
+        with pto.vector_section():
+            # Guard: n_cols must be in (0, ELEMENTS_PER_TILE].
+
+            with pto.if_context(n_cols > c0):
+                with pto.if_context(c_tile >= n_cols):
+                    cid = pto.get_block_idx()
+                    sub_bid = pto.get_subblock_idx()
+                    sub_bnum = pto.get_subblock_num()
+                    num_blocks = pto.get_block_num()
+
+                    vid = pto.index_cast(cid * sub_bnum + sub_bid)  # vector core index
+                    num_cores = pto.index_cast(num_blocks * sub_bnum)  # number of vector cores
+
+                    # Distribute rows across cores (row-level parallelism).
+                    rows_per_core = pto.ceil_div(batch, num_cores)
+                    row_start = vid * rows_per_core
+                    row_end = pto.min_u(row_start + rows_per_core, batch)
+                    num_rows = row_end - row_start
+
+                    total_elems = batch * n_cols
+                    tv_a = pto.as_tensor(
+                        tensor_type, ptr=a_ptr, shape=[total_elems], strides=[c1]
+                    )
+                    tv_b = pto.as_tensor(
+                        tensor_type, ptr=b_ptr, shape=[total_elems], strides=[c1]
+                    )
+                    tv_c = pto.as_tensor(
+                        tensor_type, ptr=c_ptr, shape=[total_elems], strides=[c1]
+                    )
+
+                    with pto.if_context(num_rows > c0):
+                        # Allocate 5 UB tiles (160 KB total, well under 192 KB UB).
+                        tb_a = pto.alloc_tile(tile_type, valid_col=n_cols)
+                        tb_b = pto.alloc_tile(tile_type, valid_col=n_cols)
+                        tb_ones = pto.alloc_tile(tile_type, valid_col=n_cols)
+                        tb_tmp1 = pto.alloc_tile(tile_type, valid_col=n_cols)
+                        tb_tmp2 = pto.alloc_tile(tile_type, valid_col=n_cols)
+
+                        # Hoist ones computation: load first row, compute exp(a-a)=1
+                        sv_a_first = pto.slice_view(
+                            subtensor_type,
+                            source=tv_a,
+                            offsets=[row_start * n_cols],
+                            sizes=[n_cols],
+                        )
+                        pto.load(sv_a_first, tb_a)
+                        pto.sub(tb_a, tb_a, tb_tmp1)  # tmp1 = 0
+                        pto.exp(tb_tmp1, tb_ones)  # ones = 1.0
+
+                        for row_i in pto.for_range(c0, num_rows, c1):
+                            gm_offset = (row_start + row_i) * n_cols
+
+                            sv_a = pto.slice_view(
+                                subtensor_type,
+                                source=tv_a,
+                                offsets=[gm_offset],
+                                sizes=[n_cols],
+                            )
+                            sv_b = pto.slice_view(
+                                subtensor_type,
+                                source=tv_b,
+                                offsets=[gm_offset],
+                                sizes=[n_cols],
+                            )
+                            sv_c = pto.slice_view(
+                                subtensor_type,
+                                source=tv_c,
+                                offsets=[gm_offset],
+                                sizes=[n_cols],
+                            )
+
+                            pto.load(sv_a, tb_a)
+                            pto.load(sv_b, tb_b)
+
+                            # gelu(a) = a * sigmoid(2a)
+                            # sigmoid(2a) = exp(2a) / (exp(2a) + 1)
+                            pto.add(tb_a, tb_a, tb_tmp1)  # tmp1 = 2a
+                            pto.exp(tb_tmp1, tb_tmp1)  # tmp1 = exp(2a)
+                            pto.add(tb_tmp1, tb_ones, tb_tmp2)  # tmp2 = exp(2a) + 1
+                            pto.div(tb_tmp1, tb_tmp2, tb_tmp1)  # tmp1 = sigmoid(2a)
+
+                            # GEGLU: c = a * sigmoid(2a) * b
+                            pto.mul(tb_a, tb_tmp1, tb_tmp1)  # tmp1 = gelu(a)
+                            pto.mul(tb_tmp1, tb_b, tb_tmp1)  # tmp1 = c
+                            pto.store(tb_tmp1, sv_c)
+
+    _ = fn_name
+    return _kernel
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--fn-name",
+        default="geglu_fp16",
+        help="Generated kernel function name.",
+    )
+    args = parser.parse_args()
+    print(build_geglu(fn_name=args.fn_name))
diff --git a/examples/aot/geglu_dynamic_multicore_opt/run_geglu.py b/examples/aot/geglu_dynamic_multicore_opt/run_geglu.py
new file mode 100644
index 00000000..a180206c
--- /dev/null
+++ b/examples/aot/geglu_dynamic_multicore_opt/run_geglu.py
@@ -0,0 +1,121 @@
+import argparse
+import ctypes
+
+import torch
+import torch_npu  # noqa: F401
+
+from ptodsl.test_util import get_test_device
+
+
+def torch_to_ctypes(tensor):
+    return ctypes.c_void_p(tensor.data_ptr())
+
+
+def load_lib(lib_path, block_dim=24):
+    lib = ctypes.CDLL(lib_path)
+    lib.call_kernel.argtypes = [
+        ctypes.c_uint32,  # blockDim
+        ctypes.c_void_p,  # stream
+        ctypes.c_void_p,  # a
+        ctypes.c_void_p,  # b
+        ctypes.c_void_p,  # c (output)
+        ctypes.c_uint32,  # batch
+        ctypes.c_uint32,  # n_cols
+    ]
+    lib.call_kernel.restype = None
+
+    def geglu_func(a, b, c, batch, n_cols, block_dim=block_dim, stream_ptr=None):
+        if stream_ptr is None:
+            stream_ptr = torch.npu.current_stream()._as_parameter_
+        lib.call_kernel(
+            block_dim,
+            stream_ptr,
+            torch_to_ctypes(a),
+            torch_to_ctypes(b),
+            torch_to_ctypes(c),
+            batch,
+            n_cols,
+        )
+
+    return geglu_func
+
+
+def geglu_ref(a, b):
+    """Reference GEGLU matching the PTO kernel.
+
+    Computes c = gelu_approx(a) * b, where:
+        gelu_approx(a) = 0.5 * a * (1 + tanh(a))
+        tanh(a)        = (exp(2a) - 1) / (exp(2a) + 1)
+
+    Note: This is a simplified tanh-based GELU (without the polynomial
+    inner argument used in the full approximation). It matches what the
+    PTO kernel computes using only tile-tile operations.
+    """
+    a_f32 = a.float()
+    gelu_a = 0.5 * a_f32 * (1.0 + torch.tanh(a_f32))
+    return gelu_a.to(a.dtype) * b
+
+
+def test_geglu(lib_path, block_dim=24):
+    device = get_test_device()
+    torch.npu.set_device(device)
+
+    geglu = load_lib(lib_path=lib_path, block_dim=block_dim)
+
+    torch.manual_seed(0)
+    dtype = torch.float16
+    batch_list = [1, 4, 22, 65]
+    n_cols_list = [128, 256, 512, 1024, 2048, 4096, 8192, 16384]
+
+    results = []
+    for batch in batch_list:
+        for n_cols in n_cols_list:
+            # Use small range to stay within fp16 exp range (avoid overflow).
+            a = torch.randn(batch, n_cols, device=device, dtype=dtype).clamp(-4, 4)
+            b = torch.randn(batch, n_cols, device=device, dtype=dtype)
+            c = torch.empty(batch, n_cols, device=device, dtype=dtype)
+
+            y_ref = geglu_ref(a, b)
+            geglu(a, b, c, batch, n_cols)
+            torch.npu.synchronize()
+
+            is_match = True
+            detail = ""
+            try:
+                torch.testing.assert_close(c, y_ref, rtol=1e-2, atol=1e-2)
+            except AssertionError as err:
+                is_match = False
+                detail = str(err).strip() if str(err) else "assert_close failed"
+
+            status = "match" if is_match else "mismatch"
+            print(f"[{status}] batch={batch}, n_cols={n_cols}, lib={lib_path}")
+            if detail:
+                print("  detail:")
+                print(detail)
+            results.append((batch, n_cols, status, detail))
+
+    print(f"\ndetailed summary for {lib_path}:")
+    for batch, n_cols, status, detail in results:
+        msg = f"  batch={batch}, n_cols={n_cols}, status={status}"
+        print(msg)
+        if detail:
+            print("    detail:")
+            print(detail)
+    return results
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--lib",
+        default="./geglu_lib.so",
+        help="Path to the shared library generated by compile.sh.",
+    )
+    parser.add_argument(
+        "--block-dim",
+        type=int,
+        default=24,
+        help="Kernel blockDim (default: 24).",
+    )
+    args = parser.parse_args()
+    test_geglu(args.lib, block_dim=args.block_dim)
diff --git a/openenv/README.md b/openenv/README.md
new file mode 100644
index 00000000..b684a9bf
--- /dev/null
+++ b/openenv/README.md
@@ -0,0 +1,119 @@
+# openenv — Agentic Kernel Optimizer
+
+Uses Claude Opus with OpenEnv to iteratively edit, build, test, and benchmark PTO-DSL kernels until a faster-than-baseline implementation is found.
+
+## Quick start
+
+```bash
+ANTHROPIC_API_KEY=sk-... python openenv/agent_search.py --config openenv/hadamard_config.toml
+```
+
+The agent will:
+1. Copy the kernel directory to a `_opt/` working directory (originals untouched)
+2. Preload source files and reference docs into its context
+3. Iteratively edit → build → test → benchmark in a loop
+4. Stop when it achieves a speedup > 1.0x or exhausts its turn budget
+
+The optimized result is left in the `_opt/` directory.
+
+## CLI options
+
+```
+python openenv/agent_search.py
+  --config PATH       Path to a kernel TOML config (default: openenv/hadamard_config.toml)
+  --max-turns N       Max agent turns (default: 30)
+  --max-tokens N      Max tokens per API call (default: 8192; overrides config)
+```
+
+## Config file reference
+
+Each kernel needs its own `.toml` config. See `hadamard_config.toml` or `geglu_config.toml` for complete examples.
+
+```toml
+# --- Required ---
+
+# Path to the kernel source directory (relative to repo root, or absolute)
+kernel_dir = "examples/aot/my_kernel"
+
+# Shell commands run from inside the working directory
+build_cmd = ["bash", "compile.sh"]
+test_cmd  = ["python", "run_my_kernel.py"]
+bench_cmd = ["python", "_bench_wrapper.py"]   # must print "latency_ms=<float>"
+
+# Files snapshotted as baseline; restored on reset/revert
+baseline_files = ["my_kernel_builder.py"]
+
+# --- Agent context ---
+
+# Primary file the agent is told to look at first
+main_file = "my_kernel_builder.py"
+
+# Human-readable kernel name used in prompts
+kernel_name = "my_kernel"
+
+# Free-form domain notes injected into the system prompt
+tuning_notes = """
+- Key constraints, valid parameter ranges, hardware limits, etc.
+"""
+
+# Files the agent must NOT modify (enforced via system prompt)
+readonly_files = ["run_my_kernel.py", "caller.cpp"]
+
+# Files embedded verbatim in the system prompt at startup (saves read_file turns)
+preload_files = ["my_kernel_builder.py", "compile.sh", "caller.cpp"]
+
+# Remote docs fetched at startup and embedded in the system prompt.
+# GitHub blob URLs are auto-converted to raw.githubusercontent.com.
+[[context_urls]]
+url   = "https://github.com/org/repo/blob/main/some_reference.td"
+label = "Human-readable label shown in prompt"
+
+# --- Optional ---
+
+# Working directory for edits. Defaults to "{kernel_dir}_opt".
+# work_dir = "examples/aot/my_kernel_opt"
+
+# Max tokens per API call (also settable via --max-tokens CLI flag)
+# max_tokens = 8192
+```
+
+## Adding a new kernel
+
+1. Make sure the kernel directory has:
+   - A build script (e.g. `compile.sh`)
+   - A test script (e.g. `run_my_kernel.py`) that exits 0 on pass
+   - A bench wrapper `_bench_wrapper.py` that prints `latency_ms=<float>` to stdout
+
+2. Create a config file (copy `hadamard_config.toml` and adjust paths/notes)
+
+3. Run:
+   ```bash
+   ANTHROPIC_API_KEY=sk-... python openenv/agent_search.py --config openenv/my_kernel_config.toml
+   ```
+
+### Writing `_bench_wrapper.py`
+
+The bench wrapper is a plain Python script executed as a subprocess. It must print exactly one line of the form:
+
+```
+latency_ms=0.1234
+```
+
+See `examples/aot/fast_hadamard/_bench_wrapper.py` or `examples/aot/geglu_dynamic_multicore/_bench_wrapper.py` for complete examples. Typical structure:
+
+```python
+# load lib, allocate tensors
+# warmup loop
+# timed loop with per-iteration events
+ms = sum(s.elapsed_time(e) for s, e in zip(starts, ends)) / ITERS
+print(f"latency_ms={ms:.4f}")
+```
+
+## Files
+
+| File | Purpose |
+|---|---|
+| `agent_search.py` | Main entry point — agentic loop, config loading, tool executor |
+| `kernel_opt_env.py` | `KernelSearchEnv` — edit/build/test/benchmark environment |
+| `hadamard_config.toml` | Config for `examples/aot/fast_hadamard` |
+| `geglu_config.toml` | Config for `examples/aot/geglu_dynamic_multicore` |
diff --git a/openenv/__init__.py b/openenv/__init__.py
new file mode 100644
index 00000000..634f885e
--- /dev/null
+++ b/openenv/__init__.py
@@ -0,0 +1,3 @@
+from .kernel_opt_env import KernelAction, KernelObservation, KernelSearchEnv
+
+__all__ = ["KernelAction", "KernelObservation", "KernelSearchEnv"]
diff --git a/openenv/agent_search.py b/openenv/agent_search.py
new file mode 100644
index 00000000..e3e2d05e
--- /dev/null
+++ b/openenv/agent_search.py
@@ -0,0 +1,407 @@
+"""
+Agentic kernel optimizer.
+
+Uses Claude Opus with tool use to iteratively edit, build, test, and benchmark
+PTO-DSL kernels until a faster-than-baseline implementation is found.
+
+Usage:
+    ANTHROPIC_API_KEY=sk-... python openenv/agent_search.py --config openenv/kernel_config.toml
+
+The agent is given five tools that map 1-to-1 onto KernelSearchEnv actions:
+    read_file    – inspect any file in the kernel directory
+    edit_file    – apply a targeted text replacement to a source file
+    build        – compile the current source to a shared library
+    run_tests    – check correctness of the compiled kernel
+    benchmark    – measure latency and compute speedup vs baseline
+
+Claude decides what to change, builds, verifies, and benchmarks in a loop.
+"""
+
+import argparse
+import shutil
+import sys
+import time
+import tomllib
+import urllib.request
+from pathlib import Path
+
+import anthropic
+
+ROOT = Path(__file__).parent.parent
+sys.path.insert(0, str(ROOT))
+
+from openenv import KernelSearchEnv, KernelAction
+
+# ---------------------------------------------------------------------------
+# Config loading
+# ---------------------------------------------------------------------------
+
+def load_config(config_path: str) -> dict:
+    with open(config_path, "rb") as f:
+        return tomllib.load(f)
+
+
+def build_env(cfg: dict) -> tuple["KernelSearchEnv", Path]:
+    kernel_dir = Path(cfg["kernel_dir"])
+    if not kernel_dir.is_absolute():
+        kernel_dir = ROOT / kernel_dir
+
+    # Resolve working directory (copy of originals — never touch kernel_dir itself)
+    if "work_dir" in cfg:
+        work_dir = Path(cfg["work_dir"])
+        if not work_dir.is_absolute():
+            work_dir = ROOT / work_dir
+    else:
+        work_dir = kernel_dir.parent / (kernel_dir.name + "_opt")
+
+    if work_dir.exists():
+        shutil.rmtree(work_dir)
+    shutil.copytree(kernel_dir, work_dir)
+    print(f"Working directory: {work_dir}  (originals in {kernel_dir} are untouched)")
+
+    baseline_files = {
+        name: (kernel_dir / name).read_text(encoding="utf-8")
+        for name in cfg["baseline_files"]
+    }
+
+    def resolve_cmd(cmd: list[str]) -> list[str]:
+        # Replace bare "python" with the current interpreter
+        return [sys.executable if c == "python" else c for c in cmd]
+
+    env = KernelSearchEnv(
+        repo_path=str(work_dir),
+        build_cmd=resolve_cmd(cfg["build_cmd"]),
+        test_cmd=resolve_cmd(cfg["test_cmd"]),
+        bench_cmd=resolve_cmd(cfg["bench_cmd"]),
+        baseline_files=baseline_files,
+    )
+    return env, work_dir
+
+
+# ---------------------------------------------------------------------------
+# Parse args and initialise globals
+# ---------------------------------------------------------------------------
+
+_parser = argparse.ArgumentParser(description="Agentic kernel optimizer")
+_parser.add_argument(
+    "--config", default=str(Path(__file__).parent / "kernel_config.toml"),
+    help="Path to kernel TOML config (default: openenv/kernel_config.toml)",
+)
+_parser.add_argument("--max-turns", type=int, default=30)
+_parser.add_argument("--max-tokens", type=int, default=None,
+    help="Max tokens per API call (overrides config; default: 8192)")
+_args = _parser.parse_args()
+
+CFG = load_config(_args.config)
+EXAMPLE_DIR: Path
+env: KernelSearchEnv
+env, EXAMPLE_DIR = build_env(CFG)
+
+# ---------------------------------------------------------------------------
+# Tool definitions
+# ---------------------------------------------------------------------------
+TOOLS = [
+    {
+        "name": "read_file",
+        "description": (
+            "Read a source file from the kernel directory. "
+            "Use this first to understand the current implementation before proposing changes."
+        ),
+        "input_schema": {
+            "type": "object",
+            "properties": {
+                "path": {
+                    "type": "string",
+                    "description": "Path relative to the kernel directory, e.g. 'add_builder.py'",
+                }
+            },
+            "required": ["path"],
+        },
+    },
+    {
+        "name": "edit_file",
+        "description": (
+            "Replace a unique snippet of text in a source file with new text. "
+            "'old' must match exactly (including whitespace). "
+            "Make one focused change at a time."
+        ),
+        "input_schema": {
+            "type": "object",
+            "properties": {
+                "path": {"type": "string", "description": "Relative file path"},
+                "old":  {"type": "string", "description": "Exact text to find and replace"},
+                "new":  {"type": "string", "description": "Replacement text"},
+            },
+            "required": ["path", "old", "new"],
+        },
+    },
+    {
+        "name": "build",
+        "description": "Compile the current source files into a shared library (.so). Must be called after every edit before testing or benchmarking.",
+        "input_schema": {"type": "object", "properties": {}, "required": []},
+    },
+    {
+        "name": "run_tests",
+        "description": "Run the correctness test suite. Returns pass/fail and any error output. Always call this after build before benchmark.",
+        "input_schema": {"type": "object", "properties": {}, "required": []},
+    },
+    {
+        "name": "benchmark",
+        "description": "Measure the kernel latency and compute speedup vs the baseline. Returns latency_ms and speedup_vs_baseline.",
+        "input_schema": {"type": "object", "properties": {}, "required": []},
+    },
+]
+
+# ---------------------------------------------------------------------------
+# Tool executor  (bridges Claude tool calls → KernelSearchEnv)
+# ---------------------------------------------------------------------------
+def execute_tool(name: str, tool_input: dict) -> str:
+    if name == "read_file":
+        path = EXAMPLE_DIR / tool_input["path"]
+        try:
+            return path.read_text(encoding="utf-8")
+        except FileNotFoundError:
+            return f"Error: file not found: {tool_input['path']}"
+
+    if name == "edit_file":
+        for key in ("path", "old", "new"):
+            if key not in tool_input:
+                return f"Error: edit_file requires '{key}' parameter."
+        obs = env.step(KernelAction("edit", {
+            "path": tool_input["path"],
+            "old":  tool_input["old"],
+            "new":  tool_input["new"],
+        }))
+        return obs.summary
+
+    if name == "build":
+        obs = env.step(KernelAction("build", {}))
+        return obs.summary
+
+    if name == "run_tests":
+        obs = env.step(KernelAction("test", {}))
+        status = "PASS" if obs.passed_tests else "FAIL"
+        return f"{status}\n{obs.summary}"
+
+    if name == "benchmark":
+        obs = env.step(KernelAction("benchmark", {}))
+        if obs.latency_ms is None:
+            return f"Benchmark failed: {obs.summary}"
+        return (
+            f"latency_ms={obs.latency_ms:.4f}  "
+            f"speedup={obs.speedup_vs_baseline:.4f}x vs baseline ({env.baseline_ms:.4f} ms)\n"
+            f"Best so far: {env.best_ms:.4f} ms ({env.best_speedup:.4f}x)"
+        )
+
+    return f"Unknown tool: {name}"
+
+# ---------------------------------------------------------------------------
+# System prompt  (built from config)
+# ---------------------------------------------------------------------------
+import textwrap as _textwrap
+
+
+def _github_to_raw(url: str) -> str:
+    """Convert a github.com blob URL to raw.githubusercontent.com."""
+    if "github.com" in url and "/blob/" in url:
+        url = url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/")
+    return url
+
+
+def _fetch_url(url: str, max_chars: int = 40_000) -> str:
+    url = _github_to_raw(url)
+    try:
+        with urllib.request.urlopen(url, timeout=15) as resp:
+            content = resp.read().decode("utf-8", errors="replace")
+        if len(content) > max_chars:
+            content = content[:max_chars] + f"\n... (truncated at {max_chars} chars)"
+        return content
+    except Exception as exc:
+        return f"(fetch failed: {exc})"
+
+
+def _build_system_prompt(cfg: dict, kernel_dir: Path) -> str:
+    name      = cfg.get("kernel_name", "kernel")
+    main_file = cfg.get("main_file", "builder.py")
+    tuning    = cfg.get("tuning_notes", "").strip()
+    readonly  = cfg.get("readonly_files", [])
+
+    readonly_rule = (
+        f"- Never modify these files (test harness): {', '.join(readonly)}."
+        if readonly else ""
+    )
+
+    files = sorted(f.name for f in kernel_dir.iterdir() if f.is_file())
+    file_listing = ", ".join(files)
+
+    # Embed preload_files verbatim so the agent doesn't waste turns reading them
+    preload_section = ""
+    for fname in cfg.get("preload_files", []):
+        fpath = kernel_dir / fname
+        try:
+            content = fpath.read_text(encoding="utf-8")
+            preload_section += f"\n### {fname}\n```\n{content}\n```\n"
+        except FileNotFoundError:
+            preload_section += f"\n### {fname}\n(file not found)\n"
+
+    if preload_section:
+        preload_section = "\n## Pre-loaded files (do NOT re-read these)\n" + preload_section
+
+    # Fetch remote reference docs and embed them
+    url_section = ""
+    for entry in cfg.get("context_urls", []):
+        if isinstance(entry, dict):
+            url, label = entry["url"], entry.get("label", entry["url"])
+        else:
+            url, label = entry, entry
+        print(f"  Fetching context: {label} …")
+        content = _fetch_url(url)
+        url_section += f"\n### {label}\n```\n{content}\n```\n"
+    if url_section:
+        url_section = "\n## Reference documentation (read-only, do not edit)\n" + url_section
+
+    return _textwrap.dedent(f"""\
+        You are an expert NPU kernel engineer specialising in Ascend/PTO-DSL performance tuning.
+
+        Your task is to find a version of the {name} kernel that is FASTER than the baseline.
+
+        The kernel directory contains exactly these files: {file_listing}
+        Do NOT attempt to read any other filenames — they do not exist.
+        {preload_section}{url_section}
+        ## Your workflow
+        1. Propose one targeted change based on the pre-loaded files above
+        2. edit_file(...)
+        3. build()  — check for compile errors
+        4. run_tests()  — ensure correctness
+        5. benchmark()  — measure speedup
+
+        Repeat until speedup > 1.0 (faster than baseline) or you have exhausted ideas.
+
+        ## Key tuning notes
+        {tuning}
+
+        ## Rules
+        {readonly_rule}
+        - One edit at a time; build and verify after each change.
+        - If a build or test fails, revert the change with another edit_file call
+          and try a different approach.
+        - Stop as soon as you achieve a confirmed speedup > 1.0x and explain what worked.
+    """)
+
+SYSTEM = _build_system_prompt(CFG, EXAMPLE_DIR)
+
+# ---------------------------------------------------------------------------
+# Agentic loop
+# ---------------------------------------------------------------------------
+def run_agent(max_turns: int = 30) -> None:
+    # Establish baseline
+    print("Establishing baseline …")
+    obs = env.reset()
+    print(f"Baseline: {obs.latency_ms:.4f} ms\n")
+
+    client = anthropic.Anthropic()
+    preloaded = CFG.get("preload_files", [])
+    if preloaded:
+        start_hint = f"The source files are already in your context above — start proposing a change directly."
+    else:
+        main_file = CFG.get("main_file", "builder.py")
+        start_hint = f"Start by reading {main_file}."
+
+    messages: list[dict] = [
+        {
+            "role": "user",
+            "content": (
+                f"The baseline kernel latency is {obs.latency_ms:.4f} ms. "
+                f"Please find a faster implementation. {start_hint}"
+            ),
+        }
+    ]
+
+    turn = 0
+    while turn < max_turns:
+        turn += 1
+        print(f"\n{'─'*60}")
+        print(f"Turn {turn}/{max_turns}")
+        print(f"{'─'*60}")
+
+        max_tokens = _args.max_tokens or CFG.get("max_tokens", 8192)
+        for attempt in range(5):
+            try:
+                with client.messages.stream(
+                    model="claude-opus-4-6",
+                    max_tokens=max_tokens,
+                    thinking={"type": "adaptive"},
+                    system=SYSTEM,
+                    tools=TOOLS,
+                    messages=messages,
+                ) as stream:
+                    response = stream.get_final_message()
+                break
+            except anthropic.RateLimitError as e:
+                wait = 60 * (attempt + 1)
+                print(f"\n[Rate limit] Waiting {wait}s before retry ({attempt+1}/5)… ({e})")
+                time.sleep(wait)
+        else:
+            raise RuntimeError("Rate limit retries exhausted")
+
+        # Print any text Claude produced
+        for block in response.content:
+            if block.type == "text" and block.text.strip():
+                print(f"\n[Claude] {block.text.strip()}")
+
+        # Append assistant turn — strip trailing thinking blocks (API rejects them as final block)
+        content = list(response.content)
+        while content and getattr(content[-1], "type", None) == "thinking":
+            content.pop()
+        if content:
+            messages.append({"role": "assistant", "content": content})
+
+        # Done if no tool calls
+        if response.stop_reason == "end_turn":
+            print("\n[Agent] Claude finished.")
+            break
+
+        # Execute tool calls
+        tool_results = []
+        for block in response.content:
+            if block.type != "tool_use":
+                continue
+
+            print(f"\n  → {block.name}({_fmt_input(block.input)})")
+            result = execute_tool(block.name, block.input)
+            print(f"  ← {result[:300]}" + ("…" if len(result) > 300 else ""))
+
+            tool_results.append({
+                "type": "tool_result",
+                "tool_use_id": block.id,
+                "content": result,
+            })
+
+        if tool_results:
+            messages.append({"role": "user", "content": tool_results})
+
+        # Early exit if we already have a confirmed speedup
+        if env.best_speedup is not None and env.best_speedup > 1.0:
+            print(f"\n[Agent] Speedup achieved: {env.best_speedup:.4f}x — stopping early.")
+            break
+
+    # Final summary
+    print(f"\n{'='*60}")
+    print("SEARCH COMPLETE")
+    print(f"{'='*60}")
+    print(f"  Baseline:     {env.baseline_ms:.4f} ms")
+    print(f"  Best found:   {env.best_ms:.4f} ms")
+    print(f"  Best speedup: {env.best_speedup:.4f}x" if env.best_speedup else "  No improvement found.")
+
+
+def _fmt_input(inp: dict) -> str:
+    """Compact one-line representation of tool input for logging."""
+    parts = []
+    for k, v in inp.items():
+        s = str(v).replace("\n", "\\n")
+        parts.append(f"{k}={s[:60]!r}" if len(s) > 60 else f"{k}={s!r}")
+    return ", ".join(parts)
+
+
+if __name__ == "__main__":
+    run_agent(max_turns=_args.max_turns)
diff --git a/openenv/geglu_config.toml b/openenv/geglu_config.toml
new file mode 100644
index 00000000..125ce6cb
--- /dev/null
+++ b/openenv/geglu_config.toml
@@ -0,0 +1,37 @@
+# Kernel optimizer configuration — GEGLU dynamic multicore
+# Usage: python openenv/agent_search.py --config openenv/geglu_config.toml
+
+kernel_dir = "examples/aot/geglu_dynamic_multicore"
+
+# work_dir = "examples/aot/geglu_dynamic_multicore_opt"
+
+build_cmd = ["bash", "compile.sh"]
+test_cmd  = ["python", "run_geglu.py"]
+bench_cmd = ["python", "_bench_wrapper.py"]
+
+baseline_files = ["geglu_builder.py"]
+main_file      = "geglu_builder.py"
+kernel_name    = "geglu"
+
+tuning_notes = """
+- Computes c = gelu_approx(a) * b  (in-place tanh-GELU gate, fp16).
+- ELEMENTS_PER_TILE = 16384 (32KB UB / sizeof(fp16)). n_cols must be <= this.
+- Currently uses 5 UB tiles (160 KB / 192 KB). One extra tile is available.
+- Constants 1.0 and 2.0 are derived via exp(a-a)=1 (no scalar-tile broadcast).
+- Rows are distributed across vector cores; rows_per_core = ceil(batch / num_cores).
+- Benchmark config: batch=1024, n_cols=8192, block_dim=24 (see _bench_wrapper.py).
+- Key tuning ideas: reduce ops per row (reuse intermediates), pipeline loads with
+  compute using double-buffering, or explore different tile sizes.
+"""
+
+readonly_files = ["run_geglu.py", "caller.cpp"]
+
+preload_files = ["geglu_builder.py", "compile.sh", "caller.cpp"]
+
+[[context_urls]]
+url   = "https://github.com/zhangstevenunity/PTOAS/blob/main/include/PTO/IR/PTOOps.td"
+label = "PTO MLIR op definitions (PTOOps.td)"
+
+[[context_urls]]
+url   = "https://github.com/zhangstevenunity/PTOAS/blob/main/python/pto/dialects/pto.py"
+label = "PTO Python bindings (pto.py)"
diff --git a/openenv/hadamard_config.toml b/openenv/hadamard_config.toml
new file mode 100644
index 00000000..d950dd3f
--- /dev/null
+++ b/openenv/hadamard_config.toml
@@ -0,0 +1,54 @@
+# Kernel optimizer configuration
+# Pass to agent_search.py with: python agent_search.py --config kernel_config.toml
+
+# Path to the kernel directory (relative to repo root, or absolute)
+kernel_dir = "examples/aot/fast_hadamard"
+
+# Working directory where edits happen (original files are never touched).
+# Relative paths are resolved from repo root. Defaults to "{kernel_dir}_opt".
+# work_dir = "examples/aot/fast_hadamard_opt"
+
+# Shell commands (relative paths resolved from kernel_dir)
+build_cmd = ["bash", "compile.sh"]
+test_cmd  = ["python", "run_hadamard.py"]
+bench_cmd = ["python", "_bench_wrapper.py"]
+
+# Files whose content is snapshotted as the baseline for reset/revert
+baseline_files = ["hadamard_builder.py"]
+
+# The first file the agent should read to understand the kernel
+main_file = "hadamard_builder.py"
+
+# Human-readable name used in prompts
+kernel_name = "fast_hadamard"
+
+# Free-form tuning notes injected into the system prompt
+tuning_notes = """
+- The kernel operates on fp16 tensors in-place: shape (batch, N), N must be a power of 2.
+- ELEMENTS_PER_TILE = 16384 (32KB UB / sizeof(fp16)). N must be <= this.
+- samples_per_load controls how many rows are processed per chunk (currently 1).
+  Increasing it may improve throughput if UB pressure allows.
+- Two ping/pong tile sets (event_id 0/1) are used to hide DMA latency.
+- The auto-sync variant relies on `ptoas --enable-insert-sync`; no manual barriers needed.
+- Benchmark config: batch=32, N=8192, block_dim=24 (see _bench_wrapper.py).
+"""
+
+# Files the agent must NOT modify (enforced via system-prompt rules)
+readonly_files = ["run_hadamard.py", "caller.cpp"]
+
+# Files embedded verbatim in the system prompt so the agent starts with full context
+# (saves turns that would otherwise be spent on read_file calls)
+preload_files = ["hadamard_builder.py", "compile.sh", "caller.cpp"]
+
+# Remote reference docs fetched at startup and embedded in the system prompt.
+# GitHub blob URLs are auto-converted to raw. gitcode.com is not fetchable.
+# Reference only (not fetchable): https://gitcode.com/cann/pto-isa/tree/master/docs/isa
+# Reference only (not fetchable): https://gitcode.com/cann/pto-isa/tree/master/include/pto/npu/a2a3
+# Reference only (not fetchable): https://gitcode.com/cann/pto-isa/blob/master/tests/npu/a2a3/src/st/testcase/
+[[context_urls]]
+url   = "https://github.com/zhangstevenunity/PTOAS/blob/main/include/PTO/IR/PTOOps.td"
+label = "PTO MLIR op definitions (PTOOps.td)"
+
+[[context_urls]]
+url   = "https://github.com/zhangstevenunity/PTOAS/blob/main/python/pto/dialects/pto.py"
+label = "PTO Python bindings (pto.py)"
diff --git a/openenv/kernel_opt_env.py b/openenv/kernel_opt_env.py
new file mode 100644
index 00000000..503b99ac
--- /dev/null
+++ b/openenv/kernel_opt_env.py
@@ -0,0 +1,221 @@
+import subprocess
+from dataclasses import dataclass
+from pathlib import Path
+
+
+@dataclass
+class KernelAction:
+    kind: str    # "edit", "build", "test", "benchmark"
+    payload: dict
+
+
+@dataclass
+class KernelObservation:
+    passed_tests: bool
+    latency_ms: float | None
+    speedup_vs_baseline: float | None
+    summary: str
+
+
+class KernelSearchEnv:
+    """
+    Optimization environment for finding fast NPU kernels.
+
+    env = KernelSearchEnv(
+        repo_path      = "path/to/kernel/dir",
+        build_cmd      = ["bash", "compile.sh"],
+        test_cmd       = ["python", "run.py"],
+        bench_cmd      = ["python", "bench.py"],   # must print "latency_ms=<float>"
+        baseline_files = {"builder.py": "<source>"},
+    )
+    obs = env.reset()
+    obs = env.step(KernelAction("edit", {"path": "builder.py", "old": "...", "new": "..."}))
+    obs = env.step(KernelAction("test", {}))
+    obs = env.step(KernelAction("benchmark", {}))
+    """
+
+    def __init__(
+        self,
+        repo_path: str,
+        test_cmd: list[str],
+        bench_cmd: list[str],
+        build_cmd: list[str] | None = None,
+        baseline_files: dict[str, str] | None = None,
+    ):
+        self._root = Path(repo_path)
+        self._test_cmd = test_cmd
+        self._bench_cmd = bench_cmd
+        self._build_cmd = build_cmd
+        self._baseline_files: dict[str, str] = baseline_files or {}
+        self._baseline_ms: float | None = None
+        self._best_ms: float | None = None
+
+    # ------------------------------------------------------------------
+    # Public interface
+    # ------------------------------------------------------------------
+
+    def reset(self) -> KernelObservation:
+        """Restore baseline sources, recompile, and benchmark."""
+        self._restore_baseline()
+        if self._build_cmd:
+            ok, out = self._run_cmd(self._build_cmd)
+            if not ok:
+                return self._error_obs(f"Baseline build failed:\n{out}")
+        latency = self._run_benchmark()
+        self._baseline_ms = latency
+        self._best_ms = latency
+        return KernelObservation(
+            passed_tests=True,
+            latency_ms=latency,
+            speedup_vs_baseline=1.0,
+            summary="Baseline ready",
+        )
+
+    def step(self, action: KernelAction) -> KernelObservation:
+        dispatch = {
+            "edit":      self._handle_edit,
+            "build":     self._handle_build,
+            "test":      self._handle_test,
+            "benchmark": self._handle_benchmark,
+        }
+        handler = dispatch.get(action.kind)
+        if handler is None:
+            raise ValueError(f"Unknown action kind {action.kind!r}. Valid: {list(dispatch)}")
+        return handler(action.payload)
+
+    # ------------------------------------------------------------------
+    # Action handlers
+    # ------------------------------------------------------------------
+
+    def _handle_edit(self, payload: dict) -> KernelObservation:
+        rel = payload.get("path")
+        if not rel:
+            return self._error_obs("edit payload must include 'path'")
+
+        target = self._root / rel
+        target.parent.mkdir(parents=True, exist_ok=True)
+
+        if "content" in payload:
+            target.write_text(payload["content"], encoding="utf-8")
+        elif "old" in payload and "new" in payload:
+            original = target.read_text(encoding="utf-8")
+            if payload["old"] not in original:
+                return self._error_obs(f"'old' string not found in {rel}; edit not applied.")
+            target.write_text(original.replace(payload["old"], payload["new"], 1), encoding="utf-8")
+        else:
+            return self._error_obs("edit payload must have 'content' or both 'old'+'new'")
+
+        return KernelObservation(
+            passed_tests=False,
+            latency_ms=None,
+            speedup_vs_baseline=None,
+            summary=f"Applied edit to {rel}. Run 'test' to verify correctness.",
+        )
+
+    def _handle_build(self, payload: dict) -> KernelObservation:
+        cmd = payload.get("cmd") or self._build_cmd
+        if not cmd:
+            return self._error_obs("No build_cmd configured and none provided in payload.")
+        ok, output = self._run_cmd(cmd)
+        return KernelObservation(
+            passed_tests=ok,
+            latency_ms=None,
+            speedup_vs_baseline=None,
+            summary=output if not ok else f"Build succeeded.\n{output}".strip(),
+        )
+
+    def _handle_test(self, payload: dict) -> KernelObservation:
+        cmd = payload.get("cmd") or self._test_cmd
+        ok, output = self._run_cmd(cmd)
+        return KernelObservation(
+            passed_tests=ok,
+            latency_ms=None,
+            speedup_vs_baseline=None,
+            summary=output if not ok else f"All tests passed.\n{output}".strip(),
+        )
+
+    def _handle_benchmark(self, payload: dict) -> KernelObservation:
+        cmd = payload.get("cmd") or self._bench_cmd
+        ok, output = self._run_cmd(cmd)
+        if not ok:
+            return KernelObservation(
+                passed_tests=False, latency_ms=None, speedup_vs_baseline=None,
+                summary=f"Benchmark failed:\n{output}",
+            )
+
+        latency = self._parse_latency(output)
+        if latency is None:
+            return self._error_obs(
+                f"Could not parse 'latency_ms=<number>' from bench output:\n{output}"
+            )
+
+        speedup = self._baseline_ms / latency if self._baseline_ms else None
+        if self._best_ms is None or latency < self._best_ms:
+            self._best_ms = latency
+
+        return KernelObservation(
+            passed_tests=True,
+            latency_ms=latency,
+            speedup_vs_baseline=speedup,
+            summary=self._bench_summary(latency, speedup),
+        )
+
+    # ------------------------------------------------------------------
+    # Helpers
+    # ------------------------------------------------------------------
+
+    def _restore_baseline(self):
+        for rel, content in self._baseline_files.items():
+            target = self._root / rel
+            target.parent.mkdir(parents=True, exist_ok=True)
+            target.write_text(content, encoding="utf-8")
+
+    def _run_cmd(self, cmd: list[str] | str) -> tuple[bool, str]:
+        if isinstance(cmd, str):
+            cmd = cmd.split()
+        result = subprocess.run(cmd, cwd=str(self._root), capture_output=True, text=True)
+        return result.returncode == 0, (result.stdout + result.stderr).strip()
+
+    def _run_benchmark(self) -> float | None:
+        ok, output = self._run_cmd(self._bench_cmd)
+        return self._parse_latency(output) if ok else None
+
+    @staticmethod
+    def _parse_latency(output: str) -> float | None:
+        for line in reversed(output.splitlines()):
+            if line.strip().startswith("latency_ms="):
+                try:
+                    return float(line.strip().split("=", 1)[1])
+                except ValueError:
+                    pass
+        return None
+
+    def _bench_summary(self, latency: float, speedup: float | None) -> str:
+        parts = [f"latency_ms={latency:.3f}"]
+        if speedup is not None:
+            parts.append(f"speedup={speedup:.3f}x vs baseline")
+        if self._best_ms is not None:
+            parts.append(f"best so far: {self._best_ms:.3f} ms")
+        return "  ".join(parts)
+
+    @staticmethod
+    def _error_obs(msg: str) -> KernelObservation:
+        return KernelObservation(passed_tests=False, latency_ms=None, speedup_vs_baseline=None, summary=msg)
+
+    # ------------------------------------------------------------------
+    # Properties
+    # ------------------------------------------------------------------
+
+    @property
+    def baseline_ms(self) -> float | None:
+        return self._baseline_ms
+
+    @property
+    def best_ms(self) -> float | None:
+        return self._best_ms
+
+    @property
+    def best_speedup(self) -> float | None:
+        if self._baseline_ms and self._best_ms:
+            return self._baseline_ms / self._best_ms
+        return None

From 4c246c9313ea662b330f6f1d697a1c92c10efa83 Mon Sep 17 00:00:00 2001
From: mirkodevita <mirko.de.vita@h-partners.com>
Date: Mon, 9 Mar 2026 14:45:09 +0000
Subject: [PATCH 2/6] rebased on refactoring and moved the agent kernels to a
 new subfolder of examples

---
 .../{aot => agent}/geglu_dynamic_multicore_opt/.gitignore   | 0
 .../{aot => agent}/geglu_dynamic_multicore_opt/README.md    | 0
 .../geglu_dynamic_multicore_opt/_bench_wrapper.py           | 0
 .../geglu_dynamic_multicore_opt/bench_geglu.py              | 0
 .../{aot => agent}/geglu_dynamic_multicore_opt/caller.cpp   | 0
 .../{aot => agent}/geglu_dynamic_multicore_opt/compile.sh   | 0
 .../geglu_dynamic_multicore_opt/geglu_builder.py            | 6 +++---
 .../{aot => agent}/geglu_dynamic_multicore_opt/run_geglu.py | 0
 openenv/agent_search.py                                     | 2 +-
 9 files changed, 4 insertions(+), 4 deletions(-)
 rename examples/{aot => agent}/geglu_dynamic_multicore_opt/.gitignore (100%)
 rename examples/{aot => agent}/geglu_dynamic_multicore_opt/README.md (100%)
 rename examples/{aot => agent}/geglu_dynamic_multicore_opt/_bench_wrapper.py (100%)
 rename examples/{aot => agent}/geglu_dynamic_multicore_opt/bench_geglu.py (100%)
 rename examples/{aot => agent}/geglu_dynamic_multicore_opt/caller.cpp (100%)
 rename examples/{aot => agent}/geglu_dynamic_multicore_opt/compile.sh (100%)
 rename examples/{aot => agent}/geglu_dynamic_multicore_opt/geglu_builder.py (98%)
 rename examples/{aot => agent}/geglu_dynamic_multicore_opt/run_geglu.py (100%)

diff --git a/examples/aot/geglu_dynamic_multicore_opt/.gitignore b/examples/agent/geglu_dynamic_multicore_opt/.gitignore
similarity index 100%
rename from examples/aot/geglu_dynamic_multicore_opt/.gitignore
rename to examples/agent/geglu_dynamic_multicore_opt/.gitignore
diff --git a/examples/aot/geglu_dynamic_multicore_opt/README.md b/examples/agent/geglu_dynamic_multicore_opt/README.md
similarity index 100%
rename from examples/aot/geglu_dynamic_multicore_opt/README.md
rename to examples/agent/geglu_dynamic_multicore_opt/README.md
diff --git a/examples/aot/geglu_dynamic_multicore_opt/_bench_wrapper.py b/examples/agent/geglu_dynamic_multicore_opt/_bench_wrapper.py
similarity index 100%
rename from examples/aot/geglu_dynamic_multicore_opt/_bench_wrapper.py
rename to examples/agent/geglu_dynamic_multicore_opt/_bench_wrapper.py
diff --git a/examples/aot/geglu_dynamic_multicore_opt/bench_geglu.py b/examples/agent/geglu_dynamic_multicore_opt/bench_geglu.py
similarity index 100%
rename from examples/aot/geglu_dynamic_multicore_opt/bench_geglu.py
rename to examples/agent/geglu_dynamic_multicore_opt/bench_geglu.py
diff --git a/examples/aot/geglu_dynamic_multicore_opt/caller.cpp b/examples/agent/geglu_dynamic_multicore_opt/caller.cpp
similarity index 100%
rename from examples/aot/geglu_dynamic_multicore_opt/caller.cpp
rename to examples/agent/geglu_dynamic_multicore_opt/caller.cpp
diff --git a/examples/aot/geglu_dynamic_multicore_opt/compile.sh b/examples/agent/geglu_dynamic_multicore_opt/compile.sh
similarity index 100%
rename from examples/aot/geglu_dynamic_multicore_opt/compile.sh
rename to examples/agent/geglu_dynamic_multicore_opt/compile.sh
diff --git a/examples/aot/geglu_dynamic_multicore_opt/geglu_builder.py b/examples/agent/geglu_dynamic_multicore_opt/geglu_builder.py
similarity index 98%
rename from examples/aot/geglu_dynamic_multicore_opt/geglu_builder.py
rename to examples/agent/geglu_dynamic_multicore_opt/geglu_builder.py
index 4dae7890..3a70cc97 100644
--- a/examples/aot/geglu_dynamic_multicore_opt/geglu_builder.py
+++ b/examples/agent/geglu_dynamic_multicore_opt/geglu_builder.py
@@ -1,7 +1,7 @@
-from ptodsl import to_ir_module
-import ptodsl.language as pto
+from ptodsl import pto, tile, to_ir_module
+from ptodsl import scalar as s
 
-const = pto.const
+const = s.const
 
 # 32 KB of UB / sizeof(fp16) = 16384 elements per tile
 ELEMENTS_PER_TILE = 32 * 1024 // 2
diff --git a/examples/aot/geglu_dynamic_multicore_opt/run_geglu.py b/examples/agent/geglu_dynamic_multicore_opt/run_geglu.py
similarity index 100%
rename from examples/aot/geglu_dynamic_multicore_opt/run_geglu.py
rename to examples/agent/geglu_dynamic_multicore_opt/run_geglu.py
diff --git a/openenv/agent_search.py b/openenv/agent_search.py
index e3e2d05e..f2b3403b 100644
--- a/openenv/agent_search.py
+++ b/openenv/agent_search.py
@@ -52,7 +52,7 @@ def build_env(cfg: dict) -> tuple["KernelSearchEnv", Path]:
         if not work_dir.is_absolute():
             work_dir = ROOT / work_dir
     else:
-        work_dir = kernel_dir.parent / (kernel_dir.name + "_opt")
+        work_dir = ROOT / "examples" / "agent" / kernel_dir.name
 
     if work_dir.exists():
         shutil.rmtree(work_dir)

From cb4ddee5c4b7a6f9f0cd99d434a6702ad011cd44 Mon Sep 17 00:00:00 2001
From: mirkodevita <mirko.de.vita@h-partners.com>
Date: Mon, 9 Mar 2026 15:36:19 +0000
Subject: [PATCH 3/6] added create mode, see mish and silu toml files

---
 examples/agent/mish/_bench_wrapper.py |  68 ++++++++
 examples/agent/mish/caller.cpp        |  24 +++
 examples/agent/mish/compile.sh        |  22 +++
 examples/agent/mish/mish.cpp          |  96 +++++++++++
 examples/agent/mish/mish.pto          |  60 +++++++
 examples/agent/mish/mish_builder.py   | 174 +++++++++++++++++++
 examples/agent/mish/run_mish.py       | 114 +++++++++++++
 examples/agent/silu/_bench_wrapper.py |  68 ++++++++
 examples/agent/silu/caller.cpp        |  24 +++
 examples/agent/silu/compile.sh        |  22 +++
 examples/agent/silu/run_silu.py       | 113 +++++++++++++
 examples/agent/silu/silu.cpp          |  91 ++++++++++
 examples/agent/silu/silu.pto          |  58 +++++++
 examples/agent/silu/silu_builder.py   | 154 +++++++++++++++++
 openenv/README.md                     |   6 +
 openenv/agent_search.py               | 230 +++++++++++++++++++++-----
 openenv/geglu_config.toml             |   9 +
 openenv/hadamard_config.toml          |   9 +
 openenv/hardswish_config.toml         |  48 ++++++
 openenv/kernel_opt_env.py             |   2 +-
 openenv/mish_config.toml              |  46 ++++++
 openenv/silu_config.toml              |  42 +++++
 22 files changed, 1436 insertions(+), 44 deletions(-)
 create mode 100644 examples/agent/mish/_bench_wrapper.py
 create mode 100644 examples/agent/mish/caller.cpp
 create mode 100644 examples/agent/mish/compile.sh
 create mode 100644 examples/agent/mish/mish.cpp
 create mode 100644 examples/agent/mish/mish.pto
 create mode 100644 examples/agent/mish/mish_builder.py
 create mode 100644 examples/agent/mish/run_mish.py
 create mode 100644 examples/agent/silu/_bench_wrapper.py
 create mode 100644 examples/agent/silu/caller.cpp
 create mode 100644 examples/agent/silu/compile.sh
 create mode 100644 examples/agent/silu/run_silu.py
 create mode 100644 examples/agent/silu/silu.cpp
 create mode 100644 examples/agent/silu/silu.pto
 create mode 100644 examples/agent/silu/silu_builder.py
 create mode 100644 openenv/hardswish_config.toml
 create mode 100644 openenv/mish_config.toml
 create mode 100644 openenv/silu_config.toml

diff --git a/examples/agent/mish/_bench_wrapper.py b/examples/agent/mish/_bench_wrapper.py
new file mode 100644
index 00000000..cb4af27f
--- /dev/null
+++ b/examples/agent/mish/_bench_wrapper.py
@@ -0,0 +1,68 @@
+"""
+Single-config benchmark wrapper for the agentic optimizer.
+Loads mish_lib.so and prints:  latency_ms=<number>
+"""
+import ctypes
+
+import torch
+import torch_npu  # noqa: F401
+
+from ptodsl.test_util import get_test_device
+
+# Representative shape — change to target a different operating point
+BATCH     = 1024
+N_COLS    = 8192
+BLOCK_DIM = 24
+WARMUP    = 5
+ITERS     = 20
+
+
+def torch_to_ctypes(t):
+    return ctypes.c_void_p(t.data_ptr())
+
+
+device = get_test_device()
+torch.npu.set_device(device)
+
+lib = ctypes.CDLL("./mish_lib.so")
+lib.call_kernel.argtypes = [
+    ctypes.c_uint32,  # blockDim
+    ctypes.c_void_p,  # stream
+    ctypes.c_void_p,  # x
+    ctypes.c_void_p,  # y (output)
+    ctypes.c_uint32,  # batch
+    ctypes.c_uint32,  # n_cols
+]
+lib.call_kernel.restype = None
+
+
+def run(x, y):
+    lib.call_kernel(
+        BLOCK_DIM,
+        torch.npu.current_stream()._as_parameter_,
+        torch_to_ctypes(x),
+        torch_to_ctypes(y),
+        BATCH,
+        N_COLS,
+    )
+
+
+dtype = torch.float16
+# Separate tensors per iteration to reduce cache reuse
+xs_ = [torch.randn(BATCH, N_COLS, device=device, dtype=dtype).clamp(-4, 4) for _ in range(WARMUP + ITERS)]
+y   = torch.empty(BATCH, N_COLS, device=device, dtype=dtype)
+
+for i in range(WARMUP):
+    run(xs_[i], y)
+torch.npu.synchronize()
+
+starts = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)]
+ends   = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)]
+for i in range(ITERS):
+    starts[i].record()
+    run(xs_[WARMUP + i], y)
+    ends[i].record()
+torch.npu.synchronize()
+
+ms = sum(s.elapsed_time(e) for s, e in zip(starts, ends)) / ITERS
+print(f"latency_ms={ms:.4f}")
diff --git a/examples/agent/mish/caller.cpp b/examples/agent/mish/caller.cpp
new file mode 100644
index 00000000..47fe846c
--- /dev/null
+++ b/examples/agent/mish/caller.cpp
@@ -0,0 +1,24 @@
+#ifndef KERNEL_CPP
+#define KERNEL_CPP "mish.cpp"
+#endif
+#include KERNEL_CPP
+
+#ifndef NUM_CORES
+#define NUM_CORES 24
+#endif
+
+extern "C" void call_kernel(
+    uint32_t blockDim,
+    void *stream,
+    uint8_t *x,
+    uint8_t *y,
+    uint32_t batch,
+    uint32_t n_cols)
+{
+    uint32_t launch_blocks = blockDim > 0 ? blockDim : NUM_CORES;
+    _kernel<<<launch_blocks, nullptr, stream>>>(
+        reinterpret_cast<half *>(x),
+        reinterpret_cast<half *>(y),
+        static_cast<int32_t>(batch),
+        static_cast<int32_t>(n_cols));
+}
diff --git a/examples/agent/mish/compile.sh b/examples/agent/mish/compile.sh
new file mode 100644
index 00000000..233c4952
--- /dev/null
+++ b/examples/agent/mish/compile.sh
@@ -0,0 +1,22 @@
+set -e
+
+rm -f mish.pto mish.cpp mish_lib.so
+
+python ./mish_builder.py > ./mish.pto
+ptoas --enable-insert-sync ./mish.pto -o ./mish.cpp
+
+bisheng \
+    -I${ASCEND_TOOLKIT_HOME}/include \
+    -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \
+    -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \
+    -xcce -Xhost-start -Xhost-end \
+    -mllvm -cce-aicore-stack-size=0x8000 \
+    -mllvm -cce-aicore-function-stack-size=0x8000 \
+    -mllvm -cce-aicore-record-overflow=true \
+    -mllvm -cce-aicore-addr-transform \
+    -mllvm -cce-aicore-dcci-insert-for-scalar=false \
+    --npu-arch=dav-2201 -DMEMORY_BASE \
+    -std=gnu++17 \
+    -DKERNEL_CPP="\"mish.cpp\"" \
+    ./caller.cpp \
+    -o ./mish_lib.so
diff --git a/examples/agent/mish/mish.cpp b/examples/agent/mish/mish.cpp
new file mode 100644
index 00000000..35991bc0
--- /dev/null
+++ b/examples/agent/mish/mish.cpp
@@ -0,0 +1,96 @@
+#include "pto/pto-inst.hpp"
+using namespace pto;
+__global__ AICORE void _kernel(__gm__ half* v1, __gm__ half* v2, int32_t v3, int32_t v4) {
+  unsigned v5 = 1;
+  unsigned v6 = 0;
+  int32_t v7 = 16384;
+  int32_t v8 = 1;
+  int32_t v9 = 0;
+  int64_t v10 = 0;
+  int64_t v11 = 32768;
+  int64_t v12 = 65536;
+  int64_t v13 = 98304;
+  using T = float;
+
+  #if defined(__DAV_VEC__)
+  set_mask_norm();
+  set_vector_mask(-1, -1);
+  set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+  set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+  set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+  set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+  if (v4 > v9) {
+    if (v4 <= v7) {
+      int64_t v14 = get_block_idx();
+      int64_t v15 = get_subblockid();
+      int64_t v16 = get_subblockdim();
+      int64_t v17 = (int64_t) v16;
+      int64_t v18 = get_block_num();
+      int32_t v19 = (int32_t) ((int64_t) (uint64_t) ((int64_t) v18) * (uint64_t) v17);
+      int32_t v20 = v3 / v19;
+      int32_t v21 = v3 % v19 != v9 && v3 < v9 == v19 < v9 ? v20 + v8 : v20;
+      int32_t v22 = (int32_t) ((uint32_t) ((int32_t) (int64_t) ((uint64_t) ((int64_t) (uint64_t) ((int64_t) v14) * (uint64_t) v17) + (uint64_t) ((int64_t) v15))) * (uint32_t) v21);
+      int32_t v23 = (int32_t) ((uint32_t) v22 + (uint32_t) v21);
+      int32_t v24 = (int32_t) ((uint32_t) ((uint32_t) v23 < (uint32_t) v3 ? v23 : v3) - (uint32_t) v22);
+      int32_t v25 = (int32_t) ((uint32_t) v3 * (uint32_t) v4);
+      if (v24 > v9) {
+        Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v26 = Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v4);
+        TASSIGN(v26, v10);
+        Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v27 = Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v4);
+        TASSIGN(v27, v11);
+        Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v28 = Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v4);
+        TASSIGN(v28, v12);
+        Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v29 = Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v4);
+        TASSIGN(v29, v13);
+        for (size_t v30 = (size_t) v9; v30 < ((size_t) v24); v30 += (size_t) v8) {
+          int32_t v31 = (int32_t) ((uint32_t) ((int32_t) (uint32_t) v22 + (uint32_t) ((int32_t) v30)) * (uint32_t) v4);
+          unsigned v32 = (unsigned) v4 * v5;
+          pto::Shape<1, 1, 1, 1, -1> v33 = pto::Shape<1, 1, 1, 1, -1>(v4);
+          pto::Stride<-1, -1, -1, -1, 1> v34 = pto::Stride<-1, -1, -1, -1, 1>(v32, v32, v32, v32);
+          GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v35 = GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v6 + (unsigned) v31 * (unsigned) v8), v33, v34);
+          unsigned v36 = (unsigned) v4 * v5;
+          pto::Shape<1, 1, 1, 1, -1> v37 = pto::Shape<1, 1, 1, 1, -1>(v4);
+          pto::Stride<-1, -1, -1, -1, 1> v38 = pto::Stride<-1, -1, -1, -1, 1>(v36, v36, v36, v36);
+          GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v39 = GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v6 + (unsigned) v31 * (unsigned) v8), v37, v38);
+          wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+          TLOAD(v26, v35);
+          set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+          wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+          wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+          TSUB(v28, v26, v26);
+          pipe_barrier(PIPE_V);
+          TEXP(v27, v28);
+          pipe_barrier(PIPE_V);
+          TEXP(v28, v26);
+          pipe_barrier(PIPE_V);
+          TADD(v28, v27, v28);
+          pipe_barrier(PIPE_V);
+          TMUL(v29, v28, v28);
+          pipe_barrier(PIPE_V);
+          TSUB(v28, v29, v27);
+          pipe_barrier(PIPE_V);
+          TADD(v29, v29, v27);
+          pipe_barrier(PIPE_V);
+          TDIV(v28, v28, v29);
+          pipe_barrier(PIPE_V);
+          TMUL(v28, v26, v28);
+          set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+          set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+          wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+          pipe_barrier(PIPE_MTE3);
+          TSTORE(v39, v28);
+          set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+        };
+      };
+    };
+  }
+  pipe_barrier(PIPE_ALL);
+  wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+  wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+  wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+  wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+  #endif // __DAV_VEC__
+
+  return;
+}
+
diff --git a/examples/agent/mish/mish.pto b/examples/agent/mish/mish.pto
new file mode 100644
index 00000000..09e00dd9
--- /dev/null
+++ b/examples/agent/mish/mish.pto
@@ -0,0 +1,60 @@
+module {
+  func.func @_kernel(%arg0: !pto.ptr<f16>, %arg1: !pto.ptr<f16>, %arg2: i32, %arg3: i32) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16384 = arith.constant 16384 : index
+    %0 = arith.index_cast %arg2 : i32 to index
+    %1 = arith.index_cast %arg3 : i32 to index
+    pto.section.vector {
+      %2 = arith.cmpi sgt, %1, %c0 : index
+      scf.if %2 {
+        %3 = arith.cmpi sge, %c16384, %1 : index
+        scf.if %3 {
+          %4 = pto.get_block_idx
+          %5 = pto.get_subblock_idx
+          %6 = pto.get_subblock_num
+          %7 = pto.get_block_num
+          %8 = arith.muli %4, %6 : i64
+          %9 = arith.addi %8, %5 : i64
+          %10 = arith.index_cast %9 : i64 to index
+          %11 = arith.muli %7, %6 : i64
+          %12 = arith.index_cast %11 : i64 to index
+          %13 = arith.ceildivsi %0, %12 : index
+          %14 = arith.muli %10, %13 : index
+          %15 = arith.addi %14, %13 : index
+          %16 = arith.minui %15, %0 : index
+          %17 = arith.subi %16, %14 : index
+          %18 = arith.muli %0, %1 : index
+          %19 = pto.make_tensor_view %arg0, shape = [%18] strides = [%c1] : !pto.tensor_view<?xf16>
+          %20 = pto.make_tensor_view %arg1, shape = [%18] strides = [%c1] : !pto.tensor_view<?xf16>
+          %21 = arith.cmpi sgt, %17, %c0 : index
+          scf.if %21 {
+            %22 = pto.alloc_tile valid_col = %1 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+            %23 = pto.alloc_tile valid_col = %1 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+            %24 = pto.alloc_tile valid_col = %1 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+            %25 = pto.alloc_tile valid_col = %1 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+            scf.for %arg4 = %c0 to %17 step %c1 {
+              %26 = arith.addi %14, %arg4 : index
+              %27 = arith.muli %26, %1 : index
+              %28 = pto.partition_view %19, offsets = [%27], sizes = [%1] : !pto.tensor_view<?xf16> -> !pto.partition_tensor_view<1x16384xf16>
+              %29 = pto.partition_view %20, offsets = [%27], sizes = [%1] : !pto.tensor_view<?xf16> -> !pto.partition_tensor_view<1x16384xf16>
+              pto.tload ins(%28 : !pto.partition_tensor_view<1x16384xf16>) outs(%22 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tsub ins(%22, %22 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.texp ins(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.texp ins(%22 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tadd ins(%23, %24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tmul ins(%24, %24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%25 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tsub ins(%25, %23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tadd ins(%25, %23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%25 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tdiv ins(%24, %25 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tmul ins(%22, %24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tstore ins(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%29 : !pto.partition_tensor_view<1x16384xf16>)
+            }
+          }
+        }
+      }
+    }
+    return
+  }
+}
+
diff --git a/examples/agent/mish/mish_builder.py b/examples/agent/mish/mish_builder.py
new file mode 100644
index 00000000..85dd381e
--- /dev/null
+++ b/examples/agent/mish/mish_builder.py
@@ -0,0 +1,174 @@
+from ptodsl import pto, tile, to_ir_module
+from ptodsl import scalar as s
+
+const = s.const
+
+# 32 KB of UB / sizeof(fp16) = 16384 elements per tile
+ELEMENTS_PER_TILE = 32 * 1024 // 2
+
+
+def meta_data():
+    dtype = pto.float16
+    ptr_type = pto.PtrType(dtype)
+    index_dtype = pto.int32
+
+    tensor_type = pto.TensorType(rank=1, dtype=dtype)
+    subtensor_type = pto.SubTensorType(shape=[1, ELEMENTS_PER_TILE], dtype=dtype)
+
+    tile_cfg = pto.TileBufConfig()
+    tile_type = pto.TileBufType(
+        shape=[1, ELEMENTS_PER_TILE],
+        valid_shape=[1, -1],
+        dtype=dtype,
+        memory_space="VEC",
+        config=tile_cfg,
+    )
+
+    return {
+        "ptr_type": ptr_type,
+        "index_dtype": index_dtype,
+        "tensor_type": tensor_type,
+        "subtensor_type": subtensor_type,
+        "tile_type": tile_type,
+    }
+
+
+def build_mish(fn_name="mish_fp16"):
+    """
+    Build a dynamic-batch Mish kernel in PTO DSL.
+
+    Computes y = x * tanh(softplus(x)), where:
+        softplus(x) = ln(1 + exp(x))
+        tanh(softplus(x)) = ((1+exp(x))^2 - 1) / ((1+exp(x))^2 + 1)
+
+    This avoids ln() by using the algebraic identity:
+        exp(2*ln(u)) = u^2
+
+    Constants (1.0) are derived from the input tile itself using
+    the identity exp(x - x) = exp(0) = 1.0, which avoids the need for
+    scalar-tile broadcast operations not available in PTO DSL.
+
+    UB tile budget (fp16, 4 tiles x 32 KB = 128 KB < 192 KB):
+        tb_x    : input row x
+        tb_ones : constant 1.0 (recomputed each row via exp(x-x))
+        tb_tmp1 : intermediate / final output
+        tb_tmp2 : intermediate
+
+    Kernel args:
+        x_ptr   : fp16[batch * n_cols]  -- input
+        y_ptr   : fp16[batch * n_cols]  -- output
+        batch   : int32                 -- number of rows
+        n_cols  : int32                 -- elements per row; must be <= 16384
+    """
+
+    @to_ir_module(meta_data=meta_data)
+    def _kernel(
+        x_ptr: "ptr_type",
+        y_ptr: "ptr_type",
+        batch_i32: "index_dtype",
+        n_cols_i32: "index_dtype",
+    ) -> None:
+        c0 = const(0)
+        c1 = const(1)
+        c_tile = const(ELEMENTS_PER_TILE)
+
+        batch = s.index_cast(batch_i32)
+        n_cols = s.index_cast(n_cols_i32)
+
+        with pto.vector_section():
+            # Guard: n_cols must be in (0, ELEMENTS_PER_TILE].
+            with pto.if_context(n_cols > c0):
+                with pto.if_context(c_tile >= n_cols):
+                    cid = pto.get_block_idx()
+                    sub_bid = pto.get_subblock_idx()
+                    sub_bnum = pto.get_subblock_num()
+                    num_blocks = pto.get_block_num()
+
+                    vid = s.index_cast(cid * sub_bnum + sub_bid)
+                    num_cores = s.index_cast(num_blocks * sub_bnum)
+
+                    # Distribute rows across cores (row-level parallelism).
+                    rows_per_core = s.ceil_div(batch, num_cores)
+                    row_start = vid * rows_per_core
+                    row_end = s.min_u(row_start + rows_per_core, batch)
+                    num_rows = row_end - row_start
+
+                    total_elems = batch * n_cols
+                    tv_x = pto.as_tensor(
+                        tensor_type, ptr=x_ptr, shape=[total_elems], strides=[c1]
+                    )
+                    tv_y = pto.as_tensor(
+                        tensor_type, ptr=y_ptr, shape=[total_elems], strides=[c1]
+                    )
+
+                    with pto.if_context(num_rows > c0):
+                        # Allocate 4 UB tiles (128 KB total, well under 192 KB UB).
+                        tb_x = pto.alloc_tile(tile_type, valid_col=n_cols)
+                        tb_ones = pto.alloc_tile(tile_type, valid_col=n_cols)
+                        tb_tmp1 = pto.alloc_tile(tile_type, valid_col=n_cols)
+                        tb_tmp2 = pto.alloc_tile(tile_type, valid_col=n_cols)
+
+                        for row_i in pto.range(c0, num_rows, c1):
+                            gm_offset = (row_start + row_i) * n_cols
+
+                            sv_x = pto.slice_view(
+                                subtensor_type,
+                                source=tv_x,
+                                offsets=[gm_offset],
+                                sizes=[n_cols],
+                            )
+                            sv_y = pto.slice_view(
+                                subtensor_type,
+                                source=tv_y,
+                                offsets=[gm_offset],
+                                sizes=[n_cols],
+                            )
+
+                            pto.load(sv_x, tb_x)
+
+                            # Derive constant 1.0 from data:
+                            #   x - x = 0  =>  exp(0) = 1.0
+                            tile.sub(tb_x, tb_x, tb_tmp1)    # tmp1 = 0.0
+                            tile.exp(tb_tmp1, tb_ones)        # ones = 1.0
+
+                            # Compute mish(x) = x * tanh(softplus(x))
+                            # Using identity: tanh(ln(1+exp(x))) = ((1+exp(x))^2 - 1) / ((1+exp(x))^2 + 1)
+
+                            # Step 1: exp(x)
+                            tile.exp(tb_x, tb_tmp1)           # tmp1 = exp(x)
+
+                            # Step 2: u = 1 + exp(x)
+                            tile.add(tb_ones, tb_tmp1, tb_tmp1)  # tmp1 = 1 + exp(x)
+
+                            # Step 3: u^2 = (1 + exp(x))^2
+                            tile.mul(tb_tmp1, tb_tmp1, tb_tmp2)  # tmp2 = (1+exp(x))^2
+
+                            # Step 4: numerator = u^2 - 1
+                            tile.sub(tb_tmp2, tb_ones, tb_tmp1)  # tmp1 = (1+exp(x))^2 - 1
+
+                            # Step 5: denominator = u^2 + 1
+                            tile.add(tb_tmp2, tb_ones, tb_tmp2)  # tmp2 = (1+exp(x))^2 + 1
+
+                            # Step 6: tanh(softplus(x)) = num / den
+                            tile.div(tb_tmp1, tb_tmp2, tb_tmp1)  # tmp1 = tanh(softplus(x))
+
+                            # Step 7: mish(x) = x * tanh(softplus(x))
+                            tile.mul(tb_x, tb_tmp1, tb_tmp1)     # tmp1 = mish(x)
+
+                            pto.store(tb_tmp1, sv_y)
+
+    _ = fn_name
+    return _kernel
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--fn-name",
+        default="mish_fp16",
+        help="Generated kernel function name.",
+    )
+    args = parser.parse_args()
+    print(build_mish(fn_name=args.fn_name))
diff --git a/examples/agent/mish/run_mish.py b/examples/agent/mish/run_mish.py
new file mode 100644
index 00000000..d4ca82ad
--- /dev/null
+++ b/examples/agent/mish/run_mish.py
@@ -0,0 +1,114 @@
+import argparse
+import ctypes
+
+import torch
+import torch_npu  # noqa: F401
+
+from ptodsl.test_util import get_test_device
+
+
+def torch_to_ctypes(tensor):
+    return ctypes.c_void_p(tensor.data_ptr())
+
+
+def load_lib(lib_path, block_dim=24):
+    lib = ctypes.CDLL(lib_path)
+    lib.call_kernel.argtypes = [
+        ctypes.c_uint32,  # blockDim
+        ctypes.c_void_p,  # stream
+        ctypes.c_void_p,  # x
+        ctypes.c_void_p,  # y (output)
+        ctypes.c_uint32,  # batch
+        ctypes.c_uint32,  # n_cols
+    ]
+    lib.call_kernel.restype = None
+
+    def mish_func(x, y, batch, n_cols, block_dim=block_dim, stream_ptr=None):
+        if stream_ptr is None:
+            stream_ptr = torch.npu.current_stream()._as_parameter_
+        lib.call_kernel(
+            block_dim,
+            stream_ptr,
+            torch_to_ctypes(x),
+            torch_to_ctypes(y),
+            batch,
+            n_cols,
+        )
+
+    return mish_func
+
+
+def mish_ref(x):
+    """Reference Mish matching the PTO kernel.
+
+    Computes y = x * tanh(softplus(x)), where:
+        softplus(x) = ln(1 + exp(x))
+        tanh(softplus(x)) = ((1+exp(x))^2 - 1) / ((1+exp(x))^2 + 1)
+
+    Uses torch.nn.functional.mish on float32 for reference.
+    """
+    return torch.nn.functional.mish(x.float()).to(x.dtype)
+
+
+def test_mish(lib_path, block_dim=24):
+    device = get_test_device()
+    torch.npu.set_device(device)
+
+    mish = load_lib(lib_path=lib_path, block_dim=block_dim)
+
+    torch.manual_seed(0)
+    dtype = torch.float16
+    batch_list = [1, 4, 22, 65]
+    n_cols_list = [128, 256, 512, 1024, 2048, 4096, 8192, 16384]
+
+    results = []
+    for batch in batch_list:
+        for n_cols in n_cols_list:
+            # Use small range to stay within fp16 exp range (avoid overflow).
+            x = torch.randn(batch, n_cols, device=device, dtype=dtype).clamp(-4, 4)
+            y = torch.empty(batch, n_cols, device=device, dtype=dtype)
+
+            y_ref = mish_ref(x)
+            mish(x, y, batch, n_cols)
+            torch.npu.synchronize()
+
+            is_match = True
+            detail = ""
+            try:
+                torch.testing.assert_close(y, y_ref, rtol=1e-2, atol=1e-2)
+            except AssertionError as err:
+                is_match = False
+                detail = str(err).strip() if str(err) else "assert_close failed"
+
+            status = "match" if is_match else "mismatch"
+            print(f"[{status}] batch={batch}, n_cols={n_cols}, lib={lib_path}")
+            if detail:
+                print("  detail:")
+                print(detail)
+            results.append((batch, n_cols, status, detail))
+
+    print(f"\ndetailed summary for {lib_path}:")
+    for batch, n_cols, status, detail in results:
+        msg = f"  batch={batch}, n_cols={n_cols}, status={status}"
+        print(msg)
+        if detail:
+            print("    detail:")
+            print(detail)
+    return results
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--lib",
+        default="./mish_lib.so",
+        help="Path to the shared library generated by compile.sh.",
+    )
+    parser.add_argument(
+        "--block-dim",
+        type=int,
+        default=24,
+        help="Kernel blockDim (default: 24).",
+    )
+    args = parser.parse_args()
+    test_mish(args.lib, block_dim=args.block_dim)
diff --git a/examples/agent/silu/_bench_wrapper.py b/examples/agent/silu/_bench_wrapper.py
new file mode 100644
index 00000000..a81933b1
--- /dev/null
+++ b/examples/agent/silu/_bench_wrapper.py
@@ -0,0 +1,68 @@
+"""
+Single-config benchmark wrapper for the agentic optimizer.
+Loads silu_lib.so and prints:  latency_ms=<number>
+"""
+import ctypes
+
+import torch
+import torch_npu  # noqa: F401
+
+from ptodsl.test_util import get_test_device
+
+# Representative shape
+BATCH     = 1024
+N_COLS    = 8192
+BLOCK_DIM = 24
+WARMUP    = 5
+ITERS     = 20
+
+
+def torch_to_ctypes(t):
+    return ctypes.c_void_p(t.data_ptr())
+
+
+device = get_test_device()
+torch.npu.set_device(device)
+
+lib = ctypes.CDLL("./silu_lib.so")
+lib.call_kernel.argtypes = [
+    ctypes.c_uint32,  # blockDim
+    ctypes.c_void_p,  # stream
+    ctypes.c_void_p,  # x
+    ctypes.c_void_p,  # y (output)
+    ctypes.c_uint32,  # batch
+    ctypes.c_uint32,  # n_cols
+]
+lib.call_kernel.restype = None
+
+
+def run(x, y):
+    lib.call_kernel(
+        BLOCK_DIM,
+        torch.npu.current_stream()._as_parameter_,
+        torch_to_ctypes(x),
+        torch_to_ctypes(y),
+        BATCH,
+        N_COLS,
+    )
+
+
+dtype = torch.float16
+# Separate tensors per iteration to reduce cache reuse
+xs = [torch.randn(BATCH, N_COLS, device=device, dtype=dtype).clamp(-4, 4) for _ in range(WARMUP + ITERS)]
+y  = torch.empty(BATCH, N_COLS, device=device, dtype=dtype)
+
+for i in range(WARMUP):
+    run(xs[i], y)
+torch.npu.synchronize()
+
+starts = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)]
+ends   = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)]
+for i in range(ITERS):
+    starts[i].record()
+    run(xs[WARMUP + i], y)
+    ends[i].record()
+torch.npu.synchronize()
+
+ms = sum(s.elapsed_time(e) for s, e in zip(starts, ends)) / ITERS
+print(f"latency_ms={ms:.4f}")
diff --git a/examples/agent/silu/caller.cpp b/examples/agent/silu/caller.cpp
new file mode 100644
index 00000000..3f3d74aa
--- /dev/null
+++ b/examples/agent/silu/caller.cpp
@@ -0,0 +1,24 @@
+#ifndef KERNEL_CPP
+#define KERNEL_CPP "silu.cpp"
+#endif
+#include KERNEL_CPP
+
+#ifndef NUM_CORES
+#define NUM_CORES 24
+#endif
+
+extern "C" void call_kernel(
+    uint32_t blockDim,
+    void *stream,
+    uint8_t *x,
+    uint8_t *y,
+    uint32_t batch,
+    uint32_t n_cols)
+{
+    uint32_t launch_blocks = blockDim > 0 ? blockDim : NUM_CORES;
+    _kernel<<<launch_blocks, nullptr, stream>>>(
+        reinterpret_cast<half *>(x),
+        reinterpret_cast<half *>(y),
+        static_cast<int32_t>(batch),
+        static_cast<int32_t>(n_cols));
+}
diff --git a/examples/agent/silu/compile.sh b/examples/agent/silu/compile.sh
new file mode 100644
index 00000000..0ddb0e7d
--- /dev/null
+++ b/examples/agent/silu/compile.sh
@@ -0,0 +1,22 @@
+set -e
+
+rm -f silu.pto silu.cpp silu_lib.so
+
+python ./silu_builder.py > ./silu.pto
+ptoas --enable-insert-sync ./silu.pto -o ./silu.cpp
+
+bisheng \
+    -I${ASCEND_TOOLKIT_HOME}/include \
+    -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \
+    -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \
+    -xcce -Xhost-start -Xhost-end \
+    -mllvm -cce-aicore-stack-size=0x8000 \
+    -mllvm -cce-aicore-function-stack-size=0x8000 \
+    -mllvm -cce-aicore-record-overflow=true \
+    -mllvm -cce-aicore-addr-transform \
+    -mllvm -cce-aicore-dcci-insert-for-scalar=false \
+    --npu-arch=dav-2201 -DMEMORY_BASE \
+    -std=gnu++17 \
+    -DKERNEL_CPP="\"silu.cpp\"" \
+    ./caller.cpp \
+    -o ./silu_lib.so
diff --git a/examples/agent/silu/run_silu.py b/examples/agent/silu/run_silu.py
new file mode 100644
index 00000000..f71fec06
--- /dev/null
+++ b/examples/agent/silu/run_silu.py
@@ -0,0 +1,113 @@
+import argparse
+import ctypes
+
+import torch
+import torch.nn.functional as F
+import torch_npu  # noqa: F401
+
+from ptodsl.test_util import get_test_device
+
+
+def torch_to_ctypes(tensor):
+    return ctypes.c_void_p(tensor.data_ptr())
+
+
+def load_lib(lib_path, block_dim=24):
+    lib = ctypes.CDLL(lib_path)
+    lib.call_kernel.argtypes = [
+        ctypes.c_uint32,  # blockDim
+        ctypes.c_void_p,  # stream
+        ctypes.c_void_p,  # x
+        ctypes.c_void_p,  # y (output)
+        ctypes.c_uint32,  # batch
+        ctypes.c_uint32,  # n_cols
+    ]
+    lib.call_kernel.restype = None
+
+    def silu_func(x, y, batch, n_cols, block_dim=block_dim, stream_ptr=None):
+        if stream_ptr is None:
+            stream_ptr = torch.npu.current_stream()._as_parameter_
+        lib.call_kernel(
+            block_dim,
+            stream_ptr,
+            torch_to_ctypes(x),
+            torch_to_ctypes(y),
+            batch,
+            n_cols,
+        )
+
+    return silu_func
+
+
+def silu_ref(x):
+    """Reference SiLU matching the PTO kernel.
+
+    Computes y = x * sigmoid(x)
+
+    Uses fp32 intermediate for reference accuracy, then casts back to fp16.
+    """
+    return F.silu(x.float()).to(x.dtype)
+
+
+def test_silu(lib_path, block_dim=24):
+    device = get_test_device()
+    torch.npu.set_device(device)
+
+    silu = load_lib(lib_path=lib_path, block_dim=block_dim)
+
+    torch.manual_seed(0)
+    dtype = torch.float16
+    batch_list = [1, 4, 22, 65]
+    n_cols_list = [128, 256, 512, 1024, 2048, 4096, 8192, 16384]
+
+    results = []
+    for batch in batch_list:
+        for n_cols in n_cols_list:
+            # Use small range to stay within fp16 exp range (avoid overflow).
+            x = torch.randn(batch, n_cols, device=device, dtype=dtype).clamp(-4, 4)
+            y = torch.empty(batch, n_cols, device=device, dtype=dtype)
+
+            y_ref = silu_ref(x)
+            silu(x, y, batch, n_cols)
+            torch.npu.synchronize()
+
+            is_match = True
+            detail = ""
+            try:
+                torch.testing.assert_close(y, y_ref, rtol=1e-2, atol=1e-2)
+            except AssertionError as err:
+                is_match = False
+                detail = str(err).strip() if str(err) else "assert_close failed"
+
+            status = "match" if is_match else "mismatch"
+            print(f"[{status}] batch={batch}, n_cols={n_cols}, lib={lib_path}")
+            if detail:
+                print("  detail:")
+                print(detail)
+            results.append((batch, n_cols, status, detail))
+
+    print(f"\ndetailed summary for {lib_path}:")
+    for batch, n_cols, status, detail in results:
+        msg = f"  batch={batch}, n_cols={n_cols}, status={status}"
+        print(msg)
+        if detail:
+            print("    detail:")
+            print(detail)
+    return results
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--lib",
+        default="./silu_lib.so",
+        help="Path to the shared library generated by compile.sh.",
+    )
+    parser.add_argument(
+        "--block-dim",
+        type=int,
+        default=24,
+        help="Kernel blockDim (default: 24).",
+    )
+    args = parser.parse_args()
+    test_silu(args.lib, block_dim=args.block_dim)
diff --git a/examples/agent/silu/silu.cpp b/examples/agent/silu/silu.cpp
new file mode 100644
index 00000000..703395e4
--- /dev/null
+++ b/examples/agent/silu/silu.cpp
@@ -0,0 +1,91 @@
+#include "pto/pto-inst.hpp"
+using namespace pto;
+__global__ AICORE void _kernel(__gm__ half* v1, __gm__ half* v2, int32_t v3, int32_t v4) {
+  unsigned v5 = 1;
+  unsigned v6 = 0;
+  int32_t v7 = 16384;
+  int32_t v8 = 1;
+  int32_t v9 = 0;
+  int64_t v10 = 0;
+  int64_t v11 = 32768;
+  int64_t v12 = 65536;
+  int64_t v13 = 98304;
+  using T = float;
+
+  #if defined(__DAV_VEC__)
+  set_mask_norm();
+  set_vector_mask(-1, -1);
+  set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+  set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+  set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+  set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+  if (v4 > v9) {
+    if (v4 <= v7) {
+      int64_t v14 = get_block_idx();
+      int64_t v15 = get_subblockid();
+      int64_t v16 = get_subblockdim();
+      int64_t v17 = (int64_t) v16;
+      int64_t v18 = get_block_num();
+      int32_t v19 = (int32_t) ((int64_t) (uint64_t) ((int64_t) v18) * (uint64_t) v17);
+      int32_t v20 = v3 / v19;
+      int32_t v21 = v3 % v19 != v9 && v3 < v9 == v19 < v9 ? v20 + v8 : v20;
+      int32_t v22 = (int32_t) ((uint32_t) ((int32_t) (int64_t) ((uint64_t) ((int64_t) (uint64_t) ((int64_t) v14) * (uint64_t) v17) + (uint64_t) ((int64_t) v15))) * (uint32_t) v21);
+      int32_t v23 = (int32_t) ((uint32_t) v22 + (uint32_t) v21);
+      int32_t v24 = (int32_t) ((uint32_t) ((uint32_t) v23 < (uint32_t) v3 ? v23 : v3) - (uint32_t) v22);
+      int32_t v25 = (int32_t) ((uint32_t) v3 * (uint32_t) v4);
+      if (v24 > v9) {
+        Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v26 = Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v4);
+        TASSIGN(v26, v10);
+        Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v27 = Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v4);
+        TASSIGN(v27, v11);
+        Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v28 = Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v4);
+        TASSIGN(v28, v12);
+        Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v29 = Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v4);
+        TASSIGN(v29, v13);
+        for (size_t v30 = (size_t) v9; v30 < ((size_t) v24); v30 += (size_t) v8) {
+          int32_t v31 = (int32_t) ((uint32_t) ((int32_t) (uint32_t) v22 + (uint32_t) ((int32_t) v30)) * (uint32_t) v4);
+          unsigned v32 = (unsigned) v4 * v5;
+          pto::Shape<1, 1, 1, 1, -1> v33 = pto::Shape<1, 1, 1, 1, -1>(v4);
+          pto::Stride<-1, -1, -1, -1, 1> v34 = pto::Stride<-1, -1, -1, -1, 1>(v32, v32, v32, v32);
+          GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v35 = GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v6 + (unsigned) v31 * (unsigned) v8), v33, v34);
+          unsigned v36 = (unsigned) v4 * v5;
+          pto::Shape<1, 1, 1, 1, -1> v37 = pto::Shape<1, 1, 1, 1, -1>(v4);
+          pto::Stride<-1, -1, -1, -1, 1> v38 = pto::Stride<-1, -1, -1, -1, 1>(v36, v36, v36, v36);
+          GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v39 = GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v6 + (unsigned) v31 * (unsigned) v8), v37, v38);
+          wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+          TLOAD(v26, v35);
+          set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+          wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+          TSUB(v29, v26, v26);
+          pipe_barrier(PIPE_V);
+          TEXP(v27, v29);
+          wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+          TSUB(v28, v29, v26);
+          pipe_barrier(PIPE_V);
+          TEXP(v28, v28);
+          pipe_barrier(PIPE_V);
+          TADD(v28, v28, v27);
+          pipe_barrier(PIPE_V);
+          TDIV(v28, v27, v28);
+          pipe_barrier(PIPE_V);
+          TMUL(v28, v26, v28);
+          set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+          set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+          wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+          pipe_barrier(PIPE_MTE3);
+          TSTORE(v39, v28);
+          set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+        };
+      };
+    };
+  }
+  pipe_barrier(PIPE_ALL);
+  wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+  wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+  wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+  wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+  #endif // __DAV_VEC__
+
+  return;
+}
+
diff --git a/examples/agent/silu/silu.pto b/examples/agent/silu/silu.pto
new file mode 100644
index 00000000..2dfd1cbf
--- /dev/null
+++ b/examples/agent/silu/silu.pto
@@ -0,0 +1,58 @@
+module {
+  func.func @_kernel(%arg0: !pto.ptr<f16>, %arg1: !pto.ptr<f16>, %arg2: i32, %arg3: i32) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16384 = arith.constant 16384 : index
+    %0 = arith.index_cast %arg2 : i32 to index
+    %1 = arith.index_cast %arg3 : i32 to index
+    pto.section.vector {
+      %2 = arith.cmpi sgt, %1, %c0 : index
+      scf.if %2 {
+        %3 = arith.cmpi sge, %c16384, %1 : index
+        scf.if %3 {
+          %4 = pto.get_block_idx
+          %5 = pto.get_subblock_idx
+          %6 = pto.get_subblock_num
+          %7 = pto.get_block_num
+          %8 = arith.muli %4, %6 : i64
+          %9 = arith.addi %8, %5 : i64
+          %10 = arith.index_cast %9 : i64 to index
+          %11 = arith.muli %7, %6 : i64
+          %12 = arith.index_cast %11 : i64 to index
+          %13 = arith.ceildivsi %0, %12 : index
+          %14 = arith.muli %10, %13 : index
+          %15 = arith.addi %14, %13 : index
+          %16 = arith.minui %15, %0 : index
+          %17 = arith.subi %16, %14 : index
+          %18 = arith.muli %0, %1 : index
+          %19 = pto.make_tensor_view %arg0, shape = [%18] strides = [%c1] : !pto.tensor_view<?xf16>
+          %20 = pto.make_tensor_view %arg1, shape = [%18] strides = [%c1] : !pto.tensor_view<?xf16>
+          %21 = arith.cmpi sgt, %17, %c0 : index
+          scf.if %21 {
+            %22 = pto.alloc_tile valid_col = %1 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+            %23 = pto.alloc_tile valid_col = %1 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+            %24 = pto.alloc_tile valid_col = %1 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+            %25 = pto.alloc_tile valid_col = %1 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+            scf.for %arg4 = %c0 to %17 step %c1 {
+              %26 = arith.addi %14, %arg4 : index
+              %27 = arith.muli %26, %1 : index
+              %28 = pto.partition_view %19, offsets = [%27], sizes = [%1] : !pto.tensor_view<?xf16> -> !pto.partition_tensor_view<1x16384xf16>
+              %29 = pto.partition_view %20, offsets = [%27], sizes = [%1] : !pto.tensor_view<?xf16> -> !pto.partition_tensor_view<1x16384xf16>
+              pto.tload ins(%28 : !pto.partition_tensor_view<1x16384xf16>) outs(%22 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tsub ins(%22, %22 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%25 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.texp ins(%25 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tsub ins(%25, %22 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.texp ins(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tadd ins(%24, %23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tdiv ins(%23, %24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tmul ins(%22, %24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tstore ins(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%29 : !pto.partition_tensor_view<1x16384xf16>)
+            }
+          }
+        }
+      }
+    }
+    return
+  }
+}
+
diff --git a/examples/agent/silu/silu_builder.py b/examples/agent/silu/silu_builder.py
new file mode 100644
index 00000000..8613588c
--- /dev/null
+++ b/examples/agent/silu/silu_builder.py
@@ -0,0 +1,154 @@
+from ptodsl import pto, tile, to_ir_module
+from ptodsl import scalar as s
+
+const = s.const
+
+# 32 KB of UB / sizeof(fp16) = 16384 elements per tile
+ELEMENTS_PER_TILE = 32 * 1024 // 2
+
+
+def meta_data():
+    dtype = pto.float16
+    ptr_type = pto.PtrType(dtype)
+    index_dtype = pto.int32
+
+    tensor_type = pto.TensorType(rank=1, dtype=dtype)
+    subtensor_type = pto.SubTensorType(shape=[1, ELEMENTS_PER_TILE], dtype=dtype)
+
+    tile_cfg = pto.TileBufConfig()
+    tile_type = pto.TileBufType(
+        shape=[1, ELEMENTS_PER_TILE],
+        valid_shape=[1, -1],
+        dtype=dtype,
+        memory_space="VEC",
+        config=tile_cfg,
+    )
+
+    return {
+        "ptr_type": ptr_type,
+        "index_dtype": index_dtype,
+        "tensor_type": tensor_type,
+        "subtensor_type": subtensor_type,
+        "tile_type": tile_type,
+    }
+
+
+def build_silu(fn_name="silu_fp16"):
+    """
+    Build a dynamic-batch SiLU (Swish) kernel in PTO DSL.
+
+    Computes y = x * sigmoid(x), where:
+        sigmoid(x) = 1 / (1 + exp(-x))
+
+    Constants (1.0) are derived from the input tile itself using
+    the identity exp(x - x) = exp(0) = 1.0, which avoids the need for
+    scalar-tile broadcast operations not available in PTO DSL.
+
+    UB tile budget (fp16, 4 tiles x 32 KB = 128 KB < 192 KB):
+        tb_x    : input row x
+        tb_ones : constant 1.0 (recomputed each row via exp(x-x))
+        tb_tmp1 : intermediate / final output
+        tb_tmp2 : intermediate (for zero)
+
+    Kernel args:
+        x_ptr   : fp16[batch * n_cols]  -- input
+        y_ptr   : fp16[batch * n_cols]  -- output
+        batch   : int32                 -- number of rows
+        n_cols  : int32                 -- elements per row; must be <= 16384
+    """
+
+    @to_ir_module(meta_data=meta_data)
+    def _kernel(
+        x_ptr: "ptr_type",
+        y_ptr: "ptr_type",
+        batch_i32: "index_dtype",
+        n_cols_i32: "index_dtype",
+    ) -> None:
+        c0 = const(0)
+        c1 = const(1)
+        c_tile = const(ELEMENTS_PER_TILE)
+
+        batch = s.index_cast(batch_i32)
+        n_cols = s.index_cast(n_cols_i32)
+
+        with pto.vector_section():
+            # Guard: n_cols must be in (0, ELEMENTS_PER_TILE].
+            with pto.if_context(n_cols > c0):
+                with pto.if_context(c_tile >= n_cols):
+                    cid = pto.get_block_idx()
+                    sub_bid = pto.get_subblock_idx()
+                    sub_bnum = pto.get_subblock_num()
+                    num_blocks = pto.get_block_num()
+
+                    vid = s.index_cast(cid * sub_bnum + sub_bid)
+                    num_cores = s.index_cast(num_blocks * sub_bnum)
+
+                    # Distribute rows across cores (row-level parallelism).
+                    rows_per_core = s.ceil_div(batch, num_cores)
+                    row_start = vid * rows_per_core
+                    row_end = s.min_u(row_start + rows_per_core, batch)
+                    num_rows = row_end - row_start
+
+                    total_elems = batch * n_cols
+                    tv_x = pto.as_tensor(
+                        tensor_type, ptr=x_ptr, shape=[total_elems], strides=[c1]
+                    )
+                    tv_y = pto.as_tensor(
+                        tensor_type, ptr=y_ptr, shape=[total_elems], strides=[c1]
+                    )
+
+                    with pto.if_context(num_rows > c0):
+                        # Allocate 4 UB tiles (128 KB total, well under 192 KB UB).
+                        tb_x = pto.alloc_tile(tile_type, valid_col=n_cols)
+                        tb_ones = pto.alloc_tile(tile_type, valid_col=n_cols)
+                        tb_tmp1 = pto.alloc_tile(tile_type, valid_col=n_cols)
+                        tb_tmp2 = pto.alloc_tile(tile_type, valid_col=n_cols)
+
+                        for row_i in pto.range(c0, num_rows, c1):
+                            gm_offset = (row_start + row_i) * n_cols
+
+                            sv_x = pto.slice_view(
+                                subtensor_type,
+                                source=tv_x,
+                                offsets=[gm_offset],
+                                sizes=[n_cols],
+                            )
+                            sv_y = pto.slice_view(
+                                subtensor_type,
+                                source=tv_y,
+                                offsets=[gm_offset],
+                                sizes=[n_cols],
+                            )
+
+                            pto.load(sv_x, tb_x)
+
+                            # Derive constants from data:
+                            #   x - x = 0  =>  exp(0) = 1.0
+                            tile.sub(tb_x, tb_x, tb_tmp2)    # tmp2 = 0.0
+                            tile.exp(tb_tmp2, tb_ones)        # ones = 1.0
+
+                            # sigmoid(x) = 1 / (1 + exp(-x))
+                            tile.sub(tb_tmp2, tb_x, tb_tmp1)  # tmp1 = 0 - x = -x
+                            tile.exp(tb_tmp1, tb_tmp1)         # tmp1 = exp(-x)
+                            tile.add(tb_tmp1, tb_ones, tb_tmp1) # tmp1 = 1 + exp(-x)
+                            tile.div(tb_ones, tb_tmp1, tb_tmp1) # tmp1 = sigmoid(x)
+
+                            # silu(x) = x * sigmoid(x)
+                            tile.mul(tb_x, tb_tmp1, tb_tmp1)  # tmp1 = x * sigmoid(x)
+                            pto.store(tb_tmp1, sv_y)
+
+    _ = fn_name
+    return _kernel
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--fn-name",
+        default="silu_fp16",
+        help="Generated kernel function name.",
+    )
+    args = parser.parse_args()
+    print(build_silu(fn_name=args.fn_name))
diff --git a/openenv/README.md b/openenv/README.md
index b684a9bf..6f38c2fe 100644
--- a/openenv/README.md
+++ b/openenv/README.md
@@ -2,6 +2,12 @@
 
 Uses Claude Opus with OpenEnv to iteratively edit, build, test, and benchmark PTO-DSL kernels until a faster-than-baseline implementation is found.
 
+## Installation
+
+```bash
+pip install openenv-core   
+```
+
 ## Quick start
 
 ```bash
diff --git a/openenv/agent_search.py b/openenv/agent_search.py
index f2b3403b..9552db2e 100644
--- a/openenv/agent_search.py
+++ b/openenv/agent_search.py
@@ -41,41 +41,54 @@ def load_config(config_path: str) -> dict:
         return tomllib.load(f)
 
 
-def build_env(cfg: dict) -> tuple["KernelSearchEnv", Path]:
-    kernel_dir = Path(cfg["kernel_dir"])
-    if not kernel_dir.is_absolute():
-        kernel_dir = ROOT / kernel_dir
+def build_env(cfg: dict) -> tuple["KernelSearchEnv", Path, Path]:
+    """Returns (env, work_dir, ref_dir).
 
-    # Resolve working directory (copy of originals — never touch kernel_dir itself)
+    ref_dir is kernel_dir (the reference/skeleton).  For create mode work_dir
+    starts empty; for optimize mode it is a full copy of ref_dir.
+    """
+    ref_dir = Path(cfg["kernel_dir"])
+    if not ref_dir.is_absolute():
+        ref_dir = ROOT / ref_dir
+
+    # Resolve working directory
     if "work_dir" in cfg:
         work_dir = Path(cfg["work_dir"])
         if not work_dir.is_absolute():
             work_dir = ROOT / work_dir
     else:
-        work_dir = ROOT / "examples" / "agent" / kernel_dir.name
+        default_name = cfg.get("kernel_name", ref_dir.name)
+        work_dir = ROOT / "examples" / "agent" / default_name
 
     if work_dir.exists():
         shutil.rmtree(work_dir)
-    shutil.copytree(kernel_dir, work_dir)
-    print(f"Working directory: {work_dir}  (originals in {kernel_dir} are untouched)")
+
+    if cfg.get("mode") == "create":
+        # Start with an empty directory — agent creates all files from scratch
+        work_dir.mkdir(parents=True)
+        print(f"Working directory: {work_dir}  (empty — agent will create all files)")
+    else:
+        shutil.copytree(ref_dir, work_dir)
+        print(f"Working directory: {work_dir}  (originals in {ref_dir} are untouched)")
 
     baseline_files = {
-        name: (kernel_dir / name).read_text(encoding="utf-8")
-        for name in cfg["baseline_files"]
+        name: (ref_dir / name).read_text(encoding="utf-8")
+        for name in cfg.get("baseline_files", [])
     }
 
     def resolve_cmd(cmd: list[str]) -> list[str]:
-        # Replace bare "python" with the current interpreter
         return [sys.executable if c == "python" else c for c in cmd]
 
+    bench_cmd = resolve_cmd(cfg["bench_cmd"]) if "bench_cmd" in cfg else None
+
     env = KernelSearchEnv(
         repo_path=str(work_dir),
         build_cmd=resolve_cmd(cfg["build_cmd"]),
         test_cmd=resolve_cmd(cfg["test_cmd"]),
-        bench_cmd=resolve_cmd(cfg["bench_cmd"]),
+        bench_cmd=bench_cmd,
         baseline_files=baseline_files,
     )
-    return env, work_dir
+    return env, work_dir, ref_dir
 
 
 # ---------------------------------------------------------------------------
@@ -94,8 +107,9 @@ def resolve_cmd(cmd: list[str]) -> list[str]:
 
 CFG = load_config(_args.config)
 EXAMPLE_DIR: Path
+REF_DIR: Path
 env: KernelSearchEnv
-env, EXAMPLE_DIR = build_env(CFG)
+env, EXAMPLE_DIR, REF_DIR = build_env(CFG)
 
 # ---------------------------------------------------------------------------
 # Tool definitions
@@ -150,6 +164,21 @@ def resolve_cmd(cmd: list[str]) -> list[str]:
         "description": "Measure the kernel latency and compute speedup vs the baseline. Returns latency_ms and speedup_vs_baseline.",
         "input_schema": {"type": "object", "properties": {}, "required": []},
     },
+    {
+        "name": "write_file",
+        "description": (
+            "Write the complete content of a file, creating it if it doesn't exist. "
+            "Use this to write a new kernel file from scratch."
+        ),
+        "input_schema": {
+            "type": "object",
+            "properties": {
+                "path":    {"type": "string", "description": "Relative file path"},
+                "content": {"type": "string", "description": "Full file content"},
+            },
+            "required": ["path", "content"],
+        },
+    },
 ]
 
 # ---------------------------------------------------------------------------
@@ -193,6 +222,16 @@ def execute_tool(name: str, tool_input: dict) -> str:
             f"Best so far: {env.best_ms:.4f} ms ({env.best_speedup:.4f}x)"
         )
 
+    if name == "write_file":
+        for key in ("path", "content"):
+            if key not in tool_input:
+                return f"Error: write_file requires '{key}' parameter."
+        obs = env.step(KernelAction("edit", {
+            "path":    tool_input["path"],
+            "content": tool_input["content"],
+        }))
+        return obs.summary
+
     return f"Unknown tool: {name}"
 
 # ---------------------------------------------------------------------------
@@ -247,6 +286,18 @@ def _build_system_prompt(cfg: dict, kernel_dir: Path) -> str:
     if preload_section:
         preload_section = "\n## Pre-loaded files (do NOT re-read these)\n" + preload_section
 
+    # Embed local reference files (repo-relative paths)
+    local_section = ""
+    for fpath_str in cfg.get("local_context_files", []):
+        fpath = ROOT / fpath_str
+        try:
+            content = fpath.read_text(encoding="utf-8")
+            local_section += f"\n### {fpath.name}\n```python\n{content}\n```\n"
+        except FileNotFoundError:
+            local_section += f"\n### {fpath_str}\n(file not found)\n"
+    if local_section:
+        local_section = "\n## PTO-DSL API (local, authoritative)\n" + local_section
+
     # Fetch remote reference docs and embed them
     url_section = ""
     for entry in cfg.get("context_urls", []):
@@ -267,7 +318,7 @@ def _build_system_prompt(cfg: dict, kernel_dir: Path) -> str:
 
         The kernel directory contains exactly these files: {file_listing}
         Do NOT attempt to read any other filenames — they do not exist.
-        {preload_section}{url_section}
+        {preload_section}{local_section}{url_section}
         ## Your workflow
         1. Propose one targeted change based on the pre-loaded files above
         2. edit_file(...)
@@ -288,35 +339,116 @@ def _build_system_prompt(cfg: dict, kernel_dir: Path) -> str:
         - Stop as soon as you achieve a confirmed speedup > 1.0x and explain what worked.
     """)
 
-SYSTEM = _build_system_prompt(CFG, EXAMPLE_DIR)
+def _build_create_system_prompt(cfg: dict, work_dir: Path, ref_dir: Path) -> str:
+    name        = cfg.get("kernel_name", "kernel")
+    create_file = cfg.get("create_file", "builder.py")
+    description = cfg.get("create_prompt", "").strip()
+
+    files = sorted(f.name for f in work_dir.iterdir() if f.is_file())
+    file_listing = ", ".join(files) if files else "(empty — you must create all files)"
+
+    # Preload_files come from ref_dir (the reference skeleton), not the empty work_dir
+    preload_section = ""
+    for fname in cfg.get("preload_files", []):
+        fpath = ref_dir / fname
+        try:
+            content = fpath.read_text(encoding="utf-8")
+            preload_section += f"\n### {fname} (from reference kernel — adapt for {name})\n```\n{content}\n```\n"
+        except FileNotFoundError:
+            preload_section += f"\n### {fname}\n(file not found in reference)\n"
+    if preload_section:
+        preload_section = "\n## Reference files (adapt these for the new kernel — do NOT copy verbatim)\n" + preload_section
+
+    # Embed local reference files (repo-relative paths)
+    local_section = ""
+    for fpath_str in cfg.get("local_context_files", []):
+        fpath = ROOT / fpath_str
+        try:
+            content = fpath.read_text(encoding="utf-8")
+            local_section += f"\n### {fpath.name}\n```python\n{content}\n```\n"
+        except FileNotFoundError:
+            local_section += f"\n### {fpath_str}\n(file not found)\n"
+    if local_section:
+        local_section = "\n## PTO-DSL API (local, authoritative)\n" + local_section
+
+    url_section = ""
+    for entry in cfg.get("context_urls", []):
+        if isinstance(entry, dict):
+            url, label = entry["url"], entry.get("label", entry["url"])
+        else:
+            url, label = entry, entry
+        print(f"  Fetching context: {label} …")
+        content = _fetch_url(url)
+        url_section += f"\n### {label}\n```\n{content}\n```\n"
+    if url_section:
+        url_section = "\n## Reference documentation\n" + url_section
+
+    return _textwrap.dedent(f"""\
+        You are an expert NPU kernel engineer specialising in Ascend/PTO-DSL.
+
+        Your task is to implement the {name} kernel entirely from scratch.
+        The working directory starts with: {file_listing}
+        You must create ALL required files yourself using write_file.
+
+        {preload_section}{local_section}{url_section}
+        ## Kernel to implement
+        {description}
+
+        ## Files you must create
+        - `{create_file}`  — PTO-DSL kernel builder (the main implementation)
+        - `compile.sh`     — build script (adapt from reference above; use {name}-specific names)
+        - `caller.cpp`     — C++ entry point (adapt signature to match the {name} kernel exactly)
+        - `run_{name}.py`  — correctness test script (adapt from reference; use `{name}_lib.so`)
+
+        ## Your workflow
+        1. Write all required files using write_file(path="...", content="...")
+        2. build()  — check for compile errors; fix with write_file/edit_file as needed
+        3. run_tests()  — verify correctness; fix failures and rebuild
+        4. Once tests pass, call benchmark() to measure latency
+
+        ## Rules
+        - Study the reference files above carefully to understand the PTO-DSL API and build system.
+        - The compile.sh and caller.cpp you create must be consistent with each other and with `{create_file}`.
+        - If build or tests fail, fix the issue — do not give up.
+        - Stop once tests pass and you have a benchmark result.
+    """)
+
+
+SYSTEM = (
+    _build_create_system_prompt(CFG, EXAMPLE_DIR, REF_DIR)
+    if CFG.get("mode") == "create"
+    else _build_system_prompt(CFG, EXAMPLE_DIR)
+)
 
 # ---------------------------------------------------------------------------
 # Agentic loop
 # ---------------------------------------------------------------------------
 def run_agent(max_turns: int = 30) -> None:
-    # Establish baseline
-    print("Establishing baseline …")
-    obs = env.reset()
-    print(f"Baseline: {obs.latency_ms:.4f} ms\n")
-
+    mode = CFG.get("mode", "optimize")
     client = anthropic.Anthropic()
-    preloaded = CFG.get("preload_files", [])
-    if preloaded:
-        start_hint = f"The source files are already in your context above — start proposing a change directly."
+
+    if mode == "create":
+        name = CFG.get("kernel_name", "kernel")
+        create_file = CFG.get("create_file", "builder.py")
+        print(f"Create mode — implementing {name} kernel from scratch …\n")
+        messages: list[dict] = [{"role": "user", "content":
+            f"Please implement the {name} kernel from scratch. "
+            f"Create all required files (compile.sh, caller.cpp, {create_file}, run_{name}.py) "
+            "using write_file. The reference files and PTO-DSL docs are in your system context above."}]
     else:
-        main_file = CFG.get("main_file", "builder.py")
-        start_hint = f"Start by reading {main_file}."
-
-    messages: list[dict] = [
-        {
-            "role": "user",
-            "content": (
-                f"The baseline kernel latency is {obs.latency_ms:.4f} ms. "
-                f"Please find a faster implementation. {start_hint}"
-            ),
-        }
-    ]
+        print("Establishing baseline …")
+        obs = env.reset()
+        print(f"Baseline: {obs.latency_ms:.4f} ms\n")
+        preloaded = CFG.get("preload_files", [])
+        start_hint = (
+            "The source files are already in your context above — start proposing a change directly."
+            if preloaded else f"Start by reading {CFG.get('main_file', 'builder.py')}."
+        )
+        messages = [{"role": "user", "content":
+            f"The baseline kernel latency is {obs.latency_ms:.4f} ms. "
+            f"Please find a faster implementation. {start_hint}"}]
 
+    tests_passed = False
     turn = 0
     while turn < max_turns:
         turn += 1
@@ -376,22 +508,34 @@ def run_agent(max_turns: int = 30) -> None:
                 "tool_use_id": block.id,
                 "content": result,
             })
+            # Track test results for create-mode exit condition
+            if block.name == "run_tests" and result.startswith("PASS"):
+                tests_passed = True
 
         if tool_results:
             messages.append({"role": "user", "content": tool_results})
 
-        # Early exit if we already have a confirmed speedup
-        if env.best_speedup is not None and env.best_speedup > 1.0:
+        if mode == "create" and tests_passed:
+            print("\n[Agent] Tests passed — kernel implemented successfully.")
+            break
+
+        if mode == "optimize" and env.best_speedup is not None and env.best_speedup > 1.0:
             print(f"\n[Agent] Speedup achieved: {env.best_speedup:.4f}x — stopping early.")
             break
 
     # Final summary
     print(f"\n{'='*60}")
-    print("SEARCH COMPLETE")
-    print(f"{'='*60}")
-    print(f"  Baseline:     {env.baseline_ms:.4f} ms")
-    print(f"  Best found:   {env.best_ms:.4f} ms")
-    print(f"  Best speedup: {env.best_speedup:.4f}x" if env.best_speedup else "  No improvement found.")
+    if mode == "create":
+        print("CREATE COMPLETE")
+        print(f"{'='*60}")
+        print(f"  Result: {'tests passed' if tests_passed else 'did not reach passing tests'}")
+        print(f"  Output: {EXAMPLE_DIR}")
+    else:
+        print("SEARCH COMPLETE")
+        print(f"{'='*60}")
+        print(f"  Baseline:     {env.baseline_ms:.4f} ms")
+        print(f"  Best found:   {env.best_ms:.4f} ms")
+        print(f"  Best speedup: {env.best_speedup:.4f}x" if env.best_speedup else "  No improvement found.")
 
 
 def _fmt_input(inp: dict) -> str:
diff --git a/openenv/geglu_config.toml b/openenv/geglu_config.toml
index 125ce6cb..686c6db6 100644
--- a/openenv/geglu_config.toml
+++ b/openenv/geglu_config.toml
@@ -28,6 +28,15 @@ readonly_files = ["run_geglu.py", "caller.cpp"]
 
 preload_files = ["geglu_builder.py", "compile.sh", "caller.cpp"]
 
+# Local PTO-DSL API files (authoritative, read from repo)
+local_context_files = [
+    "ptodsl/api/pto.py",
+    "ptodsl/api/tile.py",
+    "ptodsl/api/scalar.py",
+    "ptodsl/api/control_flow.py",
+    "ptodsl/api/synchronization.py",
+]
+
 [[context_urls]]
 url   = "https://github.com/zhangstevenunity/PTOAS/blob/main/include/PTO/IR/PTOOps.td"
 label = "PTO MLIR op definitions (PTOOps.td)"
diff --git a/openenv/hadamard_config.toml b/openenv/hadamard_config.toml
index d950dd3f..8675e91a 100644
--- a/openenv/hadamard_config.toml
+++ b/openenv/hadamard_config.toml
@@ -45,6 +45,15 @@ preload_files = ["hadamard_builder.py", "compile.sh", "caller.cpp"]
 # Reference only (not fetchable): https://gitcode.com/cann/pto-isa/tree/master/docs/isa
 # Reference only (not fetchable): https://gitcode.com/cann/pto-isa/tree/master/include/pto/npu/a2a3
 # Reference only (not fetchable): https://gitcode.com/cann/pto-isa/blob/master/tests/npu/a2a3/src/st/testcase/
+# Local PTO-DSL API files (authoritative, read from repo)
+local_context_files = [
+    "ptodsl/api/pto.py",
+    "ptodsl/api/tile.py",
+    "ptodsl/api/scalar.py",
+    "ptodsl/api/control_flow.py",
+    "ptodsl/api/synchronization.py",
+]
+
 [[context_urls]]
 url   = "https://github.com/zhangstevenunity/PTOAS/blob/main/include/PTO/IR/PTOOps.td"
 label = "PTO MLIR op definitions (PTOOps.td)"
diff --git a/openenv/hardswish_config.toml b/openenv/hardswish_config.toml
new file mode 100644
index 00000000..c99dfeb6
--- /dev/null
+++ b/openenv/hardswish_config.toml
@@ -0,0 +1,48 @@
+mode = "create"
+
+kernel_dir  = "examples/aot/geglu_dynamic_multicore"  # reference for API patterns only
+kernel_name = "hardswish"
+create_file = "hardswish_builder.py"
+
+build_cmd = ["bash", "compile.sh"]
+test_cmd  = ["python", "run_hardswish.py"]
+bench_cmd = ["python", "_bench_wrapper.py"]
+
+create_prompt = """
+Implement Hard-Swish activation (used in MobileNetV3):
+    y = x * relu6(x + 3) / 6
+      = x * clamp(x + 3, 0, 6) / 6
+
+PyTorch reference: torch.nn.functional.hardswish(x)
+
+Input:  fp16 tensor (batch, n_cols)
+Output: fp16 tensor (batch, n_cols) — separate output pointer
+
+The kernel signature in caller.cpp should be:
+    extern "C" void call_kernel(uint32_t blockDim, void *stream,
+                                uint8_t *x, uint8_t *y,
+                                uint32_t batch, uint32_t n_cols)
+
+The .so produced by compile.sh should be named hardswish_lib.so.
+The test script run_hardswish.py should compare against torch.nn.functional.hardswish(x.float()).to(x.dtype).
+The bench wrapper _bench_wrapper.py should print latency_ms=<value>.
+
+Implementation hint: clamp(x + 3, 0, 6) can be computed as min(max(x + 3, 0), 6).
+The division by 6 can be done as multiplication by 1/6 ≈ 0.16667.
+"""
+
+# These are loaded from the reference kernel_dir as examples — adapt them for hardswish
+preload_files = ["compile.sh", "caller.cpp", "geglu_builder.py", "run_geglu.py", "_bench_wrapper.py"]
+
+# Local PTO-DSL API files (authoritative, read from repo)
+local_context_files = [
+    "ptodsl/api/pto.py",
+    "ptodsl/api/tile.py",
+    "ptodsl/api/scalar.py",
+    "ptodsl/api/control_flow.py",
+    "ptodsl/api/synchronization.py",
+]
+
+[[context_urls]]
+url   = "https://github.com/zhangstevenunity/PTOAS/blob/main/python/pto/dialects/pto.py"
+label = "PTO Python bindings"
diff --git a/openenv/kernel_opt_env.py b/openenv/kernel_opt_env.py
index 503b99ac..a21a87fe 100644
--- a/openenv/kernel_opt_env.py
+++ b/openenv/kernel_opt_env.py
@@ -38,7 +38,7 @@ def __init__(
         self,
         repo_path: str,
         test_cmd: list[str],
-        bench_cmd: list[str],
+        bench_cmd: list[str] | None = None,
         build_cmd: list[str] | None = None,
         baseline_files: dict[str, str] | None = None,
     ):
diff --git a/openenv/mish_config.toml b/openenv/mish_config.toml
new file mode 100644
index 00000000..2f5fe8e6
--- /dev/null
+++ b/openenv/mish_config.toml
@@ -0,0 +1,46 @@
+mode = "create"
+
+kernel_dir  = "examples/aot/geglu_dynamic_multicore"  # reference for API patterns only
+kernel_name = "mish"
+create_file = "mish_builder.py"
+
+build_cmd = ["bash", "compile.sh"]
+test_cmd  = ["python", "run_mish.py"]
+bench_cmd = ["python", "_bench_wrapper.py"]
+
+create_prompt = """
+Implement Mish activation: y = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))
+PyTorch reference: torch.nn.functional.mish(x)
+
+Input:  fp16 tensor (batch, n_cols)
+Output: fp16 tensor (batch, n_cols) — separate output pointer
+
+The kernel signature in caller.cpp should be:
+    extern "C" void call_kernel(uint32_t blockDim, void *stream,
+                                uint8_t *x, uint8_t *y,
+                                uint32_t batch, uint32_t n_cols)
+
+The .so produced by compile.sh should be named mish_lib.so.
+The test script run_mish.py should compare against torch.nn.functional.mish(x.float()).to(x.dtype).
+The bench wrapper _bench_wrapper.py should print latency_ms=<value>.
+
+Implementation hint: compute softplus(x) = ln(1 + exp(x)) first, then tanh of that, then multiply by x.
+For numerical stability at large x, softplus(x) ≈ x (exp overflows), so clamp or use the identity
+softplus(x) = x + ln(1 + exp(-x)) for large positive x.
+"""
+
+# These are loaded from the reference kernel_dir as examples — adapt them for mish
+preload_files = ["compile.sh", "caller.cpp", "geglu_builder.py", "run_geglu.py", "_bench_wrapper.py"]
+
+# Local PTO-DSL API files (authoritative, read from repo)
+local_context_files = [
+    "ptodsl/api/pto.py",
+    "ptodsl/api/tile.py",
+    "ptodsl/api/scalar.py",
+    "ptodsl/api/control_flow.py",
+    "ptodsl/api/synchronization.py",
+]
+
+[[context_urls]]
+url   = "https://github.com/zhangstevenunity/PTOAS/blob/main/python/pto/dialects/pto.py"
+label = "PTO Python bindings"
diff --git a/openenv/silu_config.toml b/openenv/silu_config.toml
new file mode 100644
index 00000000..ddd69841
--- /dev/null
+++ b/openenv/silu_config.toml
@@ -0,0 +1,42 @@
+mode = "create"
+
+kernel_dir  = "examples/aot/geglu_dynamic_multicore"  # reference for API patterns only
+kernel_name = "silu"
+create_file = "silu_builder.py"
+
+build_cmd = ["bash", "compile.sh"]
+test_cmd  = ["python", "run_silu.py"]
+bench_cmd = ["python", "_bench_wrapper.py"]
+
+create_prompt = """
+Implement SiLU (Swish) activation: y = x * sigmoid(x)
+PyTorch reference: torch.nn.functional.silu(x)
+
+Input:  fp16 tensor (batch, n_cols)
+Output: fp16 tensor (batch, n_cols) — separate output pointer
+
+The kernel signature in caller.cpp should be:
+    extern "C" void call_kernel(uint32_t blockDim, void *stream,
+                                uint8_t *x, uint8_t *y,
+                                uint32_t batch, uint32_t n_cols)
+
+The .so produced by compile.sh should be named silu_lib.so.
+The test script run_silu.py should compare against torch.nn.functional.silu(x.float()).to(x.dtype).
+The bench wrapper _bench_wrapper.py should print latency_ms=<value>.
+"""
+
+# These are loaded from the reference kernel_dir as examples — adapt them for silu
+preload_files = ["compile.sh", "caller.cpp", "geglu_builder.py", "run_geglu.py", "_bench_wrapper.py"]
+
+# Local PTO-DSL API files (authoritative, read from repo)
+local_context_files = [
+    "ptodsl/api/pto.py",
+    "ptodsl/api/tile.py",
+    "ptodsl/api/scalar.py",
+    "ptodsl/api/control_flow.py",
+    "ptodsl/api/synchronization.py",
+]
+
+[[context_urls]]
+url   = "https://github.com/zhangstevenunity/PTOAS/blob/main/python/pto/dialects/pto.py"
+label = "PTO Python bindings"

From 81e5669f7e1833dd00f4671474e7f18146e6e608 Mon Sep 17 00:00:00 2001
From: mirkodevita <mirko.de.vita@h-partners.com>
Date: Mon, 9 Mar 2026 16:00:35 +0000
Subject: [PATCH 4/6] added agent egnerated hardswish example

---
 examples/agent/hardswish/_bench_wrapper.py    |  68 +++++++
 examples/agent/hardswish/caller.cpp           |  24 +++
 examples/agent/hardswish/compile.sh           |  24 +++
 examples/agent/hardswish/hardswish.cpp        | 102 +++++++++++
 examples/agent/hardswish/hardswish.pto        |  63 +++++++
 examples/agent/hardswish/hardswish_builder.py | 168 ++++++++++++++++++
 examples/agent/hardswish/run_hardswish.py     | 107 +++++++++++
 openenv/agent_search.py                       |  74 +++++++-
 8 files changed, 627 insertions(+), 3 deletions(-)
 create mode 100644 examples/agent/hardswish/_bench_wrapper.py
 create mode 100644 examples/agent/hardswish/caller.cpp
 create mode 100755 examples/agent/hardswish/compile.sh
 create mode 100644 examples/agent/hardswish/hardswish.cpp
 create mode 100644 examples/agent/hardswish/hardswish.pto
 create mode 100644 examples/agent/hardswish/hardswish_builder.py
 create mode 100644 examples/agent/hardswish/run_hardswish.py

diff --git a/examples/agent/hardswish/_bench_wrapper.py b/examples/agent/hardswish/_bench_wrapper.py
new file mode 100644
index 00000000..0199edb2
--- /dev/null
+++ b/examples/agent/hardswish/_bench_wrapper.py
@@ -0,0 +1,68 @@
+"""
+Single-config benchmark wrapper for the agentic optimizer.
+Loads hardswish_lib.so and prints:  latency_ms=<number>
+"""
+import ctypes
+
+import torch
+import torch_npu  # noqa: F401
+
+from ptodsl.test_util import get_test_device
+
+# Representative shape
+BATCH     = 1024
+N_COLS    = 8192
+BLOCK_DIM = 24
+WARMUP    = 5
+ITERS     = 20
+
+
+def torch_to_ctypes(t):
+    return ctypes.c_void_p(t.data_ptr())
+
+
+device = get_test_device()
+torch.npu.set_device(device)
+
+lib = ctypes.CDLL("./hardswish_lib.so")
+lib.call_kernel.argtypes = [
+    ctypes.c_uint32,  # blockDim
+    ctypes.c_void_p,  # stream
+    ctypes.c_void_p,  # x
+    ctypes.c_void_p,  # y (output)
+    ctypes.c_uint32,  # batch
+    ctypes.c_uint32,  # n_cols
+]
+lib.call_kernel.restype = None
+
+
+def run(x, y):
+    lib.call_kernel(
+        BLOCK_DIM,
+        torch.npu.current_stream()._as_parameter_,
+        torch_to_ctypes(x),
+        torch_to_ctypes(y),
+        BATCH,
+        N_COLS,
+    )
+
+
+dtype = torch.float16
+# Separate tensors per iteration to reduce cache reuse
+xs = [torch.randn(BATCH, N_COLS, device=device, dtype=dtype).clamp(-8, 8) for _ in range(WARMUP + ITERS)]
+y  = torch.empty(BATCH, N_COLS, device=device, dtype=dtype)
+
+for i in range(WARMUP):
+    run(xs[i], y)
+torch.npu.synchronize()
+
+starts = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)]
+ends   = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)]
+for i in range(ITERS):
+    starts[i].record()
+    run(xs[WARMUP + i], y)
+    ends[i].record()
+torch.npu.synchronize()
+
+ms = sum(s.elapsed_time(e) for s, e in zip(starts, ends)) / ITERS
+print(f"latency_ms={ms:.4f}")
diff --git a/examples/agent/hardswish/caller.cpp b/examples/agent/hardswish/caller.cpp
new file mode 100644
index 00000000..4d9c6220
--- /dev/null
+++ b/examples/agent/hardswish/caller.cpp
@@ -0,0 +1,24 @@
+#ifndef KERNEL_CPP
+#define KERNEL_CPP "hardswish.cpp"
+#endif
+#include KERNEL_CPP
+
+#ifndef NUM_CORES
+#define NUM_CORES 24
+#endif
+
+extern "C" void call_kernel(
+    uint32_t blockDim,
+    void *stream,
+    uint8_t *x,
+    uint8_t *y,
+    uint32_t batch,
+    uint32_t n_cols)
+{
+    uint32_t launch_blocks = blockDim > 0 ? blockDim : NUM_CORES;
+    _kernel<<<launch_blocks, nullptr, stream>>>(
+        reinterpret_cast<half *>(x),
+        reinterpret_cast<half *>(y),
+        static_cast<int32_t>(batch),
+        static_cast<int32_t>(n_cols));
+}
diff --git a/examples/agent/hardswish/compile.sh b/examples/agent/hardswish/compile.sh
new file mode 100755
index 00000000..3c33ccaf
--- /dev/null
+++ b/examples/agent/hardswish/compile.sh
@@ -0,0 +1,24 @@
+set -e
+
+rm -f hardswish.pto hardswish.cpp hardswish_lib.so
+
+python ./hardswish_builder.py > ./hardswish.pto
+ptoas --enable-insert-sync ./hardswish.pto -o ./hardswish.cpp
+
+PTO_LIB_PATH=/sources/pto-isa
+bisheng \
+    -I${PTO_LIB_PATH}/include \
+    -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \
+    -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \
+    -xcce -Xhost-start -Xhost-end \
+    -mllvm -cce-aicore-stack-size=0x8000 \
+    -mllvm -cce-aicore-function-stack-size=0x8000 \
+    -mllvm -cce-aicore-record-overflow=true \
+    -mllvm -cce-aicore-addr-transform \
+    -mllvm -cce-aicore-dcci-insert-for-scalar=false \
+    --npu-arch=dav-2201 -DMEMORY_BASE \
+    -std=gnu++17 \
+    -DKERNEL_CPP="\"hardswish.cpp\"" \
+    ./caller.cpp \
+    -o ./hardswish_lib.so
+
diff --git a/examples/agent/hardswish/hardswish.cpp b/examples/agent/hardswish/hardswish.cpp
new file mode 100644
index 00000000..d989a1b9
--- /dev/null
+++ b/examples/agent/hardswish/hardswish.cpp
@@ -0,0 +1,102 @@
+#include "pto/pto-inst.hpp"
+using namespace pto;
+__global__ AICORE void _kernel(__gm__ half* v1, __gm__ half* v2, int32_t v3, int32_t v4) {
+  unsigned v5 = 1;
+  unsigned v6 = 0;
+  int32_t v7 = 16384;
+  int32_t v8 = 1;
+  int32_t v9 = 0;
+  int64_t v10 = 0;
+  int64_t v11 = 32768;
+  int64_t v12 = 65536;
+  int64_t v13 = 98304;
+  using T = float;
+
+  #if defined(__DAV_VEC__)
+  set_mask_norm();
+  set_vector_mask(-1, -1);
+  set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+  set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+  set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+  set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+  if (v4 > v9) {
+    if (v4 <= v7) {
+      int64_t v14 = get_block_idx();
+      int64_t v15 = get_subblockid();
+      int64_t v16 = get_subblockdim();
+      int64_t v17 = (int64_t) v16;
+      int64_t v18 = get_block_num();
+      int32_t v19 = (int32_t) ((int64_t) (uint64_t) ((int64_t) v18) * (uint64_t) v17);
+      int32_t v20 = v3 / v19;
+      int32_t v21 = v3 % v19 != v9 && v3 < v9 == v19 < v9 ? v20 + v8 : v20;
+      int32_t v22 = (int32_t) ((uint32_t) ((int32_t) (int64_t) ((uint64_t) ((int64_t) (uint64_t) ((int64_t) v14) * (uint64_t) v17) + (uint64_t) ((int64_t) v15))) * (uint32_t) v21);
+      int32_t v23 = (int32_t) ((uint32_t) v22 + (uint32_t) v21);
+      int32_t v24 = (int32_t) ((uint32_t) ((uint32_t) v23 < (uint32_t) v3 ? v23 : v3) - (uint32_t) v22);
+      int32_t v25 = (int32_t) ((uint32_t) v3 * (uint32_t) v4);
+      if (v24 > v9) {
+        Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v26 = Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v4);
+        TASSIGN(v26, v10);
+        Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v27 = Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v4);
+        TASSIGN(v27, v11);
+        Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v28 = Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v4);
+        TASSIGN(v28, v12);
+        Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v29 = Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v4);
+        TASSIGN(v29, v13);
+        for (size_t v30 = (size_t) v9; v30 < ((size_t) v24); v30 += (size_t) v8) {
+          int32_t v31 = (int32_t) ((uint32_t) ((int32_t) (uint32_t) v22 + (uint32_t) ((int32_t) v30)) * (uint32_t) v4);
+          unsigned v32 = (unsigned) v4 * v5;
+          pto::Shape<1, 1, 1, 1, -1> v33 = pto::Shape<1, 1, 1, 1, -1>(v4);
+          pto::Stride<-1, -1, -1, -1, 1> v34 = pto::Stride<-1, -1, -1, -1, 1>(v32, v32, v32, v32);
+          GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v35 = GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v6 + (unsigned) v31 * (unsigned) v8), v33, v34);
+          unsigned v36 = (unsigned) v4 * v5;
+          pto::Shape<1, 1, 1, 1, -1> v37 = pto::Shape<1, 1, 1, 1, -1>(v4);
+          pto::Stride<-1, -1, -1, -1, 1> v38 = pto::Stride<-1, -1, -1, -1, 1>(v36, v36, v36, v36);
+          GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v39 = GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v6 + (unsigned) v31 * (unsigned) v8), v37, v38);
+          wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+          TLOAD(v26, v35);
+          set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+          wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+          wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+          TSUB(v27, v26, v26);
+          pipe_barrier(PIPE_V);
+          TEXP(v27, v27);
+          pipe_barrier(PIPE_V);
+          TADD(v28, v27, v27);
+          pipe_barrier(PIPE_V);
+          TADD(v28, v28, v27);
+          pipe_barrier(PIPE_V);
+          TADD(v29, v28, v28);
+          pipe_barrier(PIPE_V);
+          TADD(v28, v26, v28);
+          pipe_barrier(PIPE_V);
+          TRELU(v28, v28);
+          pipe_barrier(PIPE_V);
+          TSUB(v27, v28, v29);
+          pipe_barrier(PIPE_V);
+          TRELU(v27, v27);
+          pipe_barrier(PIPE_V);
+          TSUB(v27, v28, v27);
+          pipe_barrier(PIPE_V);
+          TMUL(v27, v26, v27);
+          set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+          pipe_barrier(PIPE_V);
+          TDIV(v27, v27, v29);
+          set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+          wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+          pipe_barrier(PIPE_MTE3);
+          TSTORE(v39, v27);
+          set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+        };
+      };
+    };
+  }
+  pipe_barrier(PIPE_ALL);
+  wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+  wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+  wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+  wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+  #endif // __DAV_VEC__
+
+  return;
+}
+
diff --git a/examples/agent/hardswish/hardswish.pto b/examples/agent/hardswish/hardswish.pto
new file mode 100644
index 00000000..2c60a7f2
--- /dev/null
+++ b/examples/agent/hardswish/hardswish.pto
@@ -0,0 +1,63 @@
+module {
+  func.func @_kernel(%arg0: !pto.ptr<f16>, %arg1: !pto.ptr<f16>, %arg2: i32, %arg3: i32) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16384 = arith.constant 16384 : index
+    %0 = arith.index_cast %arg2 : i32 to index
+    %1 = arith.index_cast %arg3 : i32 to index
+    pto.section.vector {
+      %2 = arith.cmpi sgt, %1, %c0 : index
+      scf.if %2 {
+        %3 = arith.cmpi sge, %c16384, %1 : index
+        scf.if %3 {
+          %4 = pto.get_block_idx
+          %5 = pto.get_subblock_idx
+          %6 = pto.get_subblock_num
+          %7 = pto.get_block_num
+          %8 = arith.muli %4, %6 : i64
+          %9 = arith.addi %8, %5 : i64
+          %10 = arith.index_cast %9 : i64 to index
+          %11 = arith.muli %7, %6 : i64
+          %12 = arith.index_cast %11 : i64 to index
+          %13 = arith.ceildivsi %0, %12 : index
+          %14 = arith.muli %10, %13 : index
+          %15 = arith.addi %14, %13 : index
+          %16 = arith.minui %15, %0 : index
+          %17 = arith.subi %16, %14 : index
+          %18 = arith.muli %0, %1 : index
+          %19 = pto.make_tensor_view %arg0, shape = [%18] strides = [%c1] : !pto.tensor_view<?xf16>
+          %20 = pto.make_tensor_view %arg1, shape = [%18] strides = [%c1] : !pto.tensor_view<?xf16>
+          %21 = arith.cmpi sgt, %17, %c0 : index
+          scf.if %21 {
+            %22 = pto.alloc_tile valid_col = %1 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+            %23 = pto.alloc_tile valid_col = %1 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+            %24 = pto.alloc_tile valid_col = %1 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+            %25 = pto.alloc_tile valid_col = %1 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+            scf.for %arg4 = %c0 to %17 step %c1 {
+              %26 = arith.addi %14, %arg4 : index
+              %27 = arith.muli %26, %1 : index
+              %28 = pto.partition_view %19, offsets = [%27], sizes = [%1] : !pto.tensor_view<?xf16> -> !pto.partition_tensor_view<1x16384xf16>
+              %29 = pto.partition_view %20, offsets = [%27], sizes = [%1] : !pto.tensor_view<?xf16> -> !pto.partition_tensor_view<1x16384xf16>
+              pto.tload ins(%28 : !pto.partition_tensor_view<1x16384xf16>) outs(%22 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tsub ins(%22, %22 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.texp ins(%23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tadd ins(%23, %23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tadd ins(%24, %23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tadd ins(%24, %24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%25 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tadd ins(%22, %24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.trelu ins(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tsub ins(%24, %25 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.trelu ins(%23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tsub ins(%24, %23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tmul ins(%22, %23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tdiv ins(%23, %25 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tstore ins(%23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%29 : !pto.partition_tensor_view<1x16384xf16>)
+            }
+          }
+        }
+      }
+    }
+    return
+  }
+}
+
diff --git a/examples/agent/hardswish/hardswish_builder.py b/examples/agent/hardswish/hardswish_builder.py
new file mode 100644
index 00000000..248581bd
--- /dev/null
+++ b/examples/agent/hardswish/hardswish_builder.py
@@ -0,0 +1,168 @@
+from ptodsl import pto, tile, to_ir_module
+from ptodsl import scalar as s
+
+const = s.const
+
+# 32 KB of UB / sizeof(fp16) = 16384 elements per tile
+ELEMENTS_PER_TILE = 32 * 1024 // 2
+
+
+def meta_data():
+    dtype = pto.float16
+    ptr_type = pto.PtrType(dtype)
+    index_dtype = pto.int32
+
+    tensor_type = pto.TensorType(rank=1, dtype=dtype)
+    subtensor_type = pto.SubTensorType(shape=[1, ELEMENTS_PER_TILE], dtype=dtype)
+
+    tile_cfg = pto.TileBufConfig()
+    tile_type = pto.TileBufType(
+        shape=[1, ELEMENTS_PER_TILE],
+        valid_shape=[1, -1],
+        dtype=dtype,
+        memory_space="VEC",
+        config=tile_cfg,
+    )
+
+    return {
+        "ptr_type": ptr_type,
+        "index_dtype": index_dtype,
+        "tensor_type": tensor_type,
+        "subtensor_type": subtensor_type,
+        "tile_type": tile_type,
+    }
+
+
+def build_hardswish(fn_name="hardswish_fp16"):
+    """
+    Build a dynamic-batch Hard-Swish kernel in PTO DSL.
+
+    Computes y = x * clamp(x + 3, 0, 6) / 6, where:
+        clamp(v, 0, 6) = relu(v) - relu(v - 6)
+
+    Constants (1.0, 3.0, 6.0) are derived from the input tile itself using
+    the identity exp(a - a) = exp(0) = 1.0, which avoids the need for
+    scalar-tile broadcast operations not available in PTO DSL.
+
+    UB tile budget (fp16, 4 tiles x 32 KB = 128 KB < 192 KB):
+        tb_x   : input row x
+        tb_t1  : intermediate
+        tb_t2  : intermediate
+        tb_t3  : holds 6.0, then intermediate
+
+    Kernel args:
+        x_ptr   : fp16[batch * n_cols]  -- input
+        y_ptr   : fp16[batch * n_cols]  -- output
+        batch   : int32                 -- number of rows
+        n_cols  : int32                 -- elements per row; must be <= 16384
+    """
+
+    @to_ir_module(meta_data=meta_data)
+    def _kernel(
+        x_ptr: "ptr_type",
+        y_ptr: "ptr_type",
+        batch_i32: "index_dtype",
+        n_cols_i32: "index_dtype",
+    ) -> None:
+        c0 = const(0)
+        c1 = const(1)
+        c_tile = const(ELEMENTS_PER_TILE)
+
+        batch = s.index_cast(batch_i32)
+        n_cols = s.index_cast(n_cols_i32)
+
+        with pto.vector_section():
+            # Guard: n_cols must be in (0, ELEMENTS_PER_TILE].
+            with pto.if_context(n_cols > c0):
+                with pto.if_context(c_tile >= n_cols):
+                    cid = pto.get_block_idx()
+                    sub_bid = pto.get_subblock_idx()
+                    sub_bnum = pto.get_subblock_num()
+                    num_blocks = pto.get_block_num()
+
+                    vid = s.index_cast(cid * sub_bnum + sub_bid)
+                    num_cores = s.index_cast(num_blocks * sub_bnum)
+
+                    # Distribute rows across cores (row-level parallelism).
+                    rows_per_core = s.ceil_div(batch, num_cores)
+                    row_start = vid * rows_per_core
+                    row_end = s.min_u(row_start + rows_per_core, batch)
+                    num_rows = row_end - row_start
+
+                    total_elems = batch * n_cols
+                    tv_x = pto.as_tensor(
+                        tensor_type, ptr=x_ptr, shape=[total_elems], strides=[c1]
+                    )
+                    tv_y = pto.as_tensor(
+                        tensor_type, ptr=y_ptr, shape=[total_elems], strides=[c1]
+                    )
+
+                    with pto.if_context(num_rows > c0):
+                        # Allocate 4 UB tiles (128 KB total, well under 192 KB UB).
+                        tb_x = pto.alloc_tile(tile_type, valid_col=n_cols)
+                        tb_t1 = pto.alloc_tile(tile_type, valid_col=n_cols)
+                        tb_t2 = pto.alloc_tile(tile_type, valid_col=n_cols)
+                        tb_t3 = pto.alloc_tile(tile_type, valid_col=n_cols)
+
+                        for row_i in pto.range(c0, num_rows, c1):
+                            gm_offset = (row_start + row_i) * n_cols
+
+                            sv_x = pto.slice_view(
+                                subtensor_type,
+                                source=tv_x,
+                                offsets=[gm_offset],
+                                sizes=[n_cols],
+                            )
+                            sv_y = pto.slice_view(
+                                subtensor_type,
+                                source=tv_y,
+                                offsets=[gm_offset],
+                                sizes=[n_cols],
+                            )
+
+                            # Load input
+                            pto.load(sv_x, tb_x)
+
+                            # Derive constants from data:
+                            #   x - x = 0  =>  exp(0) = 1.0
+                            tile.sub(tb_x, tb_x, tb_t1)    # tb_t1 = 0.0
+                            tile.exp(tb_t1, tb_t1)          # tb_t1 = 1.0 (ones)
+
+                            # Build 3.0:
+                            tile.add(tb_t1, tb_t1, tb_t2)   # tb_t2 = 2.0
+                            tile.add(tb_t2, tb_t1, tb_t2)   # tb_t2 = 3.0
+
+                            # Build 6.0:
+                            tile.add(tb_t2, tb_t2, tb_t3)   # tb_t3 = 6.0
+
+                            # Compute x + 3:
+                            tile.add(tb_x, tb_t2, tb_t2)    # tb_t2 = x + 3
+
+                            # clamp(x+3, 0, 6) = relu(x+3) - relu(relu(x+3) - 6)
+                            tile.relu(tb_t2, tb_t2)          # tb_t2 = relu(x + 3) = max(x+3, 0)
+                            tile.sub(tb_t2, tb_t3, tb_t1)    # tb_t1 = max(x+3, 0) - 6
+                            tile.relu(tb_t1, tb_t1)           # tb_t1 = relu(max(x+3,0) - 6)
+                            tile.sub(tb_t2, tb_t1, tb_t1)    # tb_t1 = clamp(x+3, 0, 6)
+
+                            # y = x * clamp(x+3, 0, 6) / 6
+                            tile.mul(tb_x, tb_t1, tb_t1)     # tb_t1 = x * clamp(x+3, 0, 6)
+                            tile.div(tb_t1, tb_t3, tb_t1)    # tb_t1 = y
+
+                            pto.store(tb_t1, sv_y)
+
+    _ = fn_name
+    return _kernel
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--fn-name",
+        default="hardswish_fp16",
+        help="Generated kernel function name.",
+    )
+    args = parser.parse_args()
+    print(build_hardswish(fn_name=args.fn_name))
+
diff --git a/examples/agent/hardswish/run_hardswish.py b/examples/agent/hardswish/run_hardswish.py
new file mode 100644
index 00000000..34dee006
--- /dev/null
+++ b/examples/agent/hardswish/run_hardswish.py
@@ -0,0 +1,107 @@
+import argparse
+import ctypes
+
+import torch
+import torch.nn.functional as F
+import torch_npu  # noqa: F401
+
+from ptodsl.test_util import get_test_device
+
+
+def torch_to_ctypes(tensor):
+    return ctypes.c_void_p(tensor.data_ptr())
+
+
+def load_lib(lib_path, block_dim=24):
+    lib = ctypes.CDLL(lib_path)
+    lib.call_kernel.argtypes = [
+        ctypes.c_uint32,  # blockDim
+        ctypes.c_void_p,  # stream
+        ctypes.c_void_p,  # x
+        ctypes.c_void_p,  # y (output)
+        ctypes.c_uint32,  # batch
+        ctypes.c_uint32,  # n_cols
+    ]
+    lib.call_kernel.restype = None
+
+    def hardswish_func(x, y, batch, n_cols, block_dim=block_dim, stream_ptr=None):
+        if stream_ptr is None:
+            stream_ptr = torch.npu.current_stream()._as_parameter_
+        lib.call_kernel(
+            block_dim,
+            stream_ptr,
+            torch_to_ctypes(x),
+            torch_to_ctypes(y),
+            batch,
+            n_cols,
+        )
+
+    return hardswish_func
+
+
+def hardswish_ref(x):
+    """Reference Hard-Swish: y = x * clamp(x + 3, 0, 6) / 6"""
+    return F.hardswish(x.float()).to(x.dtype)
+
+
+def test_hardswish(lib_path, block_dim=24):
+    device = get_test_device()
+    torch.npu.set_device(device)
+
+    hardswish = load_lib(lib_path=lib_path, block_dim=block_dim)
+
+    torch.manual_seed(0)
+    dtype = torch.float16
+    batch_list = [1, 4, 22, 65]
+    n_cols_list = [128, 256, 512, 1024, 2048, 4096, 8192, 16384]
+
+    results = []
+    for batch in batch_list:
+        for n_cols in n_cols_list:
+            x = torch.randn(batch, n_cols, device=device, dtype=dtype).clamp(-8, 8)
+            y = torch.empty(batch, n_cols, device=device, dtype=dtype)
+
+            y_ref = hardswish_ref(x)
+            hardswish(x, y, batch, n_cols)
+            torch.npu.synchronize()
+
+            is_match = True
+            detail = ""
+            try:
+                torch.testing.assert_close(y, y_ref, rtol=1e-2, atol=1e-2)
+            except AssertionError as err:
+                is_match = False
+                detail = str(err).strip() if str(err) else "assert_close failed"
+
+            status = "match" if is_match else "mismatch"
+            print(f"[{status}] batch={batch}, n_cols={n_cols}, lib={lib_path}")
+            if detail:
+                print("  detail:")
+                print(detail)
+            results.append((batch, n_cols, status, detail))
+
+    print(f"\ndetailed summary for {lib_path}:")
+    for batch, n_cols, status, detail in results:
+        msg = f"  batch={batch}, n_cols={n_cols}, status={status}"
+        print(msg)
+        if detail:
+            print("    detail:")
+            print(detail)
+    return results
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--lib",
+        default="./hardswish_lib.so",
+        help="Path to the shared library generated by compile.sh.",
+    )
+    parser.add_argument(
+        "--block-dim",
+        type=int,
+        default=24,
+        help="Kernel blockDim (default: 24).",
+    )
+    args = parser.parse_args()
+    test_hardswish(args.lib, block_dim=args.block_dim)
diff --git a/openenv/agent_search.py b/openenv/agent_search.py
index 9552db2e..e5e9591d 100644
--- a/openenv/agent_search.py
+++ b/openenv/agent_search.py
@@ -237,6 +237,73 @@ def execute_tool(name: str, tool_input: dict) -> str:
 # ---------------------------------------------------------------------------
 # System prompt  (built from config)
 # ---------------------------------------------------------------------------
+
+_API_CHEATSHEET = """\
+## PTO-DSL API cheatsheet — use these, do NOT reimplement them
+
+```python
+from ptodsl import pto, tile, to_ir_module
+from ptodsl import scalar as s
+
+# --- Kernel entry point ---
+@to_ir_module
+def _kernel(x: pto.PtrType, y: pto.PtrType, batch: pto.int32, n_cols: pto.int32):
+    ...
+
+# --- Tile ops (all operate on allocated tile buffers, no return value) ---
+tile.mov(src, dst)              # copy tile
+tile.add(lhs, rhs, out)         # element-wise add
+tile.sub(lhs, rhs, out)         # element-wise subtract
+tile.mul(lhs, rhs, out)         # element-wise multiply
+tile.div(lhs, rhs, out)         # element-wise divide
+tile.exp(inp, out)              # element-wise e^x
+tile.log(inp, out)              # element-wise ln(x)
+tile.relu(inp, out)             # element-wise max(x, 0)
+tile.abs(inp, out)              # element-wise |x|
+tile.sqrt(inp, out)             # element-wise sqrt(x)
+tile.rsqrt(inp, out)            # element-wise 1/sqrt(x)
+tile.reciprocal(inp, out)       # element-wise 1/x
+tile.gather(src, out, indices)  # gather rows
+
+# NOT available as a single op — implement from primitives:
+#   tanh(x)    = 1 - 2/(exp(2x)+1)   [use tile.exp, tile.add, tile.div, tile.sub]
+#   sigmoid(x) = 1/(1+exp(-x))        [negate x, tile.exp, then tile.reciprocal+add]
+#   clamp(x,0,6) = relu(x) - relu(x-6)
+#   min(a,b)   = 0.5*(a+b) - 0.5*abs(a-b)
+#   max(a,b)   = relu(a-b) + b
+
+# --- Memory & layout ---
+pto.get_block_idx()             # current core index (scalar Value)
+pto.get_block_num()             # total number of cores (scalar Value)
+pto.as_tensor(type, ptr=, shape=, strides=)   # wrap pointer as tensor view
+pto.slice_view(type, source=, offsets=, sizes=)  # sub-view of a tensor
+pto.alloc_tile(tile_type)       # allocate UB tile buffer
+pto.load(source, dest)          # DMA: tensor view → tile buffer
+pto.store(source, dest)         # DMA: tile buffer → tensor view
+
+# --- Sections (context managers) ---
+with pto.vector_section():      # all tile ops must be inside here
+    ...
+
+# --- Scalar arithmetic (s.xxx returns Value) ---
+s.const(value)                  # integer constant
+s.ceil_div(a, b)                # ceil(a/b)
+s.index_cast(v, type)           # type cast
+# Scalar Value supports: +, -, *, //, %, <, >, ==, !=
+
+# --- Control flow ---
+with pto.range(start, stop, step) as i:   # for loop
+    ...
+with pto.if_context(cond):       # if branch
+    ...
+
+# --- Synchronization (when NOT using --enable-insert-sync) ---
+pto.record_event(record_op, wait_op, event_id)
+pto.wait_event(record_op, wait_op, event_id)
+pto.record_wait_pair(record_op, wait_op, event_id)
+pto.barrier(sync_op)
+```
+"""
 import textwrap as _textwrap
 
 
@@ -296,7 +363,7 @@ def _build_system_prompt(cfg: dict, kernel_dir: Path) -> str:
         except FileNotFoundError:
             local_section += f"\n### {fpath_str}\n(file not found)\n"
     if local_section:
-        local_section = "\n## PTO-DSL API (local, authoritative)\n" + local_section
+        local_section = "\n## PTO-DSL API source files (these are the exact functions you import and call — do NOT reimplement them)\n" + local_section
 
     # Fetch remote reference docs and embed them
     url_section = ""
@@ -318,6 +385,7 @@ def _build_system_prompt(cfg: dict, kernel_dir: Path) -> str:
 
         The kernel directory contains exactly these files: {file_listing}
         Do NOT attempt to read any other filenames — they do not exist.
+        {_API_CHEATSHEET}
         {preload_section}{local_section}{url_section}
         ## Your workflow
         1. Propose one targeted change based on the pre-loaded files above
@@ -369,7 +437,7 @@ def _build_create_system_prompt(cfg: dict, work_dir: Path, ref_dir: Path) -> str
         except FileNotFoundError:
             local_section += f"\n### {fpath_str}\n(file not found)\n"
     if local_section:
-        local_section = "\n## PTO-DSL API (local, authoritative)\n" + local_section
+        local_section = "\n## PTO-DSL API source files (these are the exact functions you import and call — do NOT reimplement them)\n" + local_section
 
     url_section = ""
     for entry in cfg.get("context_urls", []):
@@ -389,7 +457,7 @@ def _build_create_system_prompt(cfg: dict, work_dir: Path, ref_dir: Path) -> str
         Your task is to implement the {name} kernel entirely from scratch.
         The working directory starts with: {file_listing}
         You must create ALL required files yourself using write_file.
-
+        {_API_CHEATSHEET}
         {preload_section}{local_section}{url_section}
         ## Kernel to implement
         {description}

From ab494e74bf49eb3884abdcdd6524d06c08ff2eed Mon Sep 17 00:00:00 2001
From: mirkodevita <mirko.de.vita@h-partners.com>
Date: Tue, 10 Mar 2026 09:28:53 +0000
Subject: [PATCH 5/6] agent fast hadamard implementation beats baseline

---
 examples/agent/fast_hadamard/.gitignore       |   9 +
 examples/agent/fast_hadamard/README.md        |   8 +
 .../agent/fast_hadamard/_bench_wrapper.py     |  69 +++++
 examples/agent/fast_hadamard/caller.cpp       |  28 ++
 examples/agent/fast_hadamard/compile.sh       |  46 ++++
 .../agent/fast_hadamard/hadamard_builder.py   | 249 ++++++++++++++++++
 examples/agent/fast_hadamard/plot_perf.py     |  72 +++++
 examples/agent/fast_hadamard/run_hadamard.py  | 208 +++++++++++++++
 examples/aot/fast_hadamard/_bench_wrapper.py  |  69 +++++
 9 files changed, 758 insertions(+)
 create mode 100644 examples/agent/fast_hadamard/.gitignore
 create mode 100644 examples/agent/fast_hadamard/README.md
 create mode 100644 examples/agent/fast_hadamard/_bench_wrapper.py
 create mode 100644 examples/agent/fast_hadamard/caller.cpp
 create mode 100644 examples/agent/fast_hadamard/compile.sh
 create mode 100644 examples/agent/fast_hadamard/hadamard_builder.py
 create mode 100644 examples/agent/fast_hadamard/plot_perf.py
 create mode 100644 examples/agent/fast_hadamard/run_hadamard.py
 create mode 100644 examples/aot/fast_hadamard/_bench_wrapper.py

diff --git a/examples/agent/fast_hadamard/.gitignore b/examples/agent/fast_hadamard/.gitignore
new file mode 100644
index 00000000..663e5a84
--- /dev/null
+++ b/examples/agent/fast_hadamard/.gitignore
@@ -0,0 +1,9 @@
+hadamard_no_sync.pto
+hadamard_manual_sync.pto
+hadamard_auto_sync.cpp
+hadamard_manual_sync.cpp
+hadamard_auto_sync.pto
+hadamard_auto_sync_lib.so
+hadamard_manual_sync_lib.so
+
+perf_data*
diff --git a/examples/agent/fast_hadamard/README.md b/examples/agent/fast_hadamard/README.md
new file mode 100644
index 00000000..6b19cee9
--- /dev/null
+++ b/examples/agent/fast_hadamard/README.md
@@ -0,0 +1,8 @@
+Usage:
+
+```bash
+bash ./compile.sh  # generate PTO/CPP and build both auto/manual sync libs
+python ./run_hadamard.py  # test auto-sync lib (default)
+python ./run_hadamard.py --manual-sync  # test manual-sync lib
+python ./plot_perf.py  # optionally visualization
+```
diff --git a/examples/agent/fast_hadamard/_bench_wrapper.py b/examples/agent/fast_hadamard/_bench_wrapper.py
new file mode 100644
index 00000000..2a8ad08d
--- /dev/null
+++ b/examples/agent/fast_hadamard/_bench_wrapper.py
@@ -0,0 +1,69 @@
+"""
+Single-config benchmark wrapper for the agentic optimizer.
+Loads hadamard_auto_sync_lib.so and prints:  latency_ms=<number>
+"""
+import ctypes
+import math
+
+import torch
+import torch_npu  # noqa: F401
+
+from ptodsl.test_util import get_test_device
+
+# Representative shape — change to target a different operating point
+BATCH     = 32
+N         = 8192
+BLOCK_DIM = 24
+WARMUP    = 5
+ITERS     = 20
+
+LOG2_N = int(math.log2(N))
+
+
+def torch_to_ctypes(t):
+    return ctypes.c_void_p(t.data_ptr())
+
+
+device = get_test_device()
+torch.npu.set_device(device)
+
+lib = ctypes.CDLL("./hadamard_auto_sync_lib.so")
+lib.call_kernel.argtypes = [
+    ctypes.c_uint32,  # blockDim
+    ctypes.c_void_p,  # stream
+    ctypes.c_void_p,  # x (in-place)
+    ctypes.c_uint32,  # batch
+    ctypes.c_uint32,  # n
+    ctypes.c_uint32,  # log2_n
+]
+lib.call_kernel.restype = None
+
+
+def run(x):
+    lib.call_kernel(
+        BLOCK_DIM,
+        torch.npu.current_stream()._as_parameter_,
+        torch_to_ctypes(x),
+        BATCH,
+        N,
+        LOG2_N,
+    )
+
+
+# Allocate separate tensors to avoid cache reuse
+xs = [torch.randn(BATCH, N, device=device, dtype=torch.float16) for _ in range(WARMUP + ITERS)]
+
+for i in range(WARMUP):
+    run(xs[i])
+torch.npu.synchronize()
+
+starts = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)]
+ends   = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)]
+for i in range(ITERS):
+    starts[i].record()
+    run(xs[WARMUP + i])
+    ends[i].record()
+torch.npu.synchronize()
+
+ms = sum(s.elapsed_time(e) for s, e in zip(starts, ends)) / ITERS
+print(f"latency_ms={ms:.4f}")
diff --git a/examples/agent/fast_hadamard/caller.cpp b/examples/agent/fast_hadamard/caller.cpp
new file mode 100644
index 00000000..1ddaff6a
--- /dev/null
+++ b/examples/agent/fast_hadamard/caller.cpp
@@ -0,0 +1,28 @@
+#ifndef KERNEL_CPP
+#define KERNEL_CPP "hadamard_auto_sync.cpp"
+#endif
+#include KERNEL_CPP
+
+#ifndef KERNEL_FN
+#define KERNEL_FN fast_hadamard_autosync
+#endif
+
+#ifndef NUM_CORES
+#define NUM_CORES 24
+#endif
+
+extern "C" void call_kernel(
+    uint32_t blockDim,
+    void *stream,
+    uint8_t *x,
+    uint32_t batch,
+    uint32_t n,
+    uint32_t log2_n)
+{
+    uint32_t launch_blocks = blockDim > 0 ? blockDim : NUM_CORES;
+    KERNEL_FN<<<launch_blocks, nullptr, stream>>>(
+        reinterpret_cast<half *>(x),
+        static_cast<int32_t>(batch),
+        static_cast<int32_t>(n),
+        static_cast<int32_t>(log2_n));
+}
diff --git a/examples/agent/fast_hadamard/compile.sh b/examples/agent/fast_hadamard/compile.sh
new file mode 100644
index 00000000..a95f6148
--- /dev/null
+++ b/examples/agent/fast_hadamard/compile.sh
@@ -0,0 +1,46 @@
+set -e
+
+rm -f \
+    hadamard_auto_sync.pto hadamard_manual_sync.pto \
+    hadamard_auto_sync.cpp hadamard_manual_sync.cpp \
+    hadamard_auto_sync_lib.so hadamard_manual_sync_lib.so
+
+# Auto-sync path: rely on ptoas synchronization insertion.
+python ./hadamard_builder.py > ./hadamard_auto_sync.pto
+ptoas --enable-insert-sync ./hadamard_auto_sync.pto -o ./hadamard_auto_sync.cpp
+
+# Manual-sync path: explicit record/wait events from builder.
+python ./hadamard_builder.py --manual-sync > ./hadamard_manual_sync.pto
+ptoas ./hadamard_manual_sync.pto -o ./hadamard_manual_sync.cpp
+
+bisheng \
+    -I${ASCEND_TOOLKIT_HOME}/include \
+    -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \
+    -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \
+    -xcce -Xhost-start -Xhost-end \
+    -mllvm -cce-aicore-stack-size=0x8000 \
+    -mllvm -cce-aicore-function-stack-size=0x8000 \
+    -mllvm -cce-aicore-record-overflow=true \
+    -mllvm -cce-aicore-addr-transform \
+    -mllvm -cce-aicore-dcci-insert-for-scalar=false \
+    --npu-arch=dav-2201 -DMEMORY_BASE \
+    -std=gnu++17 \
+    ./caller.cpp \
+    -o ./hadamard_auto_sync_lib.so
+
+bisheng \
+    -I${ASCEND_TOOLKIT_HOME}/include \
+    -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \
+    -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \
+    -xcce -Xhost-start -Xhost-end \
+    -mllvm -cce-aicore-stack-size=0x8000 \
+    -mllvm -cce-aicore-function-stack-size=0x8000 \
+    -mllvm -cce-aicore-record-overflow=true \
+    -mllvm -cce-aicore-addr-transform \
+    -mllvm -cce-aicore-dcci-insert-for-scalar=false \
+    --npu-arch=dav-2201 -DMEMORY_BASE \
+    -std=gnu++17 \
+    -DKERNEL_CPP="\"hadamard_manual_sync.cpp\"" \
+    -DKERNEL_FN=fast_hadamard_manualsync \
+    ./caller.cpp \
+    -o ./hadamard_manual_sync_lib.so
diff --git a/examples/agent/fast_hadamard/hadamard_builder.py b/examples/agent/fast_hadamard/hadamard_builder.py
new file mode 100644
index 00000000..b98a390c
--- /dev/null
+++ b/examples/agent/fast_hadamard/hadamard_builder.py
@@ -0,0 +1,249 @@
+from ptodsl import pto, tile, to_ir_module
+from ptodsl import scalar as s
+
+const = s.const
+
+ELEMENTS_PER_TILE = 32 * 1024 // 2  # 32KB UB / sizeof(fp16)
+HALF_ELEMENTS_PER_TILE = ELEMENTS_PER_TILE // 2
+
+
+def meta_data():
+    dtype = pto.float16
+    ptr_type = pto.PtrType(dtype)
+    index_dtype = pto.int32
+
+    tensor_type = pto.TensorType(rank=1, dtype=dtype)
+    subtensor_full = pto.SubTensorType(shape=[1, ELEMENTS_PER_TILE], dtype=dtype)
+    subtensor_half = pto.SubTensorType(shape=[1, HALF_ELEMENTS_PER_TILE], dtype=dtype)
+
+    tile_cfg = pto.TileBufConfig()
+    tile_full = pto.TileBufType(
+        shape=[1, ELEMENTS_PER_TILE],
+        valid_shape=[1, -1],
+        dtype=dtype,
+        memory_space="VEC",
+        config=tile_cfg,
+    )
+    tile_half = pto.TileBufType(
+        shape=[1, HALF_ELEMENTS_PER_TILE],
+        valid_shape=[1, -1],
+        dtype=dtype,
+        memory_space="VEC",
+        config=tile_cfg,
+    )
+
+    return {
+        "ptr_type": ptr_type,
+        "index_dtype": index_dtype,
+        "tensor_type": tensor_type,
+        "subtensor_full": subtensor_full,
+        "subtensor_half": subtensor_half,
+        "tile_full": tile_full,
+        "tile_half": tile_half,
+    }
+
+
+@to_ir_module(meta_data=meta_data)
+def fast_hadamard_autosync(
+    x_ptr: "ptr_type",
+    batch_i32: "index_dtype",
+    n_i32: "index_dtype",
+    log2_n_i32: "index_dtype",
+) -> None:
+    c0 = const(0)
+    c1 = const(1)
+    c2 = const(2)
+
+    batch = s.index_cast(batch_i32)
+    n = s.index_cast(n_i32)
+    log2_n = s.index_cast(log2_n_i32)
+
+    cid = pto.get_block_idx()
+    sub_bid = pto.get_subblock_idx()
+    sub_bnum = pto.get_subblock_num()
+    num_blocks = pto.get_block_num()
+
+    vid = s.index_cast(cid * sub_bnum + sub_bid)  # vector core index
+    num_cores = s.index_cast(num_blocks * sub_bnum)  # number of vector cores
+
+    with pto.vector_section():
+        samples_per_core = s.ceil_div(batch, num_cores)
+        sample_offset = vid * samples_per_core
+
+        with pto.if_context(sample_offset < batch):
+            samples_end = sample_offset + samples_per_core
+            samples_to_process = s.select(
+                samples_end > batch,
+                batch - sample_offset,
+                samples_per_core,
+            )
+
+            with pto.if_context(samples_to_process > c0):
+                total_elements = batch * n
+                tv_x = pto.as_tensor(
+                    tensor_type, ptr=x_ptr, shape=[total_elements], strides=[c1]
+                )
+
+                # Single buffer set — no ping-pong overhead
+                tb_row = pto.alloc_tile(tile_full, valid_col=n)
+                tb_even = pto.alloc_tile(tile_half, valid_col=n // c2)
+                tb_odd = pto.alloc_tile(tile_half, valid_col=n // c2)
+
+                n_half = n // c2
+
+                # Alias row halves inside UB row tile
+                tb_first = tile.subset(tb_row, [c0, c0], [1, HALF_ELEMENTS_PER_TILE])
+                tb_second = tile.subset(tb_row, [c0, n_half], [1, HALF_ELEMENTS_PER_TILE])
+
+                # Direct loop over samples — no chunk abstraction overhead
+                for si in pto.range(c0, samples_to_process, c1):
+                    row_offset = (sample_offset + si) * n
+                    sv_row = pto.slice_view(
+                        subtensor_full, source=tv_x, offsets=[row_offset], sizes=[n]
+                    )
+
+                    pto.load(sv_row, tb_row)
+                    for _ in pto.range(c0, log2_n, c1):
+                        tile.gather(tb_row, tb_even, mask_pattern="P0101")
+                        tile.gather(tb_row, tb_odd, mask_pattern="P1010")
+                        tile.add(tb_even, tb_odd, tb_first)
+                        tile.sub(tb_even, tb_odd, tb_second)
+                    pto.store(tb_row, sv_row)
+
+
+@to_ir_module(meta_data=meta_data)
+def fast_hadamard_manualsync(
+    x_ptr: "ptr_type",
+    batch_i32: "index_dtype",
+    n_i32: "index_dtype",
+    log2_n_i32: "index_dtype",
+) -> None:
+    c0 = const(0)
+    c1 = const(1)
+    c2 = const(2)
+
+    batch = s.index_cast(batch_i32)
+    n = s.index_cast(n_i32)
+    log2_n = s.index_cast(log2_n_i32)
+
+    cid = pto.get_block_idx()
+    sub_bid = pto.get_subblock_idx()
+    sub_bnum = pto.get_subblock_num()
+    num_blocks = pto.get_block_num()
+
+    vid = s.index_cast(cid * sub_bnum + sub_bid)  # vector core index
+    num_cores = s.index_cast(num_blocks * sub_bnum)  # number of vector cores
+
+    with pto.vector_section():
+        samples_per_core = s.ceil_div(batch, num_cores)
+        sample_offset = vid * samples_per_core
+
+        with pto.if_context(sample_offset < batch):
+            samples_end = sample_offset + samples_per_core
+            samples_to_process = s.select(
+                samples_end > batch,
+                batch - sample_offset,
+                samples_per_core,
+            )
+
+            with pto.if_context(samples_to_process > c0):
+                total_elements = batch * n
+                tv_x = pto.as_tensor(
+                    tensor_type, ptr=x_ptr, shape=[total_elements], strides=[c1]
+                )
+
+                # Two independent tile sets (ping/pong) so event_id 0/1 map to
+                # disjoint UB buffers, matching the manual C++ reference.
+                tb_row_0 = pto.alloc_tile(tile_full, valid_col=n)
+                tb_even_0 = pto.alloc_tile(tile_half, valid_col=n // c2)
+                tb_odd_0 = pto.alloc_tile(tile_half, valid_col=n // c2)
+
+                tb_row_1 = pto.alloc_tile(tile_full, valid_col=n)
+                tb_even_1 = pto.alloc_tile(tile_half, valid_col=n // c2)
+                tb_odd_1 = pto.alloc_tile(tile_half, valid_col=n // c2)
+
+                n_half = n // c2
+
+                # Keep one sample per chunk. Multi-sample chunks interact
+                # poorly with static tile subset sizing in current PTO Python
+                # bindings and can corrupt rows for larger batches.
+                samples_per_load = c1
+                num_chunks = s.ceil_div(samples_to_process, samples_per_load)
+
+                def process_rows(
+                    tb_row, tb_even, tb_odd, event_id, gm_offset, cur_samples
+                ):
+                    for s in pto.range(c0, cur_samples, c1):
+                        row_offset = gm_offset + s * n
+                        sv_row = pto.slice_view(
+                            subtensor_full, source=tv_x, offsets=[row_offset], sizes=[n]
+                        )
+                        # Alias row halves inside UB row tile (no GM round-trip
+                        # per Hadamard iteration).
+                        tb_first = tile.subset(
+                            tb_row, [c0, c0], [1, HALF_ELEMENTS_PER_TILE]
+                        )
+                        tb_second = tile.subset(
+                            tb_row, [c0, n_half], [1, HALF_ELEMENTS_PER_TILE]
+                        )
+
+                        pto.wait_event("VEC", "LOAD", event_id=event_id)
+                        pto.wait_event("STORE_VEC", "VEC", event_id=event_id)
+                        pto.load(sv_row, tb_row)
+                        pto.record_wait_pair("LOAD", "VEC", event_id=event_id)
+
+                        for _ in pto.range(c0, log2_n, c1):
+                            tile.gather(tb_row, tb_even, mask_pattern="P0101")
+                            tile.gather(tb_row, tb_odd, mask_pattern="P1010")
+                            pto.barrier("VEC")
+                            tile.add(tb_even, tb_odd, tb_first)
+                            tile.sub(tb_even, tb_odd, tb_second)
+                            pto.barrier("VEC")
+
+                        pto.record_wait_pair(
+                            "VEC", "STORE_VEC", event_id=event_id
+                        )
+                        pto.store(tb_row, sv_row)
+                        pto.record_event("STORE_VEC", "VEC", event_id=event_id)
+                        pto.record_event("VEC", "LOAD", event_id=event_id)
+
+                for event_id in (0, 1):
+                    pto.record_event("VEC", "LOAD", event_id=event_id)
+                    pto.record_event("STORE_VEC", "VEC", event_id=event_id)
+
+                for chunk_i in pto.range(c0, num_chunks, c1):
+                    sample_done = chunk_i * samples_per_load
+                    chunk_left = samples_to_process - sample_done
+                    cur_samples = s.select(
+                        chunk_left < samples_per_load, chunk_left, samples_per_load
+                    )
+
+                    with pto.if_context(cur_samples > c0):
+                        gm_offset = (sample_offset + sample_done) * n
+                        use_ev0 = (chunk_i % c2) == c0
+
+                        with pto.if_context(use_ev0, has_else=True) as branch:
+                            process_rows(tb_row_0, tb_even_0, tb_odd_0, 0, gm_offset, cur_samples)
+                        with branch.else_context():
+                            process_rows(tb_row_1, tb_even_1, tb_odd_1, 1, gm_offset, cur_samples)
+
+                for event_id in (0, 1):
+                    pto.wait_event("VEC", "LOAD", event_id=event_id)
+                    pto.wait_event("STORE_VEC", "VEC", event_id=event_id)
+
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--manual-sync",
+        action="store_true",
+        help="Emit explicit record/wait events instead of relying on --enable-insert-sync.",
+    )
+    args = parser.parse_args()
+    if args.manual_sync:
+        module = fast_hadamard_manualsync
+    else:
+        module = fast_hadamard_autosync
+    print(module)
diff --git a/examples/agent/fast_hadamard/plot_perf.py b/examples/agent/fast_hadamard/plot_perf.py
new file mode 100644
index 00000000..6a894fd7
--- /dev/null
+++ b/examples/agent/fast_hadamard/plot_perf.py
@@ -0,0 +1,72 @@
+import os
+import csv
+try:
+    import matplotlib.pyplot as plt
+except ImportError:
+    plt = None
+
+
+def plot_bandwidth(input_dir="./perf_data/", output_path="bw_vs_shape.png"):
+    """Generate bandwidth plot from benchmark CSVs."""
+    if plt is None:
+        print("Warning: matplotlib is not installed; skipping plot generation.")
+        return
+
+    BENCH_BATCHES = [1, 5, 8, 10, 16, 20, 32, 40, 64, 128, 256, 512, 1024]
+    BENCH_BLOCK_DIMS = [20, 24]
+
+    fig, axes = plt.subplots(1, len(BENCH_BLOCK_DIMS), figsize=(14, 6), sharey=True)
+    if len(BENCH_BLOCK_DIMS) == 1:
+        axes = [axes]
+
+    for ax, block_dim in zip(axes, BENCH_BLOCK_DIMS):
+        csv_path = os.path.join(input_dir, f"fht_pto_bd{block_dim}.csv")
+        if not os.path.exists(csv_path):
+            ax.set_title(f"BLOCK_DIM={block_dim} (no data)")
+            continue
+
+        # Parse CSV: hidden_dim -> {batch: bw}
+        data = {}
+        with open(csv_path, encoding="utf-8") as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                batch = int(row["batch"])
+                n = int(row["N"])
+                bw = float(row["bandwidth_gbs"])
+                data.setdefault(n, {})[batch] = bw
+
+        for idx, hidden_dim in enumerate(sorted(data.keys())):
+            batches = sorted(data[hidden_dim].keys())
+            bws = [data[hidden_dim][b] for b in batches]
+
+            if idx < 10:
+                marker = "o"
+            else:
+                last_markers = ["s", "^", "D"]
+                marker = last_markers[idx - 10]
+
+            ax.plot(
+                batches,
+                bws,
+                marker=marker,
+                markersize=4,
+                label=f"hidden_dim={hidden_dim}",
+            )
+
+        ax.set_xscale("log", base=2)
+        ax.set_xticks(BENCH_BATCHES)
+        ax.set_xticklabels([str(b) for b in BENCH_BATCHES], rotation=45, fontsize=7)
+        ax.set_xlabel("batch")
+        ax.set_title(f"BLOCK_DIM={block_dim}")
+        ax.grid(True, alpha=0.3)
+        ax.legend(fontsize=7, ncol=2)
+
+    axes[0].set_ylabel("Bandwidth (GB/s)")
+    fig.suptitle("Fast Hadamard PTO-DSL: Bandwidth vs Shape")
+    fig.tight_layout()
+    fig.savefig(input_dir + output_path, dpi=150)
+    print(f"\nPlot saved to {input_dir+output_path}")
+
+
+if __name__ == "__main__":
+    plot_bandwidth()
diff --git a/examples/agent/fast_hadamard/run_hadamard.py b/examples/agent/fast_hadamard/run_hadamard.py
new file mode 100644
index 00000000..60ee8aee
--- /dev/null
+++ b/examples/agent/fast_hadamard/run_hadamard.py
@@ -0,0 +1,208 @@
+import os
+import argparse
+import ctypes
+import csv
+import math
+
+import torch
+import torch_npu  # noqa: F401
+
+from ptodsl.test_util import get_test_device
+
+ELEMENTS_PER_TILE = 32 * 1024 // 2  # 32KB UB / sizeof(fp16)
+
+
+def torch_to_ctypes(tensor):
+    return ctypes.c_void_p(tensor.data_ptr())
+
+
+def load_lib(lib_path, block_dim=24):
+    lib = ctypes.CDLL(lib_path)
+    lib.call_kernel.argtypes = [
+        ctypes.c_uint32,  # blockDim
+        ctypes.c_void_p,  # stream
+        ctypes.c_void_p,  # x (in-place)
+        ctypes.c_uint32,  # batch
+        ctypes.c_uint32,  # n
+        ctypes.c_uint32,  # log2_n
+    ]
+    lib.call_kernel.restype = None
+
+    def hadamard_func(x, batch, n, log2_n, block_dim=block_dim, stream_ptr=None):
+        if stream_ptr is None:
+            stream_ptr = torch.npu.current_stream()._as_parameter_
+        assert n <= ELEMENTS_PER_TILE, f"n must be <= {ELEMENTS_PER_TILE}, got {n}"
+        lib.call_kernel(
+            block_dim,
+            stream_ptr,
+            torch_to_ctypes(x),
+            batch,
+            n,
+            log2_n,
+        )
+
+    return hadamard_func
+
+
+def hadamard_ref_inplace(x):
+    """Reference FHT matching TGATHER(P0101/P1010) + TADD/TSUB layout."""
+    x = x.clone()
+    n = x.shape[-1]
+    n_half = n // 2
+    log2_n = int(math.log2(n))
+    for _ in range(log2_n):
+        even = x[..., 0::2].clone()
+        odd = x[..., 1::2].clone()
+        x[..., :n_half] = even + odd
+        x[..., n_half:] = even - odd
+    return x
+
+
+def _is_power_of_two(v):
+    return v > 0 and (v & (v - 1)) == 0
+
+
+def test_hadamard(hadamard_func, block_dim=24):
+    torch.manual_seed(0)
+    dtype = torch.float16
+    batch_list = [1, 7, 29, 65]
+    n_list = [128, 256, 512, 1024, 2048, 4096, 8192, 16384]
+
+    results = []
+    for batch in batch_list:
+        for n in n_list:
+            if not _is_power_of_two(n):
+                continue
+            log2_n = int(math.log2(n))
+            x = torch.randn(batch, n, device=device, dtype=dtype)
+            y_ref = hadamard_ref_inplace(x)
+
+            hadamard_func(x, batch, n, log2_n)
+            torch.npu.synchronize()
+
+            is_match = True
+            detail = ""
+            try:
+                torch.testing.assert_close(x, y_ref)
+            except AssertionError as err:
+                is_match = False
+                detail = str(err).strip() if str(err) else "assert_close failed"
+
+            status = "match" if is_match else "mismatch"
+            print(f"[{status}] batch={batch}, n={n}, lib={lib_path}")
+            if detail:
+                print("  detail:")
+                print(detail)
+            results.append((batch, n, status, detail))
+
+    print(f"detailed summary for {lib_path}:")
+    for batch, n, status, detail in results:
+        msg = f"  batch={batch}, n={n}, status={status}"
+        print(msg)
+        if detail:
+            print("    detail:")
+            print(detail)
+    return results
+
+
+def benchmark(hadamard_func, warmup=2, repeats=20, output_dir="./perf_data/"):
+    """Benchmark across (batch, N, block_dim) configs.
+
+    Uses separate input tensors per run to avoid L2 cache reuse,
+    and a single timing-event pair averaged over all runs.
+    """
+    TEST_HIDDEN_DIMS = [128, 256, 512, 1024, 2048, 4096, 8192, 16384]
+    BENCH_BATCHES = [1, 5, 8, 10, 16, 20, 32, 40, 64, 128, 256, 512, 1024]
+    BENCH_BLOCK_DIMS = [20, 24]
+
+    os.makedirs(output_dir, exist_ok=True)
+
+    for block_dim in BENCH_BLOCK_DIMS:
+        print(f"\n{'=' * 60}")
+        print(f"BENCHMARK (BLOCK_DIM={block_dim})")
+        print(f"{'=' * 60}")
+        header = (
+            f"{'batch':>6s}  {'N':>6s}"
+            f"  {'duration_us':>12s}  {'bandwidth_gbs':>14s}"
+        )
+        print(header)
+        print("-" * len(header))
+
+        records = []
+
+        for batch in BENCH_BATCHES:
+            for n in TEST_HIDDEN_DIMS:
+                log2_n = int(math.log2(n))
+                allocated = warmup + repeats
+
+                # Separate GM tensors to avoid L2 cache reuse
+                x_list = [
+                    torch.randn(batch, n, device="npu", dtype=torch.float16)
+                    for _ in range(allocated)
+                ]
+
+                # Warmup
+                for i in range(warmup):
+                    hadamard_func(x_list[i], batch, n, log2_n, block_dim=block_dim)
+                torch.npu.synchronize()
+
+                # Timed runs — single event pair, average over repeats
+                start = torch.npu.Event(enable_timing=True)
+                end = torch.npu.Event(enable_timing=True)
+
+                start.record()
+                for i in range(repeats):
+                    hadamard_func(
+                        x_list[warmup + i],
+                        batch,
+                        n,
+                        log2_n,
+                        block_dim=block_dim,
+                    )
+                end.record()
+                torch.npu.synchronize()
+
+                duration_ms = start.elapsed_time(end) / repeats
+                dur_us = duration_ms * 1e3
+
+                # Bandwidth: read + write = 2 * batch * n * sizeof(half)
+                data_bytes = 2 * batch * n * 2
+                bw_gbs = (data_bytes / 1e9) / (dur_us / 1e6) if dur_us > 0 else 0.0
+
+                print(f"{batch:>6d}  {n:>6d}" f"  {dur_us:>12.2f}  {bw_gbs:>14.2f}")
+                records.append(f"{batch},{n},{dur_us:.4f},{bw_gbs:.4f}")
+
+        csv_path = os.path.join(output_dir, f"fht_pto_bd{block_dim}.csv")
+        with open(csv_path, "w", encoding="utf-8") as f:
+            f.write("batch,N,duration_us,bandwidth_gbs\n")
+            f.write("\n".join(records) + "\n")
+        print(f"\nSaved to {csv_path}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--manual-sync",
+        action="store_true",
+        help="Use manual-sync library instead of the default auto-sync library.",
+    )
+    parser.add_argument(
+        "--block-dim",
+        type=int,
+        default=24,
+        help="Kernel blockDim (default: 24).",
+    )
+    args = parser.parse_args()
+
+    lib_path = (
+        "./hadamard_manual_sync_lib.so"
+        if args.manual_sync
+        else "./hadamard_auto_sync_lib.so"
+    )
+
+    device = get_test_device()
+    torch.npu.set_device(device)
+    hadamard_func = load_lib(lib_path=lib_path, block_dim=args.block_dim)
+
+    test_hadamard(hadamard_func)
+    benchmark(hadamard_func)
diff --git a/examples/aot/fast_hadamard/_bench_wrapper.py b/examples/aot/fast_hadamard/_bench_wrapper.py
new file mode 100644
index 00000000..2a8ad08d
--- /dev/null
+++ b/examples/aot/fast_hadamard/_bench_wrapper.py
@@ -0,0 +1,69 @@
+"""
+Single-config benchmark wrapper for the agentic optimizer.
+Loads hadamard_auto_sync_lib.so and prints:  latency_ms=<number>
+"""
+import ctypes
+import math
+
+import torch
+import torch_npu  # noqa: F401
+
+from ptodsl.test_util import get_test_device
+
+# Representative shape — change to target a different operating point
+BATCH     = 32
+N         = 8192
+BLOCK_DIM = 24
+WARMUP    = 5
+ITERS     = 20
+
+LOG2_N = int(math.log2(N))
+
+
+def torch_to_ctypes(t):
+    return ctypes.c_void_p(t.data_ptr())
+
+
+device = get_test_device()
+torch.npu.set_device(device)
+
+lib = ctypes.CDLL("./hadamard_auto_sync_lib.so")
+lib.call_kernel.argtypes = [
+    ctypes.c_uint32,  # blockDim
+    ctypes.c_void_p,  # stream
+    ctypes.c_void_p,  # x (in-place)
+    ctypes.c_uint32,  # batch
+    ctypes.c_uint32,  # n
+    ctypes.c_uint32,  # log2_n
+]
+lib.call_kernel.restype = None
+
+
+def run(x):
+    lib.call_kernel(
+        BLOCK_DIM,
+        torch.npu.current_stream()._as_parameter_,
+        torch_to_ctypes(x),
+        BATCH,
+        N,
+        LOG2_N,
+    )
+
+
+# Allocate separate tensors to avoid cache reuse
+xs = [torch.randn(BATCH, N, device=device, dtype=torch.float16) for _ in range(WARMUP + ITERS)]
+
+for i in range(WARMUP):
+    run(xs[i])
+torch.npu.synchronize()
+
+starts = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)]
+ends   = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)]
+for i in range(ITERS):
+    starts[i].record()
+    run(xs[WARMUP + i])
+    ends[i].record()
+torch.npu.synchronize()
+
+ms = sum(s.elapsed_time(e) for s, e in zip(starts, ends)) / ITERS
+print(f"latency_ms={ms:.4f}")

From 349f30630fdf405098f97c1a01ab40629dba2c2c Mon Sep 17 00:00:00 2001
From: mirkodevita <mirko.de.vita@h-partners.com>
Date: Tue, 10 Mar 2026 15:00:13 +0000
Subject: [PATCH 6/6] added gemini support ad claude generated sigmoid

---
 examples/agent/sigmoid/_bench_wrapper.py  |  67 +++++++++
 examples/agent/sigmoid/caller.cpp         |  24 ++++
 examples/agent/sigmoid/compile.sh         |  22 +++
 examples/agent/sigmoid/run_sigmoid.py     | 106 ++++++++++++++
 examples/agent/sigmoid/sigmoid.cpp        |  89 ++++++++++++
 examples/agent/sigmoid/sigmoid.pto        |  57 ++++++++
 examples/agent/sigmoid/sigmoid_builder.py | 151 ++++++++++++++++++++
 openenv/agent_search.py                   | 111 ++++++++-------
 openenv/anthropic_provider.py             |  71 ++++++++++
 openenv/gemini_provider.py                | 162 ++++++++++++++++++++++
 openenv/rms_norm_config.toml              |  66 +++++++++
 openenv/sigmoid_config.toml               |  50 +++++++
 12 files changed, 929 insertions(+), 47 deletions(-)
 create mode 100644 examples/agent/sigmoid/_bench_wrapper.py
 create mode 100644 examples/agent/sigmoid/caller.cpp
 create mode 100644 examples/agent/sigmoid/compile.sh
 create mode 100644 examples/agent/sigmoid/run_sigmoid.py
 create mode 100644 examples/agent/sigmoid/sigmoid.cpp
 create mode 100644 examples/agent/sigmoid/sigmoid.pto
 create mode 100644 examples/agent/sigmoid/sigmoid_builder.py
 create mode 100644 openenv/anthropic_provider.py
 create mode 100644 openenv/gemini_provider.py
 create mode 100644 openenv/rms_norm_config.toml
 create mode 100644 openenv/sigmoid_config.toml

diff --git a/examples/agent/sigmoid/_bench_wrapper.py b/examples/agent/sigmoid/_bench_wrapper.py
new file mode 100644
index 00000000..b3adbe24
--- /dev/null
+++ b/examples/agent/sigmoid/_bench_wrapper.py
@@ -0,0 +1,67 @@
+"""
+Single-config benchmark wrapper for the agentic optimizer.
+Loads sigmoid_lib.so and prints:  latency_ms=<number>
+"""
+import ctypes
+
+import torch
+import torch_npu  # noqa: F401
+
+from ptodsl.test_util import get_test_device
+
+# Representative shape
+BATCH     = 1024
+N_COLS    = 8192
+BLOCK_DIM = 24
+WARMUP    = 5
+ITERS     = 20
+
+
+def torch_to_ctypes(t):
+    return ctypes.c_void_p(t.data_ptr())
+
+
+device = get_test_device()
+torch.npu.set_device(device)
+
+lib = ctypes.CDLL("./sigmoid_lib.so")
+lib.call_kernel.argtypes = [
+    ctypes.c_uint32,  # blockDim
+    ctypes.c_void_p,  # stream
+    ctypes.c_void_p,  # x
+    ctypes.c_void_p,  # y (output)
+    ctypes.c_uint32,  # batch
+    ctypes.c_uint32,  # n_cols
+]
+lib.call_kernel.restype = None
+
+
+def run(x, y):
+    lib.call_kernel(
+        BLOCK_DIM,
+        torch.npu.current_stream()._as_parameter_,
+        torch_to_ctypes(x),
+        torch_to_ctypes(y),
+        BATCH,
+        N_COLS,
+    )
+
+
+dtype = torch.float16
+xs = [torch.randn(BATCH, N_COLS, device=device, dtype=dtype).clamp(-4, 4) for _ in range(WARMUP + ITERS)]
+y  = torch.empty(BATCH, N_COLS, device=device, dtype=dtype)
+
+for i in range(WARMUP):
+    run(xs[i], y)
+torch.npu.synchronize()
+
+starts = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)]
+ends   = [torch.npu.Event(enable_timing=True) for _ in range(ITERS)]
+for i in range(ITERS):
+    starts[i].record()
+    run(xs[WARMUP + i], y)
+    ends[i].record()
+torch.npu.synchronize()
+
+ms = sum(s.elapsed_time(e) for s, e in zip(starts, ends)) / ITERS
+print(f"latency_ms={ms:.4f}")
diff --git a/examples/agent/sigmoid/caller.cpp b/examples/agent/sigmoid/caller.cpp
new file mode 100644
index 00000000..e00da7b7
--- /dev/null
+++ b/examples/agent/sigmoid/caller.cpp
@@ -0,0 +1,24 @@
+#ifndef KERNEL_CPP
+#define KERNEL_CPP "sigmoid.cpp"
+#endif
+#include KERNEL_CPP
+
+#ifndef NUM_CORES
+#define NUM_CORES 24
+#endif
+
+extern "C" void call_kernel(
+    uint32_t blockDim,
+    void *stream,
+    uint8_t *x,
+    uint8_t *y,
+    uint32_t batch,
+    uint32_t n_cols)
+{
+    uint32_t launch_blocks = blockDim > 0 ? blockDim : NUM_CORES;
+    _kernel<<<launch_blocks, nullptr, stream>>>(
+        reinterpret_cast<half *>(x),
+        reinterpret_cast<half *>(y),
+        static_cast<int32_t>(batch),
+        static_cast<int32_t>(n_cols));
+}
diff --git a/examples/agent/sigmoid/compile.sh b/examples/agent/sigmoid/compile.sh
new file mode 100644
index 00000000..dd6dd2c8
--- /dev/null
+++ b/examples/agent/sigmoid/compile.sh
@@ -0,0 +1,22 @@
+set -e
+
+rm -f sigmoid.pto sigmoid.cpp sigmoid_lib.so
+
+python ./sigmoid_builder.py > ./sigmoid.pto
+ptoas --enable-insert-sync ./sigmoid.pto -o ./sigmoid.cpp
+
+bisheng \
+    -I${ASCEND_TOOLKIT_HOME}/include \
+    -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \
+    -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \
+    -xcce -Xhost-start -Xhost-end \
+    -mllvm -cce-aicore-stack-size=0x8000 \
+    -mllvm -cce-aicore-function-stack-size=0x8000 \
+    -mllvm -cce-aicore-record-overflow=true \
+    -mllvm -cce-aicore-addr-transform \
+    -mllvm -cce-aicore-dcci-insert-for-scalar=false \
+    --npu-arch=dav-2201 -DMEMORY_BASE \
+    -std=gnu++17 \
+    -DKERNEL_CPP="\"sigmoid.cpp\"" \
+    ./caller.cpp \
+    -o ./sigmoid_lib.so
diff --git a/examples/agent/sigmoid/run_sigmoid.py b/examples/agent/sigmoid/run_sigmoid.py
new file mode 100644
index 00000000..bf3cfd40
--- /dev/null
+++ b/examples/agent/sigmoid/run_sigmoid.py
@@ -0,0 +1,106 @@
+import argparse
+import ctypes
+
+import torch
+import torch_npu  # noqa: F401
+
+from ptodsl.test_util import get_test_device
+
+
+def torch_to_ctypes(tensor):
+    return ctypes.c_void_p(tensor.data_ptr())
+
+
+def load_lib(lib_path, block_dim=24):
+    lib = ctypes.CDLL(lib_path)
+    lib.call_kernel.argtypes = [
+        ctypes.c_uint32,  # blockDim
+        ctypes.c_void_p,  # stream
+        ctypes.c_void_p,  # x
+        ctypes.c_void_p,  # y (output)
+        ctypes.c_uint32,  # batch
+        ctypes.c_uint32,  # n_cols
+    ]
+    lib.call_kernel.restype = None
+
+    def sigmoid_func(x, y, batch, n_cols, block_dim=block_dim, stream_ptr=None):
+        if stream_ptr is None:
+            stream_ptr = torch.npu.current_stream()._as_parameter_
+        lib.call_kernel(
+            block_dim,
+            stream_ptr,
+            torch_to_ctypes(x),
+            torch_to_ctypes(y),
+            batch,
+            n_cols,
+        )
+
+    return sigmoid_func
+
+
+def sigmoid_ref(x):
+    """Reference sigmoid: 1 / (1 + exp(-x)), computed in fp32 then cast back."""
+    return torch.sigmoid(x.float()).to(x.dtype)
+
+
+def test_sigmoid(lib_path, block_dim=24):
+    device = get_test_device()
+    torch.npu.set_device(device)
+
+    sigmoid = load_lib(lib_path=lib_path, block_dim=block_dim)
+
+    torch.manual_seed(0)
+    dtype = torch.float16
+    batch_list = [1, 4, 22, 65]
+    n_cols_list = [128, 256, 512, 1024, 2048, 4096, 8192, 16384]
+
+    results = []
+    for batch in batch_list:
+        for n_cols in n_cols_list:
+            x = torch.randn(batch, n_cols, device=device, dtype=dtype).clamp(-4, 4)
+            y = torch.empty(batch, n_cols, device=device, dtype=dtype)
+
+            y_ref = sigmoid_ref(x)
+            sigmoid(x, y, batch, n_cols)
+            torch.npu.synchronize()
+
+            is_match = True
+            detail = ""
+            try:
+                torch.testing.assert_close(y, y_ref, rtol=1e-2, atol=1e-2)
+            except AssertionError as err:
+                is_match = False
+                detail = str(err).strip() if str(err) else "assert_close failed"
+
+            status = "match" if is_match else "mismatch"
+            print(f"[{status}] batch={batch}, n_cols={n_cols}, lib={lib_path}")
+            if detail:
+                print("  detail:")
+                print(detail)
+            results.append((batch, n_cols, status, detail))
+
+    print(f"\ndetailed summary for {lib_path}:")
+    for batch, n_cols, status, detail in results:
+        msg = f"  batch={batch}, n_cols={n_cols}, status={status}"
+        print(msg)
+        if detail:
+            print("    detail:")
+            print(detail)
+    return results
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--lib",
+        default="./sigmoid_lib.so",
+        help="Path to the shared library generated by compile.sh.",
+    )
+    parser.add_argument(
+        "--block-dim",
+        type=int,
+        default=24,
+        help="Kernel blockDim (default: 24).",
+    )
+    args = parser.parse_args()
+    test_sigmoid(args.lib, block_dim=args.block_dim)
diff --git a/examples/agent/sigmoid/sigmoid.cpp b/examples/agent/sigmoid/sigmoid.cpp
new file mode 100644
index 00000000..e8248de9
--- /dev/null
+++ b/examples/agent/sigmoid/sigmoid.cpp
@@ -0,0 +1,89 @@
+#include "pto/pto-inst.hpp"
+using namespace pto;
+__global__ AICORE void _kernel(__gm__ half* v1, __gm__ half* v2, int32_t v3, int32_t v4) {
+  unsigned v5 = 1;
+  unsigned v6 = 0;
+  int32_t v7 = 16384;
+  int32_t v8 = 1;
+  int32_t v9 = 0;
+  int64_t v10 = 0;
+  int64_t v11 = 32768;
+  int64_t v12 = 65536;
+  int64_t v13 = 98304;
+  using T = float;
+
+  #if defined(__DAV_VEC__)
+  set_mask_norm();
+  set_vector_mask(-1, -1);
+  set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+  set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+  set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+  set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+  if (v4 > v9) {
+    if (v4 <= v7) {
+      int64_t v14 = get_block_idx();
+      int64_t v15 = get_subblockid();
+      int64_t v16 = get_subblockdim();
+      int64_t v17 = (int64_t) v16;
+      int64_t v18 = get_block_num();
+      int32_t v19 = (int32_t) ((int64_t) (uint64_t) ((int64_t) v18) * (uint64_t) v17);
+      int32_t v20 = v3 / v19;
+      int32_t v21 = v3 % v19 != v9 && v3 < v9 == v19 < v9 ? v20 + v8 : v20;
+      int32_t v22 = (int32_t) ((uint32_t) ((int32_t) (int64_t) ((uint64_t) ((int64_t) (uint64_t) ((int64_t) v14) * (uint64_t) v17) + (uint64_t) ((int64_t) v15))) * (uint32_t) v21);
+      int32_t v23 = (int32_t) ((uint32_t) v22 + (uint32_t) v21);
+      int32_t v24 = (int32_t) ((uint32_t) ((uint32_t) v23 < (uint32_t) v3 ? v23 : v3) - (uint32_t) v22);
+      int32_t v25 = (int32_t) ((uint32_t) v3 * (uint32_t) v4);
+      if (v24 > v9) {
+        Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v26 = Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v4);
+        TASSIGN(v26, v10);
+        Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v27 = Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v4);
+        TASSIGN(v27, v11);
+        Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v28 = Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v4);
+        TASSIGN(v28, v12);
+        Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v29 = Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v4);
+        TASSIGN(v29, v13);
+        for (size_t v30 = (size_t) v9; v30 < ((size_t) v24); v30 += (size_t) v8) {
+          int32_t v31 = (int32_t) ((uint32_t) ((int32_t) (uint32_t) v22 + (uint32_t) ((int32_t) v30)) * (uint32_t) v4);
+          unsigned v32 = (unsigned) v4 * v5;
+          pto::Shape<1, 1, 1, 1, -1> v33 = pto::Shape<1, 1, 1, 1, -1>(v4);
+          pto::Stride<-1, -1, -1, -1, 1> v34 = pto::Stride<-1, -1, -1, -1, 1>(v32, v32, v32, v32);
+          GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v35 = GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v6 + (unsigned) v31 * (unsigned) v8), v33, v34);
+          unsigned v36 = (unsigned) v4 * v5;
+          pto::Shape<1, 1, 1, 1, -1> v37 = pto::Shape<1, 1, 1, 1, -1>(v4);
+          pto::Stride<-1, -1, -1, -1, 1> v38 = pto::Stride<-1, -1, -1, -1, 1>(v36, v36, v36, v36);
+          GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v39 = GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v6 + (unsigned) v31 * (unsigned) v8), v37, v38);
+          wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+          TLOAD(v26, v35);
+          set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+          wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+          TSUB(v29, v26, v26);
+          pipe_barrier(PIPE_V);
+          TEXP(v27, v29);
+          wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+          TSUB(v28, v29, v26);
+          set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+          pipe_barrier(PIPE_V);
+          TEXP(v28, v28);
+          pipe_barrier(PIPE_V);
+          TADD(v28, v28, v27);
+          pipe_barrier(PIPE_V);
+          TDIV(v28, v27, v28);
+          set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+          wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+          pipe_barrier(PIPE_MTE3);
+          TSTORE(v39, v28);
+          set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+        };
+      };
+    };
+  }
+  pipe_barrier(PIPE_ALL);
+  wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+  wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+  wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+  wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+  #endif // __DAV_VEC__
+
+  return;
+}
+
diff --git a/examples/agent/sigmoid/sigmoid.pto b/examples/agent/sigmoid/sigmoid.pto
new file mode 100644
index 00000000..722397d9
--- /dev/null
+++ b/examples/agent/sigmoid/sigmoid.pto
@@ -0,0 +1,57 @@
+module {
+  func.func @_kernel(%arg0: !pto.ptr<f16>, %arg1: !pto.ptr<f16>, %arg2: i32, %arg3: i32) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16384 = arith.constant 16384 : index
+    %0 = arith.index_cast %arg2 : i32 to index
+    %1 = arith.index_cast %arg3 : i32 to index
+    pto.section.vector {
+      %2 = arith.cmpi sgt, %1, %c0 : index
+      scf.if %2 {
+        %3 = arith.cmpi sge, %c16384, %1 : index
+        scf.if %3 {
+          %4 = pto.get_block_idx
+          %5 = pto.get_subblock_idx
+          %6 = pto.get_subblock_num
+          %7 = pto.get_block_num
+          %8 = arith.muli %4, %6 : i64
+          %9 = arith.addi %8, %5 : i64
+          %10 = arith.index_cast %9 : i64 to index
+          %11 = arith.muli %7, %6 : i64
+          %12 = arith.index_cast %11 : i64 to index
+          %13 = arith.ceildivsi %0, %12 : index
+          %14 = arith.muli %10, %13 : index
+          %15 = arith.addi %14, %13 : index
+          %16 = arith.minui %15, %0 : index
+          %17 = arith.subi %16, %14 : index
+          %18 = arith.muli %0, %1 : index
+          %19 = pto.make_tensor_view %arg0, shape = [%18] strides = [%c1] : !pto.tensor_view<?xf16>
+          %20 = pto.make_tensor_view %arg1, shape = [%18] strides = [%c1] : !pto.tensor_view<?xf16>
+          %21 = arith.cmpi sgt, %17, %c0 : index
+          scf.if %21 {
+            %22 = pto.alloc_tile valid_col = %1 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+            %23 = pto.alloc_tile valid_col = %1 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+            %24 = pto.alloc_tile valid_col = %1 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+            %25 = pto.alloc_tile valid_col = %1 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+            scf.for %arg4 = %c0 to %17 step %c1 {
+              %26 = arith.addi %14, %arg4 : index
+              %27 = arith.muli %26, %1 : index
+              %28 = pto.partition_view %19, offsets = [%27], sizes = [%1] : !pto.tensor_view<?xf16> -> !pto.partition_tensor_view<1x16384xf16>
+              %29 = pto.partition_view %20, offsets = [%27], sizes = [%1] : !pto.tensor_view<?xf16> -> !pto.partition_tensor_view<1x16384xf16>
+              pto.tload ins(%28 : !pto.partition_tensor_view<1x16384xf16>) outs(%22 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tsub ins(%22, %22 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%25 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.texp ins(%25 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tsub ins(%25, %22 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.texp ins(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tadd ins(%24, %23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tdiv ins(%23, %24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+              pto.tstore ins(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%29 : !pto.partition_tensor_view<1x16384xf16>)
+            }
+          }
+        }
+      }
+    }
+    return
+  }
+}
+
diff --git a/examples/agent/sigmoid/sigmoid_builder.py b/examples/agent/sigmoid/sigmoid_builder.py
new file mode 100644
index 00000000..9b79d014
--- /dev/null
+++ b/examples/agent/sigmoid/sigmoid_builder.py
@@ -0,0 +1,151 @@
+from ptodsl import pto, tile, to_ir_module
+from ptodsl import scalar as s
+
+const = s.const
+
+# 32 KB of UB / sizeof(fp16) = 16384 elements per tile
+ELEMENTS_PER_TILE = 32 * 1024 // 2
+
+
+def meta_data():
+    dtype = pto.float16
+    ptr_type = pto.PtrType(dtype)
+    index_dtype = pto.int32
+
+    tensor_type = pto.TensorType(rank=1, dtype=dtype)
+    subtensor_type = pto.SubTensorType(shape=[1, ELEMENTS_PER_TILE], dtype=dtype)
+
+    tile_cfg = pto.TileBufConfig()
+    tile_type = pto.TileBufType(
+        shape=[1, ELEMENTS_PER_TILE],
+        valid_shape=[1, -1],
+        dtype=dtype,
+        memory_space="VEC",
+        config=tile_cfg,
+    )
+
+    return {
+        "ptr_type": ptr_type,
+        "index_dtype": index_dtype,
+        "tensor_type": tensor_type,
+        "subtensor_type": subtensor_type,
+        "tile_type": tile_type,
+    }
+
+
+def build_sigmoid(fn_name="sigmoid_fp16"):
+    """
+    Build a dynamic-batch Sigmoid kernel in PTO DSL.
+
+    Computes y = 1 / (1 + exp(-x)), where:
+        -x        = 0 - x           (tile.sub)
+        exp(-x)                      (tile.exp)
+        1 + exp(-x)                  (tile.add with ones tile)
+        1 / (1 + exp(-x))           (tile.reciprocal)
+
+    Constants (1.0) are derived from the input tile itself using
+    the identity exp(x - x) = exp(0) = 1.0, which avoids the need for
+    scalar-tile broadcast operations not available in PTO DSL.
+
+    UB tile budget (fp16, 4 tiles x 32 KB = 128 KB < 192 KB):
+        tb_x    : input row x
+        tb_ones : constant 1.0 (recomputed each row via exp(x-x))
+        tb_tmp1 : intermediate / final output
+        tb_tmp2 : intermediate (zeros / neg_x)
+
+    Kernel args:
+        x_ptr   : fp16[batch * n_cols]  -- input
+        y_ptr   : fp16[batch * n_cols]  -- output
+        batch   : int32                 -- number of rows
+        n_cols  : int32                 -- elements per row; must be <= 16384
+    """
+
+    @to_ir_module(meta_data=meta_data)
+    def _kernel(
+        x_ptr: "ptr_type",
+        y_ptr: "ptr_type",
+        batch_i32: "index_dtype",
+        n_cols_i32: "index_dtype",
+    ) -> None:
+        c0 = const(0)
+        c1 = const(1)
+        c_tile = const(ELEMENTS_PER_TILE)
+
+        batch = s.index_cast(batch_i32)
+        n_cols = s.index_cast(n_cols_i32)
+
+        with pto.vector_section():
+            with pto.if_context(n_cols > c0):
+                with pto.if_context(c_tile >= n_cols):
+                    cid = pto.get_block_idx()
+                    sub_bid = pto.get_subblock_idx()
+                    sub_bnum = pto.get_subblock_num()
+                    num_blocks = pto.get_block_num()
+
+                    vid = s.index_cast(cid * sub_bnum + sub_bid)
+                    num_cores = s.index_cast(num_blocks * sub_bnum)
+
+                    rows_per_core = s.ceil_div(batch, num_cores)
+                    row_start = vid * rows_per_core
+                    row_end = s.min_u(row_start + rows_per_core, batch)
+                    num_rows = row_end - row_start
+
+                    total_elems = batch * n_cols
+                    tv_x = pto.as_tensor(
+                        tensor_type, ptr=x_ptr, shape=[total_elems], strides=[c1]
+                    )
+                    tv_y = pto.as_tensor(
+                        tensor_type, ptr=y_ptr, shape=[total_elems], strides=[c1]
+                    )
+
+                    with pto.if_context(num_rows > c0):
+                        tb_x = pto.alloc_tile(tile_type, valid_col=n_cols)
+                        tb_ones = pto.alloc_tile(tile_type, valid_col=n_cols)
+                        tb_tmp1 = pto.alloc_tile(tile_type, valid_col=n_cols)
+                        tb_tmp2 = pto.alloc_tile(tile_type, valid_col=n_cols)
+
+                        for row_i in pto.range(c0, num_rows, c1):
+                            gm_offset = (row_start + row_i) * n_cols
+
+                            sv_x = pto.slice_view(
+                                subtensor_type,
+                                source=tv_x,
+                                offsets=[gm_offset],
+                                sizes=[n_cols],
+                            )
+                            sv_y = pto.slice_view(
+                                subtensor_type,
+                                source=tv_y,
+                                offsets=[gm_offset],
+                                sizes=[n_cols],
+                            )
+
+                            pto.load(sv_x, tb_x)
+
+                            # Derive ones: exp(x - x) = exp(0) = 1.0
+                            tile.sub(tb_x, tb_x, tb_tmp2)      # tmp2 = 0.0
+                            tile.exp(tb_tmp2, tb_ones)          # ones = 1.0
+
+                            # sigmoid(x) = 1 / (1 + exp(-x))
+                            tile.sub(tb_tmp2, tb_x, tb_tmp1)    # tmp1 = -x  (0 - x)
+                            tile.exp(tb_tmp1, tb_tmp1)           # tmp1 = exp(-x)
+                            tile.add(tb_tmp1, tb_ones, tb_tmp1)  # tmp1 = 1 + exp(-x)
+                            tile.div(tb_ones, tb_tmp1, tb_tmp1)  # tmp1 = 1 / (1 + exp(-x))
+
+                            pto.store(tb_tmp1, sv_y)
+
+    _ = fn_name
+    return _kernel
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--fn-name",
+        default="sigmoid_fp16",
+        help="Generated kernel function name.",
+    )
+    args = parser.parse_args()
+    print(build_sigmoid(fn_name=args.fn_name))
diff --git a/openenv/agent_search.py b/openenv/agent_search.py
index e5e9591d..199e2eb6 100644
--- a/openenv/agent_search.py
+++ b/openenv/agent_search.py
@@ -18,19 +18,19 @@
 """
 
 import argparse
+import os
 import shutil
 import sys
-import time
 import tomllib
 import urllib.request
 from pathlib import Path
 
-import anthropic
-
 ROOT = Path(__file__).parent.parent
 sys.path.insert(0, str(ROOT))
 
 from openenv import KernelSearchEnv, KernelAction
+import openenv.anthropic_provider as _anthropic_provider
+import openenv.gemini_provider as _gemini_provider
 
 # ---------------------------------------------------------------------------
 # Config loading
@@ -103,6 +103,8 @@ def resolve_cmd(cmd: list[str]) -> list[str]:
 _parser.add_argument("--max-turns", type=int, default=30)
 _parser.add_argument("--max-tokens", type=int, default=None,
     help="Max tokens per API call (overrides config; default: 8192)")
+_parser.add_argument("--model", default=None,
+    help="Model to use, e.g. claude-opus-4-6 (default) or gemini-2.5-flash")
 _args = _parser.parse_args()
 
 CFG = load_config(_args.config)
@@ -404,7 +406,14 @@ def _build_system_prompt(cfg: dict, kernel_dir: Path) -> str:
         - One edit at a time; build and verify after each change.
         - If a build or test fails, revert the change with another edit_file call
           and try a different approach.
-        - Stop as soon as you achieve a confirmed speedup > 1.0x and explain what worked.
+        - Once you achieve a confirmed speedup > 1.0x, write a file called
+          `optimization_note.md` using write_file. It should explain:
+            * What changes were made compared to the baseline
+            * Why each change improves performance
+            * The final speedup achieved
+          Then stop.
+        - If you exhaust all ideas without a speedup, still write `optimization_note.md`
+          summarising what was tried and why it did not help.
     """)
 
 def _build_create_system_prompt(cfg: dict, work_dir: Path, ref_dir: Path) -> str:
@@ -488,12 +497,27 @@ def _build_create_system_prompt(cfg: dict, work_dir: Path, ref_dir: Path) -> str
     else _build_system_prompt(CFG, EXAMPLE_DIR)
 )
 
+# ---------------------------------------------------------------------------
+# Model dispatch
+# ---------------------------------------------------------------------------
+
+def _is_gemini(model: str) -> bool:
+    return model.startswith("gemini")
+
+
+def _call_model(messages: list[dict], system: str, model: str, max_tokens: int) -> tuple[list[dict], str]:
+    if _is_gemini(model):
+        return _gemini_provider.call(messages, system, model, max_tokens, TOOLS)
+    return _anthropic_provider.call(messages, system, model, max_tokens, TOOLS)
+
+
 # ---------------------------------------------------------------------------
 # Agentic loop
 # ---------------------------------------------------------------------------
 def run_agent(max_turns: int = 30) -> None:
-    mode = CFG.get("mode", "optimize")
-    client = anthropic.Anthropic()
+    mode  = CFG.get("mode", "optimize")
+    model = _args.model or CFG.get("model", "claude-opus-4-6")
+    print(f"Model: {model}")
 
     if mode == "create":
         name = CFG.get("kernel_name", "kernel")
@@ -525,59 +549,50 @@ def run_agent(max_turns: int = 30) -> None:
         print(f"{'─'*60}")
 
         max_tokens = _args.max_tokens or CFG.get("max_tokens", 8192)
-        for attempt in range(5):
-            try:
-                with client.messages.stream(
-                    model="claude-opus-4-6",
-                    max_tokens=max_tokens,
-                    thinking={"type": "adaptive"},
-                    system=SYSTEM,
-                    tools=TOOLS,
-                    messages=messages,
-                ) as stream:
-                    response = stream.get_final_message()
-                break
-            except anthropic.RateLimitError as e:
-                wait = 60 * (attempt + 1)
-                print(f"\n[Rate limit] Waiting {wait}s before retry ({attempt+1}/5)… ({e})")
-                time.sleep(wait)
-        else:
-            raise RuntimeError("Rate limit retries exhausted")
-
-        # Print any text Claude produced
-        for block in response.content:
-            if block.type == "text" and block.text.strip():
-                print(f"\n[Claude] {block.text.strip()}")
-
-        # Append assistant turn — strip trailing thinking blocks (API rejects them as final block)
-        content = list(response.content)
-        while content and getattr(content[-1], "type", None) == "thinking":
-            content.pop()
-        if content:
+        content_dicts, stop_reason = _call_model(messages, SYSTEM, model, max_tokens)
+
+        # Print any text the model produced
+        for block in content_dicts:
+            if block["type"] == "text" and block["text"].strip():
+                print(f"\n[Agent] {block['text'].strip()}")
+
+        # Append assistant turn — strip trailing thinking blocks (Anthropic API rejects
+        # them as the final block).  If the entire response is thinking (tokens exhausted
+        # before any output), inject synthetic turns to advance context.
+        content = list(content_dicts)
+        has_non_thinking = any(b["type"] != "thinking" for b in content)
+        if has_non_thinking:
+            while content and content[-1]["type"] == "thinking":
+                content.pop()
             messages.append({"role": "assistant", "content": content})
+        elif content:
+            # All-thinking: inject a synthetic round-trip to unblock the next call.
+            messages.append({"role": "assistant", "content": [{"type": "text", "text": "I need to use the tools to proceed."}]})
+            messages.append({"role": "user", "content": "Please proceed and use the available tools."})
 
         # Done if no tool calls
-        if response.stop_reason == "end_turn":
-            print("\n[Agent] Claude finished.")
+        if stop_reason == "end_turn":
+            print("\n[Agent] Finished.")
             break
 
         # Execute tool calls
         tool_results = []
-        for block in response.content:
-            if block.type != "tool_use":
+        for block in content_dicts:
+            if block["type"] != "tool_use":
                 continue
 
-            print(f"\n  → {block.name}({_fmt_input(block.input)})")
-            result = execute_tool(block.name, block.input)
+            print(f"\n  → {block['name']}({_fmt_input(block['input'])})")
+            result = execute_tool(block["name"], block["input"])
             print(f"  ← {result[:300]}" + ("…" if len(result) > 300 else ""))
 
             tool_results.append({
-                "type": "tool_result",
-                "tool_use_id": block.id,
-                "content": result,
+                "type":      "tool_result",
+                "tool_use_id": block["id"],
+                "tool_name": block["name"],   # kept for Gemini message translation
+                "content":   result,
             })
             # Track test results for create-mode exit condition
-            if block.name == "run_tests" and result.startswith("PASS"):
+            if block["name"] == "run_tests" and result.startswith("PASS"):
                 tests_passed = True
 
         if tool_results:
@@ -588,8 +603,10 @@ def run_agent(max_turns: int = 30) -> None:
             break
 
         if mode == "optimize" and env.best_speedup is not None and env.best_speedup > 1.0:
-            print(f"\n[Agent] Speedup achieved: {env.best_speedup:.4f}x — stopping early.")
-            break
+            note_path = EXAMPLE_DIR / "optimization_note.md"
+            if note_path.exists():
+                print(f"\n[Agent] Speedup achieved: {env.best_speedup:.4f}x and optimization_note.md written — stopping.")
+                break
 
     # Final summary
     print(f"\n{'='*60}")
diff --git a/openenv/anthropic_provider.py b/openenv/anthropic_provider.py
new file mode 100644
index 00000000..35765fc9
--- /dev/null
+++ b/openenv/anthropic_provider.py
@@ -0,0 +1,71 @@
+"""Anthropic/Claude provider for agent_search."""
+
+import time
+import anthropic
+
+
+def _sanitize_messages(messages: list[dict]) -> list[dict]:
+    """Strip provider-specific fields that Anthropic rejects (e.g. tool_name in tool_result)."""
+    clean = []
+    for msg in messages:
+        content = msg["content"]
+        if isinstance(content, list):
+            new_content = []
+            for block in content:
+                if isinstance(block, dict) and block.get("type") == "tool_result":
+                    block = {k: v for k, v in block.items() if k != "tool_name"}
+                new_content.append(block)
+            clean.append({**msg, "content": new_content})
+        else:
+            clean.append(msg)
+    return clean
+
+
+def call(
+    messages: list[dict],
+    system: str,
+    model: str,
+    max_tokens: int,
+    tools: list[dict],
+) -> tuple[list[dict], str]:
+    """Call Anthropic and return (content_dicts, stop_reason)."""
+    client = anthropic.Anthropic()
+
+    for attempt in range(5):
+        try:
+            with client.messages.stream(
+                model=model,
+                max_tokens=max_tokens,
+                thinking={"type": "adaptive"},
+                system=system,
+                tools=tools,
+                messages=_sanitize_messages(messages),
+            ) as stream:
+                response = stream.get_final_message()
+            break
+        except anthropic.RateLimitError as e:
+            wait = 60 * (attempt + 1)
+            print(f"\n[Rate limit] Waiting {wait}s before retry ({attempt+1}/5)… ({e})")
+            time.sleep(wait)
+    else:
+        raise RuntimeError("Rate limit retries exhausted")
+
+    content_dicts: list[dict] = []
+    for block in response.content:
+        if block.type == "thinking":
+            entry: dict = {"type": "thinking", "thinking": getattr(block, "thinking", "")}
+            sig = getattr(block, "signature", None)
+            if sig:
+                entry["signature"] = sig
+            content_dicts.append(entry)
+        elif block.type == "text":
+            content_dicts.append({"type": "text", "text": block.text})
+        elif block.type == "tool_use":
+            content_dicts.append({
+                "type": "tool_use",
+                "id": block.id,
+                "name": block.name,
+                "input": block.input,
+            })
+
+    return content_dicts, response.stop_reason
diff --git a/openenv/gemini_provider.py b/openenv/gemini_provider.py
new file mode 100644
index 00000000..9714a424
--- /dev/null
+++ b/openenv/gemini_provider.py
@@ -0,0 +1,162 @@
+"""Google Gemini provider for agent_search."""
+
+import os
+import time
+import uuid
+
+
+def _tools_for_gemini(tools: list[dict]) -> list[dict]:
+    """Convert Anthropic-style tool defs to Gemini function declarations.
+
+    Gemini generates MALFORMED_FUNCTION_CALL when a tool declaration has an
+    empty-properties schema (e.g. build/run_tests/benchmark).  Omit the
+    parameters key entirely for no-argument tools so Gemini knows they take none.
+    """
+    decls = []
+    for t in tools:
+        decl = {"name": t["name"], "description": t["description"]}
+        schema = t.get("input_schema", {})
+        if schema.get("properties"):
+            decl["parameters"] = schema
+        decls.append(decl)
+    return [{"function_declarations": decls}]
+
+
+def _messages_to_contents(messages: list[dict]) -> list[dict]:
+    """Translate shared message history to Gemini contents format."""
+    contents = []
+    for msg in messages:
+        role = "model" if msg["role"] == "assistant" else "user"
+        raw = msg["content"]
+        if isinstance(raw, str):
+            contents.append({"role": role, "parts": [{"text": raw}]})
+            continue
+        parts: list[dict] = []
+        for block in raw:
+            btype = block.get("type") if isinstance(block, dict) else getattr(block, "type", None)
+            if btype == "text":
+                text = block["text"] if isinstance(block, dict) else block.text
+                if text:
+                    parts.append({"text": text})
+            elif btype == "tool_use":
+                name = block["name"] if isinstance(block, dict) else block.name
+                inp  = block["input"] if isinstance(block, dict) else block.input
+                fc_dict: dict = {"name": name, "args": inp or {}}
+                # Echo thought_signature back — required by Gemini thinking models.
+                sig = block.get("thought_signature") if isinstance(block, dict) else getattr(block, "thought_signature", None)
+                if sig is not None:
+                    fc_dict["thought_signature"] = sig
+                parts.append({"function_call": fc_dict})
+            elif btype == "tool_result":
+                fn_name = block.get("tool_name", block.get("tool_use_id", "tool"))
+                result  = block.get("content", "")
+                parts.append({"function_response": {"name": fn_name, "response": {"result": result}}})
+            elif btype == "gemini_thought":
+                text = block.get("text", "") if isinstance(block, dict) else getattr(block, "text", "")
+                parts.append({"thought": True, "text": text})
+            # Anthropic-style thinking blocks are not forwarded to Gemini
+        if parts:
+            contents.append({"role": role, "parts": parts})
+    return contents
+
+
+def call(
+    messages: list[dict],
+    system: str,
+    model: str,
+    max_tokens: int,
+    tools: list[dict],
+) -> tuple[list[dict], str]:
+    """Call Gemini and return (content_dicts, stop_reason)."""
+    try:
+        from google import genai as _genai
+        from google.genai import types as _gtypes
+    except ImportError:
+        raise RuntimeError("google-genai not installed — run: pip install google-genai")
+
+    api_key = os.environ.get("GOOGLE_API_KEY")
+    if not api_key:
+        raise RuntimeError("GOOGLE_API_KEY environment variable not set")
+
+    gclient = _genai.Client(api_key=api_key)
+
+    max_retries = 5
+    base_delay = 30.0
+    for attempt in range(max_retries):
+        try:
+            response = gclient.models.generate_content(
+                model=model,
+                contents=_messages_to_contents(messages),
+                config=_gtypes.GenerateContentConfig(
+                    system_instruction=system,
+                    tools=_tools_for_gemini(tools),
+                    max_output_tokens=max_tokens,
+                ),
+            )
+            break
+        except Exception as _e:
+            err_str = str(_e)
+            if "429" not in err_str and "RESOURCE_EXHAUSTED" not in err_str:
+                raise
+            if "PerDay" in err_str or "per_day" in err_str.lower():
+                raise RuntimeError(
+                    "Gemini daily quota exhausted. "
+                    "Upgrade your plan or wait until quota resets (midnight Pacific).\n"
+                    f"  Original error: {_e}"
+                ) from None
+            if attempt == max_retries - 1:
+                raise
+            import re as _re
+            delay_match = _re.search(r"retryDelay.*?(\d+)s", err_str)
+            delay = float(delay_match.group(1)) if delay_match else base_delay * (2 ** attempt)
+            print(f"\n[Gemini] Rate limited (429), retrying in {delay:.0f}s (attempt {attempt+1}/{max_retries})...")
+            time.sleep(delay)
+
+    content_dicts: list[dict] = []
+    stop_reason = "end_turn"
+    candidate = response.candidates[0] if response.candidates else None
+
+    if candidate is None:
+        fb = getattr(response, "prompt_feedback", None)
+        raise RuntimeError(
+            f"Gemini returned no candidates — prompt may have been blocked.\n"
+            f"  prompt_feedback: {fb}"
+        )
+
+    if candidate.content is None:
+        finish = str(getattr(candidate, "finish_reason", "unknown"))
+        if "MALFORMED_FUNCTION_CALL" in finish:
+            print(f"\n[Gemini] Warning: malformed function call ({finish}), nudging model to retry.")
+            return [{"type": "text", "text": "My previous tool call was malformed. I will retry with correct syntax."}], "tool_use"
+        raise RuntimeError(
+            f"Gemini candidate has no content (finish_reason={finish!r}).\n"
+            f"  The response was likely filtered. Try a different model or simplify the system prompt."
+        )
+
+    for part in (candidate.content.parts or []):
+        if getattr(part, "thought", False):
+            content_dicts.append({"type": "gemini_thought", "text": getattr(part, "text", "") or ""})
+            continue
+        fc = getattr(part, "function_call", None)
+        if fc and getattr(fc, "name", None):
+            entry: dict = {
+                "type":  "tool_use",
+                "id":    f"gemini-{uuid.uuid4().hex[:8]}",
+                "name":  fc.name,
+                "input": dict(fc.args) if fc.args else {},
+            }
+            sig = getattr(fc, "thought_signature", None)
+            if sig is not None:
+                entry["thought_signature"] = sig
+            content_dicts.append(entry)
+            stop_reason = "tool_use"
+        elif getattr(part, "text", None):
+            content_dicts.append({"type": "text", "text": part.text})
+
+    finish = str(getattr(candidate, "finish_reason", ""))
+    if "MAX_TOKENS" in finish:
+        stop_reason = "max_tokens"
+    elif "STOP" in finish and stop_reason != "tool_use":
+        stop_reason = "end_turn"
+
+    return content_dicts, stop_reason
diff --git a/openenv/rms_norm_config.toml b/openenv/rms_norm_config.toml
new file mode 100644
index 00000000..91be9e40
--- /dev/null
+++ b/openenv/rms_norm_config.toml
@@ -0,0 +1,66 @@
+mode = "create"
+
+kernel_dir  = "examples/aot/geglu_dynamic_multicore"  # reference for API/build patterns
+kernel_name = "rms_norm"
+create_file = "rms_norm_builder.py"
+
+build_cmd = ["bash", "compile.sh"]
+test_cmd  = ["python", "run_rms_norm.py"]
+bench_cmd = ["python", "_bench_wrapper.py"]
+
+create_prompt = """
+Implement RMS Layer Normalisation: y = x * rsqrt(mean(x^2) + eps) * weight
+PyTorch reference: torch.nn.functional.rms_norm(x, (n_cols,), weight, eps=1e-6)
+
+Inputs:
+  x      — fp32 tensor (batch, n_cols)
+  weight — fp32 tensor (n_cols,)  — per-element learnable scale (gamma)
+Output:
+  out    — fp32 tensor (batch, n_cols)  — normalised and scaled values
+
+The kernel signature in caller.cpp should be:
+    extern "C" void call_kernel(uint32_t blockDim, void *stream,
+                                uint8_t *x, uint8_t *weight, uint8_t *out,
+                                uint32_t batch, uint32_t n_cols)
+
+The .so produced by compile.sh should be named rms_norm_lib.so.
+
+The test script run_rms_norm.py should:
+  - Use batch=16, n_cols=1024 (or similar)
+  - Compare against torch.nn.functional.rms_norm(x, (n_cols,), weight, eps=1e-6)
+  - Check max absolute difference < 1e-3 (fp32 tolerance)
+
+The bench wrapper _bench_wrapper.py should print latency_ms=<value>.
+
+Implementation notes:
+  - Process one row per loop iteration.
+  - Load weight once before the loop (it is shared across all rows).
+  - For each row:
+      1. Load x[row, :] into a tile.
+      2. Square element-wise: tmp = x * x.
+      3. Compute row sum of tmp → scalar tile (row_sum).
+      4. Divide scalar by n_cols → mean of squares.
+      5. Add eps (store eps as a scalar constant and add it using tile.add or a scalar op).
+      6. rsqrt of the scalar → rms_inv.
+      7. Broadcast rms_inv across a row tile and multiply x by it.
+      8. Multiply element-wise by weight.
+      9. Store result to out[row, :].
+  - eps = 1e-6; represent as a tile filled with that constant for tile.add,
+    or add it to the scalar before rsqrt.
+"""
+
+# Reference files from geglu — agent should adapt these for rms_norm
+preload_files = ["compile.sh", "caller.cpp", "geglu_builder.py", "run_geglu.py", "_bench_wrapper.py"]
+
+# Local PTO-DSL API files (authoritative)
+local_context_files = [
+    "ptodsl/api/pto.py",
+    "ptodsl/api/tile.py",
+    "ptodsl/api/scalar.py",
+    "ptodsl/api/control_flow.py",
+    "ptodsl/api/synchronization.py",
+]
+
+[[context_urls]]
+url   = "https://github.com/zhangstevenunity/PTOAS/blob/main/python/pto/dialects/pto.py"
+label = "PTO Python bindings"
diff --git a/openenv/sigmoid_config.toml b/openenv/sigmoid_config.toml
new file mode 100644
index 00000000..9acc5ff7
--- /dev/null
+++ b/openenv/sigmoid_config.toml
@@ -0,0 +1,50 @@
+mode = "create"
+
+kernel_dir  = "examples/aot/geglu_dynamic_multicore"  # reference for API patterns only
+kernel_name = "sigmoid"
+create_file = "sigmoid_builder.py"
+
+build_cmd = ["bash", "compile.sh"]
+test_cmd  = ["python", "run_sigmoid.py"]
+bench_cmd = ["python", "_bench_wrapper.py"]
+
+create_prompt = """
+Implement Sigmoid activation: y = 1 / (1 + exp(-x))
+PyTorch reference: torch.sigmoid(x)
+
+Input:  fp16 tensor (batch, n_cols)
+Output: fp16 tensor (batch, n_cols) — separate output pointer
+
+The kernel signature in caller.cpp should be:
+    extern "C" void call_kernel(uint32_t blockDim, void *stream,
+                                uint8_t *x, uint8_t *y,
+                                uint32_t batch, uint32_t n_cols)
+
+The .so produced by compile.sh should be named sigmoid_lib.so.
+The test script run_sigmoid.py should compare against torch.sigmoid(x.float()).to(x.dtype).
+The bench wrapper _bench_wrapper.py should print latency_ms=<value>.
+
+Implementation hint:
+  sigmoid(x) = 1 / (1 + exp(-x))
+  -x can be computed as tile.sub(zeros, x, neg_x) where zeros = tile with 0.0.
+  Use exp(x - x) = 1.0 to derive a tile of ones (no scalar-tile broadcast available).
+  Then: tile.exp(neg_x) -> tile.add(exp_neg_x, ones) -> tile.reciprocal(...).
+
+Scalar-to-tile broadcast is NOT available — do not pass scalar values to tile ops.
+"""
+
+# These are loaded from the reference kernel_dir as examples — adapt them for sigmoid
+preload_files = ["compile.sh", "caller.cpp", "geglu_builder.py", "run_geglu.py", "_bench_wrapper.py"]
+
+# Local PTO-DSL API files (authoritative, read from repo)
+local_context_files = [
+    "ptodsl/api/pto.py",
+    "ptodsl/api/tile.py",
+    "ptodsl/api/scalar.py",
+    "ptodsl/api/control_flow.py",
+    "ptodsl/api/synchronization.py",
+]
+
+[[context_urls]]
+url   = "https://github.com/zhangstevenunity/PTOAS/blob/main/python/pto/dialects/pto.py"
+label = "PTO Python bindings"