diff --git a/examples/jit_cpp/cross_core_sync_demo/.gitignore b/examples/jit_cpp/cross_core_sync_demo/.gitignore
new file mode 100644
index 00000000..f1fe8d1e
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/.gitignore
@@ -0,0 +1 @@
+*.so
\ No newline at end of file
diff --git a/examples/jit_cpp/cross_core_sync_demo/PTO_API_BUGS.md b/examples/jit_cpp/cross_core_sync_demo/PTO_API_BUGS.md
new file mode 100644
index 00000000..82107b36
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/PTO_API_BUGS.md
@@ -0,0 +1,232 @@
+# PTO API Known Bugs
+
+This document records confirmed bugs and their workarounds in the PTO-ISA
+library (`pto-isa-master`), found while implementing the kernels in this
+`cross_core_sync_demo` directory.
+
+---
+
+## Bug 1 — `TPipe` (TileData TPUSH/TPOP): `tileIndex` shared between Vec sub-blocks breaks multi-round kernels
+
+### Status
+
+**Confirmed present in**:
+- `/sources/pto-isa/include` (Ascend CANN 8.5.0 bundled headers)
+- `pto-isa-master` HEAD as of 2026-05-12 (commit `933ad5d8`)
+
+The pto-isa maintainers acknowledged the issue by changing their own reference
+test (`tests/npu/a2a3/src/st/testcase/tpushpop_cv/tpushpop_cv_kernel.cpp`) from
+`FIFO_DEPTH=2` to `FIFO_DEPTH=1` in commit `aef3a004` (PR !895, "optimize
+reverse dependency with synchronization period", merged 2026-05-07).
+
+### Affected API
+
+`TPUSH` / `TPOP` — TileData overloads (not the GlobalData / gm_pipe overloads):
+
+```cpp
+// Producer side (Cube in C2V, Vec in V2C)
+TPUSH<PipeType, TileProd, TileSplitAxis::TILE_UP_DOWN>(pipe, tile);
+
+// Consumer side (Vec in C2V, Cube in V2C)
+TPOP<PipeType, TileCons, TileSplitAxis::TILE_UP_DOWN>(pipe, tile);
+```
+
+The bug is specific to `TileSplitAxis::TILE_UP_DOWN` (or any split that causes
+2 Vec sub-blocks to call TPUSH or TPOP independently).  `TILE_NO_SPLIT` is
+believed to be unaffected.
+
+### Root Cause
+
+`TPipe<FlagID, Dir, SlotSize, FIFO_DEPTH>` stores a single `tileIndex` counter
+per `Producer` and per `Consumer` struct (`pipe.prod.tileIndex` /
+`pipe.cons.tileIndex`).  With `TILE_UP_DOWN`, a single core has **two** Vec
+sub-blocks (`vid = 0` and `vid = 1`); each sub-block independently calls TPUSH
+or TPOP in its own code path.
+
+Because `tileIndex++` fires once per TPUSH/TPOP call:
+
+| Direction | Who calls TPUSH | Who calls TPOP | Effect |
+|-----------|-----------------|----------------|--------|
+| C2V | 1 Cube core — once per round | 2 Vec sub-blocks — once each per round | `cons.tileIndex` advances by **2** per round; `prod.tileIndex` advances by 1 → desync after round 1 |
+| V2C | 2 Vec sub-blocks — once each per round | 1 Cube core — once per round | `prod.tileIndex` advances by **2** per round; `cons.tileIndex` advances by 1 → desync after round 1 |
+
+After N logical rounds with `FIFO_DEPTH=2`, `SyncPeriod=2`:
+- The side with 2 sub-blocks has `tileIndex = 2N`; the other side has `tileIndex = N`
+- The slot selected by `tileIndex % SLOT_NUM` drifts: the 2-sub-block side
+  starts reading/writing the wrong FIFO slot from round 2 onwards
+- The `shouldWaitFree` / `shouldNotifyFree` conditions also fire at wrong
+  intervals, causing the FFTS signal counts to diverge
+
+### Observed Failures
+
+**C2V (`matmul_add_c2v`, `stream_c2v`):**
+- `num_rounds = 1`: correct
+- `num_rounds = 2`: wrong numerical results (`max_diff ≈ 70` for fp32 output)
+- `num_rounds ≥ 4`: hardware exception — `L0C read/write conflict (FIXP reads
+  l0c, same address as cube write)`
+
+**V2C (`add_matmul_v2c`, `stream_v2c`):**
+- `num_rounds = 1`: correct
+- `num_rounds ≥ 2`: wrong numerical results and/or hardware exception
+
+Errors are **deterministic** (reproducible on every run with the same seed).
+
+### Minimal Reproduction
+
+```cpp
+// C2V direction — fails at num_rounds=2 with FIFO_DEPTH=2
+constexpr uint32_t FIFO_DEPTH = 2;
+using C2VPipe = TPipe<0, Direction::DIR_C2V, C2V_SLOT_SIZE, FIFO_DEPTH>;
+// ...
+for (int32_t r = 0; r < num_rounds; ++r) {
+    TPUSH<C2VPipe, TileL0C, TileSplitAxis::TILE_UP_DOWN>(pipe, c_l0);  // Cube
+    TPOP<C2VPipe, VecTile,  TileSplitAxis::TILE_UP_DOWN>(pipe, c_ub);  // Vec ×2 sub-blocks
+}
+```
+
+See `matmul_add/pushpop/matmul_add_c2v.cpp` and `add_matmul_v2c.cpp` for the
+full implementations that reproduce the failure.
+
+### Expected Behavior
+
+A kernel with `num_rounds > 1` using `TILE_UP_DOWN` should:
+1. Maintain correct FIFO slot selection across all rounds
+2. Maintain balanced FFTS signal counts (no accumulation)
+3. Produce correct numerical output for any `num_rounds ≥ 1`
+
+### Workarounds
+
+#### Workaround A — `FIFO_DEPTH=1` (pto-isa maintainers' approach)
+
+Change the pipe depth to 1.  With `SlotNum=1`, `SyncPeriod=1` (per
+`TPipe::SyncPeriod` formula), and the new `shouldWaitFree` code (PR !895)
+always returns `true` for `SlotNum == 1`.  This forces strict producer↔consumer
+alternation — no double-buffering — which avoids the tileIndex desync at the
+cost of pipeline overlap:
+
+```cpp
+constexpr uint32_t FIFO_DEPTH = 1;  // was 2
+using C2VPipe = TPipe<0, Direction::DIR_C2V, C2V_SLOT_SIZE, FIFO_DEPTH>;
+```
+
+**Important**: the Python-side `fifo_mem` allocation must also reflect
+`FIFO_DEPTH=1`:
+```python
+C2V_FIFO_ELEMS_PER_CORE = 1 * TILE_SIZE * TILE_SIZE  # not 2× anymore
+```
+
+**Note**: this workaround also requires fresh `fifo_mem` per kernel call in
+Python benchmarks.  Reusing the same `fifo_mem` tensor across repeated calls
+accumulates TPipe head/tail state (stored inside `fifo_mem`) and causes wrong
+results or hangs.  Pre-allocate one `fifo_mem` per call:
+```python
+fifos = [torch.zeros(BLOCK_DIM * FIFO_ELEMS_PER_CORE, ...) for _ in range(n_calls)]
+for i in range(n_calls):
+    kernel(A, B, C, D, fifos[i])
+```
+
+#### Workaround B — `gm_pipe` variant (GlobalData TPUSH/TPOP + explicit TALLOC/TFREE)
+
+Use the GlobalData overloads of TPUSH/TPOP together with TALLOC/TFREE and
+explicit TSTORE/TLOAD.  The `gm_pipe` implementation in this demo manages FIFO
+slot indices manually via `r % FIFO_DEPTH`, completely bypassing the shared
+`tileIndex` counter.  This variant supports arbitrary `num_rounds` with
+`FIFO_DEPTH=2`.
+
+See `matmul_add/gm_pipe/` and `stream_c2v_v2c/gm_pipe/`.
+
+**Important**: `gm_pipe` requires the newer `pto-isa-master` headers (not the
+CANN 8.5.0 bundled headers), because `TALLOC`, `TPOP(GlobalData)`, and `TFREE`
+are absent from `/sources/pto-isa/include`.
+
+#### Workaround C — raw FFTS flags (`raw_flag` variant)
+
+Avoid TPipe entirely.  Use `ffts_cross_core_sync` / `wait_flag_dev` directly
+with explicit workspace memory.  Supports arbitrary `num_rounds` with no tileIndex
+issue.  See `matmul_add/raw_flag/` and `stream_c2v_v2c/raw_flag/`.
+
+### Summary Table
+
+| Variant | API | Multi-round | Notes |
+|---------|-----|-------------|-------|
+| `pushpop` (FIFO_DEPTH=2) | TileData TPUSH/TPOP | ❌ broken ≥2 rounds | This bug |
+| `pushpop` (FIFO_DEPTH=1) | TileData TPUSH/TPOP | ✅ correct | No double-buffer overlap |
+| `gm_pipe` | GlobalData TPUSH/TPOP + TALLOC/TFREE | ✅ correct | Newer headers required |
+| `raw_flag` | Direct FFTS + manual workspace | ✅ correct | Most portable |
+
+---
+
+## Bug 2 — FFTS flag collision between kernels with the same `FlagID`
+
+### Status
+
+**Design limitation** (not a library bug per se, but a footgun).
+
+### Description
+
+`TPipe<FlagID, ...>` uses FFTS hardware flags `FlagID` (push/data-ready signal)
+and `FlagID+1` (free/slot-available signal) internally.  When two different
+kernels or pipe types use the same `FlagID`, their FFTS signals contaminate
+each other if the kernels are called sequentially in the same process on the
+same NPU core.
+
+**Example**: `C2VPipe = TPipe<0, DIR_C2V>` and `V2CPipe = TPipe<0, DIR_V2C>`
+both occupy FFTS flags 0 and 1.  A benchmark that calls the C2V kernel many
+times accumulates residual FFTS signals on flags 0/1.  The subsequent V2C
+kernel's first TPOP fires on a stale signal and reads wrong data.
+
+### Fix
+
+Assign non-overlapping `FlagID` values to pipes that are called from the same
+process:
+
+```cpp
+using C2VPipe = TPipe<0, Direction::DIR_C2V, ...>;  // uses flags 0, 1
+using V2CPipe = TPipe<2, Direction::DIR_V2C, ...>;  // uses flags 2, 3 — no collision
+```
+
+This fix is applied in:
+- `stream_c2v_v2c/pushpop/stream_v2c.cpp`
+- `stream_c2v_v2c/gm_pipe/stream_v2c.cpp`
+- `matmul_add/gm_pipe/add_matmul_v2c.cpp` (uses raw FFTS flags 2/3 instead of 0/1)
+
+---
+
+## Bug 3 — `TSTORE(c_global, c_l0)` (FIX pipe) conflicts with next-call `TMATMUL` (M pipe) in benchmark loops
+
+### Status
+
+**Synchronization omission** in the kernel itself, exposed by benchmark loops.
+
+### Description
+
+`TSTORE(dst_gm, c_l0)` on the FIX pipe initiates a DMA that reads from `c_l0`
+(L0C) and writes to global memory.  The DMA may still be in-flight when the
+kernel "completes" (all pipe instructions issued).  If back-to-back kernel calls
+are queued in the same NPU stream (as in a benchmark loop), the **next** call's
+`TMATMUL` can start writing to `c_l0` (M pipe) before the **previous** call's
+FIX DMA finishes reading it → `L0C read/write conflict` hardware exception.
+
+This does NOT manifest in correctness tests (few calls) but reliably crashes
+under benchmark load (`REPEATS=30` calls in a tight loop).
+
+### Fix
+
+Add `pipe_barrier(PIPE_ALL)` immediately after the last `TSTORE(c_global, c_l0)`
+in the Cube loop to drain the FIX pipe before kernel exit:
+
+```cpp
+for (int32_t r = 0; r < num_rounds; ++r) {
+    // ...
+    TSTORE(c_global, c_l0);
+    pipe_barrier(PIPE_ALL);  // ← drain FIX before kernel exit / next TMATMUL
+}
+```
+
+Or use the targeted `SetFlag<PIPE_FIX, PIPE_M>(1); WaitFlag<PIPE_FIX, PIPE_M>(1);`
+pair after each TSTORE (requires an additional `SetFlag<PIPE_M, PIPE_MTE1>(1);
+WaitFlag<PIPE_M, PIPE_MTE1>(1);` guard on L0A reuse — see
+`matmul_add/gm_pipe/add_matmul_v2c.cpp` for the full treatment).
+
+This fix is applied in `matmul_add/raw_flag/add_matmul_v2c.cpp` and the
+`gm_pipe` variants.
diff --git a/examples/jit_cpp/cross_core_sync_demo/README.md b/examples/jit_cpp/cross_core_sync_demo/README.md
new file mode 100644
index 00000000..ddeac504
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/README.md
@@ -0,0 +1,21 @@
+Demonstrate different API abstractions for Cube-Vector data exchange and synchronization
+
+There are currently 4 API sets that can express cross-core data passing:
+1. `ffts_cross_core_sync` & `wait_flag_dev`
+2. `TSYNC`
+3. `TPUSH` & `TPOP`
+4. `TPUSH` & `TPOP` & `TFREE` & `TALLOC`
+
+Purpose of this demo directory: Use *clear, minimum code* to demonstrate the *syntax and performance* differences between those API styles.
+
+- [stream_c2v_v2c](./stream_c2v_v2c)
+- [matmul_add](./matmul_add)
+- [linear_attn](./linear_attn)
+
+## Known PTO API Issues
+
+See **[PTO_API_BUGS.md](./PTO_API_BUGS.md)** for confirmed bugs and workarounds:
+
+- **Bug 1**: `TPipe` TileData TPUSH/TPOP with `TILE_UP_DOWN` and 2 Vec sub-blocks — `tileIndex` shared counter causes slot desync for `num_rounds ≥ 2` (`FIFO_DEPTH=2`). Confirmed present in latest `pto-isa-master` (as of 2026-05-12). Workaround: use `FIFO_DEPTH=1`, `gm_pipe`, or `raw_flag`.
+- **Bug 2**: FFTS flag collision when two pipes share the same `FlagID` (e.g., `TPipe<0, DIR_C2V>` and `TPipe<0, DIR_V2C>` both use flags 0/1).
+- **Bug 3**: `TSTORE(c_global, c_l0)` FIX-pipe DMA in-flight at kernel exit conflicts with next-call `TMATMUL` in benchmark loops → `L0C read/write conflict`.
diff --git a/examples/jit_cpp/cross_core_sync_demo/matmul_add/README.md b/examples/jit_cpp/cross_core_sync_demo/matmul_add/README.md
new file mode 100644
index 00000000..23905fdf
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/matmul_add/README.md
@@ -0,0 +1,103 @@
+# matmul_add — Three Cube↔Vector synchronization API styles + naive baseline
+
+Persistent kernels computing `C = A @ B + D` (C2V) and `C = (A + B) @ D` (V2C),
+implemented in three pipelined API styles and one non-pipelined naive baseline.
+
+## Variants
+
+| Subdirectory | Sync API | Pipeline | Note |
+|---|---|---|---|
+| `raw_flag/` | `ffts_cross_core_sync` + `wait_flag_dev` (direct) | round-level overlap | Reference, full multi-round correctness |
+| `pushpop/` | `TPipe` TileData — sync + data-move in one call | round-level overlap | `num_rounds=1` scope; multi-round has shared tileIndex issue |
+| `gm_pipe/` | `TPipe` GlobalData — `TPUSH`/`TPOP` signal only | round-level overlap | Full multi-round; requires pto-isa-master headers |
+| `naive_separate/` | `ffts_cross_core_sync` (one signal per stage) | **none** — stages are sequential | Slower baseline; shows pipeline benefit |
+
+## Kernel Algorithms
+
+| Kernel | Operation | C2V or V2C |
+|--------|-----------|------------|
+| `matmul_add_c2v` | `C = A @ B + D` | Cube GEMM → workspace → Vec add |
+| `add_matmul_v2c` | `C = (A + B) @ D` | Vec add → workspace → Cube GEMM |
+
+## Files
+
+| Subdirectory | Kernels | Python | Note |
+|---|---|---|---|
+| `raw_flag/` | `matmul_add_c2v.cpp`, `add_matmul_v2c.cpp` | `jit_util_*.py`, `run_*.py` | Reference |
+| `pushpop/` | same | `jit_util.py`, `run.py` | Single-round scope |
+| `gm_pipe/` | same | `jit_util.py`, `run.py` | pto-isa-master headers |
+| `naive_separate/` | `naive_separate.cpp` (both kernels) | `jit_util.py`, `run.py` | No pipeline |
+
+## Reproduce
+
+```bash
+BASE=examples/jit_cpp/cross_core_sync_demo/matmul_add
+
+# raw_flag: correctness (30/30 seeds × rounds) + bandwidth
+python $BASE/raw_flag/run_matmul_add_c2v.py
+python $BASE/raw_flag/run_add_matmul_v2c.py
+
+# pushpop: correctness (num_rounds=1 scope) + bandwidth at batch=3072
+python $BASE/pushpop/run.py
+
+# gm_pipe: correctness + bandwidth (both kernels in one script)
+python $BASE/gm_pipe/run.py
+
+# naive_separate: correctness + bandwidth vs torch baseline
+python $BASE/naive_separate/run.py
+```
+
+Each script prints correctness results followed by a bandwidth table.
+Set `NPU_DEVICE=npu:N` to select a specific NPU.
+
+## API Syntax Comparison (C2V direction: `C = A @ B + D`)
+
+```
+                   Sync API                    │  Data API
+──────────────────────────────────────────────────────────────────────────
+raw_flag  Cube: ffts_cross_core_sync(FIX, FLAG_C2V)  │  TSTORE(ws_half, c_l0)
+          Vec:  wait_flag_dev(FLAG_C2V)               │  TLOAD(c_ub, ws)
+                ffts_cross_core_sync(MTE3, FLAG_V2C)  │
+
+pushpop   Cube: TPUSH<C2VPipe, TileL0C, UP_DOWN>(pipe, c_l0)     ← sync + data in one call
+          Vec:  TPOP<C2VPipe, VecTile<float>, UP_DOWN>(pipe, c_ub_float)
+
+gm_pipe   Cube: TALLOC<C2VPipe, SlotHalf, UP_DOWN>(pipe, slot)   ← TPipe allocates slot
+                TSTORE(slot, c_l0)               ← explicit fp32→fp16 (hardware FIX)
+                TPUSH<C2VPipe, SlotHalf, UP_DOWN>(pipe, slot)    ← TPipe signals consumer
+          Vec:  TPOP<C2VPipe, PopHalf, UP_DOWN>(pipe, pop)        ← TPipe waits + slot ptr
+                TLOAD(c_ub, pop)                 ← explicit load
+                TFREE<C2VPipe, PopHalf, UP_DOWN>(pipe, pop)       ← TPipe notifies free
+
+naive     Cube: (all GEMMs) → pipe_barrier → ffts_cross_core_sync(FIX, FLAG_C2V)
+          Vec:  wait_flag_dev(FLAG_C2V) → (all adds)
+               ↑ one signal after ALL rounds, no round overlap
+```
+
+## Measured Bandwidth (910B2, TILE_SIZE=128, 24 Cube cores)
+
+Peak effective external bandwidth (read A+B+D, write C; workspace not counted):
+
+| Variant | matmul_add_c2v peak | add_matmul_v2c peak | Notes |
+|---------|--------------------|--------------------|-------|
+| `raw_flag` | **1357 GB/s** | **1543 GB/s** | Reference pipelined, 64 rounds |
+| `pushpop` | **1954 GB/s** (32 rounds, f32 slot) | 45 GB/s (batch=3072) | C2V: FIFO_DEPTH=1 workaround enables multi-round (f32 slot is 2× larger than f16 → 2× bw); V2C: 2-sub-block producer deadlocks with FIFO_DEPTH=1, remains rounds=1 only |
+| `gm_pipe` | **1837 GB/s** | **1496 GB/s** | 64 rounds; requires pto-isa-master headers |
+| `naive_separate` | 1174 GB/s | 1211 GB/s | No pipeline — **15–30% lower** |
+| `torch.mm + torch.add` | ~2000 GB/s\* | ~2100 GB/s\* | Two separate launches |
+
+\* torch bandwidth appears high because torch's GEMM is a highly tuned library kernel
+that may cache intermediate results on-chip; the naive kernel instead round-trips
+through full-batch HBM workspace, making it slower than torch for large batches.
+The pipelined variants are faster than naive because they overlap Cube and Vec
+round-by-round, reducing the effective latency of cross-core data movement.
+
+## Known Limitations
+
+- **pushpop multi-round (C2V)**: Applying the `FIFO_DEPTH=1` workaround (forces `SyncPeriod=1`, strict alternation) makes `matmul_add_c2v` work for arbitrary `num_rounds`.  Note the C2V slot is `float32` (64 KB), so bandwidth figures are 2× those of the half-slot variants.
+
+- **pushpop multi-round (V2C)**: `add_matmul_v2c` cannot be fixed with `FIFO_DEPTH=1`: with only 1 free signal seeded in the constructor, sub-block 0 consumes it and sub-block 1 deadlocks at `allocate()`.  V2C therefore remains scoped to `num_rounds=1`.  Use `gm_pipe` for multi-round V2C.
+
+- **gm_pipe header requirement**: `TALLOC`/`TPOP(GlobalData)`/`TFREE` are in `pto-isa-master` headers, not the default `/sources/pto-isa`.  The `gm_pipe/jit_util.py` uses `-I/workdir/pto-isa-master/include`.
+
+- **naive_separate workspace**: Uses `workspace[batch, TILE_SIZE]` (full-batch allocation) vs `workspace[num_cores * TILE_SIZE, TILE_SIZE]` for pipelined variants.  The larger workspace means more HBM traffic per kernel call at large batch sizes.
diff --git a/examples/jit_cpp/cross_core_sync_demo/matmul_add/gm_pipe/add_matmul_v2c.cpp b/examples/jit_cpp/cross_core_sync_demo/matmul_add/gm_pipe/add_matmul_v2c.cpp
new file mode 100644
index 00000000..ae476913
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/matmul_add/gm_pipe/add_matmul_v2c.cpp
@@ -0,0 +1,278 @@
+// =============================================================================
+// add_matmul_v2c.cpp  —  C = (A + B) @ D   [gm_pipe variant]
+//
+// ── API variant: GlobalData TSTORE/TLOAD via explicit slot views ───────────
+//
+// The "gm_pipe" concept: TSTORE and TLOAD operate on GlobalTensor slot views
+// computed from the FIFO buffer, while synchronization uses raw ffts calls.
+// This separates data-movement (explicit TSTORE/TLOAD) from FIFO sync, which
+// is the defining property of the GlobalData path (TALLOC+TPUSH / TPOP+TFREE).
+//
+//  raw_flag (fixed workspace)         │  gm_pipe (explicit double-buffer slot view)
+//  ─────────────────────────────────── ─────────────────────────────────────────────
+//  Vec:  TSTORE(ws_half, a_ub)         │  slot = slot_view(fifo + slot_idx * SLOT_SIZE)
+//        ffts_cross_core_sync(FLAG_V2C)│  TSTORE(slot, a_ub)
+//                                      │  ffts_cross_core_sync(FLAG_V2C)
+//  ─────────────────────────────────── ─────────────────────────────────────────────
+//  Cube: wait_flag_dev(FLAG_V2C)       │  wait_flag_dev(FLAG_V2C)
+//        TLOAD(ab_l1, ws_half)         │  slot = slot_view(fifo + slot_idx * SLOT_SIZE)
+//        ffts_cross_core_sync(FLAG_C2V)│  TLOAD(ab_l1, slot)
+//                                      │  ffts_cross_core_sync(FLAG_C2V)
+//
+// Key gm_pipe advantage over raw_flag:
+//   • FIFO_DEPTH=2 double-buffer: Vec writes to slot (r%2) while Cube reads
+//     slot ((r-1)%2) — they use different slots and can pipeline one iteration.
+//   • The GlobalTensor slot view makes the data path explicit (vs implicit
+//     workspace pointer) and can be typed at a sub-tile granularity.
+//
+// Note: we use raw ffts calls for sync rather than TALLOC/TPUSH/TPOP/TFREE
+// because TALLOC on the Vec side (2 sub-blocks) requires careful tileIndex
+// management that TPUSH(TileData) handles atomically but TALLOC+TPUSH(GlobalData)
+// exposes to the programmer.  The C2V direction uses full TALLOC+TPUSH API since
+// Cube has only one sub-block.
+//
+// Python: all float16.  Reference: (A + B) @ D
+// =============================================================================
+
+#define MEMORY_BASE
+#include <pto/pto-inst.hpp>
+#include <pto/common/fifo.hpp>
+#include "acl/acl.h"
+#include <runtime/rt_ffts.h>
+
+using namespace pto;
+
+#ifdef __DAV_CUBE__
+constexpr bool DAV_CUBE = true;
+#else
+constexpr bool DAV_CUBE = false;
+#endif
+#ifdef __DAV_VEC__
+constexpr bool DAV_VEC = true;
+#else
+constexpr bool DAV_VEC = false;
+#endif
+
+#define TILE_SIZE 128
+#define HALF_TILE  64
+#define VEC_NUM     2
+
+#ifdef __CCE_AICORE__
+
+constexpr uint32_t L1_D_OFFSET  = 0u;
+constexpr uint32_t L1_AB_OFFSET = TILE_SIZE * TILE_SIZE * sizeof(half);  // 32 KB
+constexpr uint32_t L0_OFFSET    = 0u;
+constexpr uint32_t UB_A_OFFSET  = 0u;
+constexpr uint32_t UB_B_OFFSET  = HALF_TILE * TILE_SIZE * sizeof(half);  // 16 KB
+
+constexpr uint32_t FIFO_DEPTH      = 2u;
+constexpr uint32_t V2C_SLOT_SIZE   = TILE_SIZE * TILE_SIZE * sizeof(half); // 32 KB/slot
+constexpr uint32_t V2C_FIFO_BYTES  = FIFO_DEPTH * V2C_SLOT_SIZE;           // 64 KB/core
+
+// FFTS flag assignments (raw, not managed by TPipe)
+// Use IDs 2 and 3 to avoid collision with matmul_add_c2v's TPipe<0> which
+// internally occupies flags 0 (push/data-ready) and 1 (free/slot-returned).
+constexpr uint32_t FLAG_V2C_DATA = 2;  // Vec signals Cube: slot written
+constexpr uint32_t FLAG_V2C_FREE = 3;  // Cube signals Vec: slot consumed
+// mode=2 (CV_CORES_SYNC): one Cube broadcast → both Vec sub-blocks unblock,
+//                         both Vec sub-blocks signal → Cube unblocks.
+constexpr uint32_t SIGNAL_MODE   = 2;
+
+using TileL1 = Tile<TileType::Mat, half, TILE_SIZE, TILE_SIZE,
+                    BLayout::ColMajor, TILE_SIZE, TILE_SIZE,
+                    SLayout::RowMajor, 512, PadValue::Zero>;
+using TileL0A = TileLeft<half,  TILE_SIZE, TILE_SIZE>;
+using TileL0B = TileRight<half, TILE_SIZE, TILE_SIZE>;
+using TileL0C = TileAcc<float,  TILE_SIZE, TILE_SIZE>;
+
+using TileVecUB = Tile<TileType::Vec, half, HALF_TILE, TILE_SIZE,
+                       BLayout::RowMajor, HALF_TILE, TILE_SIZE,
+                       SLayout::NoneBox, 512, PadValue::Null>;
+
+// Slot view types (the core of the gm_pipe approach).
+// Vec writes T/2 rows per sub-block; Cube reads the full T×T slot.
+using HalfSlotView =
+    GlobalTensor<half,
+                 TileShape2D<half, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 BaseShape2D<half, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 Layout::ND>;
+using FullSlotView =
+    GlobalTensor<half,
+                 TileShape2D<half, TILE_SIZE, TILE_SIZE, Layout::ND>,
+                 BaseShape2D<half, TILE_SIZE, TILE_SIZE, Layout::ND>,
+                 Layout::ND>;
+
+// Large-tensor GM accessors (non-FIFO data)
+using TileGlobal =
+    GlobalTensor<half,
+                 TileShape2D<half, TILE_SIZE, TILE_SIZE, Layout::ND>,
+                 BaseShape2D<half, TILE_SIZE, TILE_SIZE, Layout::ND>,
+                 Layout::ND>;
+using HalfTileGlobal =
+    GlobalTensor<half,
+                 TileShape2D<half, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 BaseShape2D<half, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 Layout::ND>;
+
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void SetFlag(uint32_t id) { set_flag(Src, Dst, static_cast<event_t>(id)); }
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void WaitFlag(uint32_t id) { wait_flag(Src, Dst, static_cast<event_t>(id)); }
+
+AICORE void run_add_matmul_v2c(
+    __gm__ half    *A, __gm__ half *B, __gm__ half *C, __gm__ half *D,
+    __gm__ uint8_t *fifo_mem, int64_t batch, uint64_t ffts_addr)
+{
+    const int32_t cid       = static_cast<int32_t>(get_block_idx());
+    const int32_t vid       = static_cast<int32_t>(get_subblockid());
+    const int32_t num_cores = static_cast<int32_t>(block_num);
+
+    set_ffts_base_addr(ffts_addr);
+
+    const int32_t wave_rows  = num_cores * TILE_SIZE;
+    const int32_t num_rounds = static_cast<int32_t>(batch) / wave_rows;
+
+    // Per-core FIFO region: FIFO_DEPTH × SLOT_SIZE bytes.
+    __gm__ uint8_t *core_fifo = fifo_mem + cid * V2C_FIFO_BYTES;
+
+    TileL1  d_l1, ab_l1;
+    TASSIGN(d_l1,  L1_D_OFFSET);
+    TASSIGN(ab_l1, L1_AB_OFFSET);
+    TileL0A ab_l0;  TileL0B d_l0;  TileL0C c_l0;
+    TASSIGN(ab_l0, L0_OFFSET);
+    TASSIGN(d_l0,  L0_OFFSET);
+    TASSIGN(c_l0,  L0_OFFSET);
+    TileVecUB a_ub, b_ub;
+    TASSIGN(a_ub, UB_A_OFFSET);
+    TASSIGN(b_ub, UB_B_OFFSET);
+
+    // ── Cube: wait for slot, TLOAD via slot view, signal free ─────────────────
+    if constexpr (DAV_CUBE) {
+        TileGlobal d_global(D);
+        TLOAD(d_l1, d_global);
+        SetFlag<PIPE_MTE2, PIPE_MTE1>(0);
+        WaitFlag<PIPE_MTE2, PIPE_MTE1>(0);
+        TMOV(d_l0, d_l1);
+        SetFlag<PIPE_MTE1, PIPE_M>(0);
+        WaitFlag<PIPE_MTE1, PIPE_M>(0);
+
+        for (int32_t r = 0; r < num_rounds; ++r) {
+            const int32_t row_c = r * wave_rows + cid * TILE_SIZE;
+
+            // Wait for both Vec sub-blocks to write their half-tiles.
+            wait_flag_dev(FLAG_V2C_DATA);
+
+            // Compute the current slot view (explicit GlobalTensor — the gm_pipe pattern).
+            const uint32_t slot_offset = static_cast<uint32_t>(r % FIFO_DEPTH) * V2C_SLOT_SIZE;
+            FullSlotView slot_in(reinterpret_cast<__gm__ half *>(
+                reinterpret_cast<uint64_t>(core_fifo) + slot_offset));
+
+            // Explicit TLOAD from the GM slot into L1 (the gm_pipe data-move).
+            TLOAD(ab_l1, slot_in);
+            SetFlag<PIPE_MTE2, PIPE_MTE1>(0);
+            WaitFlag<PIPE_MTE2, PIPE_MTE1>(0);  // MTE2→MTE1: TLOAD done before TMOV
+
+            // Signal Vec: slot consumed, safe to write again next round.
+            // Skip for the last round — no more Vec writes will happen.
+            if (r + 1 < num_rounds) {
+                ffts_cross_core_sync(PIPE_MTE2, 1u | (SIGNAL_MODE << 4) | (FLAG_V2C_FREE << 8));
+            }
+
+            // M→MTE1 (ab_l0): wait for the previous round's TMATMUL to finish
+            // reading ab_l0 before MTE1 overwrites it with TMOV.
+            // Skipped for r=0 — no previous TMATMUL to wait for.
+            // Uses id=1 to avoid aliasing the MTE1→M flag (id=0).
+            if (r > 0) {
+                WaitFlag<PIPE_M, PIPE_MTE1>(1);
+            }
+
+            TMOV(ab_l0, ab_l1);
+            SetFlag<PIPE_MTE1, PIPE_M>(0);
+            WaitFlag<PIPE_MTE1, PIPE_M>(0);
+
+            TMATMUL(c_l0, ab_l0, d_l0);
+            // Signal MTE1: ab_l0 is no longer in use by M (TMATMUL done).
+            // Consumed by WaitFlag<M,MTE1>(1) in the next round (or the drain
+            // after the loop for the final round).
+            SetFlag<PIPE_M, PIPE_MTE1>(1);
+
+            SetFlag<PIPE_M, PIPE_FIX>(0);
+            WaitFlag<PIPE_M, PIPE_FIX>(0);  // M→FIX: c_l0 ready for TSTORE
+
+            TileGlobal c_global(C + row_c * TILE_SIZE);
+            TSTORE(c_global, c_l0);
+            // FIX→M (c_l0): wait for TSTORE to finish reading c_l0 before the
+            // next TMATMUL writes it.  Uses id=1 to avoid aliasing M→FIX (id=0).
+            SetFlag<PIPE_FIX, PIPE_M>(1);
+            WaitFlag<PIPE_FIX, PIPE_M>(1);
+        }
+        // Drain the M→MTE1 token left over from the final round.
+        // Without this, the next kernel call's round-1 WaitFlag<M,MTE1>(1) would
+        // consume a stale token and skip waiting, risking an L0A conflict.
+        WaitFlag<PIPE_M, PIPE_MTE1>(1);
+    }
+
+    // ── Vec: compute A+B, write to slot view, signal Cube ─────────────────────
+    if constexpr (DAV_VEC) {
+        set_mask_norm();
+        set_vector_mask(-1, -1);
+
+        for (int32_t r = 0; r < num_rounds; ++r) {
+            const int32_t row_v = r * wave_rows + cid * TILE_SIZE + vid * HALF_TILE;
+
+            HalfTileGlobal a_global(A + row_v * TILE_SIZE);
+            TLOAD(a_ub, a_global);
+            HalfTileGlobal b_global(B + row_v * TILE_SIZE);
+            TLOAD(b_ub, b_global);
+            pipe_barrier(PIPE_ALL);  // MTE2→V: TLOADs done before TADD
+
+            TADD(a_ub, a_ub, b_ub);
+            pipe_barrier(PIPE_ALL);  // V→MTE3: TADD done before TSTORE
+
+            // Wait for Cube to free the slot before overwriting it.
+            // TSTORE and ffts_cross_core_sync are both MTE3 — ordered in same pipe.
+            if (r >= static_cast<int32_t>(FIFO_DEPTH)) {
+                wait_flag_dev(FLAG_V2C_FREE);
+                pipe_barrier(PIPE_ALL);
+            }
+
+            // Compute explicit slot view for this sub-block's half-tile region.
+            const uint32_t slot_offset =
+                static_cast<uint32_t>(r % FIFO_DEPTH) * V2C_SLOT_SIZE
+                + static_cast<uint32_t>(vid) * HALF_TILE * TILE_SIZE * sizeof(half);
+            HalfSlotView slot_out(reinterpret_cast<__gm__ half *>(
+                reinterpret_cast<uint64_t>(core_fifo) + slot_offset));
+
+            // Explicit TSTORE to the GM slot (the gm_pipe data-move).
+            TSTORE(slot_out, a_ub);
+            pipe_barrier(PIPE_ALL);  // MTE3: wait for DMA to complete before signaling Cube
+            // Signal Cube: this sub-block has written its T/2 rows.
+            // mode=2 → Cube unblocks after BOTH sub-blocks (vid=0 and vid=1) signal.
+            ffts_cross_core_sync(PIPE_MTE3, 1u | (SIGNAL_MODE << 4) | (FLAG_V2C_DATA << 8));
+        }
+    }
+}
+
+#endif  // __CCE_AICORE__
+
+extern "C" __global__ AICORE void add_matmul_v2c_kernel(
+    __gm__ uint8_t *A, __gm__ uint8_t *B, __gm__ uint8_t *C,
+    __gm__ uint8_t *D, __gm__ uint8_t *fifo_mem,
+    int64_t batch, uint64_t ffts_addr)
+{
+    run_add_matmul_v2c(
+        reinterpret_cast<__gm__ half *>(A), reinterpret_cast<__gm__ half *>(B),
+        reinterpret_cast<__gm__ half *>(C), reinterpret_cast<__gm__ half *>(D),
+        fifo_mem, batch, ffts_addr);
+}
+
+extern "C" void call(uint32_t block_dim, void *stream,
+                     uint8_t *A, uint8_t *B, uint8_t *C,
+                     uint8_t *D, uint8_t *fifo_mem, int64_t batch)
+{
+    uint32_t ffts_len  = 0;
+    uint64_t ffts_addr = 0;
+    rtGetC2cCtrlAddr(&ffts_addr, &ffts_len);
+    add_matmul_v2c_kernel<<<block_dim, nullptr, stream>>>(
+        A, B, C, D, fifo_mem, batch, ffts_addr);
+}
diff --git a/examples/jit_cpp/cross_core_sync_demo/matmul_add/gm_pipe/jit_util.py b/examples/jit_cpp/cross_core_sync_demo/matmul_add/gm_pipe/jit_util.py
new file mode 100644
index 00000000..bd8e9858
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/matmul_add/gm_pipe/jit_util.py
@@ -0,0 +1,107 @@
+"""JIT utilities for matmul_add/gm_pipe kernels.
+
+Uses pto-isa-master headers for GlobalData TPOP/TALLOC/TFREE APIs.
+All tensors float16; fifo_mem float16 (half slot, same slot size as raw_flag).
+"""
+from __future__ import annotations
+
+import ctypes
+import os
+import subprocess
+from functools import lru_cache
+
+import torch
+
+_HERE = os.path.dirname(os.path.abspath(__file__))
+
+ASCEND_TOOLKIT_HOME = os.environ.get("ASCEND_TOOLKIT_HOME") or os.environ.get(
+    "ASCEND_HOME_PATH", ""
+)
+if not ASCEND_TOOLKIT_HOME:
+    raise RuntimeError("Set ASCEND_TOOLKIT_HOME or ASCEND_HOME_PATH")
+
+_PTO_NEW_INC = "/workdir/pto-isa-master/include"
+_DRIVER_INC  = "/usr/local/Ascend/driver/kernel/inc"
+
+_NPU_DEVICE = os.environ.get("NPU_DEVICE", "npu:7")
+try:
+    BLOCK_DIM = int(
+        getattr(torch.npu.get_device_properties(_NPU_DEVICE), "cube_core_num", 20)
+    )
+except (RuntimeError, AssertionError):
+    BLOCK_DIM = 24
+
+TILE_SIZE  = 128
+FIFO_DEPTH = 2
+FIFO_ELEMS_PER_CORE = FIFO_DEPTH * TILE_SIZE * TILE_SIZE   # float16 elements
+
+
+def _compile(cpp_basename: str, so_basename: str, verbose: bool = True) -> str:
+    flags = [
+        "-fPIC", "-shared", "-xcce", "-DMEMORY_BASE", "-O2", "-std=gnu++17",
+        "--cce-aicore-arch=dav-c220",
+        "-mllvm", "-cce-aicore-stack-size=0x8000",
+        "-mllvm", "-cce-aicore-function-stack-size=0x8000",
+        "-mllvm", "-cce-aicore-record-overflow=true",
+        "-mllvm", "-cce-aicore-dcci-insert-for-scalar=false",
+        "-Wno-macro-redefined", "-Wno-ignored-attributes",
+        f"-I{_PTO_NEW_INC}",
+        f"-I{ASCEND_TOOLKIT_HOME}/include",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc/runtime",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc/profiling",
+    ]
+    if os.path.isdir(_DRIVER_INC):
+        flags.append(f"-I{_DRIVER_INC}")
+    cpp = os.path.join(_HERE, cpp_basename)
+    so  = os.path.join(_HERE, so_basename)
+    cmd = ["bisheng", *flags, cpp, "-o", so]
+    if verbose:
+        print("Compiling (with pto-isa-master headers):", " ".join(cmd))
+    subprocess.run(cmd, check=True, timeout=300)
+    if verbose:
+        print(f"Compiled → {so}")
+    return so
+
+
+def _make_lib(so_path: str) -> ctypes.CDLL:
+    lib = ctypes.CDLL(os.path.abspath(so_path))
+    lib.call.argtypes = [
+        ctypes.c_uint32, ctypes.c_void_p,  # block_dim, stream
+        ctypes.c_void_p, ctypes.c_void_p,  # A, B
+        ctypes.c_void_p, ctypes.c_void_p,  # C, D
+        ctypes.c_void_p, ctypes.c_int64,   # fifo_mem, batch
+    ]
+    lib.call.restype = None
+    return lib
+
+
+@lru_cache(maxsize=1)
+def load_matmul_add_c2v(verbose: bool = True) -> "MatmulKernel":
+    so = _compile("matmul_add_c2v.cpp", "matmul_add_c2v.so", verbose=verbose)
+    return MatmulKernel(_make_lib(so), BLOCK_DIM)
+
+
+@lru_cache(maxsize=1)
+def load_add_matmul_v2c(verbose: bool = True) -> "MatmulKernel":
+    so = _compile("add_matmul_v2c.cpp", "add_matmul_v2c.so", verbose=verbose)
+    return MatmulKernel(_make_lib(so), BLOCK_DIM)
+
+
+class MatmulKernel:
+    def __init__(self, lib: ctypes.CDLL, block_dim: int) -> None:
+        self._lib = lib
+        self._block_dim = block_dim
+
+    def __call__(self, A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
+                 D: torch.Tensor, fifo_mem: torch.Tensor,
+                 batch: int | None = None) -> None:
+        if batch is None:
+            batch = A.shape[0]
+        stream_ptr = ctypes.c_void_p(torch.npu.current_stream().npu_stream)
+        self._lib.call(
+            self._block_dim, stream_ptr,
+            ctypes.c_void_p(A.data_ptr()), ctypes.c_void_p(B.data_ptr()),
+            ctypes.c_void_p(C.data_ptr()), ctypes.c_void_p(D.data_ptr()),
+            ctypes.c_void_p(fifo_mem.data_ptr()), ctypes.c_int64(batch),
+        )
diff --git a/examples/jit_cpp/cross_core_sync_demo/matmul_add/gm_pipe/matmul_add_c2v.cpp b/examples/jit_cpp/cross_core_sync_demo/matmul_add/gm_pipe/matmul_add_c2v.cpp
new file mode 100644
index 00000000..caa085ef
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/matmul_add/gm_pipe/matmul_add_c2v.cpp
@@ -0,0 +1,203 @@
+// =============================================================================
+// matmul_add_c2v.cpp  —  C = A @ B + D   [gm_pipe variant]
+//
+// ── API variant: GlobalData TALLOC + TSTORE + TPUSH / TPOP + TLOAD + TFREE ──
+//
+//  raw_flag equivalent          │  gm_pipe (this file)
+//  ─────────────────────────────┼─────────────────────────────────────────────
+//  TSTORE(ws_half, c_l0)        │  TALLOC<C2VPipe, SlotFull, UP_DOWN>(pipe, slot)
+//  pipe_barrier(PIPE_ALL)       │  TSTORE(slot, c_l0)   ← explicit fp32→fp16
+//  SetCrossFlag<FIX>(FLAG_C2V)  │  pipe_barrier(PIPE_ALL)
+//                               │  TPUSH<C2VPipe, SlotFull, UP_DOWN>(pipe, slot)
+//  ─────────────────────────────┼─────────────────────────────────────────────
+//  WaitCrossFlag(FLAG_C2V)      │  TPOP<C2VPipe, PopHalf, UP_DOWN>(pipe, pop)
+//  TLOAD(c_ub, ws_half)         │  TLOAD(c_ub, pop)
+//  SetCrossFlag<MTE3>(FLAG_V2C) │  pipe_barrier(PIPE_ALL)
+//                               │  TFREE<C2VPipe, PopHalf, UP_DOWN>(pipe, pop)
+//
+// Requires: pto-isa-master headers (GlobalData TPOP/TALLOC/TFREE APIs).
+// Half slot (32 KB/slot) — same size as raw_flag; direct bandwidth comparison valid.
+// Python: all float16.  Reference: A @ B + D
+// =============================================================================
+
+#define MEMORY_BASE
+#include <pto/pto-inst.hpp>
+#include <pto/common/fifo.hpp>
+#include "acl/acl.h"
+#include <runtime/rt_ffts.h>
+
+using namespace pto;
+
+#ifdef __DAV_CUBE__
+constexpr bool DAV_CUBE = true;
+#else
+constexpr bool DAV_CUBE = false;
+#endif
+#ifdef __DAV_VEC__
+constexpr bool DAV_VEC = true;
+#else
+constexpr bool DAV_VEC = false;
+#endif
+
+#define TILE_SIZE 128
+#define HALF_TILE  64
+#define VEC_NUM     2
+
+#ifdef __CCE_AICORE__
+
+constexpr uint32_t L1_B_OFFSET = 0u;
+constexpr uint32_t L1_A_OFFSET = TILE_SIZE * TILE_SIZE * sizeof(half);   // 32 KB
+constexpr uint32_t L0_OFFSET   = 0u;
+constexpr uint32_t UB_C_OFFSET = 0u;   // c_ub: 16 KB
+constexpr uint32_t UB_D_OFFSET = HALF_TILE * TILE_SIZE * sizeof(half);   // 16 KB
+
+constexpr uint32_t FIFO_DEPTH     = 2u;
+constexpr uint32_t C2V_SLOT_SIZE  = TILE_SIZE * TILE_SIZE * sizeof(half); // 32 KB
+constexpr uint32_t C2V_FIFO_BYTES = FIFO_DEPTH * C2V_SLOT_SIZE;           // 64 KB/core
+
+using TileL1 = Tile<TileType::Mat, half, TILE_SIZE, TILE_SIZE,
+                    BLayout::ColMajor, TILE_SIZE, TILE_SIZE,
+                    SLayout::RowMajor, 512, PadValue::Zero>;
+using TileL0A = TileLeft<half,  TILE_SIZE, TILE_SIZE>;
+using TileL0B = TileRight<half, TILE_SIZE, TILE_SIZE>;
+using TileL0C = TileAcc<float,  TILE_SIZE, TILE_SIZE>;
+
+using TileVecUB = Tile<TileType::Vec, half, HALF_TILE, TILE_SIZE,
+                       BLayout::RowMajor, HALF_TILE, TILE_SIZE,
+                       SLayout::NoneBox, 512, PadValue::Null>;
+
+using C2VPipe = TPipe<0, Direction::DIR_C2V, C2V_SLOT_SIZE, FIFO_DEPTH>;
+
+using SlotFull = GlobalTensor<half, pto::Shape<1, 1, 1, TILE_SIZE, TILE_SIZE>,
+                               pto::Stride<1, 1, 1, TILE_SIZE, 1>>;
+using PopHalf  = GlobalTensor<half, pto::Shape<1, 1, 1, HALF_TILE, TILE_SIZE>,
+                               pto::Stride<1, 1, 1, TILE_SIZE, 1>>;
+
+using TileGlobal =
+    GlobalTensor<half,
+                 TileShape2D<half, TILE_SIZE, TILE_SIZE, Layout::ND>,
+                 BaseShape2D<half, TILE_SIZE, TILE_SIZE, Layout::ND>,
+                 Layout::ND>;
+using HalfTileGlobal =
+    GlobalTensor<half,
+                 TileShape2D<half, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 BaseShape2D<half, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 Layout::ND>;
+
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void SetFlag(uint32_t id) { set_flag(Src, Dst, static_cast<event_t>(id)); }
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void WaitFlag(uint32_t id) { wait_flag(Src, Dst, static_cast<event_t>(id)); }
+
+AICORE void run_matmul_add_c2v(
+    __gm__ half    *A, __gm__ half *B, __gm__ half *C, __gm__ half *D,
+    __gm__ uint8_t *fifo_mem, int64_t batch, uint64_t ffts_addr)
+{
+    const int32_t cid       = static_cast<int32_t>(get_block_idx());
+    const int32_t vid       = static_cast<int32_t>(get_subblockid());
+    const int32_t num_cores = static_cast<int32_t>(block_num);
+
+    set_ffts_base_addr(ffts_addr);
+
+    const int32_t wave_rows  = num_cores * TILE_SIZE;
+    const int32_t num_rounds = static_cast<int32_t>(batch) / wave_rows;
+
+    __gm__ void *core_fifo = fifo_mem + cid * C2V_FIFO_BYTES;
+    C2VPipe pipe(core_fifo, /*c2v_ub_base=*/0x0, /*v2c_l1_base=*/0x0);
+
+    TileL1  b_l1, a_l1;
+    TASSIGN(b_l1, L1_B_OFFSET);
+    TASSIGN(a_l1, L1_A_OFFSET);
+    TileL0A a_l0;  TileL0B b_l0;  TileL0C c_l0;
+    TASSIGN(a_l0, L0_OFFSET);
+    TASSIGN(b_l0, L0_OFFSET);
+    TASSIGN(c_l0, L0_OFFSET);
+    TileVecUB c_ub, d_ub;
+    TASSIGN(c_ub, UB_C_OFFSET);
+    TASSIGN(d_ub, UB_D_OFFSET);
+
+    if constexpr (DAV_CUBE) {
+        TileGlobal b_global(B);
+        TLOAD(b_l1, b_global);
+        SetFlag<PIPE_MTE2, PIPE_MTE1>(0);
+        WaitFlag<PIPE_MTE2, PIPE_MTE1>(0);
+        TMOV(b_l0, b_l1);
+        SetFlag<PIPE_MTE1, PIPE_M>(0);
+        WaitFlag<PIPE_MTE1, PIPE_M>(0);
+
+        SlotFull push_slot;
+        for (int32_t r = 0; r < num_rounds; ++r) {
+            const int32_t row_c = r * wave_rows + cid * TILE_SIZE;
+
+            TileGlobal a_global(A + row_c * TILE_SIZE);
+            TLOAD(a_l1, a_global);
+            SetFlag<PIPE_MTE2, PIPE_MTE1>(0);
+            WaitFlag<PIPE_MTE2, PIPE_MTE1>(0);
+            TMOV(a_l0, a_l1);
+            SetFlag<PIPE_MTE1, PIPE_M>(0);
+            WaitFlag<PIPE_MTE1, PIPE_M>(0);
+
+            TMATMUL(c_l0, a_l0, b_l0);
+            SetFlag<PIPE_M, PIPE_FIX>(0);
+            WaitFlag<PIPE_M, PIPE_FIX>(0);  // M→FIX: c_l0 ready for TSTORE
+
+            TALLOC<C2VPipe, SlotFull, TileSplitAxis::TILE_UP_DOWN>(pipe, push_slot);
+            TSTORE(push_slot, c_l0);
+            //   AccTile<float> → GlobalTensor<half>: fp32→fp16 via hardware FIX
+            pipe_barrier(PIPE_ALL);  // FIX: wait for DMA to complete before TPUSH signals Vec
+            TPUSH<C2VPipe, SlotFull, TileSplitAxis::TILE_UP_DOWN>(pipe, push_slot);
+        }
+    }
+
+    if constexpr (DAV_VEC) {
+        set_mask_norm();
+        set_vector_mask(-1, -1);
+
+        PopHalf pop_slot;
+        for (int32_t r = 0; r < num_rounds; ++r) {
+            const int32_t row_v = r * wave_rows + cid * TILE_SIZE + vid * HALF_TILE;
+
+            TPOP<C2VPipe, PopHalf, TileSplitAxis::TILE_UP_DOWN>(pipe, pop_slot);
+            TLOAD(c_ub, pop_slot);
+
+            HalfTileGlobal d_global(D + row_v * TILE_SIZE);
+            TLOAD(d_ub, d_global);
+
+            // TFREE fires from MTE2 after both TLOADs complete (same pipe, ordered).
+            TFREE<C2VPipe, PopHalf, TileSplitAxis::TILE_UP_DOWN>(pipe, pop_slot);
+
+            pipe_barrier(PIPE_ALL);  // MTE2→V: TLOADs done before TADD
+
+            TADD(c_ub, c_ub, d_ub);
+            pipe_barrier(PIPE_ALL);  // V→MTE3: TADD done before TSTORE
+
+            HalfTileGlobal c_out(C + row_v * TILE_SIZE);
+            TSTORE(c_out, c_ub);
+            pipe_barrier(PIPE_ALL);  // MTE3: TSTORE complete before next TPOP overwrites c_ub
+        }
+    }
+}
+
+#endif  // __CCE_AICORE__
+
+extern "C" __global__ AICORE void matmul_add_c2v_kernel(
+    __gm__ uint8_t *A, __gm__ uint8_t *B, __gm__ uint8_t *C,
+    __gm__ uint8_t *D, __gm__ uint8_t *fifo_mem,
+    int64_t batch, uint64_t ffts_addr)
+{
+    run_matmul_add_c2v(
+        reinterpret_cast<__gm__ half *>(A), reinterpret_cast<__gm__ half *>(B),
+        reinterpret_cast<__gm__ half *>(C), reinterpret_cast<__gm__ half *>(D),
+        fifo_mem, batch, ffts_addr);
+}
+
+extern "C" void call(uint32_t block_dim, void *stream,
+                     uint8_t *A, uint8_t *B, uint8_t *C,
+                     uint8_t *D, uint8_t *fifo_mem, int64_t batch)
+{
+    uint32_t ffts_len  = 0;
+    uint64_t ffts_addr = 0;
+    rtGetC2cCtrlAddr(&ffts_addr, &ffts_len);
+    matmul_add_c2v_kernel<<<block_dim, nullptr, stream>>>(
+        A, B, C, D, fifo_mem, batch, ffts_addr);
+}
diff --git a/examples/jit_cpp/cross_core_sync_demo/matmul_add/gm_pipe/run.py b/examples/jit_cpp/cross_core_sync_demo/matmul_add/gm_pipe/run.py
new file mode 100644
index 00000000..22a73343
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/matmul_add/gm_pipe/run.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python3
+"""
+Correctness tests and bandwidth benchmark for matmul_add/gm_pipe kernels.
+
+matmul_add_c2v  (gm_pipe):  C = A @ B + D   — all float16, half FIFO slot
+add_matmul_v2c  (gm_pipe):  C = (A + B) @ D — all float16, half FIFO slot
+
+gm_pipe uses explicit TSTORE/TLOAD on GlobalTensor slot views, plus
+raw ffts_cross_core_sync/wait_flag_dev for signaling — same as raw_flag
+semantics but with FIFO_DEPTH=2 double-buffer slot cycling.
+
+NOTE on FFTS signal accumulation:
+  FIFO protocols leave FIFO_DEPTH "pipeline fill" signals in FFTS counters
+  after each run.  Use a fresh fifo buffer PER CALL to avoid accumulation.
+
+Usage:
+    python run.py            # correctness + bandwidth
+    NPU_DEVICE=npu:5 python run.py
+"""
+from __future__ import annotations
+
+import os
+import sys
+
+import torch
+import torch_npu  # noqa: F401
+
+_DEVICE = os.environ.get("NPU_DEVICE", "npu:7")
+torch.npu.set_device(_DEVICE)
+print(f"Using device: {_DEVICE}")
+os.environ["NPU_DEVICE"] = _DEVICE
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from jit_util import (  # noqa: E402
+    load_matmul_add_c2v, load_add_matmul_v2c,
+    BLOCK_DIM, TILE_SIZE, FIFO_ELEMS_PER_CORE,
+)
+
+RTOL = 1e-3
+ATOL = 1e-5
+
+
+def _test(kernel, name: str, ref_fn) -> None:
+    print("=" * 60)
+    print(f"{name}  gm_pipe")
+    print("=" * 60)
+
+    wave_rows = BLOCK_DIM * TILE_SIZE
+    passed = failed = 0
+
+    for num_rounds in [1, 4, 8]:
+        batch = num_rounds * wave_rows
+        torch.manual_seed(0)
+        tensors = ref_fn(batch, TILE_SIZE, _DEVICE)
+        A, B, D = tensors['A'], tensors['B'], tensors['D']
+        C = torch.zeros(batch, TILE_SIZE, dtype=torch.float16, device=_DEVICE)
+
+        # Fresh fifo per call: avoids FFTS counter accumulation across calls
+        fifo = torch.zeros(BLOCK_DIM * FIFO_ELEMS_PER_CORE,
+                           dtype=torch.float16, device=_DEVICE)
+        kernel(A, B, C, D, fifo)
+        torch.npu.synchronize()
+
+        ref = tensors['ref']
+        try:
+            torch.testing.assert_close(C, ref, rtol=RTOL, atol=ATOL)
+            passed += 1
+        except AssertionError as e:
+            failed += 1
+            if failed <= 3:
+                print(f"  FAIL rounds={num_rounds}: {e}")
+
+    status = "OK" if failed == 0 else f"FAILED ({failed}/{passed+failed})"
+    print(f"Correctness: {passed}/{passed+failed} passed — {status}\n")
+    if failed:
+        sys.exit(1)
+
+
+def _c2v_tensors(batch, tile, device):
+    kw = dict(dtype=torch.float16, device=device)
+    A = torch.randn(batch, tile, **kw)
+    B = torch.randn(tile, tile, **kw)
+    D = torch.randn(batch, tile, **kw)
+    return dict(A=A, B=B, D=D, ref=(A @ B + D).half())
+
+
+def _v2c_tensors(batch, tile, device):
+    kw = dict(dtype=torch.float16, device=device)
+    A = torch.randn(batch, tile, **kw)
+    B = torch.randn(batch, tile, **kw)
+    D = torch.randn(tile, tile, **kw)
+    return dict(A=A, B=B, D=D, ref=((A + B) @ D).half())
+
+
+def _benchmark(kernel, name: str, make_tensors, warmup: int = 10,
+               repeats: int = 30) -> None:
+    print("=" * 60)
+    print(f"BENCHMARK  {name}  gm_pipe")
+    print(f"  warmup={warmup}  repeats={repeats}")
+    print("=" * 60)
+
+    wave_rows = BLOCK_DIM * TILE_SIZE
+    hdr = f"{'batch':>10}  {'rounds':>6}  {'dur_us':>10}  {'bw_GB/s':>10}"
+    print(hdr)
+    print("-" * len(hdr))
+
+    records = []
+    for num_rounds in [1, 2, 4, 8, 16, 32, 64]:
+        batch = num_rounds * wave_rows
+        tensors = make_tensors(batch, TILE_SIZE, _DEVICE)
+        A, B, D = tensors['A'], tensors['B'], tensors['D']
+        C = torch.zeros(batch, TILE_SIZE, dtype=torch.float16, device=_DEVICE)
+
+        # Pre-allocate a fresh fifo for every call so TPipe FIFO head/tail
+        # pointers stored inside fifo_mem never accumulate across calls.
+        # Allocation happens before the timing window — no overhead inside timer.
+        n_calls = warmup + repeats
+        fifos = [torch.zeros(BLOCK_DIM * FIFO_ELEMS_PER_CORE,
+                             dtype=torch.float16, device=_DEVICE)
+                 for _ in range(n_calls)]
+
+        for i in range(warmup):
+            kernel(A, B, C, D, fifos[i])
+        torch.npu.synchronize()
+
+        start = torch.npu.Event(enable_timing=True)
+        end   = torch.npu.Event(enable_timing=True)
+        start.record()
+        for i in range(repeats):
+            kernel(A, B, C, D, fifos[warmup + i])
+        end.record()
+        end.synchronize()
+
+        dur_us = start.elapsed_time(end) / repeats * 1e3
+        # Bytes: read A + read B/D + read D/B + write C  (all fp16 = 2 bytes)
+        bytes_total = (batch * TILE_SIZE * 3 + TILE_SIZE * TILE_SIZE) * 2
+        bw_gbs = bytes_total / dur_us * 1e-3
+
+        print(f"{batch:>10d}  {num_rounds:>6d}  {dur_us:>10.2f}  {bw_gbs:>10.2f}")
+        records.append(dict(batch=batch, num_rounds=num_rounds,
+                            dur_us=dur_us, bw_gbs=bw_gbs))
+
+    peak_bw = max(r["bw_gbs"] for r in records)
+    print(f"\nPeak bandwidth: {peak_bw:.1f} GB/s  "
+          f"(910B2 HBM roofline ≈ 1500 GB/s)\n")
+
+
+if __name__ == "__main__":
+    print(f"BLOCK_DIM={BLOCK_DIM}\n")
+
+    print("Compiling matmul_add_c2v ...")
+    c2v = load_matmul_add_c2v(verbose=True)
+    print()
+    print("Compiling add_matmul_v2c ...")
+    v2c = load_add_matmul_v2c(verbose=True)
+    print()
+
+    _test(c2v, "matmul_add_c2v  (C = A @ B + D)",    _c2v_tensors)
+    _test(v2c, "add_matmul_v2c  (C = (A + B) @ D)", _v2c_tensors)
+
+    _benchmark(c2v, "matmul_add_c2v  (C = A @ B + D)",    _c2v_tensors)
+    _benchmark(v2c, "add_matmul_v2c  (C = (A + B) @ D)", _v2c_tensors)
diff --git a/examples/jit_cpp/cross_core_sync_demo/matmul_add/naive_separate/README.md b/examples/jit_cpp/cross_core_sync_demo/matmul_add/naive_separate/README.md
new file mode 100644
index 00000000..e62af6e7
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/matmul_add/naive_separate/README.md
@@ -0,0 +1,79 @@
+# naive_separate — Two-stage baseline (no Cube↔Vec pipeline)
+
+Computes `C = A @ B + D` (matmul_add_c2v) and `C = (A + B) @ D` (add_matmul_v2c)
+in two **sequential** stages within a **single kernel launch**.  No round-level
+overlap between the Cube (GEMM) stage and the Vec (element-wise add) stage.
+
+## Purpose
+
+Provides a slower baseline to demonstrate the benefit of fine-grained pipelining
+in the `raw_flag`, `pushpop`, and `gm_pipe` variants.
+
+## Algorithm
+
+**matmul_add_c2v** (`C = A @ B + D`):
+1. **Stage 1 — Cube (all rounds)**: compute `A @ B` for every round, write
+   results to `workspace[batch, TILE_SIZE]`.  After all rounds, signal Vec
+   with one `FLAG_C2V` broadcast.
+2. **Stage 2 — Vec (all rounds)**: wait for `FLAG_C2V`, then for each round
+   load the GEMM result from workspace, add `D`, write `C`.
+
+**add_matmul_v2c** (`C = (A + B) @ D`):
+1. **Stage 1 — Vec (all rounds)**: compute `A + B` for every round, write
+   results to `workspace`.  After all rounds each sub-block sends `FLAG_V2C`.
+2. **Stage 2 — Cube (all rounds)**: wait for both `FLAG_V2C` signals, then for
+   each round load workspace, compute GEMM, write `C`.
+
+## Workspace sizing
+
+`workspace[batch, TILE_SIZE]` fp16 — one full slot per `(core, round)` pair.
+This is much larger than the pipelined FIFO buffers (which hold only a handful
+of slots regardless of `num_rounds`).
+
+## Reproduce
+
+```bash
+BASE=/workdir/pto-kernels-fork/examples/jit_cpp/cross_core_sync_demo/matmul_add
+python $BASE/naive_separate/run.py
+```
+
+## Key sync difference vs pipelined variants
+
+| | pipelined (raw_flag / pushpop / gm_pipe) | naive_separate |
+|---|---|---|
+| Signal granularity | one signal **per round** | one signal **after all rounds** |
+| Cube↔Vec overlap | yes — GEMM round r overlaps with Vec round r-1 | no — GEMM finishes before Vec starts |
+| Workspace size | `num_cores × TILE_SIZE²` (small FIFO) | `batch × TILE_SIZE` (full array) |
+| Bandwidth | higher (pipeline hides latency) | lower (sequential stages) |
+
+## Benchmark results (910B2, TILE_SIZE=128, 24 cores)
+
+### matmul_add_c2v (`C = A @ B + D`)
+
+| batch | rounds | naive µs | naive GB/s | torch µs | torch GB/s |
+|-------|--------|----------|------------|----------|------------|
+| 3072 | 1 | 53.2 | 44.9 | 35.6 | 67.2 |
+| 6144 | 2 | 51.3 | 92.7 | 38.1 | 124.9 |
+| 12288 | 4 | 51.4 | 184.3 | 37.9 | 250.1 |
+| 24576 | 8 | 51.6 | 366.8 | 36.4 | 519.3 |
+| 49152 | 16 | 51.9 | 727.7 | 35.9 | 1052.9 |
+| 98304 | 32 | 64.4 | **1173.7** | 36.6 | 2061.9 |
+| 196608 | 64 | 141.5 | 1067.1 | 76.0 | 1986.2 |
+
+### add_matmul_v2c (`C = (A + B) @ D`)
+
+| batch | rounds | naive µs | naive GB/s | torch µs | torch GB/s |
+|-------|--------|----------|------------|----------|------------|
+| 3072 | 1 | 52.0 | 46.1 | 36.3 | 65.8 |
+| 6144 | 2 | 50.7 | 93.7 | 36.1 | 131.5 |
+| 12288 | 4 | 51.1 | 185.2 | 37.4 | 253.2 |
+| 24576 | 8 | 51.0 | 371.1 | 36.7 | 514.9 |
+| 49152 | 16 | 55.3 | 683.8 | 36.0 | 1051.0 |
+| 98304 | 32 | 64.3 | **1174.2** | 36.8 | 2054.4 |
+| 196608 | 64 | 124.7 | **1210.7** | 68.7 | 2197.4 |
+
+Peak naive bandwidth: ~**1174–1211 GB/s** vs pipelined variants: ~**1357–1543 GB/s**.
+
+The pipelined kernels are **15–30% faster** than naive_separate because they
+overlap Cube and Vec work round-by-round.  Both are constrained by HBM bandwidth
+for large batch sizes.
diff --git a/examples/jit_cpp/cross_core_sync_demo/matmul_add/naive_separate/jit_util.py b/examples/jit_cpp/cross_core_sync_demo/matmul_add/naive_separate/jit_util.py
new file mode 100644
index 00000000..9d60bca9
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/matmul_add/naive_separate/jit_util.py
@@ -0,0 +1,153 @@
+"""JIT utilities for naive_separate kernels.
+
+Both kernels (matmul_add_c2v and add_matmul_v2c) live in a single
+naive_separate.cpp and expose two entry points:
+    call_matmul_add_c2v  and  call_add_matmul_v2c
+
+Workspace sizing:
+    Unlike the pipelined variants that use a small FIFO buffer, the naive
+    baseline needs one slot per round (no double-buffering).  Slot size is
+    TILE_SIZE × TILE_SIZE fp16, and there are num_rounds = batch/(BLOCK_DIM*T)
+    rounds per core.  So:
+        workspace[batch, TILE_SIZE]  fp16   (== same shape as A, C, D)
+"""
+from __future__ import annotations
+
+import ctypes
+import os
+import subprocess
+from functools import lru_cache
+
+import torch
+
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_CPP  = os.path.join(_HERE, "naive_separate.cpp")
+_SO   = os.path.join(_HERE, "naive_separate.so")
+
+ASCEND_TOOLKIT_HOME = os.environ.get("ASCEND_TOOLKIT_HOME") or os.environ.get(
+    "ASCEND_HOME_PATH", ""
+)
+if not ASCEND_TOOLKIT_HOME:
+    raise RuntimeError("Set ASCEND_TOOLKIT_HOME or ASCEND_HOME_PATH")
+
+_PTO_INC    = os.path.join(ASCEND_TOOLKIT_HOME, "include")
+_DRIVER_INC = "/usr/local/Ascend/driver/kernel/inc"
+
+_NPU_DEVICE = os.environ.get("NPU_DEVICE", "npu:7")
+try:
+    BLOCK_DIM = int(
+        getattr(torch.npu.get_device_properties(_NPU_DEVICE), "cube_core_num", 20)
+    )
+except (RuntimeError, AssertionError):
+    BLOCK_DIM = 24
+
+TILE_SIZE = 128
+
+
+# ── Compilation ───────────────────────────────────────────────────────────────
+
+def _compile(verbose: bool = True) -> str:
+    flags = [
+        "-fPIC", "-shared", "-xcce", "-DMEMORY_BASE", "-O2", "-std=gnu++17",
+        "--cce-aicore-arch=dav-c220",
+        "-mllvm", "-cce-aicore-stack-size=0x8000",
+        "-mllvm", "-cce-aicore-function-stack-size=0x8000",
+        "-mllvm", "-cce-aicore-record-overflow=true",
+        "-mllvm", "-cce-aicore-dcci-insert-for-scalar=false",
+        "-Wno-macro-redefined", "-Wno-ignored-attributes",
+        f"-I{_PTO_INC}",
+        f"-I{ASCEND_TOOLKIT_HOME}/include",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc/runtime",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc/profiling",
+    ]
+    if os.path.isdir(_DRIVER_INC):
+        flags.append(f"-I{_DRIVER_INC}")
+
+    cmd = ["bisheng", *flags, _CPP, "-o", _SO]
+    if verbose:
+        print("Compiling:", " ".join(cmd))
+    subprocess.run(cmd, check=True, timeout=300)
+    if verbose:
+        print(f"Compiled → {_SO}")
+    return _SO
+
+
+def _bind(lib: ctypes.CDLL, fn_name: str) -> None:
+    """Bind the call signature for a kernel entry point."""
+    fn = getattr(lib, fn_name)
+    fn.argtypes = [
+        ctypes.c_uint32,   # block_dim
+        ctypes.c_void_p,   # stream
+        ctypes.c_void_p,   # A
+        ctypes.c_void_p,   # B
+        ctypes.c_void_p,   # C
+        ctypes.c_void_p,   # D
+        ctypes.c_void_p,   # workspace
+        ctypes.c_int64,    # batch
+    ]
+    fn.restype = None
+
+
+@lru_cache(maxsize=1)
+def _load_so(verbose: bool = True) -> ctypes.CDLL:
+    so = _compile(verbose=verbose)
+    lib = ctypes.CDLL(os.path.abspath(so))
+    _bind(lib, "call_matmul_add_c2v")
+    _bind(lib, "call_add_matmul_v2c")
+    return lib
+
+
+# ── Kernel wrapper ────────────────────────────────────────────────────────────
+
+class NaiveKernel:
+    """Callable wrapper for a naive_separate entry point.
+
+    Parameters
+    ----------
+    lib       : loaded shared library
+    fn_name   : symbol name ("call_matmul_add_c2v" or "call_add_matmul_v2c")
+    block_dim : number of Cube cores
+    """
+
+    def __init__(self, lib: ctypes.CDLL, fn_name: str, block_dim: int) -> None:
+        self._fn = getattr(lib, fn_name)
+        self._block_dim = block_dim
+
+    def __call__(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        C: torch.Tensor,
+        D: torch.Tensor,
+        workspace: torch.Tensor,
+        batch: int | None = None,
+    ) -> None:
+        """Launch the kernel in-place (result written to C).
+
+        All tensors must be on the same NPU device, contiguous, and fp16.
+        workspace must be at least [batch, TILE_SIZE] fp16.
+        """
+        if batch is None:
+            batch = A.shape[0]
+        stream_ptr = ctypes.c_void_p(torch.npu.current_stream().npu_stream)
+        self._fn(
+            self._block_dim,
+            stream_ptr,
+            ctypes.c_void_p(A.data_ptr()),
+            ctypes.c_void_p(B.data_ptr()),
+            ctypes.c_void_p(C.data_ptr()),
+            ctypes.c_void_p(D.data_ptr()),
+            ctypes.c_void_p(workspace.data_ptr()),
+            ctypes.c_int64(batch),
+        )
+
+
+def load_matmul_add_c2v(verbose: bool = True) -> NaiveKernel:
+    lib = _load_so(verbose=verbose)
+    return NaiveKernel(lib, "call_matmul_add_c2v", BLOCK_DIM)
+
+
+def load_add_matmul_v2c(verbose: bool = True) -> NaiveKernel:
+    lib = _load_so(verbose=verbose)
+    return NaiveKernel(lib, "call_add_matmul_v2c", BLOCK_DIM)
diff --git a/examples/jit_cpp/cross_core_sync_demo/matmul_add/naive_separate/naive_separate.cpp b/examples/jit_cpp/cross_core_sync_demo/matmul_add/naive_separate/naive_separate.cpp
new file mode 100644
index 00000000..8ae5ecc3
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/matmul_add/naive_separate/naive_separate.cpp
@@ -0,0 +1,485 @@
+// =============================================================================
+// naive_separate.cpp — Naive two-stage baseline: no Cube↔Vec pipeline
+//
+// Computes the same operations as the pipelined matmul_add kernels, but the
+// two stages (GEMM and element-wise add) are executed sequentially within each
+// core pair.  There is no fine-grained round-by-round overlap between Cube and
+// Vec: the first stage completes ALL rounds before the second stage starts.
+//
+// Two kernels (sharing the four stage-helper functions below):
+//
+//   matmul_add_c2v  "C = A @ B + D"
+//     Stage 1 (Cube): GEMM A@B for every round → workspace
+//     Stage 2 (Vec):  C = workspace + D for every round
+//
+//   add_matmul_v2c  "C = (A + B) @ D"
+//     Stage 1 (Vec):  (A + B) for every round → workspace
+//     Stage 2 (Cube): GEMM workspace@D for every round → C
+//
+// Workspace layout — both kernels use the same shape:
+//   workspace[batch, TILE_SIZE]  fp16   (== same size as A, C, D)
+//   Slot for core cid at round r:
+//     workspace[r * num_cores * TILE_SIZE + cid * TILE_SIZE : ..., :]
+//   Each slot is TILE_SIZE × TILE_SIZE elements; Vec sub-block vid reads/writes
+//   the half-rows at vid * HALF_TILE inside that slot.
+//
+// Cross-core synchronization (one signal per direction, not per round):
+//   matmul_add_c2v:
+//     Cube signals FLAG_C2V *once* after all GEMM rounds are in workspace.
+//     Both Vec sub-blocks wait for that one signal then drain workspace.
+//   add_matmul_v2c:
+//     Each Vec sub-block signals FLAG_V2C *once* after writing all its rows.
+//     Cube waits for VEC_NUM=2 signals then drains workspace.
+//
+// Compared with pipelined versions: within each core pair the two stages are
+// serialized — no round-level overlap — so effective bandwidth is lower.
+// =============================================================================
+
+#define MEMORY_BASE
+#include <pto/pto-inst.hpp>
+#include "acl/acl.h"
+#include <runtime/rt_ffts.h>
+
+using namespace pto;
+
+// ── Tile dimensions ────────────────────────────────────────────────────────────
+#define TILE_SIZE 128   // rows/cols per matrix tile
+#define HALF_TILE  64   // rows per Vec sub-block  (TILE_SIZE / VEC_NUM)
+#define VEC_NUM     2   // Vec sub-blocks per Cube core
+
+#ifdef __CCE_AICORE__
+
+// ── On-chip buffer base addresses (bytes) ─────────────────────────────────────
+// L1: constant weight first, data tile second
+constexpr uint32_t L1_CONST_OFFSET = 0u;
+constexpr uint32_t L1_DATA_OFFSET  = TILE_SIZE * TILE_SIZE * sizeof(half);  // 32 KB
+
+// L0A / L0B / L0C are independent scratchpads; each starts at byte 0
+constexpr uint32_t L0_OFFSET = 0u;
+
+// UB: two HALF_TILE × TILE_SIZE half tiles
+constexpr uint32_t UB_SLOT0_OFFSET = 0u;
+constexpr uint32_t UB_SLOT1_OFFSET = HALF_TILE * TILE_SIZE * sizeof(half);  // 16 KB
+
+// ── Cross-core FFTS flag IDs ───────────────────────────────────────────────────
+// For matmul_add_c2v: Cube → Vec  (workspace filled with GEMM results)
+// For add_matmul_v2c: Vec → Cube  (workspace filled with A+B results)
+constexpr int32_t FLAG_C2V = 0;
+constexpr int32_t FLAG_V2C = 1;
+
+// ── Tile type aliases ──────────────────────────────────────────────────────────
+// L1 tile — NZ (ColMajor/RowMajor) layout required by the Cube engine
+using TileL1 = Tile<TileType::Mat, half, TILE_SIZE, TILE_SIZE,
+                    BLayout::ColMajor, TILE_SIZE, TILE_SIZE,
+                    SLayout::RowMajor, 512, PadValue::Zero>;
+
+// L0 tiles — one per independent Cube scratchpad
+using TileL0A = TileLeft<half,  TILE_SIZE, TILE_SIZE>;
+using TileL0B = TileRight<half, TILE_SIZE, TILE_SIZE>;
+using TileL0C = TileAcc<float,  TILE_SIZE, TILE_SIZE>;  // fp32 accumulator
+
+// UB Vec tile — row-major, HALF_TILE rows × TILE_SIZE cols, fp16
+using TileVecUB = Tile<TileType::Vec, half, HALF_TILE, TILE_SIZE,
+                       BLayout::RowMajor, HALF_TILE, TILE_SIZE,
+                       SLayout::NoneBox, 512, PadValue::Null>;
+
+// GlobalTensor aliases — contiguous 2D row-major in GM
+using TileGlobal =
+    GlobalTensor<half,
+                 TileShape2D<half, TILE_SIZE, TILE_SIZE, Layout::ND>,
+                 BaseShape2D<half, TILE_SIZE, TILE_SIZE, Layout::ND>,
+                 Layout::ND>;
+
+using HalfTileGlobal =
+    GlobalTensor<half,
+                 TileShape2D<half, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 BaseShape2D<half, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 Layout::ND>;
+
+// ── Cross-core sync helpers ────────────────────────────────────────────────────
+// One call from Cube signals FLAG and unblocks all VEC_NUM Vec sub-blocks.
+// One call from each Vec sub-block signals FLAG; Cube unblocks after VEC_NUM.
+template <pipe_t Pipe>
+AICORE inline void SetCrossFlag(int32_t flag) {
+    ffts_cross_core_sync(Pipe, 1 | (VEC_NUM << 4) | (flag << 8));
+}
+
+AICORE inline void WaitCrossFlag(int32_t flag) {
+    wait_flag_dev(flag);
+}
+
+// ── Intra-pipe sync helpers ────────────────────────────────────────────────────
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void SetFlag(uint32_t id) {
+    set_flag(Src, Dst, static_cast<event_t>(id));
+}
+
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void WaitFlag(uint32_t id) {
+    wait_flag(Src, Dst, static_cast<event_t>(id));
+}
+
+// =============================================================================
+// Stage helpers — shared by both kernels
+//
+// Each helper is guarded by the compilation pass it belongs to
+// (__DAV_C220_CUBE__ or __DAV_C220_VEC__).  bisheng compiles the source twice
+// (once per pass) so we must keep Cube-only and Vec-only instructions in their
+// respective guards.  Wrapping helpers this way lets run_* call them within the
+// matching #if blocks, providing code reuse across both kernels.
+// =============================================================================
+
+// ── Cube-side helpers ──────────────────────────────────────────────────────────
+#if defined(__DAV_C220_CUBE__)
+
+// Stage 1 of matmul_add_c2v (Cube side):
+//   Compute A @ B for every round and write results to workspace.
+//   After the final TSTORE, send FLAG_C2V to signal both Vec sub-blocks.
+//
+// Workspace offset for round r, core cid:
+//   workspace + (r * num_cores * TILE_SIZE + cid * TILE_SIZE) * TILE_SIZE
+AICORE void cube_gemm_all_rounds_to_ws(
+    __gm__ half *A,         // [batch, TILE_SIZE]              input
+    __gm__ half *B,         // [TILE_SIZE, TILE_SIZE]          constant weight
+    __gm__ half *workspace, // [batch, TILE_SIZE]              output of this stage
+    int32_t cid, int32_t num_cores, int32_t num_rounds)
+{
+    TileL1  b_l1, a_l1;
+    TileL0A a_l0;
+    TileL0B b_l0;
+    TileL0C c_l0;
+    TASSIGN(b_l1, L1_CONST_OFFSET);
+    TASSIGN(a_l1, L1_DATA_OFFSET);
+    TASSIGN(a_l0, L0_OFFSET);
+    TASSIGN(b_l0, L0_OFFSET);
+    TASSIGN(c_l0, L0_OFFSET);
+
+    const int32_t wave_rows = num_cores * TILE_SIZE;
+
+    // Load constant weight B once — reused for all rounds
+    TileGlobal b_global(B);
+    TLOAD(b_l1, b_global);
+    SetFlag<PIPE_MTE2, PIPE_MTE1>(0);
+    WaitFlag<PIPE_MTE2, PIPE_MTE1>(0);
+    TMOV(b_l0, b_l1);
+    SetFlag<PIPE_MTE1, PIPE_M>(0);
+    WaitFlag<PIPE_MTE1, PIPE_M>(0);
+
+    for (int32_t r = 0; r < num_rounds; ++r) {
+        const int32_t row_a  = r * wave_rows + cid * TILE_SIZE;
+        const int32_t ws_row = r * wave_rows + cid * TILE_SIZE;
+
+        // Load A tile: GM → L1 → L0A
+        TileGlobal a_global(A + row_a * TILE_SIZE);
+        TLOAD(a_l1, a_global);
+        SetFlag<PIPE_MTE2, PIPE_MTE1>(0);
+        WaitFlag<PIPE_MTE2, PIPE_MTE1>(0);
+        TMOV(a_l0, a_l1);
+        SetFlag<PIPE_MTE1, PIPE_M>(0);
+        WaitFlag<PIPE_MTE1, PIPE_M>(0);
+
+        // GEMM: c_l0 = A @ B  (fp32 accumulator)
+        TMATMUL(c_l0, a_l0, b_l0);
+        SetFlag<PIPE_M, PIPE_FIX>(0);
+        WaitFlag<PIPE_M, PIPE_FIX>(0);  // M → FIX: c_l0 ready for TSTORE
+
+        // Write result to workspace (fp32 → fp16 via FIX pipe)
+        TileGlobal ws_out(workspace + ws_row * TILE_SIZE);
+        TSTORE(ws_out, c_l0);
+
+        // Drain all pipes before the next round.  Without this barrier the
+        // next round's TMOV(a_l0) on MTE1 would race with the current round's
+        // TMATMUL still reading a_l0 on M (no cross-core wait provides the
+        // implicit ordering that the pipelined version relies on).
+        pipe_barrier(PIPE_ALL);
+    }
+
+    // All GEMM rounds written; pipe_barrier at end of last iteration already
+    // flushed DMAs — signal both Vec sub-blocks.
+    SetCrossFlag<PIPE_FIX>(FLAG_C2V);
+}
+
+// Stage 2 of add_matmul_v2c (Cube side):
+//   Wait for both Vec sub-blocks to finish stage 1 (FLAG_V2C × VEC_NUM),
+//   then for each round: load workspace, GEMM, store C.
+AICORE void cube_gemm_from_ws_all_rounds(
+    __gm__ half *workspace, // [batch, TILE_SIZE]  A+B results from stage 1
+    __gm__ half *D,         // [TILE_SIZE, TILE_SIZE]  constant weight
+    __gm__ half *C,         // [batch, TILE_SIZE]  output
+    int32_t cid, int32_t num_cores, int32_t num_rounds)
+{
+    TileL1  d_l1, ab_l1;
+    TileL0A ab_l0;
+    TileL0B d_l0;
+    TileL0C c_l0;
+    TASSIGN(d_l1,  L1_CONST_OFFSET);
+    TASSIGN(ab_l1, L1_DATA_OFFSET);
+    TASSIGN(ab_l0, L0_OFFSET);
+    TASSIGN(d_l0,  L0_OFFSET);
+    TASSIGN(c_l0,  L0_OFFSET);
+
+    const int32_t wave_rows = num_cores * TILE_SIZE;
+
+    // Load constant weight D once — reused for all rounds
+    TileGlobal d_global(D);
+    TLOAD(d_l1, d_global);
+    SetFlag<PIPE_MTE2, PIPE_MTE1>(0);
+    WaitFlag<PIPE_MTE2, PIPE_MTE1>(0);
+    TMOV(d_l0, d_l1);
+    SetFlag<PIPE_MTE1, PIPE_M>(0);
+    WaitFlag<PIPE_MTE1, PIPE_M>(0);
+
+    // Wait for BOTH Vec sub-blocks to finish all A+B rounds (VEC_NUM signals)
+    WaitCrossFlag(FLAG_V2C);
+    pipe_barrier(PIPE_ALL);  // ensure workspace writes are visible to this Cube core
+
+    for (int32_t r = 0; r < num_rounds; ++r) {
+        const int32_t row_c  = r * wave_rows + cid * TILE_SIZE;
+        const int32_t ws_row = r * wave_rows + cid * TILE_SIZE;
+
+        // Load workspace (A+B): GM → L1 → L0A
+        TileGlobal ws_in(workspace + ws_row * TILE_SIZE);
+        TLOAD(ab_l1, ws_in);
+        SetFlag<PIPE_MTE2, PIPE_MTE1>(0);
+        WaitFlag<PIPE_MTE2, PIPE_MTE1>(0);
+        TMOV(ab_l0, ab_l1);
+        SetFlag<PIPE_MTE1, PIPE_M>(0);
+        WaitFlag<PIPE_MTE1, PIPE_M>(0);
+
+        // GEMM: c_l0 = (A+B) @ D
+        TMATMUL(c_l0, ab_l0, d_l0);
+        SetFlag<PIPE_M, PIPE_FIX>(0);
+        WaitFlag<PIPE_M, PIPE_FIX>(0);
+
+        // Store result (fp32 → fp16) to global memory C
+        TileGlobal c_global(C + row_c * TILE_SIZE);
+        TSTORE(c_global, c_l0);
+        // Drain all pipes before the next round (same reasoning as
+        // cube_gemm_all_rounds_to_ws: prevent L0A/L0C conflicts on loop back-edge).
+        pipe_barrier(PIPE_ALL);
+    }
+}
+
+#endif  // __DAV_C220_CUBE__
+
+// ── Vec-side helpers ───────────────────────────────────────────────────────────
+#if defined(__DAV_C220_VEC__)
+
+// Stage 2 of matmul_add_c2v (Vec side):
+//   Wait for FLAG_C2V (Cube has filled workspace), then for each round:
+//   load the GEMM result and D, compute c = gemm_result + D, store C.
+AICORE void vec_add_from_ws_all_rounds(
+    __gm__ half *workspace, // [batch, TILE_SIZE]  GEMM results from stage 1
+    __gm__ half *D,         // [batch, TILE_SIZE]  bias
+    __gm__ half *C,         // [batch, TILE_SIZE]  output
+    int32_t cid, int32_t vid, int32_t num_cores, int32_t num_rounds)
+{
+    TileVecUB c_ub, d_ub;
+    TASSIGN(c_ub, UB_SLOT0_OFFSET);
+    TASSIGN(d_ub, UB_SLOT1_OFFSET);
+
+    const int32_t wave_rows = num_cores * TILE_SIZE;
+
+    // Wait until all GEMM rounds are in workspace (one-shot signal from Cube)
+    WaitCrossFlag(FLAG_C2V);
+    pipe_barrier(PIPE_ALL);  // ensure workspace writes are visible to this Vec sub-block
+
+    for (int32_t r = 0; r < num_rounds; ++r) {
+        const int32_t row_v  = r * wave_rows + cid * TILE_SIZE + vid * HALF_TILE;
+        const int32_t ws_row = r * wave_rows + cid * TILE_SIZE + vid * HALF_TILE;
+
+        // Load GEMM result slice and D slice from GM → UB (both issue to MTE2)
+        HalfTileGlobal ws_in(workspace + ws_row * TILE_SIZE);
+        TLOAD(c_ub, ws_in);
+
+        HalfTileGlobal d_global(D + row_v * TILE_SIZE);
+        TLOAD(d_ub, d_global);
+
+        SetFlag<PIPE_MTE2, PIPE_V>(0);
+        WaitFlag<PIPE_MTE2, PIPE_V>(0);  // both TLOADs done before TADD
+
+        // C = gemm_result + D  (element-wise, Vec engine)
+        TADD(c_ub, c_ub, d_ub);
+        SetFlag<PIPE_V, PIPE_MTE3>(0);
+        WaitFlag<PIPE_V, PIPE_MTE3>(0);  // TADD done before TSTORE
+
+        HalfTileGlobal c_out(C + row_v * TILE_SIZE);
+        TSTORE(c_out, c_ub);
+
+        // Wait for TSTORE to complete before the next iteration reuses c_ub
+        SetFlag<PIPE_MTE3, PIPE_MTE2>(0);
+        WaitFlag<PIPE_MTE3, PIPE_MTE2>(0);
+    }
+}
+
+// Stage 1 of add_matmul_v2c (Vec side):
+//   For each round: load A and B slices, compute a_ub = A + B, store to workspace.
+//   After the final TSTORE, send FLAG_V2C to signal Cube.
+AICORE void vec_add_all_rounds_to_ws(
+    __gm__ half *A,         // [batch, TILE_SIZE]  input
+    __gm__ half *B,         // [batch, TILE_SIZE]  input
+    __gm__ half *workspace, // [batch, TILE_SIZE]  output of this stage
+    int32_t cid, int32_t vid, int32_t num_cores, int32_t num_rounds)
+{
+    TileVecUB a_ub, b_ub;
+    TASSIGN(a_ub, UB_SLOT0_OFFSET);
+    TASSIGN(b_ub, UB_SLOT1_OFFSET);
+
+    const int32_t wave_rows = num_cores * TILE_SIZE;
+
+    for (int32_t r = 0; r < num_rounds; ++r) {
+        const int32_t row_v  = r * wave_rows + cid * TILE_SIZE + vid * HALF_TILE;
+        const int32_t ws_row = r * wave_rows + cid * TILE_SIZE + vid * HALF_TILE;
+
+        // Load A and B slices from GM → UB
+        HalfTileGlobal a_global(A + row_v * TILE_SIZE);
+        TLOAD(a_ub, a_global);
+
+        HalfTileGlobal b_global(B + row_v * TILE_SIZE);
+        TLOAD(b_ub, b_global);
+
+        SetFlag<PIPE_MTE2, PIPE_V>(0);
+        WaitFlag<PIPE_MTE2, PIPE_V>(0);  // both TLOADs done before TADD
+
+        // a_ub = A + B  (element-wise, Vec engine)
+        TADD(a_ub, a_ub, b_ub);
+        SetFlag<PIPE_V, PIPE_MTE3>(0);
+        WaitFlag<PIPE_V, PIPE_MTE3>(0);  // TADD done before TSTORE
+
+        HalfTileGlobal ws_out(workspace + ws_row * TILE_SIZE);
+        TSTORE(ws_out, a_ub);
+
+        // Wait for TSTORE before next iteration reuses a_ub
+        SetFlag<PIPE_MTE3, PIPE_MTE2>(0);
+        WaitFlag<PIPE_MTE3, PIPE_MTE2>(0);
+    }
+
+    // All A+B rounds written — flush DMAs then signal Cube
+    pipe_barrier(PIPE_ALL);
+    SetCrossFlag<PIPE_MTE3>(FLAG_V2C);
+}
+
+#endif  // __DAV_C220_VEC__
+
+// =============================================================================
+// Kernel entry points
+// =============================================================================
+
+// matmul_add_c2v: C = A @ B + D
+//   Stage 1 (Cube):  A @ B → workspace   (all rounds)
+//   Stage 2 (Vec):   workspace + D → C   (all rounds, after stage 1)
+AICORE void run_matmul_add_c2v(
+    __gm__ half *A,         // [batch, TILE_SIZE]
+    __gm__ half *B,         // [TILE_SIZE, TILE_SIZE]
+    __gm__ half *C,         // [batch, TILE_SIZE]
+    __gm__ half *D,         // [batch, TILE_SIZE]
+    __gm__ half *workspace, // [batch, TILE_SIZE]
+    int64_t batch, uint64_t ffts_addr)
+{
+    const int32_t cid       = static_cast<int32_t>(get_block_idx());
+    const int32_t vid       = static_cast<int32_t>(get_subblockid());
+    const int32_t num_cores = static_cast<int32_t>(block_num);
+
+    set_ffts_base_addr(ffts_addr);
+
+    const int32_t wave_rows  = num_cores * TILE_SIZE;
+    const int32_t num_rounds = static_cast<int32_t>(batch) / wave_rows;
+
+#if defined(__DAV_C220_CUBE__)
+    cube_gemm_all_rounds_to_ws(A, B, workspace, cid, num_cores, num_rounds);
+#endif
+
+#if defined(__DAV_C220_VEC__)
+    set_mask_norm();
+    set_vector_mask(-1, -1);
+    vec_add_from_ws_all_rounds(workspace, D, C, cid, vid, num_cores, num_rounds);
+#endif
+}
+
+// add_matmul_v2c: C = (A + B) @ D
+//   Stage 1 (Vec):  A + B → workspace   (all rounds)
+//   Stage 2 (Cube): workspace @ D → C   (all rounds, after stage 1)
+AICORE void run_add_matmul_v2c(
+    __gm__ half *A,         // [batch, TILE_SIZE]
+    __gm__ half *B,         // [batch, TILE_SIZE]
+    __gm__ half *C,         // [batch, TILE_SIZE]
+    __gm__ half *D,         // [TILE_SIZE, TILE_SIZE]
+    __gm__ half *workspace, // [batch, TILE_SIZE]
+    int64_t batch, uint64_t ffts_addr)
+{
+    const int32_t cid       = static_cast<int32_t>(get_block_idx());
+    const int32_t vid       = static_cast<int32_t>(get_subblockid());
+    const int32_t num_cores = static_cast<int32_t>(block_num);
+
+    set_ffts_base_addr(ffts_addr);
+
+    const int32_t wave_rows  = num_cores * TILE_SIZE;
+    const int32_t num_rounds = static_cast<int32_t>(batch) / wave_rows;
+
+#if defined(__DAV_C220_VEC__)
+    set_mask_norm();
+    set_vector_mask(-1, -1);
+    vec_add_all_rounds_to_ws(A, B, workspace, cid, vid, num_cores, num_rounds);
+#endif
+
+#if defined(__DAV_C220_CUBE__)
+    cube_gemm_from_ws_all_rounds(workspace, D, C, cid, num_cores, num_rounds);
+#endif
+}
+
+#endif  // __CCE_AICORE__
+
+// ── Kernel entry points (extern "C" __global__) ───────────────────────────────
+extern "C" __global__ AICORE void matmul_add_c2v_kernel(
+    __gm__ uint8_t *A, __gm__ uint8_t *B,
+    __gm__ uint8_t *C, __gm__ uint8_t *D,
+    __gm__ uint8_t *workspace, int64_t batch, uint64_t ffts_addr)
+{
+    run_matmul_add_c2v(
+        reinterpret_cast<__gm__ half *>(A),
+        reinterpret_cast<__gm__ half *>(B),
+        reinterpret_cast<__gm__ half *>(C),
+        reinterpret_cast<__gm__ half *>(D),
+        reinterpret_cast<__gm__ half *>(workspace),
+        batch, ffts_addr);
+}
+
+extern "C" __global__ AICORE void add_matmul_v2c_kernel(
+    __gm__ uint8_t *A, __gm__ uint8_t *B,
+    __gm__ uint8_t *C, __gm__ uint8_t *D,
+    __gm__ uint8_t *workspace, int64_t batch, uint64_t ffts_addr)
+{
+    run_add_matmul_v2c(
+        reinterpret_cast<__gm__ half *>(A),
+        reinterpret_cast<__gm__ half *>(B),
+        reinterpret_cast<__gm__ half *>(C),
+        reinterpret_cast<__gm__ half *>(D),
+        reinterpret_cast<__gm__ half *>(workspace),
+        batch, ffts_addr);
+}
+
+// ── Host-side launchers (called from Python via ctypes) ───────────────────────
+static inline uint64_t _get_ffts_addr() {
+    uint32_t ffts_len  = 0;
+    uint64_t ffts_addr = 0;
+    rtGetC2cCtrlAddr(&ffts_addr, &ffts_len);
+    return ffts_addr;
+}
+
+extern "C" void call_matmul_add_c2v(
+    uint32_t block_dim, void *stream,
+    uint8_t *A, uint8_t *B, uint8_t *C,
+    uint8_t *D, uint8_t *workspace, int64_t batch)
+{
+    matmul_add_c2v_kernel<<<block_dim, nullptr, stream>>>(
+        A, B, C, D, workspace, batch, _get_ffts_addr());
+}
+
+extern "C" void call_add_matmul_v2c(
+    uint32_t block_dim, void *stream,
+    uint8_t *A, uint8_t *B, uint8_t *C,
+    uint8_t *D, uint8_t *workspace, int64_t batch)
+{
+    add_matmul_v2c_kernel<<<block_dim, nullptr, stream>>>(
+        A, B, C, D, workspace, batch, _get_ffts_addr());
+}
diff --git a/examples/jit_cpp/cross_core_sync_demo/matmul_add/naive_separate/run.py b/examples/jit_cpp/cross_core_sync_demo/matmul_add/naive_separate/run.py
new file mode 100644
index 00000000..69e38d04
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/matmul_add/naive_separate/run.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+"""
+Correctness tests and bandwidth benchmark for naive_separate kernels.
+
+Two kernels:
+  matmul_add_c2v : C = A @ B + D
+  add_matmul_v2c : C = (A + B) @ D
+
+Both are "two-stage, no pipeline" baselines:
+  • Stage 1 (GEMM or Vec-add) completes ALL rounds before stage 2 starts.
+  • One kernel launch covers both stages — faster than two separate launches
+    (no second kernel launch overhead), but slower than pipelined variants
+    (no round-level Cube↔Vec overlap).
+
+Workspace sizing:
+  workspace[batch, TILE_SIZE] fp16  — one slot per (core, round) pair.
+  This is larger than the pipelined FIFO buffers (which use ≤ FIFO_DEPTH slots
+  per core regardless of num_rounds).
+
+Benchmark sections:
+  1. Correctness test: compare against torch.matmul / torch.add reference.
+  2. Torch baseline: measure time for torch.matmul then torch.add (two launches).
+  3. Naive kernel benchmark: measure the naive single-launch kernel.
+  4. Comparison table with pipelined variants (from prior benchmarks, for context).
+
+Usage:
+    python run.py
+    NPU_DEVICE=npu:5 python run.py
+"""
+from __future__ import annotations
+
+import os
+import sys
+import time
+
+import torch
+import torch_npu  # noqa: F401
+
+_DEVICE = os.environ.get("NPU_DEVICE", "npu:7")
+torch.npu.set_device(_DEVICE)
+print(f"Using device: {_DEVICE}")
+os.environ["NPU_DEVICE"] = _DEVICE
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from jit_util import (  # noqa: E402
+    load_matmul_add_c2v, load_add_matmul_v2c,
+    BLOCK_DIM, TILE_SIZE,
+)
+
+DTYPE    = torch.float16
+RTOL     = 1e-3
+ATOL     = 1e-5
+_KW      = dict(dtype=DTYPE, device=_DEVICE)
+_WARMUP  = 10
+_REPEATS = 30
+
+
+# ── Workspace allocation ───────────────────────────────────────────────────────
+def make_workspace(batch: int) -> torch.Tensor:
+    """workspace[batch, TILE_SIZE] fp16 — one slot per (core, round)."""
+    return torch.zeros(batch, TILE_SIZE, **_KW)
+
+
+# ── Correctness ────────────────────────────────────────────────────────────────
+
+def _run(kernel, A, B, D) -> torch.Tensor:
+    C  = torch.zeros_like(A)
+    ws = make_workspace(A.shape[0])
+    kernel(A, B, C, D, ws)
+    torch.npu.synchronize()
+    return C
+
+
+def test_correctness(c2v_kernel, v2c_kernel) -> None:
+    print("=" * 62)
+    print("CORRECTNESS TESTS")
+    print("=" * 62)
+    wave_rows = BLOCK_DIM * TILE_SIZE
+
+    for name, kernel, make_tensors, ref_fn in [
+        (
+            "matmul_add_c2v  (C = A @ B + D)",
+            c2v_kernel,
+            lambda batch: (
+                torch.randn(batch, TILE_SIZE, **_KW),
+                torch.randn(TILE_SIZE, TILE_SIZE, **_KW),
+                torch.randn(batch, TILE_SIZE, **_KW),
+            ),
+            lambda A, B, D: (A @ B + D).to(DTYPE),
+        ),
+        (
+            "add_matmul_v2c  (C = (A + B) @ D)",
+            v2c_kernel,
+            lambda batch: (
+                torch.randn(batch, TILE_SIZE, **_KW),
+                torch.randn(batch, TILE_SIZE, **_KW),
+                torch.randn(TILE_SIZE, TILE_SIZE, **_KW),
+            ),
+            lambda A, B, D: ((A + B) @ D).to(DTYPE),
+        ),
+    ]:
+        passed = failed = 0
+        for seed in range(3):
+            for num_rounds in range(1, 11):
+                batch = num_rounds * wave_rows
+                torch.manual_seed(seed)
+                A, B, D = make_tensors(batch)
+                C_kernel = _run(kernel, A, B, D)
+                C_ref    = ref_fn(A, B, D)
+                try:
+                    torch.testing.assert_close(C_kernel, C_ref, rtol=RTOL, atol=ATOL)
+                    passed += 1
+                except AssertionError as e:
+                    failed += 1
+                    if failed <= 5:
+                        print(f"  FAIL  seed={seed} rounds={num_rounds}: {e}")
+        total  = passed + failed
+        status = "OK" if failed == 0 else f"FAILED ({failed}/{total})"
+        print(f"  {name}: {passed}/{total} passed — {status}")
+
+    print()
+    if failed:
+        sys.exit(1)
+
+
+# ── Benchmark helpers ──────────────────────────────────────────────────────────
+
+def _time_kernel(fn, warmup: int = _WARMUP, repeats: int = _REPEATS) -> float:
+    """Return average kernel duration in microseconds."""
+    for _ in range(warmup):
+        fn()
+    torch.npu.synchronize()
+    start = torch.npu.Event(enable_timing=True)
+    end   = torch.npu.Event(enable_timing=True)
+    start.record()
+    for _ in range(repeats):
+        fn()
+    end.record()
+    end.synchronize()
+    return start.elapsed_time(end) / repeats * 1e3  # ms → µs
+
+
+def _bytes_c2v(batch: int) -> int:
+    """GM bytes for matmul_add_c2v: read A + read B + read D + write C  (fp16)."""
+    return (batch * TILE_SIZE * 3 + TILE_SIZE * TILE_SIZE) * 2
+
+
+def _bytes_v2c(batch: int) -> int:
+    """GM bytes for add_matmul_v2c: read A + read B + read D + write C  (fp16)."""
+    return (batch * TILE_SIZE * 3 + TILE_SIZE * TILE_SIZE) * 2
+
+
+# ── Benchmark: matmul_add_c2v ──────────────────────────────────────────────────
+
+def benchmark_c2v(c2v_kernel) -> list[dict]:
+    print("=" * 62)
+    print("BENCHMARK  matmul_add_c2v  (C = A @ B + D)")
+    print(f"  warmup={_WARMUP}  repeats={_REPEATS}")
+    print("=" * 62)
+    wave_rows = BLOCK_DIM * TILE_SIZE
+
+    hdr = (f"{'batch':>10}  {'rounds':>6}  "
+           f"{'naive_us':>10}  {'naive_GB/s':>12}  "
+           f"{'torch_us':>10}  {'torch_GB/s':>12}  "
+           f"{'speedup':>8}")
+    print(hdr)
+    print("-" * len(hdr))
+
+    records = []
+    for num_rounds in [1, 2, 4, 8, 16, 32, 64]:
+        batch = num_rounds * wave_rows
+        torch.manual_seed(0)
+        A  = torch.randn(batch, TILE_SIZE, **_KW)
+        B  = torch.randn(TILE_SIZE, TILE_SIZE, **_KW)
+        D  = torch.randn(batch, TILE_SIZE, **_KW)
+        C  = torch.zeros_like(A)
+        ws = make_workspace(batch)
+
+        naive_us = _time_kernel(lambda: c2v_kernel(A, B, C, D, ws))
+        torch_us = _time_kernel(lambda: (torch.mm(A, B) + D))  # two-launch baseline
+
+        nbytes   = _bytes_c2v(batch)
+        naive_bw = nbytes / naive_us * 1e-3
+        torch_bw = nbytes / torch_us * 1e-3
+        speedup  = torch_us / naive_us
+
+        print(f"{batch:>10d}  {num_rounds:>6d}  "
+              f"{naive_us:>10.2f}  {naive_bw:>12.2f}  "
+              f"{torch_us:>10.2f}  {torch_bw:>12.2f}  "
+              f"{speedup:>8.2f}x")
+        records.append(dict(batch=batch, num_rounds=num_rounds,
+                            naive_us=naive_us, naive_bw=naive_bw,
+                            torch_us=torch_us, torch_bw=torch_bw))
+
+    peak = max(r["naive_bw"] for r in records)
+    print(f"\nPeak naive bandwidth: {peak:.1f} GB/s  (910B2 HBM roofline ≈ 1500 GB/s)\n")
+    return records
+
+
+# ── Benchmark: add_matmul_v2c ──────────────────────────────────────────────────
+
+def benchmark_v2c(v2c_kernel) -> list[dict]:
+    print("=" * 62)
+    print("BENCHMARK  add_matmul_v2c  (C = (A + B) @ D)")
+    print(f"  warmup={_WARMUP}  repeats={_REPEATS}")
+    print("=" * 62)
+    wave_rows = BLOCK_DIM * TILE_SIZE
+
+    hdr = (f"{'batch':>10}  {'rounds':>6}  "
+           f"{'naive_us':>10}  {'naive_GB/s':>12}  "
+           f"{'torch_us':>10}  {'torch_GB/s':>12}  "
+           f"{'speedup':>8}")
+    print(hdr)
+    print("-" * len(hdr))
+
+    records = []
+    for num_rounds in [1, 2, 4, 8, 16, 32, 64]:
+        batch = num_rounds * wave_rows
+        torch.manual_seed(0)
+        A  = torch.randn(batch, TILE_SIZE, **_KW)
+        B  = torch.randn(batch, TILE_SIZE, **_KW)
+        D  = torch.randn(TILE_SIZE, TILE_SIZE, **_KW)
+        C  = torch.zeros_like(A)
+        ws = make_workspace(batch)
+
+        naive_us = _time_kernel(lambda: v2c_kernel(A, B, C, D, ws))
+        torch_us = _time_kernel(lambda: ((A + B) @ D))  # two-op baseline
+
+        nbytes   = _bytes_v2c(batch)
+        naive_bw = nbytes / naive_us * 1e-3
+        torch_bw = nbytes / torch_us * 1e-3
+        speedup  = torch_us / naive_us
+
+        print(f"{batch:>10d}  {num_rounds:>6d}  "
+              f"{naive_us:>10.2f}  {naive_bw:>12.2f}  "
+              f"{torch_us:>10.2f}  {torch_bw:>12.2f}  "
+              f"{speedup:>8.2f}x")
+        records.append(dict(batch=batch, num_rounds=num_rounds,
+                            naive_us=naive_us, naive_bw=naive_bw,
+                            torch_us=torch_us, torch_bw=torch_bw))
+
+    peak = max(r["naive_bw"] for r in records)
+    print(f"\nPeak naive bandwidth: {peak:.1f} GB/s  (910B2 HBM roofline ≈ 1500 GB/s)\n")
+    return records
+
+
+# ── Entry point ────────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    print(f"BLOCK_DIM (num Cube cores): {BLOCK_DIM}\n")
+
+    print("Compiling naive_separate kernels ...")
+    c2v = load_matmul_add_c2v(verbose=True)
+    v2c = load_add_matmul_v2c(verbose=True)
+    print()
+
+    test_correctness(c2v, v2c)
+    benchmark_c2v(c2v)
+    benchmark_v2c(v2c)
diff --git a/examples/jit_cpp/cross_core_sync_demo/matmul_add/pushpop/add_matmul_v2c.cpp b/examples/jit_cpp/cross_core_sync_demo/matmul_add/pushpop/add_matmul_v2c.cpp
new file mode 100644
index 00000000..dad931cf
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/matmul_add/pushpop/add_matmul_v2c.cpp
@@ -0,0 +1,201 @@
+// =============================================================================
+// add_matmul_v2c.cpp  —  C = (A + B) @ D   [pushpop variant]
+//
+// ── API variant: TileData TPUSH / TPOP ──────────────────────────────────────
+//
+//  raw_flag equivalent               │  pushpop (this file)
+//  ──────────────────────────────────┼─────────────────────────────────────────
+//  TSTORE(ws_half, a_ub)             │  TPUSH<V2CPipe, VecTile, UP_DOWN>(pipe, a_ub)
+//  pipe_barrier(PIPE_ALL)            │    └─ TSTORE(GlobalTensor<half>, a_ub) internally
+//  SetCrossFlag<MTE3>(FLAG_V2C)      │       then data-ready signal
+//  ──────────────────────────────────┼─────────────────────────────────────────
+//  WaitCrossFlag(FLAG_V2C)           │  TPOP<V2CPipe, TileL1, UP_DOWN>(pipe, ab_l1)
+//  TLOAD(ab_l1, ws_half)             │    └─ waits, TLOAD(GlobalTensor<half>, ab_l1)
+//  SetCrossFlag<MTE2>(FLAG_C2V)      │       free-space notify
+//
+// VecTile::DType = half → same slot size as raw_flag (T²×sizeof(half) = 32 KB).
+// All data types identical to raw_flag.
+//
+// NOTE: TileData TPUSH/TPOP with TILE_UP_DOWN and 2 Vec sub-blocks shares
+//       pipe.prod.tileIndex between sub-blocks, causing tileIndex to advance
+//       by 2 per logical round (not 1).  This de-syncs producer/consumer slot
+//       indices for num_rounds > 1.  The test is scoped to num_rounds=1 where
+//       the API behaves correctly (matching the pto-isa-master unit tests).
+//       For multi-round workloads, use the gm_pipe variant.
+//
+// Python: all float16.  Reference: (A + B) @ D
+// =============================================================================
+
+#define MEMORY_BASE
+#include <pto/pto-inst.hpp>
+#include <pto/common/fifo.hpp>
+#include "acl/acl.h"
+#include <runtime/rt_ffts.h>
+
+using namespace pto;
+
+#define TILE_SIZE 128
+#define HALF_TILE  64
+#define VEC_NUM     2
+
+#ifdef __CCE_AICORE__
+
+constexpr uint32_t L1_D_OFFSET  = 0u;
+constexpr uint32_t L1_AB_OFFSET = TILE_SIZE * TILE_SIZE * sizeof(half);  // 32 KB
+constexpr uint32_t L0_OFFSET    = 0u;
+constexpr uint32_t UB_A_OFFSET  = 0u;
+constexpr uint32_t UB_B_OFFSET  = HALF_TILE * TILE_SIZE * sizeof(half);  // 16 KB
+
+// FIFO_DEPTH=2 for V2C: both Vec sub-blocks write without blocking at allocate().
+// With FIFO_DEPTH=1 only 1 free signal is seeded, sub-block 1 deadlocks.
+// V2CPipe uses FlagID=2 (flags 2 and 3) to avoid FFTS collision with
+// C2VPipe = TPipe<0, DIR_C2V> which occupies flags 0 and 1.
+constexpr uint32_t FIFO_DEPTH     = 2u;
+constexpr uint32_t V2C_SLOT_SIZE  = TILE_SIZE * TILE_SIZE * sizeof(half); // 32 KB
+constexpr uint32_t V2C_FIFO_BYTES = FIFO_DEPTH * V2C_SLOT_SIZE;           // 64 KB/core
+constexpr uint32_t V2C_L1_BASE    = L1_AB_OFFSET;
+
+using TileL1 = Tile<TileType::Mat, half, TILE_SIZE, TILE_SIZE,
+                    BLayout::ColMajor, TILE_SIZE, TILE_SIZE,
+                    SLayout::RowMajor, 512, PadValue::Zero>;
+using TileL0A = TileLeft<half,  TILE_SIZE, TILE_SIZE>;
+using TileL0B = TileRight<half, TILE_SIZE, TILE_SIZE>;
+using TileL0C = TileAcc<float,  TILE_SIZE, TILE_SIZE>;
+
+using TileVecUB = Tile<TileType::Vec, half, HALF_TILE, TILE_SIZE,
+                       BLayout::RowMajor, HALF_TILE, TILE_SIZE,
+                       SLayout::NoneBox, 512, PadValue::Null>;
+
+using V2CPipe = TPipe<2, Direction::DIR_V2C, V2C_SLOT_SIZE, FIFO_DEPTH>;
+
+using TileGlobal =
+    GlobalTensor<half,
+                 TileShape2D<half, TILE_SIZE, TILE_SIZE, Layout::ND>,
+                 BaseShape2D<half, TILE_SIZE, TILE_SIZE, Layout::ND>,
+                 Layout::ND>;
+using HalfTileGlobal =
+    GlobalTensor<half,
+                 TileShape2D<half, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 BaseShape2D<half, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 Layout::ND>;
+
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void SetFlag(uint32_t id) { set_flag(Src, Dst, static_cast<event_t>(id)); }
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void WaitFlag(uint32_t id) { wait_flag(Src, Dst, static_cast<event_t>(id)); }
+
+AICORE void run_add_matmul_v2c(
+    __gm__ half    *A, __gm__ half *B, __gm__ half *C, __gm__ half *D,
+    __gm__ uint8_t *fifo_mem, int64_t batch, uint64_t ffts_addr)
+{
+    const int32_t cid       = static_cast<int32_t>(get_block_idx());
+    const int32_t vid       = static_cast<int32_t>(get_subblockid());
+    const int32_t num_cores = static_cast<int32_t>(block_num);
+
+    set_ffts_base_addr(ffts_addr);
+
+    const int32_t wave_rows  = num_cores * TILE_SIZE;
+    const int32_t num_rounds = static_cast<int32_t>(batch) / wave_rows;
+
+    __gm__ void *core_fifo = fifo_mem + cid * V2C_FIFO_BYTES;
+    V2CPipe pipe(core_fifo, /*c2v_ub_base=*/0x0, /*v2c_l1_base=*/V2C_L1_BASE);
+
+    TileL1  d_l1, ab_l1;
+    TASSIGN(d_l1, L1_D_OFFSET);
+    TileL0A ab_l0;  TileL0B d_l0;  TileL0C c_l0;
+    TASSIGN(ab_l0, L0_OFFSET);
+    TASSIGN(d_l0,  L0_OFFSET);
+    TASSIGN(c_l0,  L0_OFFSET);
+    TileVecUB a_ub, b_ub;
+    TASSIGN(a_ub, UB_A_OFFSET);
+    TASSIGN(b_ub, UB_B_OFFSET);
+
+    // ── Cube: load constant D, TPOP A+B from FIFO, GEMM, store C ──────────────
+#if defined(__DAV_C220_CUBE__)
+
+    TileGlobal d_global(D);
+    TLOAD(d_l1, d_global);
+    SetFlag<PIPE_MTE2, PIPE_MTE1>(0);
+    WaitFlag<PIPE_MTE2, PIPE_MTE1>(0);
+    TMOV(d_l0, d_l1);
+    SetFlag<PIPE_MTE1, PIPE_M>(0);
+    WaitFlag<PIPE_MTE1, PIPE_M>(0);
+
+    for (int32_t r = 0; r < num_rounds; ++r) {
+        const int32_t row_c = r * wave_rows + cid * TILE_SIZE;
+
+        TPOP<V2CPipe, TileL1, TileSplitAxis::TILE_UP_DOWN>(pipe, ab_l1);
+        // └─ internally: wait for both Vec sub-blocks' data-ready signals,
+        //    TLOAD(GlobalTensor<half>, ab_l1) to V2C_L1_BASE rotation,
+        //    then sends free-space notification back to Vec.
+
+        SetFlag<PIPE_MTE2, PIPE_MTE1>(0);
+        WaitFlag<PIPE_MTE2, PIPE_MTE1>(0);  // MTE2→MTE1: TPOP TLOAD done before TMOV
+
+        TMOV(ab_l0, ab_l1);
+        SetFlag<PIPE_MTE1, PIPE_M>(0);
+        WaitFlag<PIPE_MTE1, PIPE_M>(0);
+
+        TMATMUL(c_l0, ab_l0, d_l0);
+        SetFlag<PIPE_M, PIPE_FIX>(0);
+        WaitFlag<PIPE_M, PIPE_FIX>(0);  // M→FIX: c_l0 ready for TSTORE
+
+        TileGlobal c_global(C + row_c * TILE_SIZE);
+        TSTORE(c_global, c_l0);
+        // Drain FIX pipe: the in-flight TSTORE DMA (reading c_l0) must complete
+        // before the next round's (or next call's) TMATMUL writes c_l0.
+        pipe_barrier(PIPE_ALL);
+    }
+
+#endif  // __DAV_C220_CUBE__
+
+    // ── Vec: load A+B, TPUSH to FIFO ─────────────────────────────────────────
+#if defined(__DAV_C220_VEC__)
+
+    set_mask_norm();
+    set_vector_mask(-1, -1);
+
+    for (int32_t r = 0; r < num_rounds; ++r) {
+        const int32_t row_v = r * wave_rows + cid * TILE_SIZE + vid * HALF_TILE;
+
+        HalfTileGlobal a_global(A + row_v * TILE_SIZE);
+            TLOAD(a_ub, a_global);
+            HalfTileGlobal b_global(B + row_v * TILE_SIZE);
+            TLOAD(b_ub, b_global);
+            pipe_barrier(PIPE_ALL);  // MTE2→V: both TLOADs done before TADD
+
+            TADD(a_ub, a_ub, b_ub);
+            pipe_barrier(PIPE_ALL);  // V→MTE3: TADD done before TPUSH writes to GM
+
+        TPUSH<V2CPipe, TileVecUB, TileSplitAxis::TILE_UP_DOWN>(pipe, a_ub);
+        // └─ waits for free space (pipe.prod.allocate = wait_flag_dev),
+        //    TSTORE(GlobalTensor<half>, a_ub) to current FIFO slot,
+        //    data-ready signal (both sub-blocks together unblock Cube).
+    }
+
+#endif  // __DAV_C220_VEC__
+}
+
+#endif  // __CCE_AICORE__
+
+extern "C" __global__ AICORE void add_matmul_v2c_kernel(
+    __gm__ uint8_t *A, __gm__ uint8_t *B, __gm__ uint8_t *C,
+    __gm__ uint8_t *D, __gm__ uint8_t *fifo_mem,
+    int64_t batch, uint64_t ffts_addr)
+{
+    run_add_matmul_v2c(
+        reinterpret_cast<__gm__ half *>(A), reinterpret_cast<__gm__ half *>(B),
+        reinterpret_cast<__gm__ half *>(C), reinterpret_cast<__gm__ half *>(D),
+        fifo_mem, batch, ffts_addr);
+}
+
+extern "C" void call(uint32_t block_dim, void *stream,
+                     uint8_t *A, uint8_t *B, uint8_t *C,
+                     uint8_t *D, uint8_t *fifo_mem, int64_t batch)
+{
+    uint32_t ffts_len  = 0;
+    uint64_t ffts_addr = 0;
+    rtGetC2cCtrlAddr(&ffts_addr, &ffts_len);
+    add_matmul_v2c_kernel<<<block_dim, nullptr, stream>>>(
+        A, B, C, D, fifo_mem, batch, ffts_addr);
+}
diff --git a/examples/jit_cpp/cross_core_sync_demo/matmul_add/pushpop/jit_util.py b/examples/jit_cpp/cross_core_sync_demo/matmul_add/pushpop/jit_util.py
new file mode 100644
index 00000000..9641803f
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/matmul_add/pushpop/jit_util.py
@@ -0,0 +1,163 @@
+"""JIT utilities for matmul_add/pushpop kernels.
+
+matmul_add_c2v:  C = A @ B + D   (pushpop, float slot)
+  - D input: float32  (must match VecTileFloat dtype from TPOP)
+  - C output: float16
+  - fifo_mem: float32
+
+add_matmul_v2c:  C = (A + B) @ D   (pushpop, half slot)
+  - all float16; identical types to raw_flag
+  - fifo_mem: float16
+"""
+from __future__ import annotations
+
+import ctypes
+import os
+import subprocess
+from functools import lru_cache
+
+import torch
+
+_HERE = os.path.dirname(os.path.abspath(__file__))
+
+ASCEND_TOOLKIT_HOME = os.environ.get("ASCEND_TOOLKIT_HOME") or os.environ.get(
+    "ASCEND_HOME_PATH", ""
+)
+if not ASCEND_TOOLKIT_HOME:
+    raise RuntimeError("Set ASCEND_TOOLKIT_HOME or ASCEND_HOME_PATH")
+
+PTO_LIB_PATH = os.environ.get("PTO_LIB_PATH", ASCEND_TOOLKIT_HOME)
+_PTO_INC  = os.path.join(PTO_LIB_PATH, "include")
+_DRIVER_INC = "/usr/local/Ascend/driver/kernel/inc"
+
+_NPU_DEVICE = os.environ.get("NPU_DEVICE", "npu:7")
+try:
+    BLOCK_DIM = int(
+        getattr(torch.npu.get_device_properties(_NPU_DEVICE), "cube_core_num", 20)
+    )
+except (RuntimeError, AssertionError):
+    BLOCK_DIM = 24
+
+TILE_SIZE  = 128
+# C2V: FIFO_DEPTH=1 workaround — see PTO_API_BUGS.md Bug 1
+# With FIFO_DEPTH=2 and TILE_UP_DOWN, the tileIndex desync breaks multi-round.
+# FIFO_DEPTH=1 forces SyncPeriod=1 (strict alternation), fixing multi-round for C2V.
+C2V_FIFO_DEPTH = 1
+C2V_FIFO_ELEMS_PER_CORE = C2V_FIFO_DEPTH * TILE_SIZE * TILE_SIZE   # float32 elements
+
+# V2C: FIFO_DEPTH=2 needed — with FIFO_DEPTH=1, only 1 free signal is seeded.
+# Both Vec sub-blocks call allocate() independently; sub-block 1 deadlocks if
+# the single free signal is already consumed by sub-block 0.
+# V2C is therefore scoped to num_rounds=1 (single-round correctness).
+V2C_FIFO_DEPTH = 2
+V2C_FIFO_ELEMS_PER_CORE = V2C_FIFO_DEPTH * TILE_SIZE * TILE_SIZE   # float16 elements
+
+
+def _compile(cpp_basename: str, so_basename: str, verbose: bool = True) -> str:
+    flags = [
+        "-fPIC", "-shared", "-xcce", "-DMEMORY_BASE", "-O2", "-std=gnu++17",
+        "--cce-aicore-arch=dav-c220",
+        "-mllvm", "-cce-aicore-stack-size=0x8000",
+        "-mllvm", "-cce-aicore-function-stack-size=0x8000",
+        "-mllvm", "-cce-aicore-record-overflow=true",
+        "-mllvm", "-cce-aicore-dcci-insert-for-scalar=false",
+        "-Wno-macro-redefined", "-Wno-ignored-attributes",
+        f"-I{_PTO_INC}",
+        f"-I{ASCEND_TOOLKIT_HOME}/include",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc/runtime",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc/profiling",
+    ]
+    if os.path.isdir(_DRIVER_INC):
+        flags.append(f"-I{_DRIVER_INC}")
+    cpp = os.path.join(_HERE, cpp_basename)
+    so  = os.path.join(_HERE, so_basename)
+    cmd = ["bisheng", *flags, cpp, "-o", so]
+    if verbose:
+        print("Compiling:", " ".join(cmd))
+    subprocess.run(cmd, check=True, timeout=300)
+    if verbose:
+        print(f"Compiled → {so}")
+    return so
+
+
+@lru_cache(maxsize=1)
+def load_matmul_add_c2v(verbose: bool = True) -> "MatmulAddC2VKernel":
+    so  = _compile("matmul_add_c2v.cpp", "matmul_add_c2v.so", verbose=verbose)
+    lib = ctypes.CDLL(os.path.abspath(so))
+    lib.call.argtypes = [
+        ctypes.c_uint32,  # block_dim
+        ctypes.c_void_p,  # stream
+        ctypes.c_void_p,  # A  (fp16)
+        ctypes.c_void_p,  # B  (fp16)
+        ctypes.c_void_p,  # C  (fp32 — float FIFO slot, no implicit fp32→fp16)
+        ctypes.c_void_p,  # D  (fp32)
+        ctypes.c_void_p,  # fifo_mem (fp32)
+        ctypes.c_int64,   # batch
+    ]
+    lib.call.restype = None
+    return MatmulAddC2VKernel(lib, BLOCK_DIM)
+
+
+@lru_cache(maxsize=1)
+def load_add_matmul_v2c(verbose: bool = True) -> "AddMatmulV2CKernel":
+    so  = _compile("add_matmul_v2c.cpp", "add_matmul_v2c.so", verbose=verbose)
+    lib = ctypes.CDLL(os.path.abspath(so))
+    lib.call.argtypes = [
+        ctypes.c_uint32,  # block_dim
+        ctypes.c_void_p,  # stream
+        ctypes.c_void_p,  # A  (fp16)
+        ctypes.c_void_p,  # B  (fp16)
+        ctypes.c_void_p,  # C  (fp16 output)
+        ctypes.c_void_p,  # D  (fp16)
+        ctypes.c_void_p,  # fifo_mem (fp16)
+        ctypes.c_int64,   # batch
+    ]
+    lib.call.restype = None
+    return AddMatmulV2CKernel(lib, BLOCK_DIM)
+
+
+class MatmulAddC2VKernel:
+    def __init__(self, lib: ctypes.CDLL, block_dim: int) -> None:
+        self._lib = lib
+        self._block_dim = block_dim
+
+    def __call__(self, A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
+                 D: torch.Tensor, fifo_mem: torch.Tensor,
+                 batch: int | None = None) -> None:
+        """D must be float32; C is float32; A,B fp16; fifo_mem float32."""
+        if batch is None:
+            batch = A.shape[0]
+        stream_ptr = ctypes.c_void_p(torch.npu.current_stream().npu_stream)
+        self._lib.call(
+            self._block_dim, stream_ptr,
+            ctypes.c_void_p(A.data_ptr()),
+            ctypes.c_void_p(B.data_ptr()),
+            ctypes.c_void_p(C.data_ptr()),
+            ctypes.c_void_p(D.data_ptr()),
+            ctypes.c_void_p(fifo_mem.data_ptr()),
+            ctypes.c_int64(batch),
+        )
+
+
+class AddMatmulV2CKernel:
+    def __init__(self, lib: ctypes.CDLL, block_dim: int) -> None:
+        self._lib = lib
+        self._block_dim = block_dim
+
+    def __call__(self, A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
+                 D: torch.Tensor, fifo_mem: torch.Tensor,
+                 batch: int | None = None) -> None:
+        """All tensors fp16; fifo_mem fp16."""
+        if batch is None:
+            batch = A.shape[0]
+        stream_ptr = ctypes.c_void_p(torch.npu.current_stream().npu_stream)
+        self._lib.call(
+            self._block_dim, stream_ptr,
+            ctypes.c_void_p(A.data_ptr()),
+            ctypes.c_void_p(B.data_ptr()),
+            ctypes.c_void_p(C.data_ptr()),
+            ctypes.c_void_p(D.data_ptr()),
+            ctypes.c_void_p(fifo_mem.data_ptr()),
+            ctypes.c_int64(batch),
+        )
diff --git a/examples/jit_cpp/cross_core_sync_demo/matmul_add/pushpop/matmul_add_c2v.cpp b/examples/jit_cpp/cross_core_sync_demo/matmul_add/pushpop/matmul_add_c2v.cpp
new file mode 100644
index 00000000..e5cd3ef2
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/matmul_add/pushpop/matmul_add_c2v.cpp
@@ -0,0 +1,235 @@
+// =============================================================================
+// matmul_add_c2v.cpp  —  C = A @ B + D   [pushpop variant]
+//
+// ── API variant: TileData TPUSH / TPOP ──────────────────────────────────────
+//
+//  raw_flag equivalent          │  pushpop (this file)
+//  ─────────────────────────────┼─────────────────────────────────────────────
+//  TSTORE(ws_half, c_l0)        │  TPUSH<C2VPipe, TileL0C, UP_DOWN>(pipe, c_l0)
+//  pipe_barrier(PIPE_ALL)       │    └─ stores AccTile::DType=float into slot
+//  SetCrossFlag<FIX>(FLAG_C2V)  │
+//  ─────────────────────────────┼─────────────────────────────────────────────
+//  WaitCrossFlag(FLAG_C2V)      │  TPOP<C2VPipe, VecTileFloat, UP_DOWN>(pipe, c_ub_float)
+//  TLOAD(c_ub_half, ws_half)    │    └─ receives float tile  ← dtype differs from raw_flag
+//  SetCrossFlag<MTE3>(FLAG_V2C) │
+//
+// Dtype note: TPUSH stores AccTile::DType=float32 into the slot (no fp32→fp16
+// in the TileData path).  Vec receives float32.  D must be float32 to match.
+// Output C is also float32.  Use gm_pipe variant for half-precision output.
+//
+// Python: A,B: float16;  D,C: float32.
+// Reference: (A @ B) + D  computed in float32.
+//
+// Slot type: float32, 64 KB/slot  (vs 32 KB half-slot in raw_flag / gm_pipe).
+// =============================================================================
+
+#define MEMORY_BASE
+#include <pto/pto-inst.hpp>
+#include <pto/common/fifo.hpp>
+#include "acl/acl.h"
+#include <runtime/rt_ffts.h>
+
+using namespace pto;
+
+#define TILE_SIZE 128
+#define HALF_TILE  64
+#define VEC_NUM     2
+
+#ifdef __CCE_AICORE__
+
+// ── On-chip buffer offsets (bytes) ────────────────────────────────────────────
+constexpr uint32_t L1_B_OFFSET = 0u;
+constexpr uint32_t L1_A_OFFSET = TILE_SIZE * TILE_SIZE * sizeof(half);   // 32 KB
+constexpr uint32_t L0_OFFSET   = 0u;
+constexpr uint32_t UB_D_OFFSET = 0u;  // d_ub_float: 32 KB (float, HALF_TILE rows)
+
+// ── FIFO configuration ────────────────────────────────────────────────────────
+// FIFO_DEPTH=1: workaround for the TPipe tileIndex sharing bug (see PTO_API_BUGS.md).
+// With FIFO_DEPTH=2 and TILE_UP_DOWN (2 Vec sub-blocks), both sub-blocks increment
+// the same tileIndex counter → FIFO slot selection drifts after round 1.
+// FIFO_DEPTH=1 forces SyncPeriod=1 and strict alternation (no double-buffer overlap),
+// which avoids the desync at the cost of no pipeline prefetch.
+constexpr uint32_t FIFO_DEPTH      = 1u;
+constexpr uint32_t C2V_SLOT_SIZE   = TILE_SIZE * TILE_SIZE * sizeof(float); // 64 KB
+constexpr uint32_t C2V_FIFO_BYTES  = FIFO_DEPTH * C2V_SLOT_SIZE;            // 64 KB/core
+constexpr uint32_t C2V_UB_BASE     = 0x20000;  // 128 KB offset: after d_ub (32 KB)
+
+// ── Tile types ────────────────────────────────────────────────────────────────
+using TileL1 = Tile<TileType::Mat, half, TILE_SIZE, TILE_SIZE,
+                    BLayout::ColMajor, TILE_SIZE, TILE_SIZE,
+                    SLayout::RowMajor, 512, PadValue::Zero>;
+
+using TileL0A = TileLeft<half,  TILE_SIZE, TILE_SIZE>;
+using TileL0B = TileRight<half, TILE_SIZE, TILE_SIZE>;
+using TileL0C = TileAcc<float,  TILE_SIZE, TILE_SIZE>;
+
+// Vec receives float (matching the float slot written by TPUSH).
+using VecTileFloat = Tile<TileType::Vec, float, HALF_TILE, TILE_SIZE,
+                          BLayout::RowMajor, HALF_TILE, TILE_SIZE,
+                          SLayout::NoneBox, 512, PadValue::Null>;
+// D is also float so TADD types match.
+using VecTileFloatD = Tile<TileType::Vec, float, HALF_TILE, TILE_SIZE,
+                           BLayout::RowMajor, HALF_TILE, TILE_SIZE,
+                           SLayout::NoneBox, 512, PadValue::Null>;
+
+using C2VPipe = TPipe<0, Direction::DIR_C2V, C2V_SLOT_SIZE, FIFO_DEPTH>;
+
+using TileGlobal =
+    GlobalTensor<half,
+                 TileShape2D<half, TILE_SIZE, TILE_SIZE, Layout::ND>,
+                 BaseShape2D<half, TILE_SIZE, TILE_SIZE, Layout::ND>,
+                 Layout::ND>;
+using HalfTileGlobal =
+    GlobalTensor<half,
+                 TileShape2D<half, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 BaseShape2D<half, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 Layout::ND>;
+// D input and C output are declared as half in GM but loaded/stored as float
+// via float GlobalTensors with the same byte offset.
+using FloatHalfTileGlobal =
+    GlobalTensor<float,
+                 TileShape2D<float, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 BaseShape2D<float, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 Layout::ND>;
+
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void SetFlag(uint32_t id) { set_flag(Src, Dst, static_cast<event_t>(id)); }
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void WaitFlag(uint32_t id) { wait_flag(Src, Dst, static_cast<event_t>(id)); }
+
+// ─────────────────────────────────────────────────────────────────────────────
+AICORE void run_matmul_add_c2v(
+    __gm__ half    *A,
+    __gm__ half    *B,
+    __gm__ float   *C,        // float32 — matches VecTileFloat output dtype
+    __gm__ float   *D,        // float32 — must match VecTileFloat dtype
+    __gm__ uint8_t *fifo_mem,
+    int64_t         batch,
+    uint64_t        ffts_addr)
+{
+    const int32_t cid       = static_cast<int32_t>(get_block_idx());
+    const int32_t vid       = static_cast<int32_t>(get_subblockid());
+    const int32_t num_cores = static_cast<int32_t>(block_num);
+
+    set_ffts_base_addr(ffts_addr);
+
+    const int32_t wave_rows  = num_cores * TILE_SIZE;
+    const int32_t num_rounds = static_cast<int32_t>(batch) / wave_rows;
+
+    __gm__ void *core_fifo = fifo_mem + cid * C2V_FIFO_BYTES;
+    C2VPipe pipe(core_fifo, /*c2v_ub_base=*/C2V_UB_BASE, /*v2c_l1_base=*/0x0);
+
+    TileL1  b_l1, a_l1;
+    TASSIGN(b_l1, L1_B_OFFSET);
+    TASSIGN(a_l1, L1_A_OFFSET);
+
+    TileL0A a_l0;  TileL0B b_l0;  TileL0C c_l0;
+    TASSIGN(a_l0, L0_OFFSET);
+    TASSIGN(b_l0, L0_OFFSET);
+    TASSIGN(c_l0, L0_OFFSET);
+
+    VecTileFloatD d_ub;
+    TASSIGN(d_ub, UB_D_OFFSET);
+    VecTileFloat c_ub_float;  // TPOP assigns this internally via C2V_UB_BASE
+
+    // ── Cube: GEMM per round, push result to FIFO ─────────────────────────────
+#if defined(__DAV_C220_CUBE__)
+
+    TileGlobal b_global(B);
+    TLOAD(b_l1, b_global);
+    SetFlag<PIPE_MTE2, PIPE_MTE1>(0);
+    WaitFlag<PIPE_MTE2, PIPE_MTE1>(0);
+    TMOV(b_l0, b_l1);
+    SetFlag<PIPE_MTE1, PIPE_M>(0);
+    WaitFlag<PIPE_MTE1, PIPE_M>(0);
+
+    for (int32_t r = 0; r < num_rounds; ++r) {
+        const int32_t row_c = r * wave_rows + cid * TILE_SIZE;
+
+        TileGlobal a_global(A + row_c * TILE_SIZE);
+        TLOAD(a_l1, a_global);
+        SetFlag<PIPE_MTE2, PIPE_MTE1>(0);
+        WaitFlag<PIPE_MTE2, PIPE_MTE1>(0);
+        TMOV(a_l0, a_l1);
+        SetFlag<PIPE_MTE1, PIPE_M>(0);
+        WaitFlag<PIPE_MTE1, PIPE_M>(0);
+
+        TMATMUL(c_l0, a_l0, b_l0);
+        SetFlag<PIPE_M, PIPE_FIX>(0);
+        WaitFlag<PIPE_M, PIPE_FIX>(0);   // M→FIX: c_l0 ready before TPUSH stores it
+
+        TPUSH<C2VPipe, TileL0C, TileSplitAxis::TILE_UP_DOWN>(pipe, c_l0);
+        // └─ internally: TSTORE(GlobalTensor<float>, c_l0) + data-ready signal
+        // FIX→MTE2: drain the FIX TSTORE DMA before the next iteration's TLOAD
+        // starts.  Without this, the in-flight FIX DMA (reading c_l0) races with
+        // the next TMATMUL (M writing c_l0) → L0C read/write conflict.
+        // Mirrors the set_flag(PIPE_FIX, PIPE_MTE2) carry-over in the pto-isa
+        // reference test (tpushpop_cv_kernel.cpp, commit aef3a004).
+        pipe_barrier(PIPE_ALL);
+    }
+
+#endif  // __DAV_C220_CUBE__
+
+    // ── Vec: pop GEMM result (float), add D (float), store C (half) ───────────
+#if defined(__DAV_C220_VEC__)
+
+    set_mask_norm();
+    set_vector_mask(-1, -1);
+
+    for (int32_t r = 0; r < num_rounds; ++r) {
+        const int32_t row_v = r * wave_rows + cid * TILE_SIZE + vid * HALF_TILE;
+
+        TPOP<C2VPipe, VecTileFloat, TileSplitAxis::TILE_UP_DOWN>(pipe, c_ub_float);
+        // └─ wait + TLOAD(GlobalTensor<float>, c_ub_float) + free notify
+
+        // Load D as float32 to match c_ub_float dtype for TADD.
+        FloatHalfTileGlobal d_global(D + row_v * TILE_SIZE);
+        TLOAD(d_ub, d_global);
+
+        pipe_barrier(PIPE_ALL);  // MTE2: both TPOP-TLOAD and D-TLOAD complete
+
+        TADD(c_ub_float, c_ub_float, d_ub);
+        pipe_barrier(PIPE_ALL);  // V→MTE3: TADD done before TSTORE
+
+        // TSTORE float → float: Vec UB tile → GM C (float32 output).
+        // Note: Vec→GM TSTORE requires matching dtypes (no implicit conversion
+        // for the MTE3 pipe).  Float32 C output is the natural result of float
+        // FIFO slots in this pushpop variant.  See gm_pipe for half16 output.
+        using FloatTileGlobal =
+            GlobalTensor<float,
+                         TileShape2D<float, HALF_TILE, TILE_SIZE, Layout::ND>,
+                         BaseShape2D<float, HALF_TILE, TILE_SIZE, Layout::ND>,
+                         Layout::ND>;
+        FloatTileGlobal c_out(C + row_v * TILE_SIZE);
+        TSTORE(c_out, c_ub_float);
+        pipe_barrier(PIPE_ALL);  // MTE3: TSTORE complete before next round
+    }
+
+#endif  // __DAV_C220_VEC__
+}
+
+#endif  // __CCE_AICORE__
+
+extern "C" __global__ AICORE void matmul_add_c2v_kernel(
+    __gm__ uint8_t *A, __gm__ uint8_t *B, __gm__ uint8_t *C,
+    __gm__ uint8_t *D, __gm__ uint8_t *fifo_mem,
+    int64_t batch, uint64_t ffts_addr)
+{
+    run_matmul_add_c2v(
+        reinterpret_cast<__gm__ half    *>(A),
+        reinterpret_cast<__gm__ half    *>(B),
+        reinterpret_cast<__gm__ float   *>(C),
+        reinterpret_cast<__gm__ float   *>(D),
+        fifo_mem, batch, ffts_addr);
+}
+
+extern "C" void call(uint32_t block_dim, void *stream,
+                     uint8_t *A, uint8_t *B, uint8_t *C,
+                     uint8_t *D, uint8_t *fifo_mem, int64_t batch)
+{
+    uint32_t ffts_len  = 0;
+    uint64_t ffts_addr = 0;
+    rtGetC2cCtrlAddr(&ffts_addr, &ffts_len);
+    matmul_add_c2v_kernel<<<block_dim, nullptr, stream>>>(
+        A, B, C, D, fifo_mem, batch, ffts_addr);
+}
diff --git a/examples/jit_cpp/cross_core_sync_demo/matmul_add/pushpop/run.py b/examples/jit_cpp/cross_core_sync_demo/matmul_add/pushpop/run.py
new file mode 100644
index 00000000..cc4ff6b3
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/matmul_add/pushpop/run.py
@@ -0,0 +1,242 @@
+#!/usr/bin/env python3
+"""
+Correctness tests and bandwidth benchmark for matmul_add/pushpop kernels.
+
+matmul_add_c2v  (pushpop):  C = A @ B + D
+  TPUSH<C2VPipe, TileL0C, TILE_UP_DOWN>(pipe, c_l0)  on Cube
+  TPOP<C2VPipe, VecTile<float>, TILE_UP_DOWN>(pipe, c_ub_float)  on Vec
+  Float32 slot (AccTile::DType=float); D:f32, C:f32.
+
+add_matmul_v2c  (pushpop):  C = (A + B) @ D
+  TPUSH<V2CPipe, TileVecUB, TILE_UP_DOWN>(pipe, a_ub)  on Vec
+  TPOP<V2CPipe, TileL1, TILE_UP_DOWN>(pipe, ab_l1)  on Cube
+  Half slot; all float16.
+
+FIFO_DEPTH=1 workaround (see PTO_API_BUGS.md Bug 1):
+  TPipe TileData TPUSH/TPOP with FIFO_DEPTH=2 and TILE_UP_DOWN is broken for
+  num_rounds > 1: both Vec sub-blocks share a single tileIndex counter, so the
+  FIFO slot selection drifts by 2× per logical round.
+  Setting FIFO_DEPTH=1 forces SyncPeriod=1 (strict alternation, no double-buffer
+  prefetch) which avoids the desync.  Multi-round benchmarking is now possible.
+
+NOTE on fresh fifo per call:
+  Reusing the same fifo_mem tensor across repeated calls accumulates TPipe
+  head/tail state stored inside fifo_mem.  The benchmark pre-allocates one fresh
+  fifo per call to prevent this.
+
+Usage:
+    python run.py
+    NPU_DEVICE=npu:5 python run.py
+"""
+from __future__ import annotations
+
+import os
+import sys
+
+import torch
+import torch_npu  # noqa: F401
+
+_DEVICE = os.environ.get("NPU_DEVICE", "npu:7")
+torch.npu.set_device(_DEVICE)
+print(f"Using device: {_DEVICE}")
+os.environ["NPU_DEVICE"] = _DEVICE
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from jit_util import (  # noqa: E402
+    load_matmul_add_c2v, load_add_matmul_v2c,
+    BLOCK_DIM, TILE_SIZE,
+    C2V_FIFO_ELEMS_PER_CORE, V2C_FIFO_ELEMS_PER_CORE,
+)
+
+RTOL = 1e-3
+ATOL = 1e-3
+
+
+# ── Correctness tests ──────────────────────────────────────────────────────────
+
+def test_matmul_add_c2v(kernel) -> None:
+    print("=" * 60)
+    print("matmul_add_c2v  pushpop  (C = A @ B + D)")
+    print("  TPUSH Cube(AccTile<float>) → TPOP Vec(VecTile<float>)")
+    print("  Float32 slot; D:f32, C:f32, A/B:f16  |  FIFO_DEPTH=1")
+    print("=" * 60)
+
+    wave_rows = BLOCK_DIM * TILE_SIZE
+    passed = failed = 0
+
+    for seed in range(3):
+        for num_rounds in range(1, 9):
+            batch = num_rounds * wave_rows
+            torch.manual_seed(seed)
+            A = torch.randn(batch, TILE_SIZE, dtype=torch.float16, device=_DEVICE)
+            B = torch.randn(TILE_SIZE, TILE_SIZE, dtype=torch.float16, device=_DEVICE)
+            D = torch.randn(batch, TILE_SIZE, dtype=torch.float32, device=_DEVICE)
+            C = torch.zeros(batch, TILE_SIZE, dtype=torch.float32, device=_DEVICE)
+            fifo = torch.zeros(BLOCK_DIM * C2V_FIFO_ELEMS_PER_CORE,
+                               dtype=torch.float32, device=_DEVICE)
+
+            kernel(A, B, C, D, fifo)
+            torch.npu.synchronize()
+
+            ref = (A.float() @ B.float()) + D
+            try:
+                torch.testing.assert_close(C, ref, rtol=RTOL, atol=ATOL)
+                passed += 1
+            except AssertionError as e:
+                failed += 1
+                if failed <= 3:
+                    print(f"  FAIL seed={seed} rounds={num_rounds}: {e}")
+
+    total  = passed + failed
+    status = "OK" if failed == 0 else f"FAILED ({failed}/{total})"
+    print(f"Correctness: {passed}/{total} passed — {status}\n")
+    if failed:
+        sys.exit(1)
+
+
+def test_add_matmul_v2c(kernel) -> None:
+    print("=" * 60)
+    print("add_matmul_v2c  pushpop  (C = (A + B) @ D)")
+    print("  TPUSH Vec(VecTile<half>) → TPOP Cube(TileL1<half>)")
+    print("  Half slot; all float16  |  FIFO_DEPTH=2  (num_rounds=1 scope)")
+    print("  V2C: FIFO_DEPTH=2 needed; tileIndex desync breaks rounds>1.")
+    print("=" * 60)
+
+    wave_rows = BLOCK_DIM * TILE_SIZE
+    passed = failed = 0
+
+    # V2C remains scoped to num_rounds=1: FIFO_DEPTH=2 is required so both Vec
+    # sub-blocks can write without blocking at allocate(), but tileIndex desync
+    # (both sub-blocks share a single counter) breaks rounds > 1.
+    for seed in range(5):
+        batch = wave_rows  # num_rounds = 1
+        torch.manual_seed(seed)
+        A = torch.randn(batch, TILE_SIZE, dtype=torch.float16, device=_DEVICE)
+        B = torch.randn(batch, TILE_SIZE, dtype=torch.float16, device=_DEVICE)
+        D = torch.randn(TILE_SIZE, TILE_SIZE, dtype=torch.float16, device=_DEVICE)
+        C = torch.zeros(batch, TILE_SIZE, dtype=torch.float16, device=_DEVICE)
+        fifo = torch.zeros(BLOCK_DIM * V2C_FIFO_ELEMS_PER_CORE,
+                           dtype=torch.float16, device=_DEVICE)
+
+        kernel(A, B, C, D, fifo)
+        torch.npu.synchronize()
+
+        ref = ((A + B) @ D).half()
+        try:
+            torch.testing.assert_close(C, ref, rtol=1e-3, atol=1e-5)
+            passed += 1
+        except AssertionError as e:
+            failed += 1
+            if failed <= 3:
+                print(f"  FAIL seed={seed}: {e}")
+
+    total  = passed + failed
+    status = "OK" if failed == 0 else f"FAILED ({failed}/{total})"
+    print(f"Correctness (num_rounds=1): {passed}/{total} passed — {status}\n")
+    if failed:
+        sys.exit(1)
+
+
+# ── Bandwidth benchmarks ───────────────────────────────────────────────────────
+
+def _benchmark(kernel, name: str, fifo_dtype, fifo_elems_per_core: int,
+               make_tensors, warmup: int = 10, repeats: int = 30,
+               num_rounds_list: list | None = None) -> None:
+    if num_rounds_list is None:
+        num_rounds_list = [1, 2, 4, 8, 16, 32, 64]
+    print("=" * 60)
+    print(f"BENCHMARK  {name}  pushpop  (FIFO_DEPTH=1)")
+    print(f"  warmup={warmup}  repeats={repeats}")
+    print("=" * 60)
+
+    wave_rows = BLOCK_DIM * TILE_SIZE
+    hdr = f"{'batch':>10}  {'rounds':>6}  {'dur_us':>10}  {'bw_GB/s':>10}"
+    print(hdr)
+    print("-" * len(hdr))
+
+    records = []
+    for num_rounds in num_rounds_list:
+        batch = num_rounds * wave_rows
+        tensors = make_tensors(batch)
+        A, B, D = tensors["A"], tensors["B"], tensors["D"]
+        C = torch.zeros_like(tensors["C_ref"])
+
+        # Pre-allocate fresh fifo per call — avoids TPipe head/tail accumulation
+        n_calls = warmup + repeats
+        fifos = [torch.zeros(BLOCK_DIM * fifo_elems_per_core,
+                             dtype=fifo_dtype, device=_DEVICE)
+                 for _ in range(n_calls)]
+
+        for i in range(warmup):
+            kernel(A, B, C, D, fifos[i])
+        torch.npu.synchronize()
+
+        start = torch.npu.Event(enable_timing=True)
+        end   = torch.npu.Event(enable_timing=True)
+        start.record()
+        for i in range(repeats):
+            kernel(A, B, C, D, fifos[warmup + i])
+        end.record()
+        end.synchronize()
+
+        dur_us = start.elapsed_time(end) / repeats * 1e3
+        bytes_total = tensors["bytes"]
+        bw_gbs = bytes_total / dur_us * 1e-3
+
+        print(f"{batch:>10d}  {num_rounds:>6d}  {dur_us:>10.2f}  {bw_gbs:>10.2f}")
+        records.append(dict(batch=batch, num_rounds=num_rounds,
+                            dur_us=dur_us, bw_gbs=bw_gbs))
+
+    peak_bw = max(r["bw_gbs"] for r in records)
+    print(f"\nPeak bandwidth: {peak_bw:.1f} GB/s  "
+          f"(910B2 HBM roofline ≈ 1500 GB/s)\n")
+
+
+def benchmark_c2v(kernel) -> None:
+    def make(batch):
+        kw16 = dict(dtype=torch.float16, device=_DEVICE)
+        kw32 = dict(dtype=torch.float32, device=_DEVICE)
+        A = torch.randn(batch, TILE_SIZE, **kw16)
+        B = torch.randn(TILE_SIZE, TILE_SIZE, **kw16)
+        D = torch.randn(batch, TILE_SIZE, **kw32)
+        C_ref = torch.zeros(batch, TILE_SIZE, **kw32)
+        # bytes: A(f16) + B(f16) + D(f32) + C(f32)
+        nb = (batch * TILE_SIZE * 2 + TILE_SIZE * TILE_SIZE * 2
+              + batch * TILE_SIZE * 4 + batch * TILE_SIZE * 4)
+        return dict(A=A, B=B, D=D, C_ref=C_ref, bytes=nb)
+
+    _benchmark(kernel, "matmul_add_c2v  (C = A @ B + D)",
+               torch.float32, C2V_FIFO_ELEMS_PER_CORE, make)
+
+
+def benchmark_v2c(kernel) -> None:
+    def make(batch):
+        kw = dict(dtype=torch.float16, device=_DEVICE)
+        A = torch.randn(batch, TILE_SIZE, **kw)
+        B = torch.randn(batch, TILE_SIZE, **kw)
+        D = torch.randn(TILE_SIZE, TILE_SIZE, **kw)
+        C_ref = torch.zeros(batch, TILE_SIZE, **kw)
+        # bytes: A + B + D + C  (all f16)
+        nb = (batch * TILE_SIZE * 3 + TILE_SIZE * TILE_SIZE) * 2
+        return dict(A=A, B=B, D=D, C_ref=C_ref, bytes=nb)
+
+    _benchmark(kernel, "add_matmul_v2c  (C = (A + B) @ D)",
+               torch.float16, V2C_FIFO_ELEMS_PER_CORE, make,
+               num_rounds_list=[1])  # rounds=1 only; tileIndex desync breaks rounds>1
+
+
+if __name__ == "__main__":
+    print(f"BLOCK_DIM={BLOCK_DIM}\n")
+
+    print("Compiling matmul_add_c2v ...")
+    c2v = load_matmul_add_c2v(verbose=True)
+    print()
+    print("Compiling add_matmul_v2c ...")
+    v2c = load_add_matmul_v2c(verbose=True)
+    print()
+
+    test_matmul_add_c2v(c2v)
+    test_add_matmul_v2c(v2c)
+
+    benchmark_c2v(c2v)
+    benchmark_v2c(v2c)
diff --git a/examples/jit_cpp/cross_core_sync_demo/matmul_add/raw_flag/README.md b/examples/jit_cpp/cross_core_sync_demo/matmul_add/raw_flag/README.md
new file mode 100644
index 00000000..8ca5112b
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/matmul_add/raw_flag/README.md
@@ -0,0 +1,53 @@
+# matmul_add — C2V and V2C persistent kernels (ffts_cross_core_sync)
+
+Two complementary kernels demonstrating Cube↔Vec handshake via
+`ffts_cross_core_sync` / `wait_flag_dev`.  Both use `block_dim = num_cube_cores`
+(persistent style).
+
+## Kernels
+
+| Kernel | Operation | Stream direction |
+|--------|-----------|-----------------|
+| `matmul_add_c2v` | `C = A @ B + D` | Cube GEMM → workspace → Vec add |
+| `add_matmul_v2c` | `C = (A + B) @ D` | Vec add → workspace → Cube GEMM |
+
+## Files
+
+| File | Purpose |
+|------|---------|
+| `matmul_add_c2v.cpp` | C2V kernel source |
+| `jit_util_matmul_add_c2v.py` | JIT compile + ctypes loader (C2V) |
+| `run_matmul_add_c2v.py` | Correctness tests + benchmark (C2V) |
+| `add_matmul_v2c.cpp` | V2C kernel source |
+| `jit_util_add_matmul_v2c.py` | JIT compile + ctypes loader (V2C) |
+| `run_add_matmul_v2c.py` | Correctness tests + benchmark (V2C) |
+
+## Reproduce
+
+```bash
+cd examples/jit_cpp/cross_core_sync_demo/matmul_add
+
+# C2V: C = A @ B + D
+python run_matmul_add_c2v.py
+
+# V2C: C = (A + B) @ D
+python run_add_matmul_v2c.py
+
+# Choose a different NPU (default: npu:7)
+NPU_DEVICE=npu:5 python run_matmul_add_c2v.py
+NPU_DEVICE=npu:5 python run_add_matmul_v2c.py
+```
+
+## Expected output (910B2, 24 Cube cores)
+
+**C2V** (`matmul_add_c2v`):
+```
+Correctness: 30/30 passed — OK
+Peak bandwidth: 1401.3 GB/s
+```
+
+**V2C** (`add_matmul_v2c`):
+```
+Correctness: 30/30 passed — OK
+Peak bandwidth: 1593.8 GB/s
+```
diff --git a/examples/jit_cpp/cross_core_sync_demo/matmul_add/raw_flag/add_matmul_v2c.cpp b/examples/jit_cpp/cross_core_sync_demo/matmul_add/raw_flag/add_matmul_v2c.cpp
new file mode 100644
index 00000000..6dce3c94
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/matmul_add/raw_flag/add_matmul_v2c.cpp
@@ -0,0 +1,290 @@
+// =============================================================================
+// add_matmul_v2c.cpp — Persistent kernel: C = (A + B) @ D (Vector-to-Cube stream)
+//
+// Computes  C[batch, T] = (A[batch, T] + B[batch, T]) @ D[T, T]  (fp16)
+// where T = TILE_SIZE = 128.
+//
+// Algorithm (persistent kernel, block_dim == num_cube_cores):
+//
+//   Vec sub-block (vid ∈ {0,1}), each owns HALF_TILE = T/2 rows — PRODUCER:
+//     For each round r:
+//       Load A slice → a_ub, Load B slice → b_ub  (prefetch, no dependency on ws)
+//       TADD  a_ub = a_ub + b_ub
+//       if r > 0: WaitCrossFlag(FLAG_C2V)  ← Cube freed workspace (loaded into L1)
+//       TSTORE a_ub → workspace[cid*T + vid*HT :]  (MTE3 pipe)
+//       SetCrossFlag FLAG_V2C              ← signal Cube: workspace tile written
+//
+//   Cube core (cid) — CONSUMER:
+//     Load D → d_l1 → d_l0  (once, constant weight)
+//     For each round r:
+//       WaitCrossFlag(FLAG_V2C)            ← Vec wrote workspace
+//       TLOAD workspace[cid*T:] → ab_l1   (MTE2 pipe)
+//       SetCrossFlag<PIPE_MTE2> FLAG_C2V   ← signal Vec: workspace freed (right
+//                                            after TLOAD, while GEMM is in flight)
+//       TMOV ab_l1 → ab_l0  (L1 → L0A)
+//       TMATMUL c_l0 = ab_l0 @ d_l0       (GEMM: (A+B) @ D)
+//       TSTORE c_l0 → C[row_c:]            (fp32 → fp16, FIX pipe)
+//
+// Cross-core flags (FFTS):
+//   FLAG_V2C = 1  Vec → Cube: workspace tile written, safe to read
+//   FLAG_C2V = 0  Cube → Vec: workspace tile consumed into L1, safe to overwrite
+//
+// Cube signals Vec via PIPE_MTE2 immediately after the workspace TLOAD — this
+// lets Vec begin loading A and B and computing A+B for the next round while Cube
+// is still executing the GEMM on the already-captured a_l1 data.
+//
+// Memory budget (per core):
+//   L1 (512 KB): d_l1 (32 KB at 0) + ab_l1 (32 KB at 32 KB) = 64 KB used
+//   L0A ( 64 KB): ab_l0 (32 KB at 0)
+//   L0B ( 64 KB): d_l0  (32 KB at 0)
+//   L0C (128 KB): c_l0  (64 KB at 0)
+//   UB  (192 KB): a_ub  (16 KB at 0) + b_ub (16 KB at 16 KB) = 32 KB used
+// =============================================================================
+
+#define MEMORY_BASE
+#include <pto/pto-inst.hpp>
+#include "acl/acl.h"
+#include <runtime/rt_ffts.h>
+
+using namespace pto;
+
+// ── Tile dimensions ────────────────────────────────────────────────────────────
+#define TILE_SIZE 128   // rows/cols per matrix tile
+#define HALF_TILE  64   // rows per Vec sub-block  (TILE_SIZE / VEC_NUM)
+#define VEC_NUM     2   // Vec sub-blocks per Cube core
+
+#ifdef __CCE_AICORE__
+
+// ── On-chip buffer base addresses (bytes) ─────────────────────────────────────
+// L1: d_l1 (constant weight D) followed by ab_l1 (workspace: A+B result)
+constexpr uint32_t L1_D_OFFSET  = 0u;
+constexpr uint32_t L1_AB_OFFSET = TILE_SIZE * TILE_SIZE * sizeof(half);  // 32 KB
+
+// L0A / L0B / L0C are independent scratchpads; each starts at byte 0
+constexpr uint32_t L0_OFFSET = 0u;
+
+// UB: a_ub and b_ub for the Vec add
+constexpr uint32_t UB_A_OFFSET = 0u;
+constexpr uint32_t UB_B_OFFSET = HALF_TILE * TILE_SIZE * sizeof(half);   // 16 KB
+
+// ── Cross-core FFTS flags ──────────────────────────────────────────────────────
+constexpr int32_t FLAG_C2V = 0;  // Cube → Vec: workspace slot consumed into L1
+constexpr int32_t FLAG_V2C = 1;  // Vec → Cube: workspace tile written to GM
+
+// ── Tile type aliases ──────────────────────────────────────────────────────────
+// L1 tile — NZ (ColMajor/RowMajor) layout required by the Cube engine
+using TileL1 = Tile<TileType::Mat, half, TILE_SIZE, TILE_SIZE,
+                    BLayout::ColMajor, TILE_SIZE, TILE_SIZE,
+                    SLayout::RowMajor, 512, PadValue::Zero>;
+
+// L0 tiles — one per independent Cube scratchpad
+using TileL0A = TileLeft<half,  TILE_SIZE, TILE_SIZE>;
+using TileL0B = TileRight<half, TILE_SIZE, TILE_SIZE>;
+using TileL0C = TileAcc<float,  TILE_SIZE, TILE_SIZE>;  // fp32 accumulator
+
+// UB Vec tile — row-major, HALF_TILE rows × TILE_SIZE cols, fp16
+using TileVecUB = Tile<TileType::Vec, half, HALF_TILE, TILE_SIZE,
+                       BLayout::RowMajor, HALF_TILE, TILE_SIZE,
+                       SLayout::NoneBox, 512, PadValue::Null>;
+
+// GlobalTensor aliases — contiguous 2D row-major in GM
+using TileGlobal =
+    GlobalTensor<half,
+                 TileShape2D<half, TILE_SIZE, TILE_SIZE, Layout::ND>,
+                 BaseShape2D<half, TILE_SIZE, TILE_SIZE, Layout::ND>,
+                 Layout::ND>;
+
+using HalfTileGlobal =
+    GlobalTensor<half,
+                 TileShape2D<half, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 BaseShape2D<half, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 Layout::ND>;
+
+// ── Cross-core sync helpers ────────────────────────────────────────────────────
+template <pipe_t Pipe>
+AICORE inline void SetCrossFlag(int32_t flag) {
+    ffts_cross_core_sync(Pipe, 1 | (VEC_NUM << 4) | (flag << 8));
+}
+
+AICORE inline void WaitCrossFlag(int32_t flag) {
+    wait_flag_dev(flag);
+}
+
+// ── Intra-pipe sync helpers ────────────────────────────────────────────────────
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void SetFlag(uint32_t id) {
+    set_flag(Src, Dst, static_cast<event_t>(id));
+}
+
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void WaitFlag(uint32_t id) {
+    wait_flag(Src, Dst, static_cast<event_t>(id));
+}
+
+// ── Kernel implementation ──────────────────────────────────────────────────────
+AICORE void run_add_matmul_v2c(
+    __gm__ half    *A,          // [batch, TILE_SIZE]              input
+    __gm__ half    *B,          // [batch, TILE_SIZE]              input
+    __gm__ half    *C,          // [batch, TILE_SIZE]              output
+    __gm__ half    *D,          // [TILE_SIZE, TILE_SIZE]          weight (constant)
+    __gm__ half    *workspace,  // [num_cores*TILE_SIZE, TILE_SIZE] V2C buffer
+    int64_t         batch,
+    uint64_t        ffts_addr)
+{
+    const int32_t cid       = static_cast<int32_t>(get_block_idx());   // Cube core id
+    const int32_t vid       = static_cast<int32_t>(get_subblockid());  // Vec sub-block: 0 or 1
+    const int32_t num_cores = static_cast<int32_t>(block_num);         // launched Cube cores
+
+    set_ffts_base_addr(ffts_addr);
+
+    const int32_t wave_rows  = num_cores * TILE_SIZE;
+    const int32_t num_rounds = static_cast<int32_t>(batch) / wave_rows;
+
+    // ── Allocate on-chip buffers ───────────────────────────────────────────────
+    TileL1  d_l1, ab_l1;
+    TASSIGN(d_l1,  L1_D_OFFSET);
+    TASSIGN(ab_l1, L1_AB_OFFSET);
+
+    TileL0A ab_l0;
+    TileL0B d_l0;
+    TileL0C c_l0;
+    TASSIGN(ab_l0, L0_OFFSET);
+    TASSIGN(d_l0,  L0_OFFSET);
+    TASSIGN(c_l0,  L0_OFFSET);
+
+    TileVecUB a_ub, b_ub;
+    TASSIGN(a_ub, UB_A_OFFSET);
+    TASSIGN(b_ub, UB_B_OFFSET);
+
+    // ── Cube core: GEMM ───────────────────────────────────────────────────────
+#if defined(__DAV_C220_CUBE__)
+
+    // Load the constant weight D once — reused for every round.
+    TileGlobal d_global(D);
+    TLOAD(d_l1, d_global);
+    SetFlag<PIPE_MTE2, PIPE_MTE1>(0);
+    WaitFlag<PIPE_MTE2, PIPE_MTE1>(0);
+    TMOV(d_l0, d_l1);                   // L1 → L0B (MTE1 pipe)
+    SetFlag<PIPE_MTE1, PIPE_M>(0);
+    WaitFlag<PIPE_MTE1, PIPE_M>(0);
+
+    for (int32_t r = 0; r < num_rounds; ++r) {
+        const int32_t row_c = r * wave_rows + cid * TILE_SIZE;
+
+        // Wait for both Vec sub-blocks to write their halves of the workspace tile.
+        WaitCrossFlag(FLAG_V2C);
+
+        // Load workspace (A+B sum) from GM → ab_l1  (MTE2 pipe).
+        TileGlobal ws_in(workspace + cid * TILE_SIZE * TILE_SIZE);
+        TLOAD(ab_l1, ws_in);
+
+        // Signal Vec immediately after workspace TLOAD (via MTE2): the workspace slot
+        // is now captured in L1, Vec can overwrite it for the next round.
+        // This fires in the MTE2 pipe right after the preceding TLOAD — the GEMM and
+        // C store (MTE1, M, FIX pipes) run concurrently with Vec's next A+B load.
+        SetCrossFlag<PIPE_MTE2>(FLAG_C2V);
+
+        // Sync MTE2 → MTE1: wait for TLOAD to finish before TMOV reads L1.
+        SetFlag<PIPE_MTE2, PIPE_MTE1>(0);
+        WaitFlag<PIPE_MTE2, PIPE_MTE1>(0);
+
+        TMOV(ab_l0, ab_l1);             // L1 → L0A  (MTE1 pipe)
+        SetFlag<PIPE_MTE1, PIPE_M>(0);
+        WaitFlag<PIPE_MTE1, PIPE_M>(0); // M pipe waits for ab_l0 to be ready
+
+        // GEMM: c_l0 = (A+B) @ D  (initialises c_l0)
+        TMATMUL(c_l0, ab_l0, d_l0);
+        SetFlag<PIPE_M, PIPE_FIX>(0);
+        WaitFlag<PIPE_M, PIPE_FIX>(0);  // M→FIX: c_l0 ready for TSTORE
+
+        // Store result (fp32 → fp16) to global memory C.
+        TileGlobal c_global(C + row_c * TILE_SIZE);
+        TSTORE(c_global, c_l0);
+        // Drain FIX pipe before the loop back-edge (or kernel exit on the last
+        // round): the next TMATMUL writes c_l0, so FIX must finish reading it.
+        // Back-to-back benchmark invocations would otherwise trigger an L0C
+        // read/write conflict (same pattern that raw_flag matmul_add_c2v avoids
+        // with its pipe_barrier before SetCrossFlag).
+        pipe_barrier(PIPE_ALL);
+    }
+    // pipe_barrier(PIPE_ALL) inside the loop already drained the last round.
+
+#endif  // __DAV_C220_CUBE__
+
+    // ── Vec sub-block: element-wise add + store to workspace ──────────────────
+#if defined(__DAV_C220_VEC__)
+
+    set_mask_norm();
+    set_vector_mask(-1, -1);
+
+    // This sub-block's fixed workspace row offset (constant across rounds).
+    const int32_t ws_row = cid * TILE_SIZE + vid * HALF_TILE;
+
+    for (int32_t r = 0; r < num_rounds; ++r) {
+        const int32_t row_v = r * wave_rows + cid * TILE_SIZE + vid * HALF_TILE;
+
+        // Load A and B slices — independent of the workspace handshake; prefetch
+        // them while Cube may still be draining the previous round's workspace.
+        HalfTileGlobal a_global(A + row_v * TILE_SIZE);
+        TLOAD(a_ub, a_global);
+
+        HalfTileGlobal b_global(B + row_v * TILE_SIZE);
+        TLOAD(b_ub, b_global);
+
+        pipe_barrier(PIPE_ALL);  // MTE2→V: both TLOADs done before TADD
+
+        // Compute element-wise sum: a_ub = A + B
+        TADD(a_ub, a_ub, b_ub);
+        pipe_barrier(PIPE_ALL);  // V→MTE3: TADD done before TSTORE
+
+        // Wait for Cube to signal that it has loaded the previous workspace tile
+        // into L1 (slot is free to overwrite). Round 0: no previous tile, skip.
+        if (r > 0) {
+            WaitCrossFlag(FLAG_C2V);
+            pipe_barrier(PIPE_ALL);
+        }
+
+        // Write (A+B) sum to the workspace slot for this sub-block.
+        HalfTileGlobal ws_out(workspace + ws_row * TILE_SIZE);
+        TSTORE(ws_out, a_ub);
+        pipe_barrier(PIPE_ALL);  // MTE3: TSTORE complete before SetCrossFlag
+
+        // Signal Cube: workspace tile is fully written, safe to read.
+        SetCrossFlag<PIPE_MTE3>(FLAG_V2C);
+    }
+
+#endif  // __DAV_C220_VEC__
+}
+
+#endif  // __CCE_AICORE__
+
+// ── Kernel entry point ─────────────────────────────────────────────────────────
+extern "C" __global__ AICORE void add_matmul_v2c_kernel(
+    __gm__ uint8_t *A,
+    __gm__ uint8_t *B,
+    __gm__ uint8_t *C,
+    __gm__ uint8_t *D,
+    __gm__ uint8_t *workspace,
+    int64_t         batch,
+    uint64_t        ffts_addr)
+{
+    run_add_matmul_v2c(
+        reinterpret_cast<__gm__ half *>(A),
+        reinterpret_cast<__gm__ half *>(B),
+        reinterpret_cast<__gm__ half *>(C),
+        reinterpret_cast<__gm__ half *>(D),
+        reinterpret_cast<__gm__ half *>(workspace),
+        batch, ffts_addr);
+}
+
+// ── Host-side launcher (called from Python via ctypes) ─────────────────────────
+extern "C" void call(uint32_t block_dim, void *stream,
+                     uint8_t *A, uint8_t *B, uint8_t *C,
+                     uint8_t *D, uint8_t *workspace, int64_t batch)
+{
+    uint32_t ffts_len  = 0;
+    uint64_t ffts_addr = 0;
+    rtGetC2cCtrlAddr(&ffts_addr, &ffts_len);
+    add_matmul_v2c_kernel<<<block_dim, nullptr, stream>>>(
+        A, B, C, D, workspace, batch, ffts_addr);
+}
diff --git a/examples/jit_cpp/cross_core_sync_demo/matmul_add/raw_flag/jit_util_add_matmul_v2c.py b/examples/jit_cpp/cross_core_sync_demo/matmul_add/raw_flag/jit_util_add_matmul_v2c.py
new file mode 100644
index 00000000..ad16423c
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/matmul_add/raw_flag/jit_util_add_matmul_v2c.py
@@ -0,0 +1,140 @@
+"""JIT compile + load utility for add_matmul_v2c kernel.
+
+Compiles add_matmul_v2c.cpp with bisheng and returns a Python callable.
+
+Usage:
+    from jit_util_add_matmul_v2c import compile_and_load, BLOCK_DIM
+    kernel = compile_and_load()
+    kernel(A, B, C, D, workspace, batch)
+"""
+from __future__ import annotations
+
+import ctypes
+import os
+import subprocess
+from functools import lru_cache
+
+import torch
+
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_CPP_FILE = os.path.join(_HERE, "add_matmul_v2c.cpp")
+_LIB_FILE = os.path.join(_HERE, "add_matmul_v2c.so")
+
+ASCEND_TOOLKIT_HOME = os.environ.get("ASCEND_TOOLKIT_HOME") or os.environ.get(
+    "ASCEND_HOME_PATH", ""
+)
+if not ASCEND_TOOLKIT_HOME:
+    raise RuntimeError("Set ASCEND_TOOLKIT_HOME or ASCEND_HOME_PATH")
+
+PTO_LIB_PATH = os.environ.get("PTO_LIB_PATH", ASCEND_TOOLKIT_HOME)
+_PTO_INC = os.path.join(PTO_LIB_PATH, "include")
+_DRIVER_INC = "/usr/local/Ascend/driver/kernel/inc"
+
+_NPU_DEVICE = os.environ.get("NPU_DEVICE", "npu:7")
+try:
+    BLOCK_DIM = int(
+        getattr(torch.npu.get_device_properties(_NPU_DEVICE), "cube_core_num", 20)
+    )
+except (RuntimeError, AssertionError):
+    BLOCK_DIM = 24
+
+
+def _compile(verbose: bool = True) -> str:
+    flags = [
+        "-fPIC",
+        "-shared",
+        "-xcce",
+        "-DMEMORY_BASE",
+        "-O2",
+        "-std=gnu++17",
+        "--cce-aicore-arch=dav-c220",
+        "-mllvm", "-cce-aicore-stack-size=0x8000",
+        "-mllvm", "-cce-aicore-function-stack-size=0x8000",
+        "-mllvm", "-cce-aicore-record-overflow=true",
+        "-mllvm", "-cce-aicore-dcci-insert-for-scalar=false",
+        "-Wno-macro-redefined",
+        "-Wno-ignored-attributes",
+        f"-I{_PTO_INC}",
+        f"-I{ASCEND_TOOLKIT_HOME}/include",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc/runtime",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc/profiling",
+    ]
+    if os.path.isdir(_DRIVER_INC):
+        flags.append(f"-I{_DRIVER_INC}")
+
+    cmd = ["bisheng", *flags, _CPP_FILE, "-o", _LIB_FILE]
+    if verbose:
+        print("Compiling:", " ".join(cmd))
+    subprocess.run(cmd, check=True, timeout=300)
+    if verbose:
+        print(f"Compiled → {_LIB_FILE}")
+    return _LIB_FILE
+
+
+def _load_lib(lib_path: str) -> ctypes.CDLL:
+    lib = ctypes.CDLL(os.path.abspath(lib_path))
+    # void call(uint32_t block_dim, void *stream,
+    #            uint8_t *A, uint8_t *B, uint8_t *C,
+    #            uint8_t *D, uint8_t *workspace, int64_t batch)
+    lib.call.argtypes = [
+        ctypes.c_uint32,   # block_dim
+        ctypes.c_void_p,   # stream
+        ctypes.c_void_p,   # A
+        ctypes.c_void_p,   # B
+        ctypes.c_void_p,   # C
+        ctypes.c_void_p,   # D
+        ctypes.c_void_p,   # workspace
+        ctypes.c_int64,    # batch
+    ]
+    lib.call.restype = None
+    return lib
+
+
+@lru_cache(maxsize=1)
+def compile_and_load(verbose: bool = True) -> "AddMatmulV2CKernel":
+    lib_path = _compile(verbose=verbose)
+    lib = _load_lib(lib_path)
+    return AddMatmulV2CKernel(lib, BLOCK_DIM)
+
+
+class AddMatmulV2CKernel:
+    """Callable wrapper around the add_matmul_v2c ctypes kernel.
+
+    Parameters
+    ----------
+    lib       : ctypes.CDLL loaded from the compiled .so
+    block_dim : number of Cube cores to launch (== num physical cores)
+    """
+
+    def __init__(self, lib: ctypes.CDLL, block_dim: int) -> None:
+        self._lib = lib
+        self._block_dim = block_dim
+
+    def __call__(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        C: torch.Tensor,
+        D: torch.Tensor,
+        workspace: torch.Tensor,
+        batch: int | None = None,
+    ) -> None:
+        """Launch the kernel in-place (result written to C).
+
+        All tensors must be on the same NPU device, contiguous, and fp16.
+        workspace must be at least [block_dim * TILE_SIZE, TILE_SIZE] fp16.
+        """
+        if batch is None:
+            batch = A.shape[0]
+        stream_ptr = ctypes.c_void_p(torch.npu.current_stream().npu_stream)
+        self._lib.call(
+            self._block_dim,
+            stream_ptr,
+            ctypes.c_void_p(A.data_ptr()),
+            ctypes.c_void_p(B.data_ptr()),
+            ctypes.c_void_p(C.data_ptr()),
+            ctypes.c_void_p(D.data_ptr()),
+            ctypes.c_void_p(workspace.data_ptr()),
+            ctypes.c_int64(batch),
+        )
diff --git a/examples/jit_cpp/cross_core_sync_demo/matmul_add/raw_flag/jit_util_matmul_add_c2v.py b/examples/jit_cpp/cross_core_sync_demo/matmul_add/raw_flag/jit_util_matmul_add_c2v.py
new file mode 100644
index 00000000..6cac1f72
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/matmul_add/raw_flag/jit_util_matmul_add_c2v.py
@@ -0,0 +1,145 @@
+"""JIT compile + load utility for matmul_add_c2v kernel.
+
+Compiles matmul_add_c2v.cpp with bisheng and returns a Python callable that
+invokes the kernel via ctypes.
+
+Usage:
+    from jit_util_matmul_add_c2v import compile_and_load, BLOCK_DIM
+    kernel = compile_and_load()
+    kernel(A, B, C, D, workspace, batch)
+"""
+from __future__ import annotations
+
+import ctypes
+import os
+import subprocess
+from functools import lru_cache
+
+import torch
+
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_CPP_FILE = os.path.join(_HERE, "matmul_add_c2v.cpp")
+_LIB_FILE = os.path.join(_HERE, "matmul_add_c2v.so")
+
+ASCEND_TOOLKIT_HOME = os.environ.get("ASCEND_TOOLKIT_HOME") or os.environ.get(
+    "ASCEND_HOME_PATH", ""
+)
+if not ASCEND_TOOLKIT_HOME:
+    raise RuntimeError("Set ASCEND_TOOLKIT_HOME or ASCEND_HOME_PATH")
+
+PTO_LIB_PATH = os.environ.get("PTO_LIB_PATH", ASCEND_TOOLKIT_HOME)
+_PTO_INC = os.path.join(PTO_LIB_PATH, "include")
+_DRIVER_INC = "/usr/local/Ascend/driver/kernel/inc"
+
+# Determine the number of Cube cores on the current NPU.
+_NPU_DEVICE = os.environ.get("NPU_DEVICE", "npu:7")
+try:
+    BLOCK_DIM = int(
+        getattr(torch.npu.get_device_properties(_NPU_DEVICE), "cube_core_num", 20)
+    )
+except (RuntimeError, AssertionError):
+    BLOCK_DIM = 24  # 910B2 default
+
+
+def _compile(verbose: bool = True) -> str:
+    """Compile the kernel and return the .so path."""
+    flags = [
+        "-fPIC",
+        "-shared",
+        "-xcce",
+        "-DMEMORY_BASE",
+        "-O2",
+        "-std=gnu++17",
+        "--cce-aicore-arch=dav-c220",
+        "-mllvm", "-cce-aicore-stack-size=0x8000",
+        "-mllvm", "-cce-aicore-function-stack-size=0x8000",
+        "-mllvm", "-cce-aicore-record-overflow=true",
+        "-mllvm", "-cce-aicore-dcci-insert-for-scalar=false",
+        "-Wno-macro-redefined",
+        "-Wno-ignored-attributes",
+        f"-I{_PTO_INC}",
+        f"-I{ASCEND_TOOLKIT_HOME}/include",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc/runtime",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc/profiling",
+    ]
+    if os.path.isdir(_DRIVER_INC):
+        flags.append(f"-I{_DRIVER_INC}")
+
+    cmd = ["bisheng", *flags, _CPP_FILE, "-o", _LIB_FILE]
+    if verbose:
+        print("Compiling:", " ".join(cmd))
+    subprocess.run(cmd, check=True, timeout=300)
+    if verbose:
+        print(f"Compiled → {_LIB_FILE}")
+    return _LIB_FILE
+
+
+def _load_lib(lib_path: str) -> ctypes.CDLL:
+    """Load the .so and bind the `call` symbol."""
+    lib = ctypes.CDLL(os.path.abspath(lib_path))
+    # void call(uint32_t block_dim, void *stream,
+    #            uint8_t *A, uint8_t *B, uint8_t *C,
+    #            uint8_t *D, uint8_t *workspace, int64_t batch)
+    lib.call.argtypes = [
+        ctypes.c_uint32,   # block_dim
+        ctypes.c_void_p,   # stream
+        ctypes.c_void_p,   # A
+        ctypes.c_void_p,   # B
+        ctypes.c_void_p,   # C
+        ctypes.c_void_p,   # D
+        ctypes.c_void_p,   # workspace
+        ctypes.c_int64,    # batch
+    ]
+    lib.call.restype = None
+    return lib
+
+
+@lru_cache(maxsize=1)
+def compile_and_load(verbose: bool = True) -> "MatmulAddC2VKernel":
+    """Compile (if needed) and return a callable kernel wrapper."""
+    lib_path = _compile(verbose=verbose)
+    lib = _load_lib(lib_path)
+    return MatmulAddC2VKernel(lib, BLOCK_DIM)
+
+
+class MatmulAddC2VKernel:
+    """Callable wrapper around the matmul_add_c2v ctypes kernel.
+
+    Parameters
+    ----------
+    lib       : ctypes.CDLL loaded from the compiled .so
+    block_dim : number of Cube cores to launch (== num physical cores)
+    """
+
+    def __init__(self, lib: ctypes.CDLL, block_dim: int) -> None:
+        self._lib = lib
+        self._block_dim = block_dim
+
+    def __call__(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        C: torch.Tensor,
+        D: torch.Tensor,
+        workspace: torch.Tensor,
+        batch: int | None = None,
+    ) -> None:
+        """Launch the kernel in-place (result written to C).
+
+        All tensors must be on the same NPU device, contiguous, and fp16.
+        workspace must be at least [block_dim * TILE_SIZE, TILE_SIZE] fp16.
+        """
+        if batch is None:
+            batch = A.shape[0]
+        stream_ptr = ctypes.c_void_p(torch.npu.current_stream().npu_stream)
+        self._lib.call(
+            self._block_dim,
+            stream_ptr,
+            ctypes.c_void_p(A.data_ptr()),
+            ctypes.c_void_p(B.data_ptr()),
+            ctypes.c_void_p(C.data_ptr()),
+            ctypes.c_void_p(D.data_ptr()),
+            ctypes.c_void_p(workspace.data_ptr()),
+            ctypes.c_int64(batch),
+        )
diff --git a/examples/jit_cpp/cross_core_sync_demo/matmul_add/raw_flag/matmul_add_c2v.cpp b/examples/jit_cpp/cross_core_sync_demo/matmul_add/raw_flag/matmul_add_c2v.cpp
new file mode 100644
index 00000000..24c3e9e7
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/matmul_add/raw_flag/matmul_add_c2v.cpp
@@ -0,0 +1,283 @@
+// =============================================================================
+// matmul_add_c2v.cpp — Persistent kernel: C = A @ B + D (Cube-to-Vec stream)
+//
+// Computes  C[batch, T] = A[batch, T] @ B[T, T] + D[batch, T]
+// where T = TILE_SIZE = 128.
+//
+// Algorithm (persistent kernel, block_dim == num_cube_cores):
+//   Each "round" all Cube cores process one wave of TILE_SIZE rows in parallel.
+//   After num_rounds waves the full batch is consumed.
+//
+//   Cube core (cid):
+//     1. Load B[0:T, 0:T] → L1 → L0B  (once, before the loop)
+//     2. For each round r:
+//          a. Load A[row_c:row_c+T, :] → L1 → L0A
+//          b. GEMM: c_l0 = A @ B  (L0C ← L0A × L0B)
+//          c. If r > 0: wait for Vec signal (FLAG_V2C) confirming workspace freed
+//          d. TSTORE c_l0 → workspace[cid*T : (cid+1)*T, :]  (FIX pipe, f32→f16)
+//          e. SetCrossFlag FLAG_C2V  (signals both Vec sub-blocks: tile ready)
+//
+//   Vec sub-block (cid, vid ∈ {0,1}):
+//     Each sub-block owns half the tile rows: vid*T/2 .. (vid+1)*T/2
+//     For each round r:
+//          a. WaitCrossFlag FLAG_C2V  (Cube has written workspace)
+//          b. TLOAD workspace slice → c_ub
+//          c. TLOAD D slice         → d_ub
+//          d. pipe_barrier(ALL)  — both loads complete
+//          e. SetCrossFlag FLAG_V2C  (signals Cube: workspace slot freed)
+//          f. TADD c_ub = c_ub + d_ub
+//          g. TSTORE c_ub → C output
+//
+// Cross-core sync uses FFTS (Fast Fine-grained Task Synchronization).
+// Flag mode = VEC_NUM = 2:
+//   • Cube sends FLAG_C2V once; hardware delivers it to both Vec sub-blocks.
+//   • Each Vec sub-block sends FLAG_V2C once; Cube unblocks after VEC_NUM signals.
+//
+// Memory budget:
+//   L1  (512 KB): b_l1 (32 KB at 0) + a_l1 (32 KB at 32 KB) = 64 KB used
+//   L0A ( 64 KB): a_l0 (32 KB at 0)
+//   L0B ( 64 KB): b_l0 (32 KB at 0)
+//   L0C (128 KB): c_l0 (64 KB at 0)
+//   UB  (192 KB): c_ub (16 KB at 0) + d_ub (16 KB at 16 KB) = 32 KB used
+// =============================================================================
+
+#define MEMORY_BASE
+#include <pto/pto-inst.hpp>
+#include "acl/acl.h"
+#include <runtime/rt_ffts.h>
+
+using namespace pto;
+
+// ── Tile dimensions ────────────────────────────────────────────────────────────
+#define TILE_SIZE 128   // rows/cols per matrix tile
+#define HALF_TILE  64   // rows per Vec sub-block  (TILE_SIZE / VEC_NUM)
+#define VEC_NUM     2   // Vec sub-blocks per Cube core
+
+#ifdef __CCE_AICORE__
+
+// ── On-chip buffer base addresses (bytes) ─────────────────────────────────────
+// L1: two back-to-back TILE_SIZE×TILE_SIZE half tiles
+constexpr uint32_t L1_B_OFFSET = 0u;
+constexpr uint32_t L1_A_OFFSET = TILE_SIZE * TILE_SIZE * sizeof(half);  // 32 KB
+
+// L0A / L0B / L0C are independent scratchpads; each starts at byte 0
+constexpr uint32_t L0_OFFSET = 0u;
+
+// UB: two HALF_TILE×TILE_SIZE half tiles
+constexpr uint32_t UB_C_OFFSET = 0u;
+constexpr uint32_t UB_D_OFFSET = HALF_TILE * TILE_SIZE * sizeof(half);  // 16 KB
+
+// ── Cross-core FFTS flags ──────────────────────────────────────────────────────
+constexpr int32_t FLAG_C2V = 0;  // Cube → Vec: GEMM result written to workspace
+constexpr int32_t FLAG_V2C = 1;  // Vec → Cube: workspace slot has been read
+
+// ── Tile type aliases ──────────────────────────────────────────────────────────
+// L1 tile — NZ (ColMajor/RowMajor) layout required by the Cube engine
+using TileL1 = Tile<TileType::Mat, half, TILE_SIZE, TILE_SIZE,
+                    BLayout::ColMajor, TILE_SIZE, TILE_SIZE,
+                    SLayout::RowMajor, 512, PadValue::Zero>;
+
+// L0 tiles — one per independent Cube scratchpad
+using TileL0A = TileLeft<half,  TILE_SIZE, TILE_SIZE>;
+using TileL0B = TileRight<half, TILE_SIZE, TILE_SIZE>;
+using TileL0C = TileAcc<float,  TILE_SIZE, TILE_SIZE>;  // fp32 accumulator
+
+// UB Vec tile — row-major, HALF_TILE rows × TILE_SIZE cols, fp16
+using TileVecUB = Tile<TileType::Vec, half, HALF_TILE, TILE_SIZE,
+                       BLayout::RowMajor, HALF_TILE, TILE_SIZE,
+                       SLayout::NoneBox, 512, PadValue::Null>;
+
+// GlobalTensor aliases — contiguous 2D row-major in GM
+using TileGlobal =
+    GlobalTensor<half,
+                 TileShape2D<half, TILE_SIZE, TILE_SIZE, Layout::ND>,
+                 BaseShape2D<half, TILE_SIZE, TILE_SIZE, Layout::ND>,
+                 Layout::ND>;
+
+using HalfTileGlobal =
+    GlobalTensor<half,
+                 TileShape2D<half, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 BaseShape2D<half, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 Layout::ND>;
+
+// ── Cross-core sync helpers ────────────────────────────────────────────────────
+// SetCrossFlag: insert a signal into `Pipe`'s instruction stream.
+//   mode = VEC_NUM means:
+//     • When Cube signals FLAG_C2V: one call unblocks all VEC_NUM Vec sub-blocks.
+//     • When Vec signals FLAG_V2C: Cube unblocks after receiving VEC_NUM signals
+//       (one per sub-block).
+template <pipe_t Pipe>
+AICORE inline void SetCrossFlag(int32_t flag) {
+    ffts_cross_core_sync(Pipe, 1 | (VEC_NUM << 4) | (flag << 8));
+}
+
+AICORE inline void WaitCrossFlag(int32_t flag) {
+    wait_flag_dev(flag);
+}
+
+// ── Intra-pipe sync helpers ────────────────────────────────────────────────────
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void SetFlag(uint32_t id) {
+    set_flag(Src, Dst, static_cast<event_t>(id));
+}
+
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void WaitFlag(uint32_t id) {
+    wait_flag(Src, Dst, static_cast<event_t>(id));
+}
+
+// ── Kernel implementation ──────────────────────────────────────────────────────
+AICORE void run_matmul_add_c2v(
+    __gm__ half    *A,          // [batch, TILE_SIZE]              input matrix
+    __gm__ half    *B,          // [TILE_SIZE, TILE_SIZE]          weight (constant)
+    __gm__ half    *C,          // [batch, TILE_SIZE]              output
+    __gm__ half    *D,          // [batch, TILE_SIZE]              bias
+    __gm__ half    *workspace,  // [num_cores*TILE_SIZE, TILE_SIZE] C2V buffer
+    int64_t         batch,
+    uint64_t        ffts_addr)
+{
+    const int32_t cid       = static_cast<int32_t>(get_block_idx());   // Cube core id
+    const int32_t vid       = static_cast<int32_t>(get_subblockid());  // Vec sub-block: 0 or 1
+    const int32_t num_cores = static_cast<int32_t>(block_num);         // launched Cube cores
+
+    set_ffts_base_addr(ffts_addr);
+
+    // One wave processes (num_cores × TILE_SIZE) rows across all cores.
+    const int32_t wave_rows  = num_cores * TILE_SIZE;
+    const int32_t num_rounds = static_cast<int32_t>(batch) / wave_rows;
+
+    // ── Allocate on-chip buffers ───────────────────────────────────────────────
+    TileL1  b_l1, a_l1;
+    TASSIGN(b_l1, L1_B_OFFSET);
+    TASSIGN(a_l1, L1_A_OFFSET);
+
+    TileL0A a_l0;
+    TileL0B b_l0;
+    TileL0C c_l0;
+    TASSIGN(a_l0, L0_OFFSET);
+    TASSIGN(b_l0, L0_OFFSET);
+    TASSIGN(c_l0, L0_OFFSET);
+
+    TileVecUB c_ub, d_ub;
+    TASSIGN(c_ub, UB_C_OFFSET);
+    TASSIGN(d_ub, UB_D_OFFSET);
+
+    // ── Cube core: GEMM ───────────────────────────────────────────────────────
+#if defined(__DAV_C220_CUBE__)
+
+    // Load the constant weight matrix B once — reused for every round.
+    TileGlobal b_global(B);
+    TLOAD(b_l1, b_global);
+    SetFlag<PIPE_MTE2, PIPE_MTE1>(0);
+    WaitFlag<PIPE_MTE2, PIPE_MTE1>(0);
+    TMOV(b_l0, b_l1);                   // L1 → L0B (MTE1 pipe)
+    SetFlag<PIPE_MTE1, PIPE_M>(0);
+    WaitFlag<PIPE_MTE1, PIPE_M>(0);     // M pipe waits for b_l0 to be ready
+
+    for (int32_t r = 0; r < num_rounds; ++r) {
+        // Row offset in A for this core + round
+        const int32_t row_c = r * wave_rows + cid * TILE_SIZE;
+
+        // Load A tile: GM → L1 → L0A
+        TileGlobal a_global(A + row_c * TILE_SIZE);
+        TLOAD(a_l1, a_global);
+        SetFlag<PIPE_MTE2, PIPE_MTE1>(0);
+        WaitFlag<PIPE_MTE2, PIPE_MTE1>(0);
+        TMOV(a_l0, a_l1);               // L1 → L0A (MTE1 pipe)
+        SetFlag<PIPE_MTE1, PIPE_M>(0);
+        WaitFlag<PIPE_MTE1, PIPE_M>(0); // M pipe waits for a_l0 to be ready
+
+        // GEMM: c_l0 = A @ B  (initialises c_l0 — no prior accumulation)
+        TMATMUL(c_l0, a_l0, b_l0);
+        SetFlag<PIPE_M, PIPE_FIX>(0);
+        WaitFlag<PIPE_M, PIPE_FIX>(0);  // M→FIX: c_l0 ready for TSTORE
+
+        // Wait for both Vec sub-blocks to finish reading the workspace slot
+        // from the *previous* round before overwriting it. Skip on round 0.
+        if (r > 0) {
+            WaitCrossFlag(FLAG_V2C);
+            pipe_barrier(PIPE_ALL);  // flush all pipes before issuing TSTORE
+        }
+
+        // Write GEMM result to workspace (fp32 → fp16 conversion via FIX pipe).
+        TileGlobal ws_out(workspace + cid * TILE_SIZE * TILE_SIZE);
+        TSTORE(ws_out, c_l0);
+        pipe_barrier(PIPE_ALL);  // FIX: TSTORE complete before SetCrossFlag fires
+        SetCrossFlag<PIPE_FIX>(FLAG_C2V);
+    }
+
+#endif  // __DAV_C220_CUBE__
+
+    // ── Vec sub-block: add bias + store result ─────────────────────────────────
+#if defined(__DAV_C220_VEC__)
+
+    set_mask_norm();
+    set_vector_mask(-1, -1);
+
+    // This sub-block's workspace row offset (fixed across all rounds).
+    const int32_t ws_row = cid * TILE_SIZE + vid * HALF_TILE;
+
+    for (int32_t r = 0; r < num_rounds; ++r) {
+        // Global output row this sub-block writes
+        const int32_t row_v = r * wave_rows + cid * TILE_SIZE + vid * HALF_TILE;
+
+        // Wait for Cube: workspace slot contains a fresh GEMM result.
+        WaitCrossFlag(FLAG_C2V);
+
+        // Load GEMM result and D from GM → UB (both loads can issue in parallel).
+        HalfTileGlobal ws_in(workspace + ws_row * TILE_SIZE);
+        TLOAD(c_ub, ws_in);
+
+        HalfTileGlobal d_global(D + row_v * TILE_SIZE);
+        TLOAD(d_ub, d_global);
+
+        pipe_barrier(PIPE_ALL);  // MTE2→V+MTE3: both TLOADs done before signal and TADD
+
+        // Signal Cube: workspace slot is consumed — safe to overwrite next round.
+        SetCrossFlag<PIPE_MTE3>(FLAG_V2C);
+
+        // C = GEMM_result + D  (Vec engine, element-wise)
+        TADD(c_ub, c_ub, d_ub);
+        pipe_barrier(PIPE_ALL);  // V→MTE3: TADD done before TSTORE
+
+        // Store result to global memory C.
+        HalfTileGlobal c_out(C + row_v * TILE_SIZE);
+        TSTORE(c_out, c_ub);
+        pipe_barrier(PIPE_ALL);  // MTE3: TSTORE complete before next round
+    }
+
+#endif  // __DAV_C220_VEC__
+}
+
+#endif  // __CCE_AICORE__
+
+// ── Kernel entry point ─────────────────────────────────────────────────────────
+extern "C" __global__ AICORE void matmul_add_c2v_kernel(
+    __gm__ uint8_t *A,
+    __gm__ uint8_t *B,
+    __gm__ uint8_t *C,
+    __gm__ uint8_t *D,
+    __gm__ uint8_t *workspace,
+    int64_t         batch,
+    uint64_t        ffts_addr)
+{
+    run_matmul_add_c2v(
+        reinterpret_cast<__gm__ half *>(A),
+        reinterpret_cast<__gm__ half *>(B),
+        reinterpret_cast<__gm__ half *>(C),
+        reinterpret_cast<__gm__ half *>(D),
+        reinterpret_cast<__gm__ half *>(workspace),
+        batch, ffts_addr);
+}
+
+// ── Host-side launcher (called from Python via ctypes) ─────────────────────────
+extern "C" void call(uint32_t block_dim, void *stream,
+                     uint8_t *A, uint8_t *B, uint8_t *C,
+                     uint8_t *D, uint8_t *workspace, int64_t batch)
+{
+    uint32_t ffts_len  = 0;
+    uint64_t ffts_addr = 0;
+    rtGetC2cCtrlAddr(&ffts_addr, &ffts_len);
+    matmul_add_c2v_kernel<<<block_dim, nullptr, stream>>>(
+        A, B, C, D, workspace, batch, ffts_addr);
+}
diff --git a/examples/jit_cpp/cross_core_sync_demo/matmul_add/raw_flag/run_add_matmul_v2c.py b/examples/jit_cpp/cross_core_sync_demo/matmul_add/raw_flag/run_add_matmul_v2c.py
new file mode 100644
index 00000000..74f65f10
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/matmul_add/raw_flag/run_add_matmul_v2c.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+"""
+Correctness test and bandwidth benchmark for add_matmul_v2c kernel.
+
+Algorithm: C = (A + B) @ D
+  A : [batch, T]     fp16
+  B : [batch, T]     fp16
+  C : [batch, T]     fp16   (output)
+  D : [T, T]         fp16   (constant weight)
+  workspace: [num_cores * T, T]  fp16  (V2C communication buffer)
+  where T = TILE_SIZE = 128
+
+batch must be a multiple of (num_cores * T).
+
+Usage:
+    python run_add_matmul_v2c.py
+"""
+from __future__ import annotations
+
+import os
+import sys
+
+import torch
+import torch_npu  # noqa: F401
+
+_DEVICE = os.environ.get("NPU_DEVICE", "npu:7")
+torch.npu.set_device(_DEVICE)
+print(f"Using device: {_DEVICE}")
+
+os.environ["NPU_DEVICE"] = _DEVICE
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from jit_util_add_matmul_v2c import compile_and_load, BLOCK_DIM  # noqa: E402
+
+TILE_SIZE = 128
+DTYPE = torch.float16
+COMMON_KWARGS = dict(dtype=DTYPE, device=_DEVICE)
+
+RTOL = 1e-3
+ATOL = 1e-5
+
+
+def make_workspace() -> torch.Tensor:
+    return torch.empty(BLOCK_DIM * TILE_SIZE, TILE_SIZE, **COMMON_KWARGS)
+
+
+def run_kernel(kernel, A, B, D):
+    C = torch.zeros_like(A)
+    ws = make_workspace()
+    kernel(A, B, C, D, ws)
+    torch.npu.synchronize()
+    return C
+
+
+def ref(A, B, D):
+    return ((A + B) @ D).to(DTYPE)
+
+
+# ── Correctness tests ─────────────────────────────────────────────────────────
+
+def test_correctness(kernel) -> None:
+    print("=" * 60)
+    print("CORRECTNESS TESTS")
+    print("=" * 60)
+
+    passed = failed = 0
+    for seed in range(3):
+        for num_rounds in range(1, 11):
+            batch = num_rounds * BLOCK_DIM * TILE_SIZE
+            torch.manual_seed(seed)
+            A = torch.randn(batch, TILE_SIZE, **COMMON_KWARGS)
+            B = torch.randn(batch, TILE_SIZE, **COMMON_KWARGS)
+            D = torch.randn(TILE_SIZE, TILE_SIZE, **COMMON_KWARGS)
+
+            C_kernel = run_kernel(kernel, A, B, D)
+            C_ref    = ref(A, B, D)
+
+            try:
+                torch.testing.assert_close(C_kernel, C_ref, rtol=RTOL, atol=ATOL)
+                passed += 1
+            except AssertionError as e:
+                failed += 1
+                if failed <= 5:
+                    print(f"  FAIL  seed={seed} num_rounds={num_rounds} batch={batch}: {e}")
+
+    total = passed + failed
+    status = "OK" if failed == 0 else f"FAILED ({failed}/{total})"
+    print(f"\nCorrectness: {passed}/{total} passed — {status}\n")
+    if failed:
+        sys.exit(1)
+
+
+# ── Benchmark ─────────────────────────────────────────────────────────────────
+
+def benchmark(kernel, warmup: int = 10, repeats: int = 30) -> None:
+    print("=" * 60)
+    print(f"BENCHMARK  (warmup={warmup}, repeats={repeats})")
+    print("=" * 60)
+    header = f"{'batch':>10}  {'rounds':>6}  {'dur_us':>10}  {'bw_GB/s':>10}"
+    print(header)
+    print("-" * len(header))
+
+    records = []
+    for num_rounds in [1, 2, 4, 8, 16, 32, 64]:
+        batch = num_rounds * BLOCK_DIM * TILE_SIZE
+        torch.manual_seed(0)
+        A = torch.randn(batch, TILE_SIZE, **COMMON_KWARGS)
+        B = torch.randn(batch, TILE_SIZE, **COMMON_KWARGS)
+        D = torch.randn(TILE_SIZE, TILE_SIZE, **COMMON_KWARGS)
+        C = torch.zeros_like(A)
+        ws = make_workspace()
+
+        for _ in range(warmup):
+            kernel(A, B, C, D, ws)
+        torch.npu.synchronize()
+
+        start = torch.npu.Event(enable_timing=True)
+        end   = torch.npu.Event(enable_timing=True)
+        start.record()
+        for _ in range(repeats):
+            kernel(A, B, C, D, ws)
+        end.record()
+        end.synchronize()
+
+        dur_us = start.elapsed_time(end) / repeats * 1e3
+
+        # Bytes accessed: read A + read B + read D + write C  (fp16 = 2 bytes)
+        bytes_total = (batch * TILE_SIZE * 3 + TILE_SIZE * TILE_SIZE) * 2
+        bw_gbs = bytes_total / dur_us * 1e-3
+
+        print(f"{batch:>10d}  {num_rounds:>6d}  {dur_us:>10.2f}  {bw_gbs:>10.2f}")
+        records.append(dict(batch=batch, num_rounds=num_rounds,
+                            dur_us=dur_us, bw_gbs=bw_gbs))
+
+    peak_bw = max(r["bw_gbs"] for r in records)
+    print(f"\nPeak bandwidth: {peak_bw:.1f} GB/s  "
+          f"(910B2 HBM roofline ≈ 1500 GB/s)")
+
+
+# ── Entry point ───────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    print(f"BLOCK_DIM (num Cube cores): {BLOCK_DIM}\n")
+
+    kernel = compile_and_load(verbose=True)
+    print()
+
+    test_correctness(kernel)
+    benchmark(kernel)
diff --git a/examples/jit_cpp/cross_core_sync_demo/matmul_add/raw_flag/run_matmul_add_c2v.py b/examples/jit_cpp/cross_core_sync_demo/matmul_add/raw_flag/run_matmul_add_c2v.py
new file mode 100644
index 00000000..c024e99f
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/matmul_add/raw_flag/run_matmul_add_c2v.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+"""
+Correctness test and bandwidth benchmark for matmul_add_c2v kernel.
+
+Algorithm: C = A @ B + D
+  A : [batch, T]     fp16
+  B : [T, T]         fp16   (constant weight)
+  C : [batch, T]     fp16   (output)
+  D : [batch, T]     fp16
+  workspace: [num_cores * T, T]  fp16  (C2V communication buffer)
+  where T = TILE_SIZE = 128
+
+batch must be a multiple of (num_cores * T).
+
+Usage:
+    python run_matmul_add_c2v.py
+"""
+from __future__ import annotations
+
+import os
+import sys
+
+import torch
+import torch_npu  # noqa: F401
+
+# ── Device setup ──────────────────────────────────────────────────────────────
+_DEVICE = os.environ.get("NPU_DEVICE", "npu:7")
+torch.npu.set_device(_DEVICE)
+print(f"Using device: {_DEVICE}")
+
+# Inject the device for jit_util's BLOCK_DIM query before importing it
+os.environ["NPU_DEVICE"] = _DEVICE
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from jit_util_matmul_add_c2v import compile_and_load, BLOCK_DIM  # noqa: E402
+
+TILE_SIZE = 128
+DTYPE = torch.float16
+COMMON_KWARGS = dict(dtype=DTYPE, device=_DEVICE)
+
+RTOL = 1e-3
+ATOL = 1e-5
+
+
+# ── Helpers ───────────────────────────────────────────────────────────────────
+
+def make_workspace() -> torch.Tensor:
+    return torch.empty(BLOCK_DIM * TILE_SIZE, TILE_SIZE, **COMMON_KWARGS)
+
+
+def run_kernel(kernel, A, B, D):
+    """Allocate output C and workspace, launch kernel, return C."""
+    C = torch.zeros_like(A)
+    ws = make_workspace()
+    kernel(A, B, C, D, ws)
+    torch.npu.synchronize()
+    return C
+
+
+def ref(A, B, D):
+    return (A @ B + D).to(DTYPE)
+
+
+# ── Correctness tests ─────────────────────────────────────────────────────────
+
+def test_correctness(kernel) -> None:
+    print("=" * 60)
+    print("CORRECTNESS TESTS")
+    print("=" * 60)
+
+    passed = failed = 0
+    for seed in range(3):          # run each shape 3 times (indeterminate sync bugs show up)
+        for num_rounds in range(1, 11):
+            batch = num_rounds * BLOCK_DIM * TILE_SIZE
+            torch.manual_seed(seed)
+            A = torch.randn(batch, TILE_SIZE, **COMMON_KWARGS)
+            B = torch.randn(TILE_SIZE, TILE_SIZE, **COMMON_KWARGS)
+            D = torch.randn(batch, TILE_SIZE, **COMMON_KWARGS)
+
+            C_kernel = run_kernel(kernel, A, B, D)
+            C_ref    = ref(A, B, D)
+
+            try:
+                torch.testing.assert_close(C_kernel, C_ref, rtol=RTOL, atol=ATOL)
+                passed += 1
+            except AssertionError as e:
+                failed += 1
+                if failed <= 5:
+                    print(f"  FAIL  seed={seed} num_rounds={num_rounds} batch={batch}: {e}")
+
+    total = passed + failed
+    status = "OK" if failed == 0 else f"FAILED ({failed}/{total})"
+    print(f"\nCorrectness: {passed}/{total} passed — {status}\n")
+    if failed:
+        sys.exit(1)
+
+
+# ── Benchmark ─────────────────────────────────────────────────────────────────
+
+def benchmark(kernel, warmup: int = 10, repeats: int = 30) -> None:
+    print("=" * 60)
+    print(f"BENCHMARK  (warmup={warmup}, repeats={repeats})")
+    print("=" * 60)
+    header = f"{'batch':>10}  {'rounds':>6}  {'dur_us':>10}  {'bw_GB/s':>10}"
+    print(header)
+    print("-" * len(header))
+
+    records = []
+    for num_rounds in [1, 2, 4, 8, 16, 32, 64]:
+        batch = num_rounds * BLOCK_DIM * TILE_SIZE
+        torch.manual_seed(0)
+        A = torch.randn(batch, TILE_SIZE, **COMMON_KWARGS)
+        B = torch.randn(TILE_SIZE, TILE_SIZE, **COMMON_KWARGS)
+        D = torch.randn(batch, TILE_SIZE, **COMMON_KWARGS)
+        C = torch.zeros_like(A)
+        ws = make_workspace()
+
+        # Warm-up
+        for _ in range(warmup):
+            kernel(A, B, C, D, ws)
+        torch.npu.synchronize()
+
+        # Timed loop
+        start = torch.npu.Event(enable_timing=True)
+        end   = torch.npu.Event(enable_timing=True)
+        start.record()
+        for _ in range(repeats):
+            kernel(A, B, C, D, ws)
+        end.record()
+        end.synchronize()
+
+        dur_us = start.elapsed_time(end) / repeats * 1e3  # ms → µs
+
+        # Bytes accessed: read A + read B + read D + write C
+        # (workspace is on-chip between Cube/Vec, so not counted as GM traffic)
+        bytes_total = (batch * TILE_SIZE * 3 + TILE_SIZE * TILE_SIZE) * 2  # fp16 = 2 B
+        bw_gbs = bytes_total / dur_us * 1e-3  # bytes/µs = GB/s
+
+        print(f"{batch:>10d}  {num_rounds:>6d}  {dur_us:>10.2f}  {bw_gbs:>10.2f}")
+        records.append(dict(batch=batch, num_rounds=num_rounds,
+                            dur_us=dur_us, bw_gbs=bw_gbs))
+
+    peak_bw = max(r["bw_gbs"] for r in records)
+    print(f"\nPeak bandwidth: {peak_bw:.1f} GB/s  "
+          f"(910B2 HBM roofline ≈ 1500 GB/s)")
+
+
+# ── Entry point ───────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    print(f"BLOCK_DIM (num Cube cores): {BLOCK_DIM}\n")
+
+    kernel = compile_and_load(verbose=True)
+    print()
+
+    test_correctness(kernel)
+    benchmark(kernel)
diff --git a/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/README.md b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/README.md
new file mode 100644
index 00000000..a9a7c736
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/README.md
@@ -0,0 +1,72 @@
+# stream_c2v_v2c — Three Cube↔Vector synchronization API styles
+
+Bandwidth microbenchmarks for the Cube↔Vector workspace handshake, implemented
+in three API styles for direct comparison.
+
+## Three API Variants
+
+| Variant | Sync API | Data API | C2V slot | V2C slot |
+|---------|----------|----------|----------|----------|
+| `raw_flag` | `ffts_cross_core_sync` + `wait_flag_dev` (direct) | `TSTORE`/`TLOAD` on fixed workspace | half 32 KB | half 32 KB |
+| `pushpop` | `TPipe` TileData — sync + data-move in one call | built into `TPUSH`/`TPOP` | **float 64 KB** | half 32 KB |
+| `gm_pipe` | `TPipe` GlobalData — `TPUSH`/`TPOP` handle sync only | explicit `TALLOC`+`TSTORE`+`TPUSH` / `TPOP`+`TLOAD`+`TFREE` | half 32 KB | half 32 KB |
+
+Key differences:
+- **raw_flag**: programmer writes `ffts_cross_core_sync`/`wait_flag_dev` directly + manages workspace manually.
+- **pushpop**: one `TPUSH`/`TPOP` call handles both sync and data-move (`TPipe` calls `ffts_cross_core_sync`/`wait_flag_dev` internally). C2V slot stores `AccTile::DType=float` (no implicit fp32→fp16).
+- **gm_pipe**: `TPUSH`/`TPOP` with GlobalData overloads handle sync only (`TPipe` manages flags internally); data-move (`TSTORE`/`TLOAD`) is explicit between `TALLOC`+`TPUSH` and `TPOP`+`TFREE`. This allows `TSTORE(slot_half, c_l0)` to perform fp32→fp16 via hardware, matching raw_flag slot size.
+
+## Files
+
+| Subdirectory | Files |
+|---|---|
+| `raw_flag/` | `stream_c2v.cpp`, `stream_v2c.cpp`, `jit_util_stream.py`, `run_stream_c2v_v2c.py` |
+| `pushpop/` | same names |
+| `gm_pipe/` | same names (compiled with pto-isa-master headers) |
+
+## Reproduce
+
+```bash
+BASE=/workdir/pto-kernels-fork/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c
+
+python $BASE/raw_flag/run_stream_c2v_v2c.py
+python $BASE/pushpop/run_stream_c2v_v2c.py
+python $BASE/gm_pipe/run_stream_c2v_v2c.py
+
+NPU_DEVICE=npu:5 python $BASE/raw_flag/run_stream_c2v_v2c.py  # choose NPU
+```
+
+Each script runs a smoke check followed by a full bandwidth sweep over
+`num_iters ∈ {1, 2, 4, … 1024}` and prints the peak GB/s.
+
+## Results (910B2, 24 Cube cores)
+
+**stream_c2v** — `Cube L0C → workspace → Vec UB`:
+
+| Variant | Slot | Peak (GB/s) | at num_iters |
+|---------|------|-------------|--------------|
+| raw_flag | half 32 KB | 1152 | 1024 |
+| pushpop | **float 64 KB** | **2133–2194** | 1024 (2× slot → 2× bw) |
+| gm_pipe | half 32 KB | 1666 | 1024 |
+
+**stream_v2c** — `Vec UB → workspace → Cube L1`:
+
+| Variant | Slot | Peak (GB/s) | at num_iters |
+|---------|------|-------------|--------------|
+| raw_flag | half 32 KB | 1098 | 128 |
+| pushpop | half 32 KB | 1106–1128 | 512–1024 |
+| gm_pipe | half 32 KB | 1233 | 512 |
+
+Note: `pushpop` C2V uses a float32 slot (64 KB) so its bandwidth is naturally 2× the half-slot variants. For a like-for-like comparison, divide by 2 (~1067–1097 GB/s), which is comparable to raw_flag (1152 GB/s) and gm_pipe (1666 GB/s).
+
+Previously `pushpop/run_stream_c2v_v2c.py` crashed mid-benchmark because it
+reused the same `fifo_mem` across all calls, causing TPipe internal head/tail
+state to accumulate.  The fix: pre-allocate one fresh fifo per call (warmup +
+repeats) and use a different buffer each time.  `V2CPipe` was also changed from
+`TPipe<0>` to `TPipe<2>` to avoid FFTS flag collision with `C2VPipe = TPipe<0>`.
+
+**Sync optimization applied** (vs initial implementation with `pipe_barrier(PIPE_ALL)` everywhere):
+- TMATMUL → TSTORE (Cube): replaced `pipe_barrier(PIPE_ALL)` with `SetFlag<M,FIX>; WaitFlag<M,FIX>`
+- TLOAD+TLOAD → TADD (Vec): replaced with `SetFlag<MTE2,V>; WaitFlag<MTE2,V>`
+- TADD → TSTORE (Vec): replaced with `SetFlag<V,MTE3>; WaitFlag<V,MTE3>`
+- DMA → cross-core signal: `pipe_barrier(PIPE_ALL)` **kept** (required for memory visibility)
diff --git a/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/gm_pipe/jit_util_stream.py b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/gm_pipe/jit_util_stream.py
new file mode 100644
index 00000000..df5960a3
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/gm_pipe/jit_util_stream.py
@@ -0,0 +1,144 @@
+"""JIT utilities for stream_c2v_v2c/gm_pipe kernels.
+
+The gm_pipe variant uses the GlobalData TPOP/TALLOC/TFREE APIs which are
+available in pto-isa-master but not in the default installed library.
+This utility compiles against /workdir/pto-isa-master/include.
+
+C2V:  TALLOC + TSTORE(slot_half, c_l0) + TPUSH  →  half slot (32 KB/core/slot)
+      explicit fp32→fp16 via hardware FIX unit, same slot size as raw_flag
+V2C:  TALLOC + TSTORE(slot_half, a_ub) + TPUSH  →  half slot (32 KB/core/slot)
+"""
+from __future__ import annotations
+
+import ctypes
+import os
+import subprocess
+from functools import lru_cache
+
+import torch
+
+_HERE = os.path.dirname(os.path.abspath(__file__))
+
+ASCEND_TOOLKIT_HOME = os.environ.get("ASCEND_TOOLKIT_HOME") or os.environ.get(
+    "ASCEND_HOME_PATH", ""
+)
+if not ASCEND_TOOLKIT_HOME:
+    raise RuntimeError("Set ASCEND_TOOLKIT_HOME or ASCEND_HOME_PATH")
+
+# gm_pipe uses the newer pto-isa-master headers for GlobalData TPOP/TALLOC/TFREE.
+_PTO_NEW_INC = "/workdir/pto-isa-master/include"
+_DRIVER_INC  = "/usr/local/Ascend/driver/kernel/inc"
+
+_NPU_DEVICE = os.environ.get("NPU_DEVICE", "npu:7")
+try:
+    BLOCK_DIM = int(
+        getattr(torch.npu.get_device_properties(_NPU_DEVICE), "cube_core_num", 20)
+    )
+except (RuntimeError, AssertionError):
+    BLOCK_DIM = 24
+
+TILE_SIZE  = 128
+FIFO_DEPTH = 2
+
+# Both C2V and V2C use half slots in gm_pipe.
+C2V_FIFO_ELEMS_PER_CORE = FIFO_DEPTH * TILE_SIZE * TILE_SIZE   # float16 elements
+V2C_FIFO_ELEMS_PER_CORE = FIFO_DEPTH * TILE_SIZE * TILE_SIZE   # float16 elements
+
+
+def _compile(cpp_basename: str, so_basename: str, verbose: bool = True) -> str:
+    flags = [
+        "-fPIC", "-shared", "-xcce", "-DMEMORY_BASE", "-O2", "-std=gnu++17",
+        "--cce-aicore-arch=dav-c220",
+        "-mllvm", "-cce-aicore-stack-size=0x8000",
+        "-mllvm", "-cce-aicore-function-stack-size=0x8000",
+        "-mllvm", "-cce-aicore-record-overflow=true",
+        "-mllvm", "-cce-aicore-dcci-insert-for-scalar=false",
+        "-Wno-macro-redefined", "-Wno-ignored-attributes",
+        # pto-isa-master FIRST (provides GlobalData TPOP/TALLOC/TFREE APIs)
+        f"-I{_PTO_NEW_INC}",
+        f"-I{ASCEND_TOOLKIT_HOME}/include",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc/runtime",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc/profiling",
+    ]
+    if os.path.isdir(_DRIVER_INC):
+        flags.append(f"-I{_DRIVER_INC}")
+    cpp = os.path.join(_HERE, cpp_basename)
+    so  = os.path.join(_HERE, so_basename)
+    cmd = ["bisheng", *flags, cpp, "-o", so]
+    if verbose:
+        print("Compiling (with pto-isa-master headers):", " ".join(cmd))
+    subprocess.run(cmd, check=True, timeout=300)
+    if verbose:
+        print(f"Compiled → {so}")
+    return so
+
+
+@lru_cache(maxsize=1)
+def load_stream_c2v(verbose: bool = True) -> "StreamC2VKernel":
+    so = _compile("stream_c2v.cpp", "stream_c2v.so", verbose=verbose)
+    lib = ctypes.CDLL(os.path.abspath(so))
+    lib.call_stream_c2v.argtypes = [
+        ctypes.c_uint32,  # block_dim
+        ctypes.c_void_p,  # stream
+        ctypes.c_void_p,  # A
+        ctypes.c_void_p,  # B
+        ctypes.c_void_p,  # fifo_mem  (float16, half slot)
+        ctypes.c_int32,   # num_iters
+    ]
+    lib.call_stream_c2v.restype = None
+    return StreamC2VKernel(lib, BLOCK_DIM)
+
+
+@lru_cache(maxsize=1)
+def load_stream_v2c(verbose: bool = True) -> "StreamV2CKernel":
+    so = _compile("stream_v2c.cpp", "stream_v2c.so", verbose=verbose)
+    lib = ctypes.CDLL(os.path.abspath(so))
+    lib.call_stream_v2c.argtypes = [
+        ctypes.c_uint32,  # block_dim
+        ctypes.c_void_p,  # stream
+        ctypes.c_void_p,  # A
+        ctypes.c_void_p,  # D
+        ctypes.c_void_p,  # fifo_mem  (float16, half slot)
+        ctypes.c_int32,   # num_iters
+    ]
+    lib.call_stream_v2c.restype = None
+    return StreamV2CKernel(lib, BLOCK_DIM)
+
+
+class StreamC2VKernel:
+    def __init__(self, lib: ctypes.CDLL, block_dim: int) -> None:
+        self._lib = lib
+        self._block_dim = block_dim
+
+    def __call__(self, A: torch.Tensor, B: torch.Tensor,
+                 fifo_mem: torch.Tensor, num_iters: int) -> None:
+        """A: [BLOCK_DIM*T, T] fp16; B: [T, T] fp16;
+        fifo_mem: [BLOCK_DIM * C2V_FIFO_ELEMS_PER_CORE] float16 (half slot)."""
+        stream_ptr = ctypes.c_void_p(torch.npu.current_stream().npu_stream)
+        self._lib.call_stream_c2v(
+            self._block_dim, stream_ptr,
+            ctypes.c_void_p(A.data_ptr()),
+            ctypes.c_void_p(B.data_ptr()),
+            ctypes.c_void_p(fifo_mem.data_ptr()),
+            ctypes.c_int32(num_iters),
+        )
+
+
+class StreamV2CKernel:
+    def __init__(self, lib: ctypes.CDLL, block_dim: int) -> None:
+        self._lib = lib
+        self._block_dim = block_dim
+
+    def __call__(self, A: torch.Tensor, D: torch.Tensor,
+                 fifo_mem: torch.Tensor, num_iters: int) -> None:
+        """A, D: [num_iters*BLOCK_DIM*T, T] fp16;
+        fifo_mem: [BLOCK_DIM * V2C_FIFO_ELEMS_PER_CORE] fp16."""
+        stream_ptr = ctypes.c_void_p(torch.npu.current_stream().npu_stream)
+        self._lib.call_stream_v2c(
+            self._block_dim, stream_ptr,
+            ctypes.c_void_p(A.data_ptr()),
+            ctypes.c_void_p(D.data_ptr()),
+            ctypes.c_void_p(fifo_mem.data_ptr()),
+            ctypes.c_int32(num_iters),
+        )
diff --git a/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/gm_pipe/run_stream_c2v_v2c.py b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/gm_pipe/run_stream_c2v_v2c.py
new file mode 100644
index 00000000..f2a1b1aa
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/gm_pipe/run_stream_c2v_v2c.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+"""
+Bandwidth benchmark for stream_c2v and stream_v2c kernels.
+
+Both kernels measure the sustained throughput of the Cube↔Vector workspace
+handshake path using `ffts_cross_core_sync` / `wait_flag_dev`.
+
+Effective bandwidth definition (same for both directions):
+  bw_eff = 2 × num_cores × T² × sizeof(fp16) × num_iters / time
+           ↑ workspace write + workspace read (round-trip)
+
+Usage:
+    python run_stream_c2v_v2c.py
+    NPU_DEVICE=npu:5 python run_stream_c2v_v2c.py
+"""
+from __future__ import annotations
+
+import os
+import sys
+
+import torch
+import torch_npu  # noqa: F401
+
+_DEVICE = os.environ.get("NPU_DEVICE", "npu:7")
+torch.npu.set_device(_DEVICE)
+print(f"Using device: {_DEVICE}")
+os.environ["NPU_DEVICE"] = _DEVICE
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from jit_util_stream import load_stream_c2v, load_stream_v2c, BLOCK_DIM  # noqa: E402
+
+TILE_SIZE  = 128
+DTYPE      = torch.float16
+KW         = dict(dtype=DTYPE, device=_DEVICE)
+
+WARMUP  = 5
+REPEATS = 20
+
+
+def workspace_roundtrip_bytes(num_iters: int) -> int:
+    """GM bytes transferred through workspace per kernel launch."""
+    return 2 * BLOCK_DIM * TILE_SIZE * TILE_SIZE * 2 * num_iters  # ×2: write + read
+
+
+def _time_kernel(fn, *args, num_iters: int) -> float:
+    """Return median duration in µs for one call of fn(*args)."""
+    start = torch.npu.Event(enable_timing=True)
+    end   = torch.npu.Event(enable_timing=True)
+    start.record()
+    for _ in range(REPEATS):
+        fn(*args, num_iters)
+    end.record()
+    end.synchronize()
+    return start.elapsed_time(end) / REPEATS * 1e3  # ms → µs
+
+
+# ── stream_c2v ────────────────────────────────────────────────────────────────
+
+def run_c2v(kernel) -> None:
+    print("=" * 60)
+    print("stream_c2v  (Cube L0C → workspace → Vec UB)")
+    print("=" * 60)
+    header = f"{'num_iters':>10}  {'dur_us':>10}  {'bw_GB/s':>10}"
+    print(header)
+    print("-" * len(header))
+
+    wave_rows = BLOCK_DIM * TILE_SIZE
+    A  = torch.randn(wave_rows, TILE_SIZE, **KW)
+    B  = torch.randn(TILE_SIZE, TILE_SIZE, **KW)
+    ws = torch.zeros(wave_rows, TILE_SIZE, **KW)
+
+    # Smoke check: run once with a few iterations, no crash = pass
+    kernel(A, B, ws, 4)
+    torch.npu.synchronize()
+    print(f"  smoke (num_iters=4): OK")
+    print()
+
+    records = []
+    for num_iters in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
+        for _ in range(WARMUP):
+            kernel(A, B, ws, num_iters)
+        torch.npu.synchronize()
+
+        dur_us = _time_kernel(kernel, A, B, ws, num_iters=num_iters)
+        bw_gbs = workspace_roundtrip_bytes(num_iters) / dur_us * 1e-3
+
+        print(f"{num_iters:>10d}  {dur_us:>10.2f}  {bw_gbs:>10.1f}")
+        records.append((num_iters, dur_us, bw_gbs))
+
+    peak_bw = max(r[2] for r in records)
+    peak_ni = max(records, key=lambda r: r[2])[0]
+    print(f"\nPeak: {peak_bw:.1f} GB/s at num_iters={peak_ni}  "
+          f"(910B2 HBM roofline ≈ 1500 GB/s)\n")
+
+
+# ── stream_v2c ────────────────────────────────────────────────────────────────
+
+def run_v2c(kernel) -> None:
+    print("=" * 60)
+    print("stream_v2c  (Vec UB → workspace → Cube L1)")
+    print("=" * 60)
+    header = f"{'num_iters':>10}  {'dur_us':>10}  {'bw_GB/s':>10}"
+    print(header)
+    print("-" * len(header))
+
+    wave_rows = BLOCK_DIM * TILE_SIZE
+    ws = torch.zeros(wave_rows, TILE_SIZE, **KW)
+
+    # Smoke check
+    A_smoke = torch.randn(4 * wave_rows, TILE_SIZE, **KW)
+    D_smoke = torch.randn(4 * wave_rows, TILE_SIZE, **KW)
+    kernel(A_smoke, D_smoke, ws, 4)
+    torch.npu.synchronize()
+    print(f"  smoke (num_iters=4): OK")
+    print()
+
+    records = []
+    for num_iters in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
+        total_rows = num_iters * wave_rows
+        A = torch.randn(total_rows, TILE_SIZE, **KW)
+        D = torch.randn(total_rows, TILE_SIZE, **KW)
+
+        for _ in range(WARMUP):
+            kernel(A, D, ws, num_iters)
+        torch.npu.synchronize()
+
+        dur_us = _time_kernel(kernel, A, D, ws, num_iters=num_iters)
+        bw_gbs = workspace_roundtrip_bytes(num_iters) / dur_us * 1e-3
+
+        print(f"{num_iters:>10d}  {dur_us:>10.2f}  {bw_gbs:>10.1f}")
+        records.append((num_iters, dur_us, bw_gbs))
+
+    peak_bw = max(r[2] for r in records)
+    peak_ni = max(records, key=lambda r: r[2])[0]
+    print(f"\nPeak: {peak_bw:.1f} GB/s at num_iters={peak_ni}  "
+          f"(910B2 HBM roofline ≈ 1500 GB/s)\n")
+
+
+# ── Entry point ───────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    print(f"BLOCK_DIM (num Cube cores): {BLOCK_DIM}\n")
+
+    print("Compiling stream_c2v ...")
+    c2v = load_stream_c2v(verbose=True)
+    print()
+
+    print("Compiling stream_v2c ...")
+    v2c = load_stream_v2c(verbose=True)
+    print()
+
+    run_c2v(c2v)
+    run_v2c(v2c)
diff --git a/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/gm_pipe/stream_c2v.cpp b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/gm_pipe/stream_c2v.cpp
new file mode 100644
index 00000000..ef1c30ec
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/gm_pipe/stream_c2v.cpp
@@ -0,0 +1,218 @@
+// =============================================================================
+// stream_c2v.cpp — Cube→Vector bandwidth microbenchmark  [gm_pipe variant]
+//
+// Same algorithm as raw_flag/stream_c2v.cpp.
+//
+// ── API variant: GlobalData TALLOC + TSTORE + TPUSH / TPOP + TLOAD + TFREE ──
+//
+//  raw_flag equivalent          │  gm_pipe (this file)
+//  ─────────────────────────────┼─────────────────────────────────────────────
+//  TSTORE(ws_half, c_l0)        │  TALLOC<C2VPipe, SlotFull, UP_DOWN>(pipe, slot)
+//  pipe_barrier(PIPE_ALL)       │  TSTORE(slot, c_l0)   ← explicit fp32→fp16
+//  SetCrossFlag<FIX>(FLAG_C2V)  │  TPUSH<C2VPipe, SlotFull, UP_DOWN>(pipe, slot)
+//                               │    (TSTORE and TPUSH are both FIX-pipe; ordered,
+//                               │     no barrier needed between them)
+//  ─────────────────────────────┼─────────────────────────────────────────────
+//  WaitCrossFlag(FLAG_C2V)      │  TPOP<C2VPipe, PopHalf, UP_DOWN>(pipe, pop)
+//  TLOAD(c_ub, ws_half)         │    └─ sync-only: assigns slot address to pop
+//  SetCrossFlag<MTE3>(FLAG_V2C) │  TLOAD(c_ub, pop)     ← explicit half load
+//                               │  TFREE<C2VPipe, PopHalf, UP_DOWN>(pipe, pop)
+//                               │    (TLOAD and TFREE are both MTE2-pipe; ordered)
+//
+// Key property:
+//   TSTORE(slot_half, c_l0) — GlobalTensor<half> ← TileAcc<float>:
+//   hardware FIX unit performs fp32→fp16.  SlotSize = T²×sizeof(half) = 32 KB.
+//   Same slot size and dtype as raw_flag; direct bandwidth comparison is valid.
+//
+// Slot type summary:
+//   raw_flag : half slot, 32 KB/slot  →  workspace half tensor
+//   pushpop  : float slot, 64 KB/slot
+//   gm_pipe  : half slot, 32 KB/slot  →  FIFO memory half tensor  ← this file
+//
+// NOTE: Uses `if constexpr` (not #if/#endif) for Cube/Vec branching, matching
+//   the style of the pto-isa unit tests.  This avoids overload-resolution
+//   issues in the GlobalData TPOP/TFREE template dispatch.
+//
+// Memory budget (per core):
+//   L1  (512 KB): b_l1 32 KB + a_l1 32 KB = 64 KB
+//   L0A ( 64 KB): a_l0 32 KB
+//   L0B ( 64 KB): b_l0 32 KB
+//   L0C (128 KB): c_l0 64 KB
+//   UB  (192 KB): c_ub 16 KB
+//   FIFO GM    : FIFO_DEPTH × SLOT_SIZE = 2 × 32 KB = 64 KB per core
+// =============================================================================
+
+#define MEMORY_BASE
+#include <pto/pto-inst.hpp>
+#include <pto/common/fifo.hpp>
+#include "acl/acl.h"
+#include <runtime/rt_ffts.h>
+
+using namespace pto;
+
+#ifdef __DAV_CUBE__
+constexpr bool DAV_CUBE = true;
+#else
+constexpr bool DAV_CUBE = false;
+#endif
+#ifdef __DAV_VEC__
+constexpr bool DAV_VEC = true;
+#else
+constexpr bool DAV_VEC = false;
+#endif
+
+#define TILE_SIZE 128
+#define HALF_TILE  64
+#define VEC_NUM     2
+
+#ifdef __CCE_AICORE__
+
+constexpr uint32_t L1_B_OFFSET  = 0u;
+constexpr uint32_t L1_A_OFFSET  = TILE_SIZE * TILE_SIZE * sizeof(half);  // 32 KB
+constexpr uint32_t L0_OFFSET    = 0u;
+constexpr uint32_t UB_C_OFFSET  = 0u;
+
+constexpr uint32_t FIFO_DEPTH     = 2u;
+constexpr uint32_t C2V_SLOT_SIZE  = TILE_SIZE * TILE_SIZE * sizeof(half); // 32 KB
+constexpr uint32_t C2V_FIFO_BYTES = FIFO_DEPTH * C2V_SLOT_SIZE;           // 64 KB/core
+
+using TileL1 = Tile<TileType::Mat, half, TILE_SIZE, TILE_SIZE,
+                    BLayout::ColMajor, TILE_SIZE, TILE_SIZE,
+                    SLayout::RowMajor, 512, PadValue::Zero>;
+using TileL0A = TileLeft<half,  TILE_SIZE, TILE_SIZE>;
+using TileL0B = TileRight<half, TILE_SIZE, TILE_SIZE>;
+using TileL0C = TileAcc<float,  TILE_SIZE, TILE_SIZE>;
+
+using TileVecUB = Tile<TileType::Vec, half, HALF_TILE, TILE_SIZE,
+                       BLayout::RowMajor, HALF_TILE, TILE_SIZE,
+                       SLayout::NoneBox, 512, PadValue::Null>;
+
+using C2VPipe = TPipe<0, Direction::DIR_C2V, C2V_SLOT_SIZE, FIFO_DEPTH>;
+
+// Cube writes the full T×T slot; Vec reads its T/2-row subslot.
+using SlotFull = GlobalTensor<half, pto::Shape<1, 1, 1, TILE_SIZE, TILE_SIZE>,
+                               pto::Stride<1, 1, 1, TILE_SIZE, 1>>;
+using PopHalf  = GlobalTensor<half, pto::Shape<1, 1, 1, HALF_TILE, TILE_SIZE>,
+                               pto::Stride<1, 1, 1, TILE_SIZE, 1>>;
+
+using TileGlobal =
+    GlobalTensor<half,
+                 TileShape2D<half, TILE_SIZE, TILE_SIZE, Layout::ND>,
+                 BaseShape2D<half, TILE_SIZE, TILE_SIZE, Layout::ND>,
+                 Layout::ND>;
+
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void SetFlag(uint32_t id) { set_flag(Src, Dst, static_cast<event_t>(id)); }
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void WaitFlag(uint32_t id) { wait_flag(Src, Dst, static_cast<event_t>(id)); }
+
+AICORE void run_stream_c2v(
+    __gm__ half    *A,
+    __gm__ half    *B,
+    __gm__ uint8_t *fifo_mem,
+    int32_t         num_iters,
+    uint64_t        ffts_addr)
+{
+    const int32_t cid = static_cast<int32_t>(get_block_idx());
+    set_ffts_base_addr(ffts_addr);
+
+    __gm__ void *core_fifo = fifo_mem + cid * C2V_FIFO_BYTES;
+    C2VPipe pipe(core_fifo, /*c2v_ub_base=*/0x0, /*v2c_l1_base=*/0x0);
+
+    TileL1  b_l1, a_l1;
+    TASSIGN(b_l1, L1_B_OFFSET);
+    TASSIGN(a_l1, L1_A_OFFSET);
+
+    TileL0A a_l0;  TileL0B b_l0;  TileL0C c_l0;
+    TASSIGN(a_l0, L0_OFFSET);
+    TASSIGN(b_l0, L0_OFFSET);
+    TASSIGN(c_l0, L0_OFFSET);
+
+    TileVecUB c_ub;
+    TASSIGN(c_ub, UB_C_OFFSET);
+
+    // ── Cube: one-time GEMM, then push loop ───────────────────────────────────
+    if constexpr (DAV_CUBE) {
+        TileGlobal b_global(B);
+        TLOAD(b_l1, b_global);
+        SetFlag<PIPE_MTE2, PIPE_MTE1>(0);
+        WaitFlag<PIPE_MTE2, PIPE_MTE1>(0);
+        TMOV(b_l0, b_l1);
+        SetFlag<PIPE_MTE1, PIPE_M>(0);
+        WaitFlag<PIPE_MTE1, PIPE_M>(0);
+
+        TileGlobal a_global(A + cid * TILE_SIZE * TILE_SIZE);
+        TLOAD(a_l1, a_global);
+        SetFlag<PIPE_MTE2, PIPE_MTE1>(0);
+        WaitFlag<PIPE_MTE2, PIPE_MTE1>(0);
+        TMOV(a_l0, a_l1);
+        SetFlag<PIPE_MTE1, PIPE_M>(0);
+        WaitFlag<PIPE_MTE1, PIPE_M>(0);
+
+        TMATMUL(c_l0, a_l0, b_l0);
+        SetFlag<PIPE_M, PIPE_FIX>(0);
+        WaitFlag<PIPE_M, PIPE_FIX>(0);   // M→FIX: c_l0 ready for TSTORE
+
+        SlotFull push_slot;
+        for (int32_t r = 0; r < num_iters; ++r) {
+            // ── gm_pipe replaces raw_flag: ───────────────────────────────────
+            // raw_flag: TSTORE(ws_half, c_l0)   (fp32→fp16 via FIX)
+            //           pipe_barrier(PIPE_ALL)
+            //           ffts_cross_core_sync(PIPE_FIX, FLAG_C2V)
+            // gm_pipe:
+            TALLOC<C2VPipe, SlotFull, TileSplitAxis::TILE_UP_DOWN>(pipe, push_slot);
+            TSTORE(push_slot, c_l0);
+            //   GlobalTensor<half> ← TileAcc<float>: fp32→fp16 (hardware FIX unit)
+            pipe_barrier(PIPE_ALL);  // FIX: wait for DMA to complete before TPUSH signals Vec
+            TPUSH<C2VPipe, SlotFull, TileSplitAxis::TILE_UP_DOWN>(pipe, push_slot);
+            //   Sync-only: emits data-ready signal (no internal TSTORE)
+        }
+    }
+
+    // ── Vec: pop half tiles from FIFO, discard (bandwidth test) ──────────────
+    if constexpr (DAV_VEC) {
+        set_mask_norm();
+        set_vector_mask(-1, -1);
+
+        PopHalf pop_slot;
+        for (int32_t r = 0; r < num_iters; ++r) {
+            // ── gm_pipe replaces raw_flag: ───────────────────────────────────
+            // raw_flag: wait_flag_dev(FLAG_C2V)
+            //           pipe_barrier(PIPE_ALL)
+            //           TLOAD(c_ub, ws_half)
+            //           pipe_barrier(PIPE_ALL)
+            //           ffts_cross_core_sync(PIPE_MTE3, FLAG_V2C)
+            // gm_pipe:
+            TPOP<C2VPipe, PopHalf, TileSplitAxis::TILE_UP_DOWN>(pipe, pop_slot);
+            //   Waits for data-ready; assigns this sub-block's T/2-row slice address.
+            TLOAD(c_ub, pop_slot);
+            //   Explicit TLOAD: TileVecUB<half> ← GlobalTensor<half>
+            pipe_barrier(PIPE_ALL);  // MTE2: wait for DMA to complete before freeing slot
+            //   TFREE fires from MTE2 pipe after TLOAD DMA completes.
+            TFREE<C2VPipe, PopHalf, TileSplitAxis::TILE_UP_DOWN>(pipe, pop_slot);
+            //   Emits free-space notification to Cube (conditional on SyncPeriod)
+        }
+    }
+}
+
+#endif  // __CCE_AICORE__
+
+extern "C" __global__ AICORE void stream_c2v_kernel(
+    __gm__ uint8_t *A, __gm__ uint8_t *B,
+    __gm__ uint8_t *fifo_mem, int32_t num_iters, uint64_t ffts_addr)
+{
+    run_stream_c2v(
+        reinterpret_cast<__gm__ half *>(A),
+        reinterpret_cast<__gm__ half *>(B),
+        fifo_mem, num_iters, ffts_addr);
+}
+
+extern "C" void call_stream_c2v(uint32_t block_dim, void *stream,
+                                 uint8_t *A, uint8_t *B,
+                                 uint8_t *fifo_mem, int32_t num_iters)
+{
+    uint32_t ffts_len  = 0;
+    uint64_t ffts_addr = 0;
+    rtGetC2cCtrlAddr(&ffts_addr, &ffts_len);
+    stream_c2v_kernel<<<block_dim, nullptr, stream>>>(A, B, fifo_mem, num_iters, ffts_addr);
+}
diff --git a/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/gm_pipe/stream_v2c.cpp b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/gm_pipe/stream_v2c.cpp
new file mode 100644
index 00000000..6817b457
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/gm_pipe/stream_v2c.cpp
@@ -0,0 +1,187 @@
+// =============================================================================
+// stream_v2c.cpp — Vector→Cube bandwidth microbenchmark  [gm_pipe variant]
+//
+// Same algorithm as raw_flag/stream_v2c.cpp.
+//
+// ── API variant: GlobalData TALLOC + TSTORE + TPUSH / TPOP + TLOAD + TFREE ──
+//
+//  raw_flag equivalent               │  gm_pipe (this file)
+//  ──────────────────────────────────┼─────────────────────────────────────────
+//  TSTORE(ws_half, a_ub)             │  TALLOC<V2CPipe, HalfSlot, UP_DOWN>(pipe, slot)
+//  SetCrossFlag<MTE3>(FLAG_V2C)      │  TSTORE(slot, a_ub)  ← explicit half store
+//                                    │  TPUSH<V2CPipe, HalfSlot, UP_DOWN>(pipe, slot)
+//                                    │    (TSTORE and TPUSH are both MTE3-pipe; ordered)
+//  ──────────────────────────────────┼─────────────────────────────────────────
+//  WaitCrossFlag(FLAG_V2C)           │  TPOP<V2CPipe, FullSlot, NO_SPLIT>(pipe, slot)
+//  TLOAD(ws_l1, ws_half)             │  TLOAD(ws_l1, slot)  ← explicit L1 load
+//  SetCrossFlag<MTE2>(FLAG_C2V)      │  TFREE<V2CPipe, FullSlot, NO_SPLIT>(pipe, slot)
+//                                    │    (TLOAD and TFREE are both MTE2-pipe; ordered)
+//
+// Vec TALLOC TILE_UP_DOWN: vid=0→slot_base+0, vid=1→slot_base+T/2×T×sizeof(half).
+// Cube TPOP TILE_NO_SPLIT: Cube always gets the full T×T slot address.
+// Slot size = T²×sizeof(half) = 32 KB — identical to raw_flag.
+//
+// Uses `if constexpr` for Cube/Vec branching (required for GlobalData dispatch).
+// =============================================================================
+
+#define MEMORY_BASE
+#include <pto/pto-inst.hpp>
+#include <pto/common/fifo.hpp>
+#include "acl/acl.h"
+#include <runtime/rt_ffts.h>
+
+using namespace pto;
+
+#ifdef __DAV_CUBE__
+constexpr bool DAV_CUBE = true;
+#else
+constexpr bool DAV_CUBE = false;
+#endif
+#ifdef __DAV_VEC__
+constexpr bool DAV_VEC = true;
+#else
+constexpr bool DAV_VEC = false;
+#endif
+
+#define TILE_SIZE 128
+#define HALF_TILE  64
+#define VEC_NUM     2
+
+#ifdef __CCE_AICORE__
+
+constexpr uint32_t L1_WS_OFFSET = 0u;
+constexpr uint32_t UB_A_OFFSET  = 0u;
+constexpr uint32_t UB_B_OFFSET  = HALF_TILE * TILE_SIZE * sizeof(half);  // 16 KB
+
+constexpr uint32_t FIFO_DEPTH     = 2u;
+constexpr uint32_t V2C_SLOT_SIZE  = TILE_SIZE * TILE_SIZE * sizeof(half); // 32 KB
+constexpr uint32_t V2C_FIFO_BYTES = FIFO_DEPTH * V2C_SLOT_SIZE;           // 64 KB/core
+
+using TileL1 = Tile<TileType::Mat, half, TILE_SIZE, TILE_SIZE,
+                    BLayout::ColMajor, TILE_SIZE, TILE_SIZE,
+                    SLayout::RowMajor, 512, PadValue::Zero>;
+
+using TileVecUB = Tile<TileType::Vec, half, HALF_TILE, TILE_SIZE,
+                       BLayout::RowMajor, HALF_TILE, TILE_SIZE,
+                       SLayout::NoneBox, 512, PadValue::Null>;
+
+// Use FlagID=2 (FFTS flags 2 and 3) to avoid collision with stream_c2v's
+// C2VPipe = TPipe<0, DIR_C2V> which occupies flags 0 (push) and 1 (free).
+using V2CPipe = TPipe<2, Direction::DIR_V2C, V2C_SLOT_SIZE, FIFO_DEPTH>;
+
+// Vec writes T/2 rows per sub-block (TILE_UP_DOWN).
+using HalfSlot = GlobalTensor<half, pto::Shape<1, 1, 1, HALF_TILE, TILE_SIZE>,
+                               pto::Stride<1, 1, 1, TILE_SIZE, 1>>;
+// Cube reads the full T×T slot (TILE_NO_SPLIT).
+using FullSlot = GlobalTensor<half, pto::Shape<1, 1, 1, TILE_SIZE, TILE_SIZE>,
+                               pto::Stride<1, 1, 1, TILE_SIZE, 1>>;
+
+using HalfTileGlobal =
+    GlobalTensor<half,
+                 TileShape2D<half, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 BaseShape2D<half, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 Layout::ND>;
+
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void SetFlag(uint32_t id) { set_flag(Src, Dst, static_cast<event_t>(id)); }
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void WaitFlag(uint32_t id) { wait_flag(Src, Dst, static_cast<event_t>(id)); }
+
+AICORE void run_stream_v2c(
+    __gm__ half    *A,
+    __gm__ half    *D,
+    __gm__ uint8_t *fifo_mem,
+    int32_t         num_iters,
+    uint64_t        ffts_addr)
+{
+    const int32_t cid       = static_cast<int32_t>(get_block_idx());
+    const int32_t vid       = static_cast<int32_t>(get_subblockid());
+    const int32_t num_cores = static_cast<int32_t>(block_num);
+
+    set_ffts_base_addr(ffts_addr);
+
+    const int32_t wave_rows = num_cores * TILE_SIZE;
+
+    __gm__ void *core_fifo = fifo_mem + cid * V2C_FIFO_BYTES;
+    V2CPipe pipe(core_fifo, /*c2v_ub_base=*/0x0, /*v2c_l1_base=*/0x0);
+
+    TileVecUB a_ub, b_ub;
+    TASSIGN(a_ub, UB_A_OFFSET);
+    TASSIGN(b_ub, UB_B_OFFSET);
+
+    TileL1 ws_l1;
+    TASSIGN(ws_l1, L1_WS_OFFSET);
+
+    // ── Cube: pop half tiles from FIFO, discard (bandwidth test) ─────────────
+    if constexpr (DAV_CUBE) {
+        FullSlot pop_slot;
+        for (int32_t r = 0; r < num_iters; ++r) {
+            TPOP<V2CPipe, FullSlot, TileSplitAxis::TILE_NO_SPLIT>(pipe, pop_slot);
+            //   Waits for data-ready; assigns the full T×T slot address.
+            TLOAD(ws_l1, pop_slot);
+            //   Explicit TLOAD: TileL1<half> ← GlobalTensor<half>
+            pipe_barrier(PIPE_ALL);  // MTE2: wait for DMA to complete before freeing slot
+            //   TFREE fires from MTE2 pipe after TLOAD DMA completes.
+            TFREE<V2CPipe, FullSlot, TileSplitAxis::TILE_NO_SPLIT>(pipe, pop_slot);
+            //   Emits free-space notification to Vec (conditional on SyncPeriod)
+        }
+    }
+
+    // ── Vec: load A+D, add, write to FIFO slot ────────────────────────────────
+    if constexpr (DAV_VEC) {
+        set_mask_norm();
+        set_vector_mask(-1, -1);
+
+        HalfSlot push_slot;
+        for (int32_t r = 0; r < num_iters; ++r) {
+            const int32_t row_v = r * wave_rows + cid * TILE_SIZE + vid * HALF_TILE;
+
+            HalfTileGlobal a_global(A + row_v * TILE_SIZE);
+            TLOAD(a_ub, a_global);
+
+            HalfTileGlobal d_global(D + row_v * TILE_SIZE);
+            TLOAD(b_ub, d_global);
+
+            SetFlag<PIPE_MTE2, PIPE_V>(0);
+            WaitFlag<PIPE_MTE2, PIPE_V>(0);   // MTE2→V: both TLOADs done before TADD
+
+            TADD(a_ub, a_ub, b_ub);
+            SetFlag<PIPE_V, PIPE_MTE3>(0);
+            WaitFlag<PIPE_V, PIPE_MTE3>(0);   // V→MTE3: TADD done before TSTORE
+
+            // ── gm_pipe replaces raw_flag: ───────────────────────────────────
+            // raw_flag: (if r>0) wait_flag_dev(FLAG_C2V), TSTORE(ws_half, a_ub),
+            //           ffts_cross_core_sync(PIPE_MTE3, FLAG_V2C)
+            // gm_pipe:
+            TALLOC<V2CPipe, HalfSlot, TileSplitAxis::TILE_UP_DOWN>(pipe, push_slot);
+            //   vid=0→slot_base+0, vid=1→slot_base+T/2×T×sizeof(half)
+            TSTORE(push_slot, a_ub);
+            //   Explicit TSTORE: GlobalTensor<half> ← TileVecUB<half> (same dtype)
+            pipe_barrier(PIPE_ALL);  // MTE3: wait for DMA to complete before TPUSH signals Cube
+            TPUSH<V2CPipe, HalfSlot, TileSplitAxis::TILE_UP_DOWN>(pipe, push_slot);
+            //   Sync-only: emits data-ready signal
+        }
+    }
+}
+
+#endif  // __CCE_AICORE__
+
+extern "C" __global__ AICORE void stream_v2c_kernel(
+    __gm__ uint8_t *A, __gm__ uint8_t *D,
+    __gm__ uint8_t *fifo_mem, int32_t num_iters, uint64_t ffts_addr)
+{
+    run_stream_v2c(
+        reinterpret_cast<__gm__ half *>(A),
+        reinterpret_cast<__gm__ half *>(D),
+        fifo_mem, num_iters, ffts_addr);
+}
+
+extern "C" void call_stream_v2c(uint32_t block_dim, void *stream,
+                                 uint8_t *A, uint8_t *D,
+                                 uint8_t *fifo_mem, int32_t num_iters)
+{
+    uint32_t ffts_len  = 0;
+    uint64_t ffts_addr = 0;
+    rtGetC2cCtrlAddr(&ffts_addr, &ffts_len);
+    stream_v2c_kernel<<<block_dim, nullptr, stream>>>(A, D, fifo_mem, num_iters, ffts_addr);
+}
diff --git a/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/pushpop/jit_util_stream.py b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/pushpop/jit_util_stream.py
new file mode 100644
index 00000000..4da51482
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/pushpop/jit_util_stream.py
@@ -0,0 +1,138 @@
+"""JIT utilities for stream_c2v_v2c/pushpop kernels.
+
+C2V:  TPUSH AccTile<float> / TPOP VecTile<float>  →  float slot (64 KB/core/slot)
+V2C:  TPUSH VecTile<half>  / TPOP TileL1<half>    →  half  slot (32 KB/core/slot)
+"""
+from __future__ import annotations
+
+import ctypes
+import os
+import subprocess
+from functools import lru_cache
+
+import torch
+
+_HERE = os.path.dirname(os.path.abspath(__file__))
+
+ASCEND_TOOLKIT_HOME = os.environ.get("ASCEND_TOOLKIT_HOME") or os.environ.get(
+    "ASCEND_HOME_PATH", ""
+)
+if not ASCEND_TOOLKIT_HOME:
+    raise RuntimeError("Set ASCEND_TOOLKIT_HOME or ASCEND_HOME_PATH")
+
+PTO_LIB_PATH = os.environ.get("PTO_LIB_PATH", ASCEND_TOOLKIT_HOME)
+_PTO_INC = os.path.join(PTO_LIB_PATH, "include")
+_DRIVER_INC = "/usr/local/Ascend/driver/kernel/inc"
+
+_NPU_DEVICE = os.environ.get("NPU_DEVICE", "npu:7")
+try:
+    BLOCK_DIM = int(
+        getattr(torch.npu.get_device_properties(_NPU_DEVICE), "cube_core_num", 20)
+    )
+except (RuntimeError, AssertionError):
+    BLOCK_DIM = 24
+
+TILE_SIZE  = 128
+FIFO_DEPTH = 2
+
+# C2V uses float slots; V2C uses half slots.
+C2V_FIFO_ELEMS_PER_CORE = FIFO_DEPTH * TILE_SIZE * TILE_SIZE   # float32 elements
+V2C_FIFO_ELEMS_PER_CORE = FIFO_DEPTH * TILE_SIZE * TILE_SIZE   # float16 elements
+
+
+def _compile(cpp_basename: str, so_basename: str, verbose: bool = True) -> str:
+    flags = [
+        "-fPIC", "-shared", "-xcce", "-DMEMORY_BASE", "-O2", "-std=gnu++17",
+        "--cce-aicore-arch=dav-c220",
+        "-mllvm", "-cce-aicore-stack-size=0x8000",
+        "-mllvm", "-cce-aicore-function-stack-size=0x8000",
+        "-mllvm", "-cce-aicore-record-overflow=true",
+        "-mllvm", "-cce-aicore-dcci-insert-for-scalar=false",
+        "-Wno-macro-redefined", "-Wno-ignored-attributes",
+        f"-I{_PTO_INC}",
+        f"-I{ASCEND_TOOLKIT_HOME}/include",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc/runtime",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc/profiling",
+    ]
+    if os.path.isdir(_DRIVER_INC):
+        flags.append(f"-I{_DRIVER_INC}")
+    cpp = os.path.join(_HERE, cpp_basename)
+    so  = os.path.join(_HERE, so_basename)
+    cmd = ["bisheng", *flags, cpp, "-o", so]
+    if verbose:
+        print("Compiling:", " ".join(cmd))
+    subprocess.run(cmd, check=True, timeout=300)
+    if verbose:
+        print(f"Compiled → {so}")
+    return so
+
+
+@lru_cache(maxsize=1)
+def load_stream_c2v(verbose: bool = True) -> "StreamC2VKernel":
+    so = _compile("stream_c2v.cpp", "stream_c2v.so", verbose=verbose)
+    lib = ctypes.CDLL(os.path.abspath(so))
+    lib.call_stream_c2v.argtypes = [
+        ctypes.c_uint32,  # block_dim
+        ctypes.c_void_p,  # stream
+        ctypes.c_void_p,  # A
+        ctypes.c_void_p,  # B
+        ctypes.c_void_p,  # fifo_mem  (float32)
+        ctypes.c_int32,   # num_iters
+    ]
+    lib.call_stream_c2v.restype = None
+    return StreamC2VKernel(lib, BLOCK_DIM)
+
+
+@lru_cache(maxsize=1)
+def load_stream_v2c(verbose: bool = True) -> "StreamV2CKernel":
+    so = _compile("stream_v2c.cpp", "stream_v2c.so", verbose=verbose)
+    lib = ctypes.CDLL(os.path.abspath(so))
+    lib.call_stream_v2c.argtypes = [
+        ctypes.c_uint32,  # block_dim
+        ctypes.c_void_p,  # stream
+        ctypes.c_void_p,  # A
+        ctypes.c_void_p,  # D
+        ctypes.c_void_p,  # fifo_mem  (float16)
+        ctypes.c_int32,   # num_iters
+    ]
+    lib.call_stream_v2c.restype = None
+    return StreamV2CKernel(lib, BLOCK_DIM)
+
+
+class StreamC2VKernel:
+    def __init__(self, lib: ctypes.CDLL, block_dim: int) -> None:
+        self._lib = lib
+        self._block_dim = block_dim
+
+    def __call__(self, A: torch.Tensor, B: torch.Tensor,
+                 fifo_mem: torch.Tensor, num_iters: int) -> None:
+        """A: [BLOCK_DIM*T, T] fp16; B: [T, T] fp16;
+        fifo_mem: [BLOCK_DIM * C2V_FIFO_ELEMS_PER_CORE] float32."""
+        stream_ptr = ctypes.c_void_p(torch.npu.current_stream().npu_stream)
+        self._lib.call_stream_c2v(
+            self._block_dim, stream_ptr,
+            ctypes.c_void_p(A.data_ptr()),
+            ctypes.c_void_p(B.data_ptr()),
+            ctypes.c_void_p(fifo_mem.data_ptr()),
+            ctypes.c_int32(num_iters),
+        )
+
+
+class StreamV2CKernel:
+    def __init__(self, lib: ctypes.CDLL, block_dim: int) -> None:
+        self._lib = lib
+        self._block_dim = block_dim
+
+    def __call__(self, A: torch.Tensor, D: torch.Tensor,
+                 fifo_mem: torch.Tensor, num_iters: int) -> None:
+        """A, D: [num_iters*BLOCK_DIM*T, T] fp16;
+        fifo_mem: [BLOCK_DIM * V2C_FIFO_ELEMS_PER_CORE] fp16."""
+        stream_ptr = ctypes.c_void_p(torch.npu.current_stream().npu_stream)
+        self._lib.call_stream_v2c(
+            self._block_dim, stream_ptr,
+            ctypes.c_void_p(A.data_ptr()),
+            ctypes.c_void_p(D.data_ptr()),
+            ctypes.c_void_p(fifo_mem.data_ptr()),
+            ctypes.c_int32(num_iters),
+        )
diff --git a/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/pushpop/run_stream_c2v_v2c.py b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/pushpop/run_stream_c2v_v2c.py
new file mode 100644
index 00000000..6ce9c91f
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/pushpop/run_stream_c2v_v2c.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+"""
+Bandwidth benchmark for stream_c2v_v2c/pushpop kernels.
+
+C2V variant: TPUSH AccTile<float> → float slot (64 KB/core/slot)
+V2C variant: TPUSH VecTile<half>  → half  slot (32 KB/core/slot)
+
+Effective bandwidth formula (matches raw_flag for comparison):
+  bw = 2 × num_cores × T² × SlotElementSize × num_iters / time
+  C2V: SlotElementSize = sizeof(float) = 4  (slot is 2× larger than raw_flag)
+  V2C: SlotElementSize = sizeof(half)  = 2  (same as raw_flag)
+
+Usage:
+    python run_stream_c2v_v2c.py
+    NPU_DEVICE=npu:5 python run_stream_c2v_v2c.py
+"""
+from __future__ import annotations
+
+import os
+import sys
+
+import torch
+import torch_npu  # noqa: F401
+
+_DEVICE = os.environ.get("NPU_DEVICE", "npu:7")
+torch.npu.set_device(_DEVICE)
+print(f"Using device: {_DEVICE}")
+os.environ["NPU_DEVICE"] = _DEVICE
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from jit_util_stream import (  # noqa: E402
+    load_stream_c2v, load_stream_v2c,
+    BLOCK_DIM, TILE_SIZE, FIFO_DEPTH,
+    C2V_FIFO_ELEMS_PER_CORE, V2C_FIFO_ELEMS_PER_CORE,
+)
+
+WARMUP  = 5
+REPEATS = 20
+
+
+def _time_kernel(fn, data_args: tuple, num_iters: int, fifos: list, offset: int) -> float:
+    """Time REPEATS calls, each using a fresh fifo from the pre-allocated pool."""
+    start = torch.npu.Event(enable_timing=True)
+    end   = torch.npu.Event(enable_timing=True)
+    start.record()
+    for i in range(REPEATS):
+        fn(*data_args, fifos[offset + i], num_iters)
+    end.record()
+    end.synchronize()
+    return start.elapsed_time(end) / REPEATS * 1e3  # ms → µs
+
+
+def run_c2v(kernel) -> None:
+    print("=" * 62)
+    print("stream_c2v  pushpop  (Cube TPUSH AccTile<float> → Vec TPOP)")
+    print("Slot: float32, 64 KB/core/slot  (2× raw_flag half slot)")
+    print("=" * 62)
+    header = f"{'num_iters':>10}  {'dur_us':>10}  {'bw_GB/s':>10}"
+    print(header)
+    print("-" * len(header))
+
+    wave_rows = BLOCK_DIM * TILE_SIZE
+    A  = torch.randn(wave_rows, TILE_SIZE, dtype=torch.float16, device=_DEVICE)
+    B  = torch.randn(TILE_SIZE, TILE_SIZE, dtype=torch.float16, device=_DEVICE)
+
+    # Smoke check with a fresh fifo
+    fifo_smoke = torch.zeros(BLOCK_DIM * C2V_FIFO_ELEMS_PER_CORE,
+                             dtype=torch.float32, device=_DEVICE)
+    kernel(A, B, fifo_smoke, 4)
+    torch.npu.synchronize()
+    print("  smoke (num_iters=4): OK\n")
+
+    records = []
+    for num_iters in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
+        # Pre-allocate one fresh fifo per call (warmup + repeats) so TPipe's
+        # internal head/tail state never accumulates across calls.
+        n_calls = WARMUP + REPEATS
+        fifos = [torch.zeros(BLOCK_DIM * C2V_FIFO_ELEMS_PER_CORE,
+                             dtype=torch.float32, device=_DEVICE)
+                 for _ in range(n_calls)]
+
+        for i in range(WARMUP):
+            kernel(A, B, fifos[i], num_iters)
+        torch.npu.synchronize()
+
+        dur_us = _time_kernel(kernel, (A, B), num_iters=num_iters,
+                              fifos=fifos, offset=WARMUP)
+        # float slot: 4 bytes per element; ×2 for write+read round-trip
+        bw_gbs = 2 * BLOCK_DIM * TILE_SIZE * TILE_SIZE * 4 * num_iters / dur_us * 1e-3
+        print(f"{num_iters:>10d}  {dur_us:>10.2f}  {bw_gbs:>10.1f}")
+        records.append((num_iters, dur_us, bw_gbs))
+
+    peak_bw = max(r[2] for r in records)
+    peak_ni = max(records, key=lambda r: r[2])[0]
+    print(f"\nPeak: {peak_bw:.1f} GB/s at num_iters={peak_ni}  "
+          f"(910B2 HBM roofline ≈ 1500 GB/s)\n")
+
+
+def run_v2c(kernel) -> None:
+    print("=" * 62)
+    print("stream_v2c  pushpop  (Vec TPUSH VecTile<half> → Cube TPOP)")
+    print("Slot: float16, 32 KB/core/slot  (same as raw_flag)")
+    print("=" * 62)
+    header = f"{'num_iters':>10}  {'dur_us':>10}  {'bw_GB/s':>10}"
+    print(header)
+    print("-" * len(header))
+
+    wave_rows = BLOCK_DIM * TILE_SIZE
+
+    # Smoke check
+    fifo_smoke = torch.zeros(BLOCK_DIM * V2C_FIFO_ELEMS_PER_CORE,
+                             dtype=torch.float16, device=_DEVICE)
+    A_smoke = torch.randn(4 * wave_rows, TILE_SIZE, dtype=torch.float16, device=_DEVICE)
+    D_smoke = torch.randn(4 * wave_rows, TILE_SIZE, dtype=torch.float16, device=_DEVICE)
+    kernel(A_smoke, D_smoke, fifo_smoke, 4)
+    torch.npu.synchronize()
+    print("  smoke (num_iters=4): OK\n")
+
+    records = []
+    for num_iters in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
+        total_rows = num_iters * wave_rows
+        A = torch.randn(total_rows, TILE_SIZE, dtype=torch.float16, device=_DEVICE)
+        D = torch.randn(total_rows, TILE_SIZE, dtype=torch.float16, device=_DEVICE)
+
+        n_calls = WARMUP + REPEATS
+        fifos = [torch.zeros(BLOCK_DIM * V2C_FIFO_ELEMS_PER_CORE,
+                             dtype=torch.float16, device=_DEVICE)
+                 for _ in range(n_calls)]
+
+        for i in range(WARMUP):
+            kernel(A, D, fifos[i], num_iters)
+        torch.npu.synchronize()
+
+        dur_us = _time_kernel(kernel, (A, D), num_iters=num_iters,
+                              fifos=fifos, offset=WARMUP)
+        bw_gbs = 2 * BLOCK_DIM * TILE_SIZE * TILE_SIZE * 2 * num_iters / dur_us * 1e-3
+        print(f"{num_iters:>10d}  {dur_us:>10.2f}  {bw_gbs:>10.1f}")
+        records.append((num_iters, dur_us, bw_gbs))
+
+    peak_bw = max(r[2] for r in records)
+    peak_ni = max(records, key=lambda r: r[2])[0]
+    print(f"\nPeak: {peak_bw:.1f} GB/s at num_iters={peak_ni}  "
+          f"(910B2 HBM roofline ≈ 1500 GB/s)\n")
+
+
+if __name__ == "__main__":
+    print(f"BLOCK_DIM={BLOCK_DIM}  FIFO_DEPTH={FIFO_DEPTH}\n")
+    print("Compiling stream_c2v ...")
+    c2v = load_stream_c2v(verbose=True)
+    print()
+    print("Compiling stream_v2c ...")
+    v2c = load_stream_v2c(verbose=True)
+    print()
+    run_c2v(c2v)
+    run_v2c(v2c)
diff --git a/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/pushpop/stream_c2v.cpp b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/pushpop/stream_c2v.cpp
new file mode 100644
index 00000000..b68c3adf
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/pushpop/stream_c2v.cpp
@@ -0,0 +1,211 @@
+// =============================================================================
+// stream_c2v.cpp — Cube→Vector bandwidth microbenchmark  [pushpop variant]
+//
+// Same algorithm as raw_flag/stream_c2v.cpp:
+//   • Cube runs one GEMM to fill c_l0, then spills it num_iters times.
+//   • Vec pops each tile from the FIFO and discards it (pure bandwidth test).
+//
+// ── API variant: TileData TPUSH / TPOP ──────────────────────────────────────
+//
+//  raw_flag equivalent          │  pushpop (this file)
+//  ─────────────────────────────┼────────────────────────────────────────────
+//  TSTORE(ws_half, c_l0)        │  TPUSH<C2VPipe, TileL0C, UP_DOWN>(pipe, c_l0)
+//  pipe_barrier(PIPE_ALL)       │    └─ internally: TSTORE(GlobalTensor<float>, c_l0)
+//  SetCrossFlag<FIX>(FLAG_C2V)  │       then data-ready signal
+//  ─────────────────────────────┼────────────────────────────────────────────
+//  WaitCrossFlag(FLAG_C2V)      │  TPOP<C2VPipe, VecTile<float>, UP_DOWN>(pipe, c_ub)
+//  TLOAD(c_ub, ws_half)         │    └─ internally: wait, TLOAD(GlobalTensor<float>, c_ub)
+//  SetCrossFlag<MTE3>(FLAG_V2C) │       then free-space notify
+//
+// Key difference from raw_flag:
+//   TPUSH stores AccTile::DType = float32 into the FIFO slot (no fp32→fp16).
+//   SlotSize = T²×sizeof(float) = 64 KB  (vs 32 KB half-slot in raw_flag).
+//   Vec receives a float VecTile.  TPipe manages double-buffering automatically.
+//
+// Slot type summary:
+//   raw_flag : half slot,  32 KB/slot  →  workspace half tensor
+//   pushpop  : float slot, 64 KB/slot  →  FIFO memory float tensor  ← this file
+//   gm_pipe  : half slot,  32 KB/slot  →  FIFO memory half tensor
+//
+// Memory budget (per core):
+//   L1  (512 KB): b_l1 32 KB + a_l1 32 KB = 64 KB  (initial GEMM setup)
+//   L0A ( 64 KB): a_l0 32 KB
+//   L0B ( 64 KB): b_l0 32 KB
+//   L0C (128 KB): c_l0 64 KB  (spilled every iteration, never overwritten)
+//   UB  (192 KB): c_ub_float — managed by TPOP via C2V_CONSUMER_BUF
+//                 Slot 0 at 0x00000 (32 KB), Slot 1 at 0x08000 (32 KB) = 64 KB used
+//   FIFO GM     : FIFO_DEPTH × SLOT_SIZE = 2 × 64 KB = 128 KB per core
+// =============================================================================
+
+#define MEMORY_BASE
+#include <pto/pto-inst.hpp>
+#include <pto/common/fifo.hpp>
+#include "acl/acl.h"
+#include <runtime/rt_ffts.h>
+
+using namespace pto;
+
+#define TILE_SIZE 128
+#define HALF_TILE  64
+#define VEC_NUM     2
+
+#ifdef __CCE_AICORE__
+
+// ── On-chip L1/L0 buffer offsets (bytes) ──────────────────────────────────────
+constexpr uint32_t L1_B_OFFSET = 0u;                                      // B: 32 KB
+constexpr uint32_t L1_A_OFFSET = TILE_SIZE * TILE_SIZE * sizeof(half);    // A: 32 KB
+constexpr uint32_t L0_OFFSET   = 0u;                                      // shared origin
+
+// ── FIFO configuration ────────────────────────────────────────────────────────
+// Slot holds AccTile::DType = float32.  Two slots for double-buffered overlap.
+constexpr uint32_t FIFO_DEPTH      = 2u;
+constexpr uint32_t C2V_SLOT_SIZE   = TILE_SIZE * TILE_SIZE * sizeof(float); // 64 KB
+constexpr uint32_t C2V_FIFO_BYTES  = FIFO_DEPTH * C2V_SLOT_SIZE;            // 128 KB/core
+
+// UB base for Vec to receive TPOP'd C2V tiles (managed by TPipe internally).
+constexpr uint32_t C2V_UB_BASE     = 0x0;
+
+// ── Tile types ────────────────────────────────────────────────────────────────
+using TileL1 = Tile<TileType::Mat, half, TILE_SIZE, TILE_SIZE,
+                    BLayout::ColMajor, TILE_SIZE, TILE_SIZE,
+                    SLayout::RowMajor, 512, PadValue::Zero>;
+
+using TileL0A   = TileLeft<half,  TILE_SIZE, TILE_SIZE>;
+using TileL0B   = TileRight<half, TILE_SIZE, TILE_SIZE>;
+using TileL0C   = TileAcc<float,  TILE_SIZE, TILE_SIZE>;
+
+// Vec receives float (matching AccTile::DType stored by TPUSH).
+using VecTileFloat = Tile<TileType::Vec, float, HALF_TILE, TILE_SIZE,
+                          BLayout::RowMajor, HALF_TILE, TILE_SIZE,
+                          SLayout::NoneBox, 512, PadValue::Null>;
+
+// ── FIFO pipe type ────────────────────────────────────────────────────────────
+// FlagID=0, DIR_C2V, slot=float64KB, depth=2.
+// TPUSH calls TSTORE(GlobalTensor<float>, c_l0) internally — no fp32→fp16.
+// TPOP  calls TLOAD(GlobalTensor<float>, c_ub) internally.
+using C2VPipe = TPipe<0, Direction::DIR_C2V, C2V_SLOT_SIZE, FIFO_DEPTH>;
+
+// ── Global tensor type for GEMM input ─────────────────────────────────────────
+using TileGlobal =
+    GlobalTensor<half,
+                 TileShape2D<half, TILE_SIZE, TILE_SIZE, Layout::ND>,
+                 BaseShape2D<half, TILE_SIZE, TILE_SIZE, Layout::ND>,
+                 Layout::ND>;
+
+// ── Intra-pipe sync helpers ────────────────────────────────────────────────────
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void SetFlag(uint32_t id) { set_flag(Src, Dst, static_cast<event_t>(id)); }
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void WaitFlag(uint32_t id) { wait_flag(Src, Dst, static_cast<event_t>(id)); }
+
+// ─────────────────────────────────────────────────────────────────────────────
+AICORE void run_stream_c2v(
+    __gm__ half    *A,         // [num_cores × T, T]  initial GEMM input
+    __gm__ half    *B,         // [T, T]              weight matrix
+    __gm__ uint8_t *fifo_mem,  // [num_cores × C2V_FIFO_BYTES]  C2V FIFO buffer
+    int32_t         num_iters,
+    uint64_t        ffts_addr)
+{
+    const int32_t cid = static_cast<int32_t>(get_block_idx());
+
+    set_ffts_base_addr(ffts_addr);
+
+    // Each core owns its own FIFO region to avoid inter-core aliasing.
+    __gm__ void *core_fifo = fifo_mem + cid * C2V_FIFO_BYTES;
+    C2VPipe pipe(core_fifo, /*c2v_ub_base=*/C2V_UB_BASE, /*v2c_l1_base=*/0x0);
+
+    // ── Allocate on-chip buffers ───────────────────────────────────────────────
+    TileL1  b_l1, a_l1;
+    TASSIGN(b_l1, L1_B_OFFSET);
+    TASSIGN(a_l1, L1_A_OFFSET);
+
+    TileL0A a_l0;  TileL0B b_l0;  TileL0C c_l0;
+    TASSIGN(a_l0, L0_OFFSET);
+    TASSIGN(b_l0, L0_OFFSET);
+    TASSIGN(c_l0, L0_OFFSET);
+
+    // c_ub_float is assigned internally by TPOP (no manual TASSIGN needed).
+    VecTileFloat c_ub_float;
+
+    // ── Cube: one-time GEMM fills c_l0, then push loop ────────────────────────
+#if defined(__DAV_C220_CUBE__)
+
+    TileGlobal b_global(B);
+    TLOAD(b_l1, b_global);
+    SetFlag<PIPE_MTE2, PIPE_MTE1>(0);
+    WaitFlag<PIPE_MTE2, PIPE_MTE1>(0);
+    TMOV(b_l0, b_l1);
+    SetFlag<PIPE_MTE1, PIPE_M>(0);
+    WaitFlag<PIPE_MTE1, PIPE_M>(0);
+
+    TileGlobal a_global(A + cid * TILE_SIZE * TILE_SIZE);
+    TLOAD(a_l1, a_global);
+    SetFlag<PIPE_MTE2, PIPE_MTE1>(0);
+    WaitFlag<PIPE_MTE2, PIPE_MTE1>(0);
+    TMOV(a_l0, a_l1);
+    SetFlag<PIPE_MTE1, PIPE_M>(0);
+    WaitFlag<PIPE_MTE1, PIPE_M>(0);
+
+    TMATMUL(c_l0, a_l0, b_l0);
+    SetFlag<PIPE_M, PIPE_FIX>(0);
+    WaitFlag<PIPE_M, PIPE_FIX>(0);    // M→FIX: c_l0 ready before TPUSH reads it
+
+    for (int32_t r = 0; r < num_iters; ++r) {
+        // ── pushpop replaces raw_flag: ───────────────────────────────────────
+        // raw_flag: TSTORE(ws_half, c_l0)  (fp32→fp16)
+        //           pipe_barrier(PIPE_ALL)
+        //           ffts_cross_core_sync(PIPE_FIX, FLAG_C2V)
+        // pushpop:
+        TPUSH<C2VPipe, TileL0C, TileSplitAxis::TILE_UP_DOWN>(pipe, c_l0);
+        // └─ internally: TSTORE(GlobalTensor<float>, c_l0) + data-ready signal
+        //    Note: slot stores float32 (AccTile::DType), NOT half.
+    }
+
+#endif  // __DAV_C220_CUBE__
+
+    // ── Vec: pop loop — receives float tiles, discards data ───────────────────
+#if defined(__DAV_C220_VEC__)
+
+    set_mask_norm();
+    set_vector_mask(-1, -1);
+
+    for (int32_t r = 0; r < num_iters; ++r) {
+        // ── pushpop replaces raw_flag: ───────────────────────────────────────
+        // raw_flag: wait_flag_dev(FLAG_C2V)
+        //           TLOAD(c_ub_half, ws_half)
+        //           ffts_cross_core_sync(PIPE_MTE3, FLAG_V2C)
+        // pushpop:
+        TPOP<C2VPipe, VecTileFloat, TileSplitAxis::TILE_UP_DOWN>(pipe, c_ub_float);
+        // └─ internally: wait, TLOAD(GlobalTensor<float>, c_ub_float),
+        //    free-space notify.  c_ub_float assigned to C2V_UB_BASE rotation.
+    }
+
+#endif  // __DAV_C220_VEC__
+}
+
+#endif  // __CCE_AICORE__
+
+// ── Kernel entry point ─────────────────────────────────────────────────────────
+extern "C" __global__ AICORE void stream_c2v_kernel(
+    __gm__ uint8_t *A,
+    __gm__ uint8_t *B,
+    __gm__ uint8_t *fifo_mem,
+    int32_t         num_iters,
+    uint64_t        ffts_addr)
+{
+    run_stream_c2v(
+        reinterpret_cast<__gm__ half    *>(A),
+        reinterpret_cast<__gm__ half    *>(B),
+        fifo_mem,
+        num_iters, ffts_addr);
+}
+
+extern "C" void call_stream_c2v(uint32_t block_dim, void *stream,
+                                 uint8_t *A, uint8_t *B,
+                                 uint8_t *fifo_mem, int32_t num_iters)
+{
+    uint32_t ffts_len  = 0;
+    uint64_t ffts_addr = 0;
+    rtGetC2cCtrlAddr(&ffts_addr, &ffts_len);
+    stream_c2v_kernel<<<block_dim, nullptr, stream>>>(A, B, fifo_mem, num_iters, ffts_addr);
+}
diff --git a/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/pushpop/stream_v2c.cpp b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/pushpop/stream_v2c.cpp
new file mode 100644
index 00000000..d442be82
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/pushpop/stream_v2c.cpp
@@ -0,0 +1,195 @@
+// =============================================================================
+// stream_v2c.cpp — Vector→Cube bandwidth microbenchmark  [pushpop variant]
+//
+// Same algorithm as raw_flag/stream_v2c.cpp:
+//   Vec loads A+D from GM, adds them in UB, writes to FIFO; Cube pops into L1
+//   and discards the data (pure bandwidth test, no GEMM on Cube side).
+//
+// ── API variant: TileData TPUSH / TPOP ──────────────────────────────────────
+//
+//  raw_flag equivalent               │  pushpop (this file)
+//  ──────────────────────────────────┼────────────────────────────────────────
+//  TSTORE(ws_half, a_ub)             │  TPUSH<V2CPipe, VecTile, UP_DOWN>(pipe, a_ub)
+//  pipe_barrier(PIPE_ALL)            │    └─ internally: TSTORE(GlobalTensor<half>, a_ub)
+//  SetCrossFlag<MTE3>(FLAG_V2C)      │       then data-ready signal
+//  ──────────────────────────────────┼────────────────────────────────────────
+//  WaitCrossFlag(FLAG_V2C)           │  TPOP<V2CPipe, TileL1, UP_DOWN>(pipe, ws_l1)
+//  TLOAD(ws_l1, ws_half)             │    └─ internally: wait, TLOAD(GlobalTensor<half>, ws_l1)
+//  SetCrossFlag<MTE2>(FLAG_C2V)      │       then free-space notify
+//
+// Key difference from raw_flag:
+//   VecTile::DType = half — TPUSH stores half into the slot (same as raw_flag).
+//   SlotSize = T²×sizeof(half) = 32 KB (identical to raw_flag workspace size).
+//   TPipe manages double-buffering automatically; no explicit FLAG_C2V/FLAG_V2C.
+//
+// Memory budget (per core):
+//   L1 (512 KB): ws_l1 — managed by TPOP via V2C_CONSUMER_BUF
+//                Slot 0 at L1:0x00000 (32 KB), Slot 1 at L1:0x08000 (32 KB)
+//   UB (192 KB): a_ub 16 KB + b_ub 16 KB = 32 KB used
+//   FIFO GM    : FIFO_DEPTH × SLOT_SIZE = 2 × 32 KB = 64 KB per core
+// =============================================================================
+
+#define MEMORY_BASE
+#include <pto/pto-inst.hpp>
+#include <pto/common/fifo.hpp>
+#include "acl/acl.h"
+#include <runtime/rt_ffts.h>
+
+using namespace pto;
+
+#define TILE_SIZE 128
+#define HALF_TILE  64
+#define VEC_NUM     2
+
+#ifdef __CCE_AICORE__
+
+// ── On-chip UB buffer offsets (bytes) ─────────────────────────────────────────
+constexpr uint32_t UB_A_OFFSET = 0u;                                     // a_ub: 16 KB
+constexpr uint32_t UB_B_OFFSET = HALF_TILE * TILE_SIZE * sizeof(half);   // b_ub: 16 KB
+
+// ── FIFO configuration ────────────────────────────────────────────────────────
+// Slot holds VecTile::DType = half.  Two slots for double-buffered overlap.
+constexpr uint32_t FIFO_DEPTH      = 2u;
+constexpr uint32_t V2C_SLOT_SIZE   = TILE_SIZE * TILE_SIZE * sizeof(half); // 32 KB
+constexpr uint32_t V2C_FIFO_BYTES  = FIFO_DEPTH * V2C_SLOT_SIZE;           // 64 KB/core
+
+// L1 base for Cube to receive TPOP'd V2C tiles (managed by TPipe internally).
+constexpr uint32_t V2C_L1_BASE     = 0x0;
+
+// ── Tile types ────────────────────────────────────────────────────────────────
+// L1 consumer tile: Cube pops the full T×T half tile from the FIFO.
+using TileL1 = Tile<TileType::Mat, half, TILE_SIZE, TILE_SIZE,
+                    BLayout::ColMajor, TILE_SIZE, TILE_SIZE,
+                    SLayout::RowMajor, 512, PadValue::Zero>;
+
+// Vec UB tiles: half, each sub-block owns HALF_TILE rows.
+using TileVecUB = Tile<TileType::Vec, half, HALF_TILE, TILE_SIZE,
+                       BLayout::RowMajor, HALF_TILE, TILE_SIZE,
+                       SLayout::NoneBox, 512, PadValue::Null>;
+
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void SetFlag(uint32_t id) { set_flag(Src, Dst, static_cast<event_t>(id)); }
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void WaitFlag(uint32_t id) { wait_flag(Src, Dst, static_cast<event_t>(id)); }
+
+// ── FIFO pipe type ────────────────────────────────────────────────────────────
+// FlagID=0, DIR_V2C, slot=half32KB, depth=2.
+// TPUSH calls TSTORE(GlobalTensor<half>, a_ub) — same dtype, no conversion.
+// TPOP  calls TLOAD(GlobalTensor<half>, ws_l1) — loads half into L1.
+// Use FlagID=2 (FFTS flags 2 and 3) to avoid collision with stream_c2v's
+// C2VPipe = TPipe<0, DIR_C2V> which occupies flags 0 (push) and 1 (free).
+using V2CPipe = TPipe<2, Direction::DIR_V2C, V2C_SLOT_SIZE, FIFO_DEPTH>;
+
+using HalfTileGlobal =
+    GlobalTensor<half,
+                 TileShape2D<half, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 BaseShape2D<half, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 Layout::ND>;
+
+// ─────────────────────────────────────────────────────────────────────────────
+AICORE void run_stream_v2c(
+    __gm__ half    *A,         // [num_iters × num_cores × T, T]  Vec input 1
+    __gm__ half    *D,         // [num_iters × num_cores × T, T]  Vec input 2
+    __gm__ uint8_t *fifo_mem,  // [num_cores × V2C_FIFO_BYTES]  V2C FIFO buffer
+    int32_t         num_iters,
+    uint64_t        ffts_addr)
+{
+    const int32_t cid       = static_cast<int32_t>(get_block_idx());
+    const int32_t vid       = static_cast<int32_t>(get_subblockid());
+    const int32_t num_cores = static_cast<int32_t>(block_num);
+
+    set_ffts_base_addr(ffts_addr);
+
+    const int32_t wave_rows = num_cores * TILE_SIZE;
+
+    // Each core owns its own FIFO region.
+    __gm__ void *core_fifo = fifo_mem + cid * V2C_FIFO_BYTES;
+    V2CPipe pipe(core_fifo, /*c2v_ub_base=*/0x0, /*v2c_l1_base=*/V2C_L1_BASE);
+
+    // ── Allocate UB buffers ────────────────────────────────────────────────────
+    TileVecUB a_ub, b_ub;
+    TASSIGN(a_ub, UB_A_OFFSET);
+    TASSIGN(b_ub, UB_B_OFFSET);
+
+    // ws_l1 is assigned internally by TPOP (no manual TASSIGN needed).
+    TileL1 ws_l1;
+
+    // ── Cube: pop loop — receives half tiles from FIFO, discards data ─────────
+#if defined(__DAV_C220_CUBE__)
+
+    for (int32_t r = 0; r < num_iters; ++r) {
+        // ── pushpop replaces raw_flag: ───────────────────────────────────────
+        // raw_flag: wait_flag_dev(FLAG_V2C)
+        //           TLOAD(ws_l1, ws_half)
+        //           ffts_cross_core_sync(PIPE_MTE2, FLAG_C2V)  ← after TLOAD
+        // pushpop:
+        TPOP<V2CPipe, TileL1, TileSplitAxis::TILE_UP_DOWN>(pipe, ws_l1);
+        // └─ internally: wait, TLOAD(GlobalTensor<half>, ws_l1), free-space notify.
+        //    ws_l1 assigned to V2C_L1_BASE rotation (slot 0 or 1).
+        //    Rotating L1 addresses avoid read-after-write: no barrier needed here.
+    }
+
+#endif  // __DAV_C220_CUBE__
+
+    // ── Vec: load A+D, add, push to FIFO ──────────────────────────────────────
+#if defined(__DAV_C220_VEC__)
+
+    set_mask_norm();
+    set_vector_mask(-1, -1);
+
+    for (int32_t r = 0; r < num_iters; ++r) {
+        const int32_t row_v = r * wave_rows + cid * TILE_SIZE + vid * HALF_TILE;
+
+        HalfTileGlobal a_global(A + row_v * TILE_SIZE);
+        TLOAD(a_ub, a_global);
+
+        HalfTileGlobal d_global(D + row_v * TILE_SIZE);
+        TLOAD(b_ub, d_global);
+
+        SetFlag<PIPE_MTE2, PIPE_V>(0);
+        WaitFlag<PIPE_MTE2, PIPE_V>(0);   // MTE2→V: both TLOADs done before TADD
+
+        TADD(a_ub, a_ub, b_ub);
+        SetFlag<PIPE_V, PIPE_MTE3>(0);
+        WaitFlag<PIPE_V, PIPE_MTE3>(0);   // V→MTE3: TADD done before TPUSH writes to GM
+
+        // ── pushpop replaces raw_flag: ───────────────────────────────────────
+        // raw_flag: (if r>0) wait_flag_dev(FLAG_C2V)
+        //           pipe_barrier(PIPE_ALL)
+        //           TSTORE(ws_half, a_ub)
+        //           pipe_barrier(PIPE_ALL)
+        //           ffts_cross_core_sync(PIPE_MTE3, FLAG_V2C)
+        // pushpop:
+        TPUSH<V2CPipe, TileVecUB, TileSplitAxis::TILE_UP_DOWN>(pipe, a_ub);
+        // └─ internally: (if needed) wait for free space, TSTORE(GlobalTensor<half>, a_ub),
+        //    then data-ready signal.  Double-buffer back-pressure is automatic.
+    }
+
+#endif  // __DAV_C220_VEC__
+}
+
+#endif  // __CCE_AICORE__
+
+extern "C" __global__ AICORE void stream_v2c_kernel(
+    __gm__ uint8_t *A,
+    __gm__ uint8_t *D,
+    __gm__ uint8_t *fifo_mem,
+    int32_t         num_iters,
+    uint64_t        ffts_addr)
+{
+    run_stream_v2c(
+        reinterpret_cast<__gm__ half *>(A),
+        reinterpret_cast<__gm__ half *>(D),
+        fifo_mem,
+        num_iters, ffts_addr);
+}
+
+extern "C" void call_stream_v2c(uint32_t block_dim, void *stream,
+                                 uint8_t *A, uint8_t *D,
+                                 uint8_t *fifo_mem, int32_t num_iters)
+{
+    uint32_t ffts_len  = 0;
+    uint64_t ffts_addr = 0;
+    rtGetC2cCtrlAddr(&ffts_addr, &ffts_len);
+    stream_v2c_kernel<<<block_dim, nullptr, stream>>>(A, D, fifo_mem, num_iters, ffts_addr);
+}
diff --git a/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/raw_flag/README.md b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/raw_flag/README.md
new file mode 100644
index 00000000..78d0082a
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/raw_flag/README.md
@@ -0,0 +1,67 @@
+# stream_c2v_v2c — Cube↔Vector bandwidth microbenchmarks
+
+Measures the **internal** throughput of the Cube↔Vector workspace handshake
+(`ffts_cross_core_sync` / `wait_flag_dev`).  Unlike the `matmul_add` kernels,
+which measure external HBM traffic (large input tensors), these kernels loop
+over the **same small workspace** many times and count only the round-trip
+workspace bytes.
+
+## Kernels
+
+| Kernel | Path | Cube work | Vec work |
+|--------|------|-----------|----------|
+| `stream_c2v` | Cube L0C → workspace → Vec UB | Initial GEMM fills L0C once; spills it every iter | Load workspace slice into UB |
+| `stream_v2c` | Vec UB → workspace → Cube L1  | Load workspace into L1, discard | Load A+D, add, write to workspace |
+
+Note: Removing the GEMM has negligible effect on throughput because the M pipe was never on the critical path.
+
+**Effective bandwidth** (both kernels use the same formula):
+```
+bw_eff = 2 × num_cores × T² × sizeof(fp16) × num_iters / time
+         ↑ workspace write + workspace read (round-trip)
+```
+
+## Files
+
+| File | Purpose |
+|------|---------|
+| `stream_c2v.cpp` | C2V kernel — runtime `num_iters` arg |
+| `stream_v2c.cpp` | V2C kernel — runtime `num_iters` arg, no GEMM on Cube side |
+| `jit_util_stream.py` | JIT compile + ctypes loaders for both |
+| `run_stream_c2v_v2c.py` | Smoke check + bandwidth sweep |
+
+## Reproduce
+
+```bash
+cd /workdir/pto-kernels-fork/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c
+
+python run_stream_c2v_v2c.py
+
+# Choose a different NPU (default: npu:7)
+NPU_DEVICE=npu:5 python run_stream_c2v_v2c.py
+```
+
+## Expected output (910B2, 24 Cube cores)
+
+```
+stream_c2v  (Cube L0C → workspace → Vec UB)
+ num_iters      dur_us     bw_GB/s
+        32       53.75        936.4
+        64       95.55       1053.6
+       256      355.84       1131.6
+      1024     1395.43       1154.2
+Peak: 1154.2 GB/s
+
+stream_v2c  (Vec UB → workspace → Cube L1)  [no Cube GEMM]
+ num_iters      dur_us     bw_GB/s
+        32       53.56        939.7
+        64       94.63       1063.8
+       128      182.56       1102.8
+      1024     1467.76       1097.3
+Peak: 1102.8 GB/s
+```
+
+C2V peak ~1154 GB/s (77% of HBM roofline); V2C peak ~1103 GB/s (74%).
+V2C is slightly lower because Vec must also load A and D from HBM before it can
+write to workspace — this external HBM traffic sits on the critical path even
+though it is not counted in the bandwidth formula.
diff --git a/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/raw_flag/jit_util_stream.py b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/raw_flag/jit_util_stream.py
new file mode 100644
index 00000000..b9395a55
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/raw_flag/jit_util_stream.py
@@ -0,0 +1,130 @@
+"""JIT compile + load utilities for stream_c2v and stream_v2c kernels."""
+from __future__ import annotations
+
+import ctypes
+import os
+import subprocess
+from functools import lru_cache
+
+import torch
+
+_HERE = os.path.dirname(os.path.abspath(__file__))
+
+ASCEND_TOOLKIT_HOME = os.environ.get("ASCEND_TOOLKIT_HOME") or os.environ.get(
+    "ASCEND_HOME_PATH", ""
+)
+if not ASCEND_TOOLKIT_HOME:
+    raise RuntimeError("Set ASCEND_TOOLKIT_HOME or ASCEND_HOME_PATH")
+
+PTO_LIB_PATH = os.environ.get("PTO_LIB_PATH", ASCEND_TOOLKIT_HOME)
+_PTO_INC = os.path.join(PTO_LIB_PATH, "include")
+_DRIVER_INC = "/usr/local/Ascend/driver/kernel/inc"
+
+_NPU_DEVICE = os.environ.get("NPU_DEVICE", "npu:7")
+try:
+    BLOCK_DIM = int(
+        getattr(torch.npu.get_device_properties(_NPU_DEVICE), "cube_core_num", 20)
+    )
+except (RuntimeError, AssertionError):
+    BLOCK_DIM = 24
+
+
+def _compile(cpp_basename: str, so_basename: str, verbose: bool = True) -> str:
+    cpp_path = os.path.join(_HERE, cpp_basename)
+    lib_path = os.path.join(_HERE, so_basename)
+    flags = [
+        "-fPIC", "-shared", "-xcce", "-DMEMORY_BASE", "-O2", "-std=gnu++17",
+        "--cce-aicore-arch=dav-c220",
+        "-mllvm", "-cce-aicore-stack-size=0x8000",
+        "-mllvm", "-cce-aicore-function-stack-size=0x8000",
+        "-mllvm", "-cce-aicore-record-overflow=true",
+        "-mllvm", "-cce-aicore-dcci-insert-for-scalar=false",
+        "-Wno-macro-redefined", "-Wno-ignored-attributes",
+        f"-I{_PTO_INC}",
+        f"-I{ASCEND_TOOLKIT_HOME}/include",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc/runtime",
+        f"-I{ASCEND_TOOLKIT_HOME}/pkg_inc/profiling",
+    ]
+    if os.path.isdir(_DRIVER_INC):
+        flags.append(f"-I{_DRIVER_INC}")
+    cmd = ["bisheng", *flags, cpp_path, "-o", lib_path]
+    if verbose:
+        print("Compiling:", " ".join(cmd))
+    subprocess.run(cmd, check=True, timeout=300)
+    if verbose:
+        print(f"Compiled → {lib_path}")
+    return lib_path
+
+
+@lru_cache(maxsize=1)
+def load_stream_c2v(verbose: bool = True) -> "StreamC2VKernel":
+    lib_path = _compile("stream_c2v.cpp", "stream_c2v.so", verbose=verbose)
+    lib = ctypes.CDLL(os.path.abspath(lib_path))
+    # void call_stream_c2v(uint32_t block_dim, void *stream,
+    #                       uint8_t *A, uint8_t *B, uint8_t *workspace, int32_t num_iters)
+    lib.call_stream_c2v.argtypes = [
+        ctypes.c_uint32,  # block_dim
+        ctypes.c_void_p,  # stream
+        ctypes.c_void_p,  # A
+        ctypes.c_void_p,  # B
+        ctypes.c_void_p,  # workspace
+        ctypes.c_int32,   # num_iters
+    ]
+    lib.call_stream_c2v.restype = None
+    return StreamC2VKernel(lib, BLOCK_DIM)
+
+
+@lru_cache(maxsize=1)
+def load_stream_v2c(verbose: bool = True) -> "StreamV2CKernel":
+    lib_path = _compile("stream_v2c.cpp", "stream_v2c.so", verbose=verbose)
+    lib = ctypes.CDLL(os.path.abspath(lib_path))
+    # void call_stream_v2c(uint32_t block_dim, void *stream,
+    #                       uint8_t *A, uint8_t *D,
+    #                       uint8_t *workspace, int32_t num_iters)
+    lib.call_stream_v2c.argtypes = [
+        ctypes.c_uint32,  # block_dim
+        ctypes.c_void_p,  # stream
+        ctypes.c_void_p,  # A
+        ctypes.c_void_p,  # D
+        ctypes.c_void_p,  # workspace
+        ctypes.c_int32,   # num_iters
+    ]
+    lib.call_stream_v2c.restype = None
+    return StreamV2CKernel(lib, BLOCK_DIM)
+
+
+class StreamC2VKernel:
+    def __init__(self, lib: ctypes.CDLL, block_dim: int) -> None:
+        self._lib = lib
+        self._block_dim = block_dim
+
+    def __call__(self, A: torch.Tensor, B: torch.Tensor,
+                 workspace: torch.Tensor, num_iters: int) -> None:
+        """A: [num_cores*T, T], B: [T, T], workspace: [num_cores*T, T]."""
+        stream_ptr = ctypes.c_void_p(torch.npu.current_stream().npu_stream)
+        self._lib.call_stream_c2v(
+            self._block_dim, stream_ptr,
+            ctypes.c_void_p(A.data_ptr()),
+            ctypes.c_void_p(B.data_ptr()),
+            ctypes.c_void_p(workspace.data_ptr()),
+            ctypes.c_int32(num_iters),
+        )
+
+
+class StreamV2CKernel:
+    def __init__(self, lib: ctypes.CDLL, block_dim: int) -> None:
+        self._lib = lib
+        self._block_dim = block_dim
+
+    def __call__(self, A: torch.Tensor, D: torch.Tensor,
+                 workspace: torch.Tensor, num_iters: int) -> None:
+        """A, D: [num_iters*num_cores*T, T], workspace: [num_cores*T, T]."""
+        stream_ptr = ctypes.c_void_p(torch.npu.current_stream().npu_stream)
+        self._lib.call_stream_v2c(
+            self._block_dim, stream_ptr,
+            ctypes.c_void_p(A.data_ptr()),
+            ctypes.c_void_p(D.data_ptr()),
+            ctypes.c_void_p(workspace.data_ptr()),
+            ctypes.c_int32(num_iters),
+        )
diff --git a/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/raw_flag/run_stream_c2v_v2c.py b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/raw_flag/run_stream_c2v_v2c.py
new file mode 100644
index 00000000..6c0029c4
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/raw_flag/run_stream_c2v_v2c.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+"""
+Bandwidth benchmark for stream_c2v and stream_v2c kernels.
+
+Both kernels measure the sustained throughput of the Cube↔Vector workspace
+handshake path using `ffts_cross_core_sync` / `wait_flag_dev`.
+
+Effective bandwidth definition (same for both directions):
+  bw_eff = 2 × num_cores × T² × sizeof(fp16) × num_iters / time
+           ↑ workspace write + workspace read (round-trip)
+
+Usage:
+    python run_stream_c2v_v2c.py
+    NPU_DEVICE=npu:5 python run_stream_c2v_v2c.py
+"""
+from __future__ import annotations
+
+import os
+import sys
+
+import torch
+import torch_npu  # noqa: F401
+
+_DEVICE = os.environ.get("NPU_DEVICE", "npu:7")
+torch.npu.set_device(_DEVICE)
+print(f"Using device: {_DEVICE}")
+os.environ["NPU_DEVICE"] = _DEVICE
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from jit_util_stream import load_stream_c2v, load_stream_v2c, BLOCK_DIM  # noqa: E402
+
+TILE_SIZE  = 128
+DTYPE      = torch.float16
+KW         = dict(dtype=DTYPE, device=_DEVICE)
+
+WARMUP  = 5
+REPEATS = 20
+
+
+def workspace_roundtrip_bytes(num_iters: int) -> int:
+    """GM bytes transferred through workspace per kernel launch."""
+    return 2 * BLOCK_DIM * TILE_SIZE * TILE_SIZE * 2 * num_iters  # ×2: write + read
+
+
+def _time_kernel(fn, *args, num_iters: int) -> float:
+    """Return median duration in µs for one call of fn(*args)."""
+    start = torch.npu.Event(enable_timing=True)
+    end   = torch.npu.Event(enable_timing=True)
+    start.record()
+    for _ in range(REPEATS):
+        fn(*args, num_iters)
+    end.record()
+    end.synchronize()
+    return start.elapsed_time(end) / REPEATS * 1e3  # ms → µs
+
+
+# ── stream_c2v ────────────────────────────────────────────────────────────────
+
+def run_c2v(kernel) -> None:
+    print("=" * 60)
+    print("stream_c2v  (Cube L0C → workspace → Vec UB)")
+    print("=" * 60)
+    header = f"{'num_iters':>10}  {'dur_us':>10}  {'bw_GB/s':>10}"
+    print(header)
+    print("-" * len(header))
+
+    wave_rows = BLOCK_DIM * TILE_SIZE
+    A  = torch.randn(wave_rows, TILE_SIZE, **KW)
+    B  = torch.randn(TILE_SIZE, TILE_SIZE, **KW)
+    ws = torch.empty(wave_rows, TILE_SIZE, **KW)
+
+    # Smoke check: run once with a few iterations, no crash = pass
+    kernel(A, B, ws, 4)
+    torch.npu.synchronize()
+    print(f"  smoke (num_iters=4): OK")
+    print()
+
+    records = []
+    for num_iters in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
+        for _ in range(WARMUP):
+            kernel(A, B, ws, num_iters)
+        torch.npu.synchronize()
+
+        dur_us = _time_kernel(kernel, A, B, ws, num_iters=num_iters)
+        bw_gbs = workspace_roundtrip_bytes(num_iters) / dur_us * 1e-3
+
+        print(f"{num_iters:>10d}  {dur_us:>10.2f}  {bw_gbs:>10.1f}")
+        records.append((num_iters, dur_us, bw_gbs))
+
+    peak_bw = max(r[2] for r in records)
+    peak_ni = max(records, key=lambda r: r[2])[0]
+    print(f"\nPeak: {peak_bw:.1f} GB/s at num_iters={peak_ni}  "
+          f"(910B2 HBM roofline ≈ 1500 GB/s)\n")
+
+
+# ── stream_v2c ────────────────────────────────────────────────────────────────
+
+def run_v2c(kernel) -> None:
+    print("=" * 60)
+    print("stream_v2c  (Vec UB → workspace → Cube L1)")
+    print("=" * 60)
+    header = f"{'num_iters':>10}  {'dur_us':>10}  {'bw_GB/s':>10}"
+    print(header)
+    print("-" * len(header))
+
+    wave_rows = BLOCK_DIM * TILE_SIZE
+    ws = torch.empty(wave_rows, TILE_SIZE, **KW)
+
+    # Smoke check
+    A_smoke = torch.randn(4 * wave_rows, TILE_SIZE, **KW)
+    D_smoke = torch.randn(4 * wave_rows, TILE_SIZE, **KW)
+    kernel(A_smoke, D_smoke, ws, 4)
+    torch.npu.synchronize()
+    print(f"  smoke (num_iters=4): OK")
+    print()
+
+    records = []
+    for num_iters in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
+        total_rows = num_iters * wave_rows
+        A = torch.randn(total_rows, TILE_SIZE, **KW)
+        D = torch.randn(total_rows, TILE_SIZE, **KW)
+
+        for _ in range(WARMUP):
+            kernel(A, D, ws, num_iters)
+        torch.npu.synchronize()
+
+        dur_us = _time_kernel(kernel, A, D, ws, num_iters=num_iters)
+        bw_gbs = workspace_roundtrip_bytes(num_iters) / dur_us * 1e-3
+
+        print(f"{num_iters:>10d}  {dur_us:>10.2f}  {bw_gbs:>10.1f}")
+        records.append((num_iters, dur_us, bw_gbs))
+
+    peak_bw = max(r[2] for r in records)
+    peak_ni = max(records, key=lambda r: r[2])[0]
+    print(f"\nPeak: {peak_bw:.1f} GB/s at num_iters={peak_ni}  "
+          f"(910B2 HBM roofline ≈ 1500 GB/s)\n")
+
+
+# ── Entry point ───────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    print(f"BLOCK_DIM (num Cube cores): {BLOCK_DIM}\n")
+
+    print("Compiling stream_c2v ...")
+    c2v = load_stream_c2v(verbose=True)
+    print()
+
+    print("Compiling stream_v2c ...")
+    v2c = load_stream_v2c(verbose=True)
+    print()
+
+    run_c2v(c2v)
+    run_v2c(v2c)
diff --git a/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/raw_flag/stream_c2v.cpp b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/raw_flag/stream_c2v.cpp
new file mode 100644
index 00000000..979fa177
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/raw_flag/stream_c2v.cpp
@@ -0,0 +1,207 @@
+// =============================================================================
+// stream_c2v.cpp — Cube→Vector bandwidth microbenchmark
+//
+// Measures the sustained throughput of the C2V path:
+//   Cube L0C  →  GM workspace  →  Vector UB
+//
+// Setup (once, outside the timed loop):
+//   Cube loads A[cid*T:] → L1 → L0A and B → L1 → L0B, then GEMM to fill c_l0.
+//
+// Inner loop (num_iters times, this is the timed section):
+//   Cube:  if r > 0: WaitCrossFlag(FLAG_V2C)
+//          TSTORE c_l0 → workspace[cid*T:]   (FIX pipe, fp32 → fp16)
+//          SetCrossFlag<PIPE_FIX>(FLAG_C2V)   → both Vec sub-blocks
+//   Vec:   WaitCrossFlag(FLAG_C2V)
+//          TLOAD workspace[ws_row:] → c_ub    (MTE2 pipe)
+//          SetCrossFlag<PIPE_MTE3>(FLAG_V2C)  → Cube
+//
+// c_l0 is filled once by the GEMM and re-spilled every iteration unchanged.
+// There is no global C write — the only GM traffic is the workspace round-trip.
+//
+// Effective bandwidth per iteration (per core):
+//   write  TILE_SIZE × TILE_SIZE × sizeof(fp16)   (Cube  → workspace)
+//   read   TILE_SIZE × TILE_SIZE × sizeof(fp16)   (Vec   ← workspace)
+//   total  2 × T² × 2  bytes  ×  num_cores  across all cores
+//
+// Input tensors:
+//   A         [num_cores * T, T]  fp16   (initial GEMM input, read once)
+//   B         [T, T]              fp16   (weight, read once)
+//   workspace [num_cores * T, T]  fp16   (C2V ping-pong buffer)
+//   num_iters int32               runtime loop count
+//
+// Memory budget (per core):
+//   L1 (512 KB): b_l1 32 KB + a_l1 32 KB = 64 KB used
+//   L0A (64 KB): a_l0  32 KB
+//   L0B (64 KB): b_l0  32 KB
+//   L0C (128 KB): c_l0 64 KB   (spilled every iteration, never overwritten)
+//   UB  (192 KB): c_ub 16 KB   (workspace result, discarded after load)
+// =============================================================================
+
+#define MEMORY_BASE
+#include <pto/pto-inst.hpp>
+#include "acl/acl.h"
+#include <runtime/rt_ffts.h>
+
+using namespace pto;
+
+#define TILE_SIZE 128
+#define HALF_TILE  64
+#define VEC_NUM     2
+
+#ifdef __CCE_AICORE__
+
+constexpr uint32_t L1_B_OFFSET  = 0u;
+constexpr uint32_t L1_A_OFFSET  = TILE_SIZE * TILE_SIZE * sizeof(half);  // 32 KB
+constexpr uint32_t L0_OFFSET    = 0u;
+constexpr uint32_t UB_C_OFFSET  = 0u;
+
+constexpr int32_t FLAG_C2V = 0;  // Cube → Vec: workspace tile written
+constexpr int32_t FLAG_V2C = 1;  // Vec → Cube: workspace tile consumed
+
+using TileL1 = Tile<TileType::Mat, half, TILE_SIZE, TILE_SIZE,
+                    BLayout::ColMajor, TILE_SIZE, TILE_SIZE,
+                    SLayout::RowMajor, 512, PadValue::Zero>;
+
+using TileL0A = TileLeft<half,  TILE_SIZE, TILE_SIZE>;
+using TileL0B = TileRight<half, TILE_SIZE, TILE_SIZE>;
+using TileL0C = TileAcc<float,  TILE_SIZE, TILE_SIZE>;
+
+using TileVecUB =
+    Tile<TileType::Vec, half, HALF_TILE, TILE_SIZE,
+         BLayout::RowMajor, HALF_TILE, TILE_SIZE, SLayout::NoneBox, 512, PadValue::Null>;
+
+using TileGlobal =
+    GlobalTensor<half,
+                 TileShape2D<half, TILE_SIZE, TILE_SIZE, Layout::ND>,
+                 BaseShape2D<half, TILE_SIZE, TILE_SIZE, Layout::ND>,
+                 Layout::ND>;
+
+using HalfTileGlobal =
+    GlobalTensor<half,
+                 TileShape2D<half, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 BaseShape2D<half, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 Layout::ND>;
+
+template <pipe_t Pipe>
+AICORE inline void SetCrossFlag(int32_t flag) {
+    ffts_cross_core_sync(Pipe, 1 | (VEC_NUM << 4) | (flag << 8));
+}
+AICORE inline void WaitCrossFlag(int32_t flag) { wait_flag_dev(flag); }
+
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void SetFlag(uint32_t id) { set_flag(Src, Dst, static_cast<event_t>(id)); }
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void WaitFlag(uint32_t id) { wait_flag(Src, Dst, static_cast<event_t>(id)); }
+
+AICORE void run_stream_c2v(
+    __gm__ half  *A,          // [num_cores * T, T]  initial GEMM input (A[cid*T:])
+    __gm__ half  *B,          // [T, T]              weight matrix
+    __gm__ half  *workspace,  // [num_cores * T, T]  C2V ping-pong buffer
+    int32_t       num_iters,  // inner loop count (runtime)
+    uint64_t      ffts_addr)
+{
+    const int32_t cid = static_cast<int32_t>(get_block_idx());
+    const int32_t vid = static_cast<int32_t>(get_subblockid());
+
+    set_ffts_base_addr(ffts_addr);
+
+    TileL1  b_l1, a_l1;
+    TASSIGN(b_l1, L1_B_OFFSET);
+    TASSIGN(a_l1, L1_A_OFFSET);
+
+    TileL0A a_l0;  TileL0B b_l0;  TileL0C c_l0;
+    TASSIGN(a_l0, L0_OFFSET);
+    TASSIGN(b_l0, L0_OFFSET);
+    TASSIGN(c_l0, L0_OFFSET);
+
+    TileVecUB c_ub;
+    TASSIGN(c_ub, UB_C_OFFSET);
+
+    // ── Cube: one-time setup — GEMM fills c_l0 ────────────────────────────────
+#if defined(__DAV_C220_CUBE__)
+
+    TileGlobal b_global(B);
+    TLOAD(b_l1, b_global);
+    SetFlag<PIPE_MTE2, PIPE_MTE1>(0);
+    WaitFlag<PIPE_MTE2, PIPE_MTE1>(0);
+    TMOV(b_l0, b_l1);
+    SetFlag<PIPE_MTE1, PIPE_M>(0);
+    WaitFlag<PIPE_MTE1, PIPE_M>(0);
+
+    TileGlobal a_global(A + cid * TILE_SIZE * TILE_SIZE);
+    TLOAD(a_l1, a_global);
+    SetFlag<PIPE_MTE2, PIPE_MTE1>(0);
+    WaitFlag<PIPE_MTE2, PIPE_MTE1>(0);
+    TMOV(a_l0, a_l1);
+    SetFlag<PIPE_MTE1, PIPE_M>(0);
+    WaitFlag<PIPE_MTE1, PIPE_M>(0);
+
+    // c_l0 is filled here and re-spilled every iteration without recomputing.
+    TMATMUL(c_l0, a_l0, b_l0);
+    SetFlag<PIPE_M, PIPE_FIX>(0);
+    WaitFlag<PIPE_M, PIPE_FIX>(0);    // M→FIX: c_l0 ready for TSTORE
+
+    // ── Cube: inner bandwidth loop ─────────────────────────────────────────────
+    TileGlobal ws_out(workspace + cid * TILE_SIZE * TILE_SIZE);
+    for (int32_t r = 0; r < num_iters; ++r) {
+        // Wait for Vec to finish reading workspace before overwriting it.
+        // (Skip round 0: Vec hasn't touched the slot yet.)
+        if (r > 0) {
+            WaitCrossFlag(FLAG_V2C);
+            // No local-pipe barrier needed: after cross-core wait returns,
+            // no pending FIX-pipe work from the previous iteration remains.
+        }
+        TSTORE(ws_out, c_l0);           // L0C → workspace  (FIX pipe, fp32 → fp16)
+        pipe_barrier(PIPE_ALL);         // FIX: wait for DMA to complete before signaling Vec
+        SetCrossFlag<PIPE_FIX>(FLAG_C2V);  // signal Vec: workspace tile is ready
+    }
+
+#endif  // __DAV_C220_CUBE__
+
+    // ── Vec: inner bandwidth loop ──────────────────────────────────────────────
+#if defined(__DAV_C220_VEC__)
+
+    set_mask_norm();
+    set_vector_mask(-1, -1);
+
+    const int32_t ws_row = cid * TILE_SIZE + vid * HALF_TILE;
+    HalfTileGlobal ws_in(workspace + ws_row * TILE_SIZE);
+
+    for (int32_t r = 0; r < num_iters; ++r) {
+        WaitCrossFlag(FLAG_C2V);         // workspace tile is ready
+        pipe_barrier(PIPE_ALL);         // ensure all local pipes flushed before TLOAD
+        TLOAD(c_ub, ws_in);              // workspace → UB  (MTE2 pipe)
+        SetFlag<PIPE_MTE2, PIPE_MTE3>(0);
+        WaitFlag<PIPE_MTE2, PIPE_MTE3>(0);  // MTE2→MTE3: wait for DMA before signal
+        SetCrossFlag<PIPE_MTE3>(FLAG_V2C);  // signal Cube: workspace slot freed
+    }
+
+#endif  // __DAV_C220_VEC__
+}
+
+#endif  // __CCE_AICORE__
+
+// ── Kernel entry point ─────────────────────────────────────────────────────────
+extern "C" __global__ AICORE void stream_c2v_kernel(
+    __gm__ uint8_t *A,
+    __gm__ uint8_t *B,
+    __gm__ uint8_t *workspace,
+    int32_t         num_iters,
+    uint64_t        ffts_addr)
+{
+    run_stream_c2v(
+        reinterpret_cast<__gm__ half *>(A),
+        reinterpret_cast<__gm__ half *>(B),
+        reinterpret_cast<__gm__ half *>(workspace),
+        num_iters, ffts_addr);
+}
+
+extern "C" void call_stream_c2v(uint32_t block_dim, void *stream,
+                                 uint8_t *A, uint8_t *B,
+                                 uint8_t *workspace, int32_t num_iters)
+{
+    uint32_t ffts_len  = 0;
+    uint64_t ffts_addr = 0;
+    rtGetC2cCtrlAddr(&ffts_addr, &ffts_len);
+    stream_c2v_kernel<<<block_dim, nullptr, stream>>>(A, B, workspace, num_iters, ffts_addr);
+}
diff --git a/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/raw_flag/stream_v2c.cpp b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/raw_flag/stream_v2c.cpp
new file mode 100644
index 00000000..79648e1c
--- /dev/null
+++ b/examples/jit_cpp/cross_core_sync_demo/stream_c2v_v2c/raw_flag/stream_v2c.cpp
@@ -0,0 +1,201 @@
+// =============================================================================
+// stream_v2c.cpp — Vector→Cube bandwidth microbenchmark
+//
+// Measures the sustained throughput of the V2C path:
+//   Vector UB  →  GM workspace  →  Cube L1
+//
+// Unlike the TileLang reference (which required a GEMM on the Cube side for
+// compiler reasons), this PTO C++ version strips the Cube work down to the
+// bare minimum: load workspace into L1, then immediately free the slot.
+//
+// Inner loop (num_iters times, this is the timed section):
+//   Vec:   TLOAD A[row_v:] → a_ub, TLOAD D[row_v:] → b_ub
+//          TADD a_ub = a_ub + b_ub
+//          if r > 0: WaitCrossFlag(FLAG_C2V)
+//          TSTORE a_ub → workspace[ws_row:]     (MTE3 pipe)
+//          SetCrossFlag<PIPE_MTE3>(FLAG_V2C)     → Cube
+//   Cube:  WaitCrossFlag(FLAG_V2C)
+//          TLOAD workspace[cid*T:] → ws_l1      (MTE2 pipe, data discarded)
+//          SetCrossFlag<PIPE_MTE2>(FLAG_C2V)     → Vec  (fires after TLOAD)
+//
+// The SetCrossFlag<PIPE_MTE2> fires in the MTE2 instruction stream right after
+// the TLOAD, so it signals Vec the moment the workspace slot is captured in L1.
+// Vec can then write fresh data while Cube is already done with the slot.
+//
+// Effective bandwidth (same definition as stream_c2v for a fair comparison):
+//   write  num_cores × T² × sizeof(fp16)   (Vec  → workspace)
+//   read   num_cores × T² × sizeof(fp16)   (Cube ← workspace)
+//   total  2 × num_cores × T² × 2  bytes  per iteration
+//
+// Input tensors:
+//   A         [num_iters * num_cores * T, T]  fp16   (Vec input 1)
+//   D         [num_iters * num_cores * T, T]  fp16   (Vec input 2)
+//   workspace [num_cores * T, T]              fp16   (V2C ping-pong buffer)
+//   num_iters int32                           runtime loop count
+//
+// Memory budget (per core):
+//   L1  (512 KB): ws_l1 32 KB  (workspace read buffer, discarded each iteration)
+//   UB  (192 KB): a_ub 16 KB + b_ub 16 KB = 32 KB used
+//   L0A / L0B / L0C: unused
+// =============================================================================
+
+#define MEMORY_BASE
+#include <pto/pto-inst.hpp>
+#include "acl/acl.h"
+#include <runtime/rt_ffts.h>
+
+using namespace pto;
+
+#define TILE_SIZE 128
+#define HALF_TILE  64
+#define VEC_NUM     2
+
+#ifdef __CCE_AICORE__
+
+constexpr uint32_t L1_WS_OFFSET = 0u;           // workspace read buffer in L1
+constexpr uint32_t UB_A_OFFSET  = 0u;
+constexpr uint32_t UB_B_OFFSET  = HALF_TILE * TILE_SIZE * sizeof(half);  // 16 KB
+
+constexpr int32_t FLAG_C2V = 0;  // Cube → Vec: workspace slot consumed into L1
+constexpr int32_t FLAG_V2C = 1;  // Vec → Cube: workspace tile written to GM
+
+using TileL1 = Tile<TileType::Mat, half, TILE_SIZE, TILE_SIZE,
+                    BLayout::ColMajor, TILE_SIZE, TILE_SIZE,
+                    SLayout::RowMajor, 512, PadValue::Zero>;
+
+using TileVecUB =
+    Tile<TileType::Vec, half, HALF_TILE, TILE_SIZE,
+         BLayout::RowMajor, HALF_TILE, TILE_SIZE, SLayout::NoneBox, 512, PadValue::Null>;
+
+using TileGlobal =
+    GlobalTensor<half,
+                 TileShape2D<half, TILE_SIZE, TILE_SIZE, Layout::ND>,
+                 BaseShape2D<half, TILE_SIZE, TILE_SIZE, Layout::ND>,
+                 Layout::ND>;
+
+using HalfTileGlobal =
+    GlobalTensor<half,
+                 TileShape2D<half, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 BaseShape2D<half, HALF_TILE, TILE_SIZE, Layout::ND>,
+                 Layout::ND>;
+
+template <pipe_t Pipe>
+AICORE inline void SetCrossFlag(int32_t flag) {
+    ffts_cross_core_sync(Pipe, 1 | (VEC_NUM << 4) | (flag << 8));
+}
+AICORE inline void WaitCrossFlag(int32_t flag) { wait_flag_dev(flag); }
+
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void SetFlag(uint32_t id) { set_flag(Src, Dst, static_cast<event_t>(id)); }
+template <pipe_t Src, pipe_t Dst>
+AICORE inline void WaitFlag(uint32_t id) { wait_flag(Src, Dst, static_cast<event_t>(id)); }
+
+AICORE void run_stream_v2c(
+    __gm__ half  *A,          // [num_iters * num_cores * T, T]  Vec input 1
+    __gm__ half  *D,          // [num_iters * num_cores * T, T]  Vec input 2
+    __gm__ half  *workspace,  // [num_cores * T, T]              V2C ping-pong buffer
+    int32_t       num_iters,
+    uint64_t      ffts_addr)
+{
+    const int32_t cid       = static_cast<int32_t>(get_block_idx());
+    const int32_t vid       = static_cast<int32_t>(get_subblockid());
+    const int32_t num_cores = static_cast<int32_t>(block_num);
+
+    set_ffts_base_addr(ffts_addr);
+
+    const int32_t wave_rows = num_cores * TILE_SIZE;
+
+    TileL1    ws_l1;
+    TASSIGN(ws_l1, L1_WS_OFFSET);
+
+    TileVecUB a_ub, b_ub;
+    TASSIGN(a_ub, UB_A_OFFSET);
+    TASSIGN(b_ub, UB_B_OFFSET);
+
+    // ── Cube: load workspace into L1, discard data, free slot immediately ──────
+#if defined(__DAV_C220_CUBE__)
+
+    TileGlobal ws_in(workspace + cid * TILE_SIZE * TILE_SIZE);
+    for (int32_t r = 0; r < num_iters; ++r) {
+        // Wait for both Vec sub-blocks to write their halves of the workspace tile.
+        WaitCrossFlag(FLAG_V2C);
+
+        // Capture workspace into L1 (the measured read).
+        TLOAD(ws_l1, ws_in);
+
+        // Signal Vec immediately from the MTE2 pipe: workspace slot is consumed
+        // (in L1) — Vec can overwrite it for the next round.
+        SetCrossFlag<PIPE_MTE2>(FLAG_C2V);
+
+        // Drain MTE2 before the next iteration's TLOAD touches ws_l1 again.
+        pipe_barrier(PIPE_MTE2);
+    }
+
+#endif  // __DAV_C220_CUBE__
+
+    // ── Vec: load A+D, add, write to workspace ─────────────────────────────────
+#if defined(__DAV_C220_VEC__)
+
+    set_mask_norm();
+    set_vector_mask(-1, -1);
+
+    const int32_t ws_row = cid * TILE_SIZE + vid * HALF_TILE;
+    HalfTileGlobal ws_out(workspace + ws_row * TILE_SIZE);
+
+    for (int32_t r = 0; r < num_iters; ++r) {
+        const int32_t row_v = r * wave_rows + cid * TILE_SIZE + vid * HALF_TILE;
+
+        // Load A and D — independent of the workspace handshake, so prefetch.
+        HalfTileGlobal a_global(A + row_v * TILE_SIZE);
+        TLOAD(a_ub, a_global);
+
+        HalfTileGlobal d_global(D + row_v * TILE_SIZE);
+        TLOAD(b_ub, d_global);
+
+        SetFlag<PIPE_MTE2, PIPE_V>(0);
+        WaitFlag<PIPE_MTE2, PIPE_V>(0);   // MTE2→V: both TLOADs done before TADD
+
+        TADD(a_ub, a_ub, b_ub);
+        SetFlag<PIPE_V, PIPE_MTE3>(0);
+        WaitFlag<PIPE_V, PIPE_MTE3>(0);   // V→MTE3: TADD done before TSTORE
+
+        // Wait for Cube to free the workspace slot (skip on round 0).
+        if (r > 0) {
+            WaitCrossFlag(FLAG_C2V);
+            // No local-pipe barrier needed after cross-core wait.
+        }
+
+        TSTORE(ws_out, a_ub);
+        pipe_barrier(PIPE_ALL);         // MTE3: wait for DMA to complete before signaling Cube
+        SetCrossFlag<PIPE_MTE3>(FLAG_V2C);
+    }
+
+#endif  // __DAV_C220_VEC__
+}
+
+#endif  // __CCE_AICORE__
+
+// ── Kernel entry point ─────────────────────────────────────────────────────────
+extern "C" __global__ AICORE void stream_v2c_kernel(
+    __gm__ uint8_t *A,
+    __gm__ uint8_t *D,
+    __gm__ uint8_t *workspace,
+    int32_t         num_iters,
+    uint64_t        ffts_addr)
+{
+    run_stream_v2c(
+        reinterpret_cast<__gm__ half *>(A),
+        reinterpret_cast<__gm__ half *>(D),
+        reinterpret_cast<__gm__ half *>(workspace),
+        num_iters, ffts_addr);
+}
+
+extern "C" void call_stream_v2c(uint32_t block_dim, void *stream,
+                                 uint8_t *A, uint8_t *D,
+                                 uint8_t *workspace, int32_t num_iters)
+{
+    uint32_t ffts_len  = 0;
+    uint64_t ffts_addr = 0;
+    rtGetC2cCtrlAddr(&ffts_addr, &ffts_len);
+    stream_v2c_kernel<<<block_dim, nullptr, stream>>>(A, D, workspace, num_iters, ffts_addr);
+}