huawei-csl · zhoubot · Mar 4, 2026 · Mar 29, 2026 · Mar 29, 2026 · Mar 30, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -1,27 +1,11 @@
-name: CI
+name: Full Regression CI
 
 on:
   push:
     branches: [main]
-  pull_request:
-    branches: [main]
   workflow_dispatch:
 
 jobs:
-  pre-commit:
-    name: pre-commit
-    runs-on: ubuntu-24.04
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: Run pre-commit checks
-        run: |
-          python -m pip install --upgrade pip
-          python -m pip install pre-commit
-          pre-commit run --all-files
-
   test:
     name: test (${{ matrix.arch }}, ${{ matrix.install-mode }})
     strategy:
@@ -58,6 +42,12 @@ jobs:
 
       - uses: actions/checkout@v4
 
+      - uses: actions/checkout@v4
+        with:
+          repository: ${{ env.RELEASE_REPO }}
+          ref: ${{ env.RELEASE_TAG }}
+          path: ptoas-src
+
       - name: Install Python packages
         run: |
           pip install --no-cache-dir torch==2.9.0 --index-url https://download.pytorch.org/whl/cpu
@@ -84,7 +74,7 @@ jobs:
 
       - name: Clone pto-isa headers
         run: |
-          git clone https://gitcode.com/cann/pto-isa.git /sources/pto-isa
+          git clone https://github.com/PTO-ISA/pto-isa.git /sources/pto-isa
           cd /sources/pto-isa && git checkout ${PTOISA_COMMIT}
 
       - name: Install ptodsl (${{ matrix.install-mode }})
@@ -95,8 +85,10 @@ jobs:
             pip install -e .
           fi
 
-      - name: Run frontend tests
-        run: pytest -v ./tests/frontend
+      - name: Run host API and regression tests
+        env:
+          PTOAS_VPTO_MANIFEST: ${{ github.workspace }}/ptoas-src/docs/vpto-manifest.json
+        run: pytest -v ./tests/api ./tests/frontend ./tests/regression
 
       - name: Run NPU build tests
         run: |

diff --git a/.github/workflows/fast-ci.yml b/.github/workflows/fast-ci.yml
@@ -0,0 +1,74 @@
+name: Fast CI
+
+on:
+  pull_request:
+    branches: [main]
+  push:
+    branches: [main]
+  workflow_dispatch:
+
+env:
+  RELEASE_REPO: zhangstevenunity/PTOAS
+  RELEASE_VER: 0.9
+  RELEASE_TAG: v0.9
+
+jobs:
+  pre-commit:
+    name: pre-commit
+    runs-on: ubuntu-24.04
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Run pre-commit checks
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install pre-commit
+          pre-commit run --all-files
+
+  host-tests:
+    name: host tests (${{ matrix.install-mode }})
+    runs-on: ubuntu-24.04
+    strategy:
+      fail-fast: false
+      matrix:
+        install-mode: [standard, editable]
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/checkout@v4
+        with:
+          repository: ${{ env.RELEASE_REPO }}
+          ref: ${{ env.RELEASE_TAG }}
+          path: ptoas-src
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install Python packages
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install pytest
+
+      - name: Install ptoas wheel
+        run: |
+          WHEEL_NAME=ptoas-${RELEASE_VER}-cp311-none-manylinux_2_34_x86_64.whl
+          wget https://github.com/${RELEASE_REPO}/releases/download/${RELEASE_TAG}/${WHEEL_NAME}
+          python -m pip install ./${WHEEL_NAME}
+          python -c "import mlir.ir; from mlir.dialects import pto"
+
+      - name: Install ptodsl (${{ matrix.install-mode }})
+        run: |
+          if [ "${{ matrix.install-mode }}" = "standard" ]; then
+            python -m pip install .
+          else
+            python -m pip install -e .
+          fi
+
+      - name: Run host API and regression tests
+        env:
+          PTOAS_VPTO_MANIFEST: ${{ github.workspace }}/ptoas-src/docs/vpto-manifest.json
+        run: |
+          pytest -v ./tests/api ./tests/frontend ./tests/regression
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 <div align="center">
 
 # PTO-DSL
-Pythonic interface and JIT compiler for [PTO-ISA](https://gitcode.com/cann/pto-isa)
+Pythonic interface and JIT compiler for [PTO-ISA](https://github.com/PTO-ISA/pto-isa)
 </div>
 
 PTO-DSL provides a programming abstraction similar to [cuTile](https://docs.nvidia.com/cuda/cutile-python/), but native to [NPU](https://www.hiascend.com/).
@@ -37,6 +37,37 @@ pip install -e .
 
 See [examples](./examples) and [tests](./tests)
 
+Preferred frontend style keeps the existing low-level ops available, but adds a thinner
+object-centric layer for common tensor and tile flows:
+
+```python
+from ptodsl import pto, tile
+
+
+def vec_add(src0: "ptr_t", src1: "ptr_t", dst: "ptr_t", rows: "index_t", cols: "index_t"):
+    x = pto.make_tensor(src0, shape=[rows, cols], dtype=pto.float32)
+    y = pto.make_tensor(src1, shape=[rows, cols], dtype=pto.float32)
+    z = pto.make_tensor(dst, shape=[rows, cols], dtype=pto.float32)
+
+    x_tile = x.slice([0, 0], [32, 32])
+    y_tile = y.slice([0, 0], [32, 32])
+    z_tile = z.slice([0, 0], [32, 32])
+
+    with pto.vector_section():
+        tile_buf = pto.make_tile_buffer(pto.float32, [32, 32], space="VEC")
+        lhs = tile_buf.alloc()
+        rhs = tile_buf.alloc()
+        out = tile_buf.alloc()
+        lhs.load_from(x_tile)
+        rhs.load_from(y_tile)
+        tile.add(lhs, rhs, out)
+        out.store_to(z_tile)
+```
+
+The lower-level `PtrType`, `TensorType`, `SubTensorType`, `TileBufType`, `as_tensor`,
+`slice_view`, and `alloc_tile` APIs remain supported for cases where explicit control is
+preferred.
+
 ## Contribute
 
 See [contribute_guide.md](./contribute_guide.md)

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -61,3 +61,9 @@ RUN ptoas ./tmatmulk.pto -o ./tmatmulk.cpp
 
 RUN python ./abs.py > ./abs.pto
 RUN ptoas --enable-insert-sync ./abs.pto -o ./abs.cpp
+# certain operations need latest isa header, not CANN 8.5.0 default
+# header on 2026/02/14
+ARG PTOISA_COMMIT=672ee54cb8905bb9f9abbe80ec26ed2054b7a0cc
+WORKDIR /sources
+RUN git clone https://github.com/PTO-ISA/pto-isa.git \
+    && cd pto-isa && git checkout $PTOISA_COMMIT
diff --git a/examples/aot/matmul_mxfp8/matmul_mxfp8_builder.py b/examples/aot/matmul_mxfp8/matmul_mxfp8_builder.py
@@ -0,0 +1,84 @@
+from ptodsl import to_ir_module
+import ptodsl.language as pto
+
+
+def build(M=16, K=64, N=32, lhs_variant="e5m2", rhs_variant="e5m2"):
+    def meta_data():
+        mx = pto.make_mxfp8(lhs=lhs_variant, rhs=rhs_variant)
+        scale_k = mx.scale_k(K)
+
+        ptr_lhs = pto.PtrType(mx.lhs)
+        ptr_rhs = pto.PtrType(mx.rhs)
+        ptr_scale = pto.PtrType(mx.scale)
+        ptr_bias = pto.PtrType(mx.acc)
+
+        lhs_tensor = pto.TensorType(rank=2, dtype=mx.lhs)
+        rhs_tensor = pto.TensorType(rank=2, dtype=mx.rhs)
+        lhs_scale_tensor = pto.TensorType(rank=2, dtype=mx.scale)
+        rhs_scale_tensor = pto.TensorType(rank=2, dtype=mx.scale)
+        bias_tensor = pto.TensorType(rank=2, dtype=mx.acc)
+
+        lhs_tile_view = pto.SubTensorType(shape=[M, K], dtype=mx.lhs)
+        rhs_tile_view = pto.SubTensorType(shape=[K, N], dtype=mx.rhs)
+        lhs_scale_tile_view = pto.SubTensorType(shape=[M, scale_k], dtype=mx.scale)
+        rhs_scale_tile_view = pto.SubTensorType(shape=[scale_k, N], dtype=mx.scale)
+        bias_tile_view = pto.SubTensorType(shape=[1, N], dtype=mx.acc)
+
+        lhs_tile = pto.TileBufType(shape=[M, K], dtype=mx.lhs, memory_space="LEFT")
+        rhs_tile = pto.TileBufType(shape=[K, N], dtype=mx.rhs, memory_space="RIGHT")
+        lhs_scale_tile = pto.LeftScaleTileBufType(shape=[M, scale_k], dtype=mx.scale)
+        rhs_scale_tile = pto.RightScaleTileBufType(shape=[scale_k, N], dtype=mx.scale)
+        bias_tile = pto.TileBufType(shape=[1, N], dtype=mx.acc, memory_space="BIAS")
+        acc_tile = pto.TileBufType(shape=[M, N], dtype=mx.acc, memory_space="ACC")
+
+        return locals()
+
+    const = pto.const
+
+    @to_ir_module(meta_data=meta_data)
+    def matmul_mxfp8(
+        a_ptr: "ptr_lhs",
+        a_scale_ptr: "ptr_scale",
+        b_ptr: "ptr_rhs",
+        b_scale_ptr: "ptr_scale",
+        bias_ptr: "ptr_bias",
+    ) -> None:
+        c0 = const(0)
+        c1 = const(1)
+        cM = const(M)
+        cK = const(K)
+        cN = const(N)
+        cScaleK = const(scale_k)
+
+        tv_a = pto.as_tensor(lhs_tensor, ptr=a_ptr, shape=[cM, cK], strides=[cK, c1])
+        tv_b = pto.as_tensor(rhs_tensor, ptr=b_ptr, shape=[cK, cN], strides=[cN, c1])
+        tv_scale_a = pto.as_tensor(lhs_scale_tensor, ptr=a_scale_ptr, shape=[cM, cScaleK], strides=[cScaleK, c1])
+        tv_scale_b = pto.as_tensor(rhs_scale_tensor, ptr=b_scale_ptr, shape=[cScaleK, cN], strides=[cN, c1])
+        tv_bias = pto.as_tensor(bias_tensor, ptr=bias_ptr, shape=[c1, cN], strides=[cN, c1])
+
+        sv_a = pto.slice_view(lhs_tile_view, source=tv_a, offsets=[c0, c0], sizes=[cM, cK])
+        sv_b = pto.slice_view(rhs_tile_view, source=tv_b, offsets=[c0, c0], sizes=[cK, cN])
+        sv_scale_a = pto.slice_view(lhs_scale_tile_view, source=tv_scale_a, offsets=[c0, c0], sizes=[cM, cScaleK])
+        sv_scale_b = pto.slice_view(rhs_scale_tile_view, source=tv_scale_b, offsets=[c0, c0], sizes=[cScaleK, cN])
+        sv_bias = pto.slice_view(bias_tile_view, source=tv_bias, offsets=[c0, c0], sizes=[c1, cN])
+
+        with pto.cube_section():
+            a_tile = pto.alloc_tile(lhs_tile)
+            b_tile = pto.alloc_tile(rhs_tile)
+            a_scale_tile = pto.alloc_tile(lhs_scale_tile)
+            b_scale_tile = pto.alloc_tile(rhs_scale_tile)
+            bias_tile_buf = pto.alloc_tile(bias_tile)
+            acc_tile_buf = pto.alloc_tile(acc_tile)
+
+            pto.load(sv_a, a_tile)
+            pto.load(sv_b, b_tile)
+            pto.load(sv_scale_a, a_scale_tile)
+            pto.load(sv_scale_b, b_scale_tile)
+            pto.load(sv_bias, bias_tile_buf)
+            pto.matmul_mx_bias(a_tile, a_scale_tile, b_tile, b_scale_tile, bias_tile_buf, acc_tile_buf)
+
+    return matmul_mxfp8
+
+
+if __name__ == "__main__":
+    print(build())
diff --git a/examples/aot/matmul_mxfp8/mxfp8_ppt_example.py b/examples/aot/matmul_mxfp8/mxfp8_ppt_example.py
@@ -0,0 +1,80 @@
+from ptodsl import to_ir_module
+import ptodsl.language as pto
+
+
+M, K, N = 16, 64, 32
+
+
+def meta_data():
+    # 1) 选择 MXFP8 组合。默认是 lhs=e5m2, rhs=e5m2, scale=e8m0, acc=f32。
+    mx = pto.make_mxfp8(lhs="e5m2", rhs="e5m2")
+    scale_k = mx.scale_k(K)  # MXFP8 的 scale 张量沿 K 维按 32:1 压缩
+
+    # 2) 全局输入指针类型
+    a_ptr = pto.PtrType(mx.lhs)
+    b_ptr = pto.PtrType(mx.rhs)
+    scale_ptr = pto.PtrType(mx.scale)
+
+    # 3) TensorView 类型
+    a_tensor = pto.TensorType(rank=2, dtype=mx.lhs)
+    b_tensor = pto.TensorType(rank=2, dtype=mx.rhs)
+    scale_a_tensor = pto.TensorType(rank=2, dtype=mx.scale)
+    scale_b_tensor = pto.TensorType(rank=2, dtype=mx.scale)
+
+    # 4) TileView / TileBuf 类型
+    a_view = pto.SubTensorType(shape=[M, K], dtype=mx.lhs)
+    b_view = pto.SubTensorType(shape=[K, N], dtype=mx.rhs)
+    scale_a_view = pto.SubTensorType(shape=[M, scale_k], dtype=mx.scale)
+    scale_b_view = pto.SubTensorType(shape=[scale_k, N], dtype=mx.scale)
+
+    a_tile = pto.TileBufType(shape=[M, K], dtype=mx.lhs, memory_space="LEFT")
+    b_tile = pto.TileBufType(shape=[K, N], dtype=mx.rhs, memory_space="RIGHT")
+    scale_a_tile = pto.LeftScaleTileBufType(shape=[M, scale_k], dtype=mx.scale)
+    scale_b_tile = pto.RightScaleTileBufType(shape=[scale_k, N], dtype=mx.scale)
+    acc_tile = pto.TileBufType(shape=[M, N], dtype=mx.acc, memory_space="ACC")
+
+    return locals()
+
+
+@to_ir_module(meta_data=meta_data)
+def matmul_mxfp8_core(
+    a: "a_ptr",
+    scale_a: "scale_ptr",
+    b: "b_ptr",
+    scale_b: "scale_ptr",
+) -> None:
+    c0 = pto.const(0)
+    c1 = pto.const(1)
+    cM = pto.const(M)
+    cK = pto.const(K)
+    cN = pto.const(N)
+    cScaleK = pto.const(scale_k)
+
+    tv_a = pto.as_tensor(a_tensor, ptr=a, shape=[cM, cK], strides=[cK, c1])
+    tv_b = pto.as_tensor(b_tensor, ptr=b, shape=[cK, cN], strides=[cN, c1])
+    tv_scale_a = pto.as_tensor(scale_a_tensor, ptr=scale_a, shape=[cM, cScaleK], strides=[cScaleK, c1])
+    tv_scale_b = pto.as_tensor(scale_b_tensor, ptr=scale_b, shape=[cScaleK, cN], strides=[cN, c1])
+
+    sv_a = pto.slice_view(a_view, source=tv_a, offsets=[c0, c0], sizes=[cM, cK])
+    sv_b = pto.slice_view(b_view, source=tv_b, offsets=[c0, c0], sizes=[cK, cN])
+    sv_scale_a = pto.slice_view(scale_a_view, source=tv_scale_a, offsets=[c0, c0], sizes=[cM, cScaleK])
+    sv_scale_b = pto.slice_view(scale_b_view, source=tv_scale_b, offsets=[c0, c0], sizes=[cScaleK, cN])
+
+    with pto.cube_section():
+        ta = pto.alloc_tile(a_tile)
+        tb = pto.alloc_tile(b_tile)
+        tsa = pto.alloc_tile(scale_a_tile)
+        tsb = pto.alloc_tile(scale_b_tile)
+        tc = pto.alloc_tile(acc_tile)
+
+        pto.load(sv_a, ta)
+        pto.load(sv_b, tb)
+        pto.load(sv_scale_a, tsa)
+        pto.load(sv_scale_b, tsb)
+
+        # 核心调用：MXFP8 data tile + scale tile -> Acc tile
+        pto.matmul_mx(ta, tsa, tb, tsb, tc)
+
+
+if __name__ == "__main__":
+    print(matmul_mxfp8_core)
diff --git a/examples/aot/template_arithmetic/constexpr_tile_builder.py b/examples/aot/template_arithmetic/constexpr_tile_builder.py
@@ -0,0 +1,36 @@
+from ptodsl import Constexpr, const_expr, pto, range_constexpr, to_ir_module
+from ptodsl import scalar as s
+
+
+const = s.const
+
+
+def meta_data(TILE_K, UNROLL=2):
+    dtype = pto.float32
+    return {
+        "index_dtype": pto.int32,
+        "tile_type": pto.TileBufType(
+            shape=[1, TILE_K // 2],
+            valid_shape=[1, TILE_K // 2],
+            dtype=dtype,
+            memory_space="VEC",
+        ),
+    }
+
+
+@to_ir_module(meta_data=meta_data)
+def constexpr_tile_kernel(
+    n: "index_dtype",
+    TILE_K: Constexpr[int],
+    UNROLL: Constexpr[int] = 2,
+) -> None:
+    with pto.vector_section():
+        if const_expr(TILE_K % 128 == 0):
+            for _ in range_constexpr(UNROLL):
+                pto.alloc_tile(tile_type)
+        else:
+            pto.alloc_tile(tile_type)
+
+
+if __name__ == "__main__":
+    print(constexpr_tile_kernel(TILE_K=128, UNROLL=3))