tile-ai · james1314520 · Apr 22, 2026 · Apr 23, 2026
diff --git a/src/target/codegen_npuir_dev.cc b/src/target/codegen_npuir_dev.cc
@@ -2555,8 +2555,8 @@ void CodeGenTileLangNPUIRDEV::VcumsumCodegen(const CallNode *op) {
     return;
   }
   auto booleanAttr = mlir::BoolAttr::get(builder.getContext(), false);
-  auto newCumsumOp = builder.create<mlir::hivm::VCumsumOp>(
-      loc, result_tensors, src, dst,
+  auto newCumsumOp = builder.create<mlir::hfusion::CumsumOp>(
+      loc, result_tensors, src,
       builder.getDenseI64ArrayAttr(npuirop.cum_dims), booleanAttr);
   SetVarValue(npuirop.dst, newCumsumOp->getResult(0));
 }
@@ -2647,30 +2647,49 @@ void CodeGenTileLangNPUIRDEV::VinterleaveCodegen(const CallNode *op) {
     srcs.push_back(src);
   }
   mlir::ValueRange srcs_vr(srcs);
-  Value dst = GenSubviewFromRegion(npuirop.dst, npuirop.dst_range);
-  builder.create<mlir::hivm::VInterleaveOp>(
-      builder.getUnknownLoc(), TypeRange{}, srcs_vr, dst,
-      static_cast<int64_t>(npuirop.channel_nums));
+  mlir::Value dst_tensor = GenExtractSliceFromRegion(npuirop.dst, npuirop.dst_range);
+
+  auto interleaveOp = builder.create<mlir::hfusion::InterleaveOp>(
+    builder.getUnknownLoc(),
+    dst_tensor.getType(),
+    srcs_vr
+  );
+
+  mlir::Value result = ReshapeCastAndInsertSlice(
+    interleaveOp->getResult(0),
+    GetVarValue(npuirop.dst),
+    npuirop.dst_range
+  );
+
+  SetVarValue(npuirop.dst, result);
 }
 
 void CodeGenTileLangNPUIRDEV::VdeinterleaveCodegen(const CallNode *op) {
   tvm::tl::NpuirDeinterleave npuirop(op->args, this->vmap);
   Value src = GenSubviewFromRegion(npuirop.src, npuirop.src_range);
-  llvm::SmallVector<Value> dsts;
   size_t n_dsts = npuirop.dsts.size();
+
   for (size_t i = 0; i < n_dsts; i++) {
-    Value dst = GenSubviewFromRegion(npuirop.dsts[i], npuirop.dsts_range[i]);
-    dsts.push_back(dst);
-  }
-  mlir::ValueRange dsts_vr(dsts);
-  auto channel_nums = mlir::IntegerAttr::get(
-      builder.getI64Type(), static_cast<int64_t>(npuirop.channel_nums));
-  mlir::hivm::DeinterleaveModeAttr index_mode =
-      mlir::hivm::DeinterleaveModeAttr::get(
-          &context, NPUIR_STR_DEINTERLEAVEMODE[npuirop.index_mode]);
-  builder.create<mlir::hivm::VDeinterleaveOp>(builder.getUnknownLoc(),
-                                              TypeRange{}, src, dsts_vr,
-                                              channel_nums, index_mode);
+    mlir::Value dst_tensor = GenExtractSliceFromRegion(npuirop.dsts[i], npuirop.dsts_range[i]);
+
+    int64_t current_channel_idx = i;
+    auto channelIdxAttr = builder.getI64IntegerAttr(current_channel_idx);
+
+    auto deinterleaveOp = builder.create<mlir::hfusion::DeinterleaveOp>(
+        builder.getUnknownLoc(),
+        dst_tensor.getType(),
+        src,
+        channelIdxAttr
+    );
+
+    mlir::Value result = ReshapeCastAndInsertSlice(
+        deinterleaveOp->getResult(0),
+        GetVarValue(npuirop.dsts[i]),
+        npuirop.dsts_range[i]
+    );
+
+    SetVarValue(npuirop.dsts[i], result);
+  }
 }
 
 /// Generate hivm.hir.varange for tl.npuir_arange.

diff --git a/testing/npuir/hfusion_ops/test_cumsum_dev.py b/testing/npuir/hfusion_ops/test_cumsum_dev.py
@@ -0,0 +1,53 @@
+# Copyright (c) Tile-AI Corporation.
+# Licensed under the MIT License.
+import os
+
+import tilelang
+import tilelang.language as T
+
+import torch
+import torch_npu
+
+tilelang.cache.clear_cache()
+dtype = "float16"
+M = 2
+K = 64
+N = 32
+
+
+def vec_cumsum(M, K, N, dtype="float16"):
+    @T.prim_func
+    def main(A: T.Tensor((M, K, N), dtype),
+             B: T.Tensor((M, K, N), dtype),
+             ):
+        with T.Kernel(1, is_npu=True) as (cid, _):
+            a = T.alloc_shared((M, K, N), dtype)
+            s = T.alloc_shared((M, K, N), dtype)
+
+            T.copy(A, a)
+            T.cumsum(a, s, dim=2)
+
+            T.copy(s, B)
+
+    return main
+
+
+def test_cumsumt():
+    torch.npu.set_device(0)
+    os.environ['TILELANG_ASCEND_MODE'] = 'Developer'
+
+    func = vec_cumsum(M, K, N)
+    compiled_kernel = tilelang.compile(func, target="npuir")
+
+    v1 = torch.randn(size=[M, K, N], dtype=eval("torch." + dtype)).npu()
+    v2 = torch.randn(size=[M, K, N], dtype=eval("torch." + dtype)).npu()
+
+    v_ref = torch.cumsum(v1, dim=2)
+    compiled_kernel(v1, v2)
+
+    torch.testing.assert_close(v_ref, v2, rtol=1e-2, atol=1e-2)
+    print("Cumsum Pass!")
+
+
+if __name__ == "__main__":
+    test_cumsumt()
diff --git a/testing/npuir/hfusion_ops/test_deinterleave_dev.py b/testing/npuir/hfusion_ops/test_deinterleave_dev.py
@@ -0,0 +1,60 @@
+# Copyright (c) Tile-AI Corporation.
+# Licensed under the MIT License.
+import os
+
+import tilelang
+import tilelang.language as T
+
+import torch
+import torch_npu
+
+tilelang.cache.clear_cache()
+dtype = "float16"
+M = 2
+N = 32
+
+def vdeinterleave_kernel(M, N, dtype):
+    BLOCK_SIZE = 1
+
+    @T.prim_func
+    def main(
+        C: T.Tensor((M, N * 2), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype)
+    ):
+        with T.Kernel(BLOCK_SIZE, is_npu=True) as (cid, _):
+            C_ub = T.alloc_shared((M, N * 2), dtype)
+            A_ub = T.alloc_shared((M, N), dtype)
+            B_ub = T.alloc_shared((M, N), dtype)
+
+            T.copy(C, C_ub)
+            T.deinterleave(C_ub, A_ub, B_ub, channel_nums=2)
+            T.copy(A_ub, A)
+            T.copy(B_ub, B)
+
+    return main
+
+def test_vdeinterleave_kernel():
+    torch.npu.set_device(0)
+    os.environ['TILELANG_ASCEND_MODE'] = 'Developer'
+
+    func = vdeinterleave_kernel(M, N, dtype)
+    compiled_kernel = tilelang.compile(func, target="npuir")
+
+    v3 = torch.randn(size=[M, N * 2], dtype=eval("torch." + dtype)).npu()
+    v1 = torch.randn(size=[M, N], dtype=eval("torch." + dtype)).npu()
+    v2 = torch.randn(size=[M, N], dtype=eval("torch." + dtype)).npu()
+
+    v_unflatten = v3.unflatten(-1, (N, 2))
+    ref_a, ref_b = v_unflatten.split(1, dim=-1)
+    ref_a = ref_a.squeeze(-1).to(dtype=eval("torch." + dtype))
+    ref_b = ref_b.squeeze(-1).to(dtype=eval("torch." + dtype))
+
+    compiled_kernel(v3, v1, v2)
+
+    torch.testing.assert_close(v1, ref_a, rtol=1e-2, atol=1e-2)
+    torch.testing.assert_close(v2, ref_b, rtol=1e-2, atol=1e-2)
+    print("Deinterleave Pass!")
+
+if __name__ == "__main__":
+    test_vdeinterleave_kernel()
diff --git a/testing/npuir/hfusion_ops/test_interleave_dev.py b/testing/npuir/hfusion_ops/test_interleave_dev.py
@@ -0,0 +1,55 @@
+# Copyright (c) Tile-AI Corporation.
+# Licensed under the MIT License.
+import os
+
+import tilelang
+import tilelang.language as T
+
+import torch
+import torch_npu
+
+tilelang.cache.clear_cache()
+dtype = "float16"
+M = 2
+N = 32
+
+def vinterleave_kernel(M, N, dtype):
+    BLOCK_SIZE = 1
+
+    @T.prim_func
+    def main(A: T.Tensor((M, N), dtype),
+             B: T.Tensor((M, N), dtype),
+             C: T.Tensor((M, N * 2), dtype)):
+
+        with T.Kernel(BLOCK_SIZE, is_npu=True) as (cid, _):
+
+            A_ub = T.alloc_shared((M, N), dtype)
+            B_ub = T.alloc_shared((M, N), dtype)
+            C_ub = T.alloc_shared((M, N * 2), dtype)
+
+            T.copy(A, A_ub)
+            T.copy(B, B_ub)
+            T.interleave(A_ub, B_ub, C_ub, channel_nums=2)
+            T.copy(C_ub, C)
+
+    return main
+
+def test_vinterleave_kernel():
+    torch.npu.set_device(0)
+    os.environ['TILELANG_ASCEND_MODE'] = 'Developer'
+
+    func = vinterleave_kernel(M, N, dtype)
+    compiled_kernel = tilelang.compile(func, target="npuir")
+
+    v1 = torch.randn(size=[M, N], dtype=eval("torch." + dtype)).npu()
+    v2 = torch.randn(size=[M, N], dtype=eval("torch." + dtype)).npu()
+    v3 = torch.randn(size=[M, N * 2], dtype=eval("torch." + dtype)).npu()
+
+    v_ref = torch.cat([v1.unsqueeze(-1), v2.unsqueeze(-1)], dim=-1).flatten(-2)
+    compiled_kernel(v1, v2, v3)
+
+    torch.testing.assert_close(v_ref, v3, rtol=1e-2, atol=1e-2)
+    print("Interleave Pass!")
+
+if __name__ == "__main__":
+    test_vinterleave_kernel()