diff --git a/src/target/codegen_npuir_dev.cc b/src/target/codegen_npuir_dev.cc index 27b49f70a..c2b8dc66c 100644 --- a/src/target/codegen_npuir_dev.cc +++ b/src/target/codegen_npuir_dev.cc @@ -2555,8 +2555,8 @@ void CodeGenTileLangNPUIRDEV::VcumsumCodegen(const CallNode *op) { return; } auto booleanAttr = mlir::BoolAttr::get(builder.getContext(), false); - auto newCumsumOp = builder.create( - loc, result_tensors, src, dst, + auto newCumsumOp = builder.create( + loc, result_tensors, src, builder.getDenseI64ArrayAttr(npuirop.cum_dims), booleanAttr); SetVarValue(npuirop.dst, newCumsumOp->getResult(0)); } @@ -2647,30 +2647,49 @@ void CodeGenTileLangNPUIRDEV::VinterleaveCodegen(const CallNode *op) { srcs.push_back(src); } mlir::ValueRange srcs_vr(srcs); - Value dst = GenSubviewFromRegion(npuirop.dst, npuirop.dst_range); - builder.create( - builder.getUnknownLoc(), TypeRange{}, srcs_vr, dst, - static_cast(npuirop.channel_nums)); + mlir::Value dst_tensor = GenExtractSliceFromRegion(npuirop.dst, npuirop.dst_range); + + auto interleaveOp = builder.create( + builder.getUnknownLoc(), + dst_tensor.getType(), + srcs_vr + ); + + mlir::Value result = ReshapeCastAndInsertSlice( + interleaveOp->getResult(0), + GetVarValue(npuirop.dst), + npuirop.dst_range + ); + + SetVarValue(npuirop.dst, result); } void CodeGenTileLangNPUIRDEV::VdeinterleaveCodegen(const CallNode *op) { tvm::tl::NpuirDeinterleave npuirop(op->args, this->vmap); Value src = GenSubviewFromRegion(npuirop.src, npuirop.src_range); - llvm::SmallVector dsts; size_t n_dsts = npuirop.dsts.size(); + for (size_t i = 0; i < n_dsts; i++) { - Value dst = GenSubviewFromRegion(npuirop.dsts[i], npuirop.dsts_range[i]); - dsts.push_back(dst); - } - mlir::ValueRange dsts_vr(dsts); - auto channel_nums = mlir::IntegerAttr::get( - builder.getI64Type(), static_cast(npuirop.channel_nums)); - mlir::hivm::DeinterleaveModeAttr index_mode = - mlir::hivm::DeinterleaveModeAttr::get( - &context, NPUIR_STR_DEINTERLEAVEMODE[npuirop.index_mode]); - builder.create(builder.getUnknownLoc(), - TypeRange{}, src, dsts_vr, - channel_nums, index_mode); + mlir::Value dst_tensor = GenExtractSliceFromRegion(npuirop.dsts[i], npuirop.dsts_range[i]); + + int64_t current_channel_idx = i; + auto channelIdxAttr = builder.getI64IntegerAttr(current_channel_idx); + + auto deinterleaveOp = builder.create( + builder.getUnknownLoc(), + dst_tensor.getType(), + src, + channelIdxAttr + ); + + mlir::Value result = ReshapeCastAndInsertSlice( + deinterleaveOp->getResult(0), + GetVarValue(npuirop.dsts[i]), + npuirop.dsts_range[i] + ); + + SetVarValue(npuirop.dsts[i], result); + } } /// Generate hivm.hir.varange for tl.npuir_arange. diff --git a/testing/npuir/hfusion_ops/test_cumsum_dev.py b/testing/npuir/hfusion_ops/test_cumsum_dev.py new file mode 100644 index 000000000..9e949e76d --- /dev/null +++ b/testing/npuir/hfusion_ops/test_cumsum_dev.py @@ -0,0 +1,53 @@ +# Copyright (c) Tile-AI Corporation. +# Licensed under the MIT License. +import os + +import tilelang +import tilelang.language as T + +import torch +import torch_npu + +tilelang.cache.clear_cache() +dtype = "float16" +M = 2 +K = 64 +N = 32 + + +def vec_cumsum(M, K, N, dtype="float16"): + @T.prim_func + def main(A: T.Tensor((M, K, N), dtype), + B: T.Tensor((M, K, N), dtype), + ): + with T.Kernel(1, is_npu=True) as (cid, _): + a = T.alloc_shared((M, K, N), dtype) + s = T.alloc_shared((M, K, N), dtype) + + T.copy(A, a) + T.cumsum(a, s, dim=2) + + T.copy(s, B) + + return main + + +def test_cumsumt(): + torch.npu.set_device(0) + os.environ['TILELANG_ASCEND_MODE'] = 'Developer' + + func = vec_cumsum(M, K, N) + compiled_kernel = tilelang.compile(func, target="npuir") + + v1 = torch.randn(size=[M, K, N], dtype=eval("torch." + dtype)).npu() + v2 = torch.randn(size=[M, K, N], dtype=eval("torch." + dtype)).npu() + + v_ref = torch.cumsum(v1, dim=2) + compiled_kernel(v1, v2) + + torch.testing.assert_close(v_ref, v2, rtol=1e-2, atol=1e-2) + print("Cumsum Pass!") + + +if __name__ == "__main__": + test_cumsumt() \ No newline at end of file diff --git a/testing/npuir/hfusion_ops/test_deinterleave_dev.py b/testing/npuir/hfusion_ops/test_deinterleave_dev.py new file mode 100644 index 000000000..51dbe3bc3 --- /dev/null +++ b/testing/npuir/hfusion_ops/test_deinterleave_dev.py @@ -0,0 +1,60 @@ +# Copyright (c) Tile-AI Corporation. +# Licensed under the MIT License. +import os + +import tilelang +import tilelang.language as T + +import torch +import torch_npu + +tilelang.cache.clear_cache() +dtype = "float16" +M = 2 +N = 32 + +def vdeinterleave_kernel(M, N, dtype): + BLOCK_SIZE = 1 + + @T.prim_func + def main( + C: T.Tensor((M, N * 2), dtype), + A: T.Tensor((M, N), dtype), + B: T.Tensor((M, N), dtype) + ): + with T.Kernel(BLOCK_SIZE, is_npu=True) as (cid, _): + C_ub = T.alloc_shared((M, N * 2), dtype) + A_ub = T.alloc_shared((M, N), dtype) + B_ub = T.alloc_shared((M, N), dtype) + + T.copy(C, C_ub) + T.deinterleave(C_ub, A_ub, B_ub, channel_nums=2) + T.copy(A_ub, A) + T.copy(B_ub, B) + + return main + +def test_vdeinterleave_kernel(): + torch.npu.set_device(0) + os.environ['TILELANG_ASCEND_MODE'] = 'Developer' + + func = vdeinterleave_kernel(M, N, dtype) + compiled_kernel = tilelang.compile(func, target="npuir") + + v3 = torch.randn(size=[M, N * 2], dtype=eval("torch." + dtype)).npu() + v1 = torch.randn(size=[M, N], dtype=eval("torch." + dtype)).npu() + v2 = torch.randn(size=[M, N], dtype=eval("torch." + dtype)).npu() + + v_unflatten = v3.unflatten(-1, (N, 2)) + ref_a, ref_b = v_unflatten.split(1, dim=-1) + ref_a = ref_a.squeeze(-1).to(dtype=eval("torch." + dtype)) + ref_b = ref_b.squeeze(-1).to(dtype=eval("torch." + dtype)) + + compiled_kernel(v3, v1, v2) + + torch.testing.assert_close(v1, ref_a, rtol=1e-2, atol=1e-2) + torch.testing.assert_close(v2, ref_b, rtol=1e-2, atol=1e-2) + print("Deinterleave Pass!") + +if __name__ == "__main__": + test_vdeinterleave_kernel() \ No newline at end of file diff --git a/testing/npuir/hfusion_ops/test_interleave_dev.py b/testing/npuir/hfusion_ops/test_interleave_dev.py new file mode 100644 index 000000000..46b81c625 --- /dev/null +++ b/testing/npuir/hfusion_ops/test_interleave_dev.py @@ -0,0 +1,55 @@ +# Copyright (c) Tile-AI Corporation. +# Licensed under the MIT License. +import os + +import tilelang +import tilelang.language as T + +import torch +import torch_npu + +tilelang.cache.clear_cache() +dtype = "float16" +M = 2 +N = 32 + +def vinterleave_kernel(M, N, dtype): + BLOCK_SIZE = 1 + + @T.prim_func + def main(A: T.Tensor((M, N), dtype), + B: T.Tensor((M, N), dtype), + C: T.Tensor((M, N * 2), dtype)): + + with T.Kernel(BLOCK_SIZE, is_npu=True) as (cid, _): + + A_ub = T.alloc_shared((M, N), dtype) + B_ub = T.alloc_shared((M, N), dtype) + C_ub = T.alloc_shared((M, N * 2), dtype) + + T.copy(A, A_ub) + T.copy(B, B_ub) + T.interleave(A_ub, B_ub, C_ub, channel_nums=2) + T.copy(C_ub, C) + + return main + +def test_vinterleave_kernel(): + torch.npu.set_device(0) + os.environ['TILELANG_ASCEND_MODE'] = 'Developer' + + func = vinterleave_kernel(M, N, dtype) + compiled_kernel = tilelang.compile(func, target="npuir") + + v1 = torch.randn(size=[M, N], dtype=eval("torch." + dtype)).npu() + v2 = torch.randn(size=[M, N], dtype=eval("torch." + dtype)).npu() + v3 = torch.randn(size=[M, N * 2], dtype=eval("torch." + dtype)).npu() + + v_ref = torch.cat([v1.unsqueeze(-1), v2.unsqueeze(-1)], dim=-1).flatten(-2) + compiled_kernel(v1, v2, v3) + + torch.testing.assert_close(v_ref, v3, rtol=1e-2, atol=1e-2) + print("Interleave Pass!") + +if __name__ == "__main__": + test_vinterleave_kernel() \ No newline at end of file