Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 38 additions & 19 deletions src/target/codegen_npuir_dev.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2555,8 +2555,8 @@ void CodeGenTileLangNPUIRDEV::VcumsumCodegen(const CallNode *op) {
return;
}
auto booleanAttr = mlir::BoolAttr::get(builder.getContext(), false);
auto newCumsumOp = builder.create<mlir::hivm::VCumsumOp>(
loc, result_tensors, src, dst,
auto newCumsumOp = builder.create<mlir::hfusion::CumsumOp>(
loc, result_tensors, src,
builder.getDenseI64ArrayAttr(npuirop.cum_dims), booleanAttr);
SetVarValue(npuirop.dst, newCumsumOp->getResult(0));
}
Expand Down Expand Up @@ -2647,30 +2647,49 @@ void CodeGenTileLangNPUIRDEV::VinterleaveCodegen(const CallNode *op) {
srcs.push_back(src);
}
mlir::ValueRange srcs_vr(srcs);
Value dst = GenSubviewFromRegion(npuirop.dst, npuirop.dst_range);
builder.create<mlir::hivm::VInterleaveOp>(
builder.getUnknownLoc(), TypeRange{}, srcs_vr, dst,
static_cast<int64_t>(npuirop.channel_nums));
mlir::Value dst_tensor = GenExtractSliceFromRegion(npuirop.dst, npuirop.dst_range);

auto interleaveOp = builder.create<mlir::hfusion::InterleaveOp>(
builder.getUnknownLoc(),
dst_tensor.getType(),
srcs_vr
);

mlir::Value result = ReshapeCastAndInsertSlice(
interleaveOp->getResult(0),
GetVarValue(npuirop.dst),
npuirop.dst_range
);

SetVarValue(npuirop.dst, result);
}

void CodeGenTileLangNPUIRDEV::VdeinterleaveCodegen(const CallNode *op) {
tvm::tl::NpuirDeinterleave npuirop(op->args, this->vmap);
Value src = GenSubviewFromRegion(npuirop.src, npuirop.src_range);
llvm::SmallVector<Value> dsts;
size_t n_dsts = npuirop.dsts.size();

for (size_t i = 0; i < n_dsts; i++) {
Value dst = GenSubviewFromRegion(npuirop.dsts[i], npuirop.dsts_range[i]);
dsts.push_back(dst);
}
mlir::ValueRange dsts_vr(dsts);
auto channel_nums = mlir::IntegerAttr::get(
builder.getI64Type(), static_cast<int64_t>(npuirop.channel_nums));
mlir::hivm::DeinterleaveModeAttr index_mode =
mlir::hivm::DeinterleaveModeAttr::get(
&context, NPUIR_STR_DEINTERLEAVEMODE[npuirop.index_mode]);
builder.create<mlir::hivm::VDeinterleaveOp>(builder.getUnknownLoc(),
TypeRange{}, src, dsts_vr,
channel_nums, index_mode);
mlir::Value dst_tensor = GenExtractSliceFromRegion(npuirop.dsts[i], npuirop.dsts_range[i]);

int64_t current_channel_idx = i;
auto channelIdxAttr = builder.getI64IntegerAttr(current_channel_idx);

auto deinterleaveOp = builder.create<mlir::hfusion::DeinterleaveOp>(
builder.getUnknownLoc(),
dst_tensor.getType(),
src,
channelIdxAttr
);

mlir::Value result = ReshapeCastAndInsertSlice(
deinterleaveOp->getResult(0),
GetVarValue(npuirop.dsts[i]),
npuirop.dsts_range[i]
);

SetVarValue(npuirop.dsts[i], result);
}
}

/// Generate hivm.hir.varange for tl.npuir_arange.
Expand Down
53 changes: 53 additions & 0 deletions testing/npuir/hfusion_ops/test_cumsum_dev.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Copyright (c) Tile-AI Corporation.
# Licensed under the MIT License.
import os

import tilelang
import tilelang.language as T

import torch
import torch_npu

tilelang.cache.clear_cache()
dtype = "float16"
M = 2
K = 64
N = 32


def vec_cumsum(M, K, N, dtype="float16"):
@T.prim_func
def main(A: T.Tensor((M, K, N), dtype),
B: T.Tensor((M, K, N), dtype),
):
with T.Kernel(1, is_npu=True) as (cid, _):
a = T.alloc_shared((M, K, N), dtype)
s = T.alloc_shared((M, K, N), dtype)

T.copy(A, a)
T.cumsum(a, s, dim=2)

T.copy(s, B)

return main


def test_cumsumt():
torch.npu.set_device(0)
os.environ['TILELANG_ASCEND_MODE'] = 'Developer'

func = vec_cumsum(M, K, N)
compiled_kernel = tilelang.compile(func, target="npuir")

v1 = torch.randn(size=[M, K, N], dtype=eval("torch." + dtype)).npu()
v2 = torch.randn(size=[M, K, N], dtype=eval("torch." + dtype)).npu()

v_ref = torch.cumsum(v1, dim=2)
compiled_kernel(v1, v2)

torch.testing.assert_close(v_ref, v2, rtol=1e-2, atol=1e-2)
print("Cumsum Pass!")


if __name__ == "__main__":
test_cumsumt()
60 changes: 60 additions & 0 deletions testing/npuir/hfusion_ops/test_deinterleave_dev.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Copyright (c) Tile-AI Corporation.
# Licensed under the MIT License.
import os

import tilelang
import tilelang.language as T

import torch
import torch_npu

tilelang.cache.clear_cache()
dtype = "float16"
M = 2
N = 32

def vdeinterleave_kernel(M, N, dtype):
BLOCK_SIZE = 1

@T.prim_func
def main(
C: T.Tensor((M, N * 2), dtype),
A: T.Tensor((M, N), dtype),
B: T.Tensor((M, N), dtype)
):
with T.Kernel(BLOCK_SIZE, is_npu=True) as (cid, _):
C_ub = T.alloc_shared((M, N * 2), dtype)
A_ub = T.alloc_shared((M, N), dtype)
B_ub = T.alloc_shared((M, N), dtype)

T.copy(C, C_ub)
T.deinterleave(C_ub, A_ub, B_ub, channel_nums=2)
T.copy(A_ub, A)
T.copy(B_ub, B)

return main

def test_vdeinterleave_kernel():
torch.npu.set_device(0)
os.environ['TILELANG_ASCEND_MODE'] = 'Developer'

func = vdeinterleave_kernel(M, N, dtype)
compiled_kernel = tilelang.compile(func, target="npuir")

v3 = torch.randn(size=[M, N * 2], dtype=eval("torch." + dtype)).npu()
v1 = torch.randn(size=[M, N], dtype=eval("torch." + dtype)).npu()
v2 = torch.randn(size=[M, N], dtype=eval("torch." + dtype)).npu()

v_unflatten = v3.unflatten(-1, (N, 2))
ref_a, ref_b = v_unflatten.split(1, dim=-1)
ref_a = ref_a.squeeze(-1).to(dtype=eval("torch." + dtype))
ref_b = ref_b.squeeze(-1).to(dtype=eval("torch." + dtype))

compiled_kernel(v3, v1, v2)

torch.testing.assert_close(v1, ref_a, rtol=1e-2, atol=1e-2)
torch.testing.assert_close(v2, ref_b, rtol=1e-2, atol=1e-2)
print("Deinterleave Pass!")

if __name__ == "__main__":
test_vdeinterleave_kernel()
55 changes: 55 additions & 0 deletions testing/npuir/hfusion_ops/test_interleave_dev.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Copyright (c) Tile-AI Corporation.
# Licensed under the MIT License.
import os

import tilelang
import tilelang.language as T

import torch
import torch_npu

tilelang.cache.clear_cache()
dtype = "float16"
M = 2
N = 32

def vinterleave_kernel(M, N, dtype):
BLOCK_SIZE = 1

@T.prim_func
def main(A: T.Tensor((M, N), dtype),
B: T.Tensor((M, N), dtype),
C: T.Tensor((M, N * 2), dtype)):

with T.Kernel(BLOCK_SIZE, is_npu=True) as (cid, _):

A_ub = T.alloc_shared((M, N), dtype)
B_ub = T.alloc_shared((M, N), dtype)
C_ub = T.alloc_shared((M, N * 2), dtype)

T.copy(A, A_ub)
T.copy(B, B_ub)
T.interleave(A_ub, B_ub, C_ub, channel_nums=2)
T.copy(C_ub, C)

return main

def test_vinterleave_kernel():
torch.npu.set_device(0)
os.environ['TILELANG_ASCEND_MODE'] = 'Developer'

func = vinterleave_kernel(M, N, dtype)
compiled_kernel = tilelang.compile(func, target="npuir")

v1 = torch.randn(size=[M, N], dtype=eval("torch." + dtype)).npu()
v2 = torch.randn(size=[M, N], dtype=eval("torch." + dtype)).npu()
v3 = torch.randn(size=[M, N * 2], dtype=eval("torch." + dtype)).npu()

v_ref = torch.cat([v1.unsqueeze(-1), v2.unsqueeze(-1)], dim=-1).flatten(-2)
compiled_kernel(v1, v2, v3)

torch.testing.assert_close(v_ref, v3, rtol=1e-2, atol=1e-2)
print("Interleave Pass!")

if __name__ == "__main__":
test_vinterleave_kernel()