Skip to content

Commit c7373fe

Browse files
authored
Merge pull request #417 from InfiniTensor/issue/416
issue/416: p800 rearrange kernel
2 parents 0e1c558 + f0c357b commit c7373fe

File tree

3 files changed

+269
-0
lines changed

3 files changed

+269
-0
lines changed
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#ifndef __REARRANGE_KUNLUN_H__
2+
#define __REARRANGE_KUNLUN_H__
3+
4+
#include "../../../tensor.h"
5+
#include "../rearrange.h"
6+
#include <numeric>
7+
8+
namespace op::rearrange::kunlun {
9+
10+
struct RearrangeInfo {
11+
std::vector<size_t> shape;
12+
std::vector<ptrdiff_t> src_strides;
13+
std::vector<ptrdiff_t> dst_strides;
14+
infiniDtype_t dtype;
15+
16+
// Device space Size for shape, src_strides, dst_strides
17+
size_t workspace_size;
18+
19+
size_t nelements() const {
20+
return std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
21+
}
22+
size_t ndim() const { return shape.size(); }
23+
size_t workspaceSize() const { return workspace_size; }
24+
25+
static utils::Result<RearrangeInfo> create(
26+
infiniopTensorDescriptor_t y_desc,
27+
infiniopTensorDescriptor_t x_desc) {
28+
auto dtype = y_desc->dtype();
29+
auto ndim = y_desc->ndim();
30+
31+
CHECK_OR_RETURN(x_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
32+
CHECK_OR_RETURN(x_desc->ndim() == ndim, INFINI_STATUS_BAD_TENSOR_SHAPE);
33+
34+
auto y_shape = y_desc->shape();
35+
auto y_strides = y_desc->strides();
36+
auto x_shape = x_desc->shape();
37+
auto x_strides = x_desc->strides();
38+
39+
CHECK_SAME_SHAPE(x_shape, y_shape);
40+
auto workspace_size_ = sizeof(size_t) * ndim + sizeof(ptrdiff_t) * ndim * 2;
41+
42+
return utils::Result<RearrangeInfo>(RearrangeInfo{
43+
y_shape,
44+
x_strides,
45+
y_strides,
46+
dtype,
47+
workspace_size_,
48+
});
49+
}
50+
};
51+
52+
class Descriptor final : public InfiniopDescriptor {
53+
struct Opaque;
54+
Opaque *_opaque;
55+
56+
RearrangeInfo _info;
57+
58+
Descriptor(
59+
Opaque *opaque,
60+
infiniDevice_t device_type,
61+
int device_id,
62+
RearrangeInfo info)
63+
: InfiniopDescriptor{device_type, device_id},
64+
_opaque(opaque),
65+
_info(info) {}
66+
67+
public:
68+
~Descriptor();
69+
70+
static infiniStatus_t create(
71+
infiniopHandle_t handle,
72+
Descriptor **desc_ptr,
73+
infiniopTensorDescriptor_t y_desc,
74+
infiniopTensorDescriptor_t x_desc);
75+
76+
infiniStatus_t calculate(
77+
void *y,
78+
const void *x,
79+
void *stream) const;
80+
};
81+
} // namespace op::rearrange::kunlun
82+
83+
#endif // __REARRANGE_KUNLUN_H__
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
#include "../../../devices/kunlun/kunlun_common.h"
2+
#include "../../../devices/kunlun/kunlun_handle.h"
3+
#include "../../../devices/kunlun/kunlun_kernel_common.h"
4+
#include "rearrange_kunlun.h"
5+
#include <memory>
6+
7+
using namespace device::kunlun::kernel;
8+
9+
/**
10+
* @brief rearrange kernel function
11+
* @tparam BLOCK_SIZE the block size of the kernel
12+
* @tparam T the data type of the input and output tensor
13+
* @param x the input tensor
14+
* @param y the output tensor
15+
* @param shape the shape of the input tensor
16+
* @param x_stride the stride of the input tensor
17+
* @param y_stride the stride of the output tensor
18+
* @param total_size the total size of the input tensor
19+
*/
20+
template <unsigned int BUFF_SIZE, typename Tdata>
21+
__global__ void rearrangeKernel(
22+
Tdata *y,
23+
const Tdata *x,
24+
const void *shape,
25+
const void *x_stride,
26+
const void *y_stride,
27+
uint32_t ndim,
28+
uint32_t total_size) {
29+
30+
int cid = core_id();
31+
int ncores = core_num();
32+
if (cid >= ncores) {
33+
return;
34+
}
35+
int thread_id = ncores * cluster_id() + cid;
36+
int nthreads = ncores * cluster_num();
37+
38+
__local__ Tdata x_local[BUFF_SIZE];
39+
__local__ _size_t shape_lm[ndim];
40+
__local__ _ptrdiff_t x_stride_lm[ndim];
41+
__local__ _ptrdiff_t y_stride_lm[ndim];
42+
43+
GM2LM_ASYNC(shape, shape_lm, ndim * sizeof(_size_t));
44+
GM2LM_ASYNC(x_stride, x_stride_lm, ndim * sizeof(_ptrdiff_t));
45+
GM2LM_ASYNC(y_stride, y_stride_lm, ndim * sizeof(_ptrdiff_t));
46+
mfence();
47+
48+
int len_per_loop = min(BUFF_SIZE, roundup_div(total_size, nthreads));
49+
50+
for (int start = thread_id * len_per_loop; start < total_size; start += nthreads * len_per_loop) {
51+
int len = min(len_per_loop, total_size - start);
52+
for (int idx = start; idx < start + len; ++idx) {
53+
int in_idx = indexToOffset(idx, ndim, shape_lm, x_stride_lm);
54+
GM2LM_ASYNC(x + in_idx, x_local + idx - start, sizeof(Tdata));
55+
}
56+
mfence();
57+
for (int idx = start; idx < start + len; ++idx) {
58+
int out_idx = indexToOffset(idx, ndim, shape_lm, y_stride_lm);
59+
LM2GM_ASYNC(x_local + idx - start, y + out_idx, sizeof(Tdata));
60+
}
61+
sync_cluster();
62+
}
63+
}
64+
65+
namespace op::rearrange::kunlun {
66+
67+
struct Descriptor::Opaque {
68+
std::shared_ptr<device::kunlun::Handle::Internal> internal;
69+
void *workspace;
70+
~Opaque() {
71+
if (workspace) {
72+
xpu_free(workspace);
73+
}
74+
}
75+
};
76+
77+
Descriptor::~Descriptor() {
78+
delete _opaque;
79+
}
80+
81+
infiniStatus_t Descriptor::create(
82+
infiniopHandle_t handle,
83+
Descriptor **desc_ptr,
84+
infiniopTensorDescriptor_t y_desc,
85+
infiniopTensorDescriptor_t x_desc) {
86+
auto result = RearrangeInfo::create(y_desc, x_desc);
87+
CHECK_RESULT(result);
88+
auto info = result.take();
89+
90+
void *workspace = nullptr;
91+
size_t workspace_size = info.workspaceSize();
92+
93+
CHECK_KUNLUN(xpu_malloc(&workspace, workspace_size, XPU_MEM_L3));
94+
95+
*desc_ptr = new Descriptor(
96+
new Opaque{
97+
reinterpret_cast<device::kunlun::Handle *>(handle)->internal(),
98+
workspace},
99+
handle->device,
100+
handle->device_id,
101+
std::move(info));
102+
return INFINI_STATUS_SUCCESS;
103+
}
104+
105+
template <unsigned int BUFF_SIZE>
106+
infiniStatus_t launchKernel(
107+
void *y,
108+
const void *x,
109+
void *workspace,
110+
size_t ndim,
111+
size_t total_size,
112+
infiniDtype_t dtype,
113+
kunlunStream_t stream) {
114+
115+
__global_ptr__ size_t *d_shape = reinterpret_cast<__global_ptr__ size_t *>(workspace);
116+
__global_ptr__ ptrdiff_t *d_src_strides = reinterpret_cast<__global_ptr__ ptrdiff_t *>(d_shape + ndim);
117+
__global_ptr__ ptrdiff_t *d_dst_strides = reinterpret_cast<__global_ptr__ ptrdiff_t *>(d_src_strides + ndim);
118+
119+
#define LAUNCH_KERNEL(Tdata) \
120+
rearrangeKernel<BUFF_SIZE, Tdata> \
121+
<<<12, 64, stream>>>( \
122+
reinterpret_cast<__global_ptr__ Tdata *>(y), \
123+
reinterpret_cast<__global_ptr__ const Tdata *>(x), \
124+
reinterpret_cast<__global_ptr__ void *>(d_shape), \
125+
reinterpret_cast<__global_ptr__ void *>(d_src_strides), \
126+
reinterpret_cast<__global_ptr__ void *>(d_dst_strides), \
127+
static_cast<uint32_t>(ndim), \
128+
static_cast<uint32_t>(total_size));
129+
130+
switch (dtype) {
131+
case INFINI_DTYPE_F32:
132+
LAUNCH_KERNEL(float);
133+
break;
134+
case INFINI_DTYPE_BF16:
135+
LAUNCH_KERNEL(bfloat16_t);
136+
break;
137+
case INFINI_DTYPE_F16:
138+
LAUNCH_KERNEL(half);
139+
break;
140+
default:
141+
return INFINI_STATUS_BAD_TENSOR_DTYPE;
142+
}
143+
#undef LAUNCH_KERNEL
144+
145+
return INFINI_STATUS_SUCCESS;
146+
}
147+
148+
infiniStatus_t Descriptor::calculate(
149+
void *y,
150+
const void *x,
151+
void *stream) const {
152+
153+
size_t ndim = _info.ndim();
154+
size_t total_size = _info.nelements();
155+
infiniDtype_t dtype = _info.dtype;
156+
157+
// Get workspace from opaque
158+
void *workspace = _opaque->workspace;
159+
__global_ptr__ size_t *d_shape = reinterpret_cast<__global_ptr__ size_t *>(workspace);
160+
__global_ptr__ ptrdiff_t *d_src_strides = reinterpret_cast<__global_ptr__ ptrdiff_t *>(d_shape + ndim);
161+
__global_ptr__ ptrdiff_t *d_dst_strides = reinterpret_cast<__global_ptr__ ptrdiff_t *>(d_src_strides + ndim);
162+
163+
// Copy shape, src_strides, dst_strides to device memory
164+
CHECK_KUNLUN(xpu_memcpy_async(d_shape, _info.shape.data(), sizeof(size_t) * ndim, XPU_HOST_TO_DEVICE, stream));
165+
CHECK_KUNLUN(xpu_memcpy_async(d_src_strides, _info.src_strides.data(), sizeof(ptrdiff_t) * ndim, XPU_HOST_TO_DEVICE, stream));
166+
CHECK_KUNLUN(xpu_memcpy_async(d_dst_strides, _info.dst_strides.data(), sizeof(ptrdiff_t) * ndim, XPU_HOST_TO_DEVICE, stream));
167+
168+
CHECK_STATUS(launchKernel<64>(y, x, workspace,
169+
ndim, total_size, dtype,
170+
reinterpret_cast<kunlunStream_t>(stream)));
171+
return INFINI_STATUS_SUCCESS;
172+
}
173+
174+
} // namespace op::rearrange::kunlun

src/infiniop/ops/rearrange/operator.cc

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020
#ifdef ENABLE_MOORE_API
2121
#include "moore/rearrange_moore.h"
2222
#endif
23+
#ifdef ENABLE_KUNLUN_API
24+
#include "kunlun/rearrange_kunlun.h"
25+
#endif
2326

2427
__C infiniStatus_t infiniopCreateRearrangeDescriptor(
2528
infiniopHandle_t handle,
@@ -57,6 +60,9 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor(
5760
#endif
5861
#ifdef ENABLE_MOORE_API
5962
CREATE(INFINI_DEVICE_MOORE, moore);
63+
#endif
64+
#ifdef ENABLE_KUNLUN_API
65+
CREATE(INFINI_DEVICE_KUNLUN, kunlun);
6066
#endif
6167
default:
6268
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -99,6 +105,9 @@ __C infiniStatus_t infiniopRearrange(
99105
#ifdef ENABLE_MOORE_API
100106
CALCULATE(INFINI_DEVICE_MOORE, moore);
101107
#endif
108+
#ifdef ENABLE_KUNLUN_API
109+
CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
110+
#endif
102111

103112
default:
104113
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -138,6 +147,9 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor(
138147
#ifdef ENABLE_MOORE_API
139148
DELETE(INFINI_DEVICE_MOORE, moore);
140149
#endif
150+
#ifdef ENABLE_KUNLUN_API
151+
DELETE(INFINI_DEVICE_KUNLUN, kunlun);
152+
#endif
141153

142154
default:
143155
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;

0 commit comments

Comments
 (0)