|
| 1 | +#include "../../../devices/kunlun/kunlun_common.h" |
| 2 | +#include "../../../devices/kunlun/kunlun_handle.h" |
| 3 | +#include "../../../devices/kunlun/kunlun_kernel_common.h" |
| 4 | +#include "rearrange_kunlun.h" |
| 5 | +#include <memory> |
| 6 | + |
| 7 | +using namespace device::kunlun::kernel; |
| 8 | + |
| 9 | +/** |
| 10 | + * @brief rearrange kernel function |
| 11 | + * @tparam BLOCK_SIZE the block size of the kernel |
| 12 | + * @tparam T the data type of the input and output tensor |
| 13 | + * @param x the input tensor |
| 14 | + * @param y the output tensor |
| 15 | + * @param shape the shape of the input tensor |
| 16 | + * @param x_stride the stride of the input tensor |
| 17 | + * @param y_stride the stride of the output tensor |
| 18 | + * @param total_size the total size of the input tensor |
| 19 | + */ |
| 20 | +template <unsigned int BUFF_SIZE, typename Tdata> |
| 21 | +__global__ void rearrangeKernel( |
| 22 | + Tdata *y, |
| 23 | + const Tdata *x, |
| 24 | + const void *shape, |
| 25 | + const void *x_stride, |
| 26 | + const void *y_stride, |
| 27 | + uint32_t ndim, |
| 28 | + uint32_t total_size) { |
| 29 | + |
| 30 | + int cid = core_id(); |
| 31 | + int ncores = core_num(); |
| 32 | + if (cid >= ncores) { |
| 33 | + return; |
| 34 | + } |
| 35 | + int thread_id = ncores * cluster_id() + cid; |
| 36 | + int nthreads = ncores * cluster_num(); |
| 37 | + |
| 38 | + __local__ Tdata x_local[BUFF_SIZE]; |
| 39 | + __local__ _size_t shape_lm[ndim]; |
| 40 | + __local__ _ptrdiff_t x_stride_lm[ndim]; |
| 41 | + __local__ _ptrdiff_t y_stride_lm[ndim]; |
| 42 | + |
| 43 | + GM2LM_ASYNC(shape, shape_lm, ndim * sizeof(_size_t)); |
| 44 | + GM2LM_ASYNC(x_stride, x_stride_lm, ndim * sizeof(_ptrdiff_t)); |
| 45 | + GM2LM_ASYNC(y_stride, y_stride_lm, ndim * sizeof(_ptrdiff_t)); |
| 46 | + mfence(); |
| 47 | + |
| 48 | + int len_per_loop = min(BUFF_SIZE, roundup_div(total_size, nthreads)); |
| 49 | + |
| 50 | + for (int start = thread_id * len_per_loop; start < total_size; start += nthreads * len_per_loop) { |
| 51 | + int len = min(len_per_loop, total_size - start); |
| 52 | + for (int idx = start; idx < start + len; ++idx) { |
| 53 | + int in_idx = indexToOffset(idx, ndim, shape_lm, x_stride_lm); |
| 54 | + GM2LM_ASYNC(x + in_idx, x_local + idx - start, sizeof(Tdata)); |
| 55 | + } |
| 56 | + mfence(); |
| 57 | + for (int idx = start; idx < start + len; ++idx) { |
| 58 | + int out_idx = indexToOffset(idx, ndim, shape_lm, y_stride_lm); |
| 59 | + LM2GM_ASYNC(x_local + idx - start, y + out_idx, sizeof(Tdata)); |
| 60 | + } |
| 61 | + sync_cluster(); |
| 62 | + } |
| 63 | +} |
| 64 | + |
| 65 | +namespace op::rearrange::kunlun { |
| 66 | + |
| 67 | +struct Descriptor::Opaque { |
| 68 | + std::shared_ptr<device::kunlun::Handle::Internal> internal; |
| 69 | + void *workspace; |
| 70 | + ~Opaque() { |
| 71 | + if (workspace) { |
| 72 | + xpu_free(workspace); |
| 73 | + } |
| 74 | + } |
| 75 | +}; |
| 76 | + |
| 77 | +Descriptor::~Descriptor() { |
| 78 | + delete _opaque; |
| 79 | +} |
| 80 | + |
| 81 | +infiniStatus_t Descriptor::create( |
| 82 | + infiniopHandle_t handle, |
| 83 | + Descriptor **desc_ptr, |
| 84 | + infiniopTensorDescriptor_t y_desc, |
| 85 | + infiniopTensorDescriptor_t x_desc) { |
| 86 | + auto result = RearrangeInfo::create(y_desc, x_desc); |
| 87 | + CHECK_RESULT(result); |
| 88 | + auto info = result.take(); |
| 89 | + |
| 90 | + void *workspace = nullptr; |
| 91 | + size_t workspace_size = info.workspaceSize(); |
| 92 | + |
| 93 | + CHECK_KUNLUN(xpu_malloc(&workspace, workspace_size, XPU_MEM_L3)); |
| 94 | + |
| 95 | + *desc_ptr = new Descriptor( |
| 96 | + new Opaque{ |
| 97 | + reinterpret_cast<device::kunlun::Handle *>(handle)->internal(), |
| 98 | + workspace}, |
| 99 | + handle->device, |
| 100 | + handle->device_id, |
| 101 | + std::move(info)); |
| 102 | + return INFINI_STATUS_SUCCESS; |
| 103 | +} |
| 104 | + |
| 105 | +template <unsigned int BUFF_SIZE> |
| 106 | +infiniStatus_t launchKernel( |
| 107 | + void *y, |
| 108 | + const void *x, |
| 109 | + void *workspace, |
| 110 | + size_t ndim, |
| 111 | + size_t total_size, |
| 112 | + infiniDtype_t dtype, |
| 113 | + kunlunStream_t stream) { |
| 114 | + |
| 115 | + __global_ptr__ size_t *d_shape = reinterpret_cast<__global_ptr__ size_t *>(workspace); |
| 116 | + __global_ptr__ ptrdiff_t *d_src_strides = reinterpret_cast<__global_ptr__ ptrdiff_t *>(d_shape + ndim); |
| 117 | + __global_ptr__ ptrdiff_t *d_dst_strides = reinterpret_cast<__global_ptr__ ptrdiff_t *>(d_src_strides + ndim); |
| 118 | + |
| 119 | +#define LAUNCH_KERNEL(Tdata) \ |
| 120 | + rearrangeKernel<BUFF_SIZE, Tdata> \ |
| 121 | + <<<12, 64, stream>>>( \ |
| 122 | + reinterpret_cast<__global_ptr__ Tdata *>(y), \ |
| 123 | + reinterpret_cast<__global_ptr__ const Tdata *>(x), \ |
| 124 | + reinterpret_cast<__global_ptr__ void *>(d_shape), \ |
| 125 | + reinterpret_cast<__global_ptr__ void *>(d_src_strides), \ |
| 126 | + reinterpret_cast<__global_ptr__ void *>(d_dst_strides), \ |
| 127 | + static_cast<uint32_t>(ndim), \ |
| 128 | + static_cast<uint32_t>(total_size)); |
| 129 | + |
| 130 | + switch (dtype) { |
| 131 | + case INFINI_DTYPE_F32: |
| 132 | + LAUNCH_KERNEL(float); |
| 133 | + break; |
| 134 | + case INFINI_DTYPE_BF16: |
| 135 | + LAUNCH_KERNEL(bfloat16_t); |
| 136 | + break; |
| 137 | + case INFINI_DTYPE_F16: |
| 138 | + LAUNCH_KERNEL(half); |
| 139 | + break; |
| 140 | + default: |
| 141 | + return INFINI_STATUS_BAD_TENSOR_DTYPE; |
| 142 | + } |
| 143 | +#undef LAUNCH_KERNEL |
| 144 | + |
| 145 | + return INFINI_STATUS_SUCCESS; |
| 146 | +} |
| 147 | + |
| 148 | +infiniStatus_t Descriptor::calculate( |
| 149 | + void *y, |
| 150 | + const void *x, |
| 151 | + void *stream) const { |
| 152 | + |
| 153 | + size_t ndim = _info.ndim(); |
| 154 | + size_t total_size = _info.nelements(); |
| 155 | + infiniDtype_t dtype = _info.dtype; |
| 156 | + |
| 157 | + // Get workspace from opaque |
| 158 | + void *workspace = _opaque->workspace; |
| 159 | + __global_ptr__ size_t *d_shape = reinterpret_cast<__global_ptr__ size_t *>(workspace); |
| 160 | + __global_ptr__ ptrdiff_t *d_src_strides = reinterpret_cast<__global_ptr__ ptrdiff_t *>(d_shape + ndim); |
| 161 | + __global_ptr__ ptrdiff_t *d_dst_strides = reinterpret_cast<__global_ptr__ ptrdiff_t *>(d_src_strides + ndim); |
| 162 | + |
| 163 | + // Copy shape, src_strides, dst_strides to device memory |
| 164 | + CHECK_KUNLUN(xpu_memcpy_async(d_shape, _info.shape.data(), sizeof(size_t) * ndim, XPU_HOST_TO_DEVICE, stream)); |
| 165 | + CHECK_KUNLUN(xpu_memcpy_async(d_src_strides, _info.src_strides.data(), sizeof(ptrdiff_t) * ndim, XPU_HOST_TO_DEVICE, stream)); |
| 166 | + CHECK_KUNLUN(xpu_memcpy_async(d_dst_strides, _info.dst_strides.data(), sizeof(ptrdiff_t) * ndim, XPU_HOST_TO_DEVICE, stream)); |
| 167 | + |
| 168 | + CHECK_STATUS(launchKernel<64>(y, x, workspace, |
| 169 | + ndim, total_size, dtype, |
| 170 | + reinterpret_cast<kunlunStream_t>(stream))); |
| 171 | + return INFINI_STATUS_SUCCESS; |
| 172 | +} |
| 173 | + |
| 174 | +} // namespace op::rearrange::kunlun |
0 commit comments