Skip to content

Commit c0d1b0d

Browse files
authored
Merge pull request #419 from InfiniTensor/issue/418
issue/418: 解决 p800 上手写算子引用 sm 上指针的报错问题
2 parents 9ad23fa + 20488ee commit c0d1b0d

File tree

4 files changed

+17
-34
lines changed

4 files changed

+17
-34
lines changed

src/infiniop/devices/kunlun/kunlun_kernel_common.h

Lines changed: 3 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -37,22 +37,6 @@ inline __device__ float lowerBitMask(int i) {
3737
return (1 << (i + 1)) - 1;
3838
}
3939

40-
/**
41-
* @brief Load data from shared memory
42-
* @param p: pointer to shared memory
43-
* @return loaded value
44-
*/
45-
template <typename T>
46-
__device__ inline T loadsm(__shared_ptr__ const T *p) {
47-
T v;
48-
if constexpr (std::is_same<T, half>::value
49-
|| std::is_same<T, bfloat16_t>::value) {
50-
__builtin_memcpy(&v, p, sizeof(T));
51-
} else {
52-
v = *p;
53-
}
54-
return v;
55-
}
5640
// Load len data from shared memory
5741
template <typename T>
5842
__device__ inline void loadsm(__shared_ptr__ const T *p, T *v, int len) {
@@ -89,7 +73,7 @@ inline __device__ T atomicAdd(__shared_ptr__ T *ptr, T value) {
8973
template <>
9074
inline __device__ half atomicAdd<half>(__shared_ptr__ half *ptr, half value) {
9175
ticket_lock_mix();
92-
__half old = loadsm(ptr);
76+
half old = *ptr;
9377
float of = __half2float(old);
9478
float vf = __half2float(value);
9579
float sumf = of + vf;
@@ -103,7 +87,7 @@ inline __device__ half atomicAdd<half>(__shared_ptr__ half *ptr, half value) {
10387
template <>
10488
inline __device__ bfloat16_t atomicAdd<bfloat16_t>(__shared_ptr__ bfloat16_t *ptr, bfloat16_t value) {
10589
ticket_lock_mix();
106-
bfloat16_t old = loadsm(ptr);
90+
bfloat16_t old = *ptr;
10791
float of = __bfloat162float(old);
10892
float vf = __bfloat162float(value);
10993
float sumf = of + vf;
@@ -122,7 +106,7 @@ inline __device__ bfloat16_t atomicAdd<bfloat16_t>(__shared_ptr__ bfloat16_t *pt
122106
template <typename T>
123107
inline __device__ T atomicMax(__shared_ptr__ T *ptr, T value) {
124108
ticket_lock_mix();
125-
T old = loadsm(ptr);
109+
T old = *ptr;
126110
if constexpr (std::is_same<T, bfloat16_t>::value) {
127111
float of = __bfloat162float(old);
128112
float vf = __bfloat162float(value);

src/infiniop/ops/causal_softmax/kunlun/kernel.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ __device__ void causalSoftmaxBlock(
3131
// height: 3 col_id->
3232
if (width + size_t(row_id) >= col + height) {
3333
if constexpr (std::is_same_v<Tdata, half>) {
34-
y[col] = hexp(loadsm(x + col) - loadsm(&max_));
34+
y[col] = hexp(x[col] - max_);
3535
} else if constexpr (std::is_same_v<Tdata, bfloat16_t>) {
3636
y[col] = __float2bfloat16(exp(__bfloat162float(x[col]) - __bfloat162float(max_)));
3737
} else {
@@ -54,7 +54,7 @@ __device__ void causalSoftmaxBlock(
5454
// Apply softmax
5555
for (size_t col = core_id(); col < width; col += BLOCK_SIZE) {
5656
if (sum_ != 0) {
57-
y[col] = to<Tdata>(to<Tcompute>(loadsm(y + col)) / sum_);
57+
y[col] = to<Tdata>(to<Tcompute>(y[col]) / sum_);
5858
} else {
5959
y[col] = Tdata(0);
6060
}

src/infiniop/ops/rms_norm/kunlun/kernel.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ __device__ void rmsnormBlock(
2525

2626
// Copy contiguous x, w into local mem (load from shared memory safely)
2727
for (size_t i = core_id(); i < dim; i += BLOCK_SIZE) {
28-
Tdata xi = loadsm(x + i);
29-
Tweight wi = loadsm(w + i);
28+
Tdata xi = x[i];
29+
Tweight wi = w[i];
3030
y[i] = static_cast<Tdata>(to<Tcompute>(xi) * to<Tcompute>(wi) * rms);
3131
}
3232
sync_cluster();

src/infiniop/reduce/kunlun/reduce_kunlun.h

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,20 +13,20 @@ __device__ inline Tcompute sumSquared(__shared_ptr__ const Tdata *data_ptr, size
1313
Tcompute ss = 0;
1414

1515
for (size_t i = core_id(); i < count; i += BLOCK_SIZE) {
16-
Tdata xi = loadsm(data_ptr + i);
16+
Tdata xi = data_ptr[i];
1717
ss += to<Tcompute>(xi) * to<Tcompute>(xi);
1818
}
1919

2020
__shared__ Tcompute temp_storage;
2121
if (core_id() == 0) {
22-
temp_storage = 0;
22+
temp_storage = to<Tcompute>(0.f);
2323
}
2424
sync_cluster();
2525

2626
atomicAdd(&temp_storage, ss);
2727
sync_cluster();
2828

29-
return loadsm(&temp_storage);
29+
return temp_storage;
3030
}
3131

3232
// Sum(x) on contiguous data of length count
@@ -35,43 +35,42 @@ __device__ inline Tcompute sum(__shared_ptr__ const Tdata *data_ptr, size_t coun
3535
Tcompute ss = 0;
3636

3737
for (size_t i = core_id(); i < count; i += BLOCK_SIZE) {
38-
Tdata xi = loadsm(data_ptr + i);
38+
Tdata xi = data_ptr[i];
3939
ss += to<Tcompute>(xi);
4040
}
4141

4242
__shared__ Tcompute temp_storage;
4343
if (core_id() == 0) {
44-
temp_storage = 0;
44+
temp_storage = to<Tcompute>(0.f);
4545
}
4646
sync_cluster();
4747

4848
atomicAdd(&temp_storage, ss);
4949
sync_cluster();
5050

51-
return loadsm(&temp_storage);
51+
return temp_storage;
5252
}
5353

5454
// Max(x) on contiguous data of length count
5555
template <unsigned int BLOCK_SIZE, typename Tdata>
5656
__device__ inline Tdata max(__shared_ptr__ const Tdata *data_ptr, size_t count) {
57-
Tdata max_val = loadsm(data_ptr);
57+
Tdata max_val = data_ptr[0];
5858

5959
for (size_t i = core_id(); i < count; i += BLOCK_SIZE) {
60-
// Tdata xi = loadsm(data_ptr + i);
61-
Tdata xi = loadsm(data_ptr + i);
60+
Tdata xi = data_ptr[i];
6261
max_val = fmax(max_val, to<Tdata>(xi));
6362
}
6463

6564
__shared__ Tdata temp_storage;
6665
if (core_id() == 0) {
67-
temp_storage = loadsm(data_ptr);
66+
temp_storage = data_ptr[0];
6867
}
6968
sync_cluster();
7069

7170
atomicMax(&temp_storage, max_val);
7271
sync_cluster();
7372

74-
return loadsm(&temp_storage);
73+
return temp_storage;
7574
}
7675

7776
} // namespace op::common_kunlun::reduce_op

0 commit comments

Comments
 (0)