fix(kernel): 解决 attention 访存错误的问题

YdrMaster · YdrMaster · commit 1b200d0455f0 · 2024-02-02T12:53:06.000+08:00
Signed-off-by: YdrMaster &lt;ydrml@hotmail.com&gt;
diff --git a/src/04kernel/src/kernels/attention/cuda_kernel.cu b/src/04kernel/src/kernels/attention/cuda_kernel.cu
@@ -12,25 +12,27 @@ namespace refactor::kernel {
     //  seqLen: 此次处理的词数
     //   posId: 在 kv cache 中的位置
     //  attLen = pastSeqLen + seqLen
-    static __forceinline__ __device__ bool
-    causualMask(int tokenId, int seqLen,
-                int posId, int attLen) {
-        // tokenId ↓ |<---attLen---->|
-        //         0 | * * ... *     |
-        //         1 | * * ... * *   |
-        //         2 | * * ... * * * |
-        // seqLen: 3 |---------------|
-        return attLen + tokenId >= posId + seqLen;
-    }
+    struct AttentionCausualMask {
+        __forceinline__ __device__ bool
+        operator()(int tokenId, int seqLen,
+                   int posId, int attLen) {
+            // tokenId ↓ |<---attLen---->|
+            //         0 | * * ... *     |
+            //         1 | * * ... * *   |
+            //         2 | * * ... * * * |
+            // seqLen: 3 |---------------|
+            return attLen + tokenId >= posId + seqLen;
+        }
+    };
 
     // gridDim.x = batch * nHead
     // gridDim.y = seqLen
     // blockDim.x = min(1024, attLen)
     // sizeof(shared) = attLen * sizeof(float)
-    template<class T>
+    template<class T, class Mask>
     static __global__ void softmax(
         T *__restrict__ att,
-        bool (*mask)(int, int, int, int),
+        Mask mask,
         uint32_t attLen,
         uint32_t bufLen) {
         // 找到这个线程块对应的 attention 区域
@@ -161,7 +163,7 @@ namespace refactor::kernel {
                                   std::min(1024u, attLen),
                                   attLen * sizeof(float),
                                   stream>>>(
-                            att, causualMask, attLen, bufLen);
+                            att, AttentionCausualMask(), attLen, bufLen);
                         {
                             half alpha = 1, beta = 0;
                             cublasLtMatmul(
diff --git a/src/04kernel/test/kernels/attention/test_cuda.cpp b/src/04kernel/test/kernels/attention/test_cuda.cpp
@@ -2,6 +2,7 @@
 
 #include "../../../src/kernels/attention/cuda_kernel.hh"
 #include "hardware/device_manager.h"
+#include "kernel/cuda/functions.cuh"
 #include <gtest/gtest.h>
 #include <numeric>
 
@@ -43,6 +44,7 @@ TEST(kernel, AttentionCudaNoKvCache) {
         void *outputs[]{*oGpu};
         routine(res, *workspace, inputs, outputs);
     }
+    cuda::sync();
 }
 
 #endif

Original file line number	Diff line number	Diff line change
`@@ -2,6 +2,7 @@`
`2`	`2`
`3`	`3`	`#include "../../../src/kernels/attention/cuda_kernel.hh"`
`4`	`4`	`#include "hardware/device_manager.h"`
	`5`	`+#include "kernel/cuda/functions.cuh"`
`5`	`6`	`#include <gtest/gtest.h>`
`6`	`7`	`#include <numeric>`
`7`	`8`
`@@ -43,6 +44,7 @@ TEST(kernel, AttentionCudaNoKvCache) {`
`43`	`44`	`void outputs[]{oGpu};`
`44`	`45`	`routine(res, *workspace, inputs, outputs);`
`45`	`46`	`}`
	`47`	`+ cuda::sync();`
`46`	`48`	`}`
`47`	`49`
`48`	`50`	`#endif`