[DotProd][FP16] support f16x8_pack kernel (#45)

DefTruth · web-flow · commit e28cb4da44f8 · 2024-09-24T20:31:07.000+08:00
* Update dot_product.cu

* Update dot_product.py

* Update README.md

* Update README.md
diff --git a/README.md b/README.md
@@ -68,6 +68,7 @@
 | ✔️ [dot_product_f32x4](./dot-product/dot_product.cu)|f32|f32|[link](./dot-product/)|⭐️⭐️|  
 | ✔️ [dot_product_f16_f32](./dot-product/dot_product.cu)|f16|f32|[link](./dot-product/)|⭐️⭐️|  
 | ✔️ [dot_product_f16x2_f32](./dot-product/dot_product.cu)|f16|f32|[link](./dot-product/)|⭐️⭐️|  
+| ✔️ [dot_product_f16x8_pack_f32](./dot-product/dot_product.cu)|f16|f32|[link](./dot-product/)|⭐️⭐️|  
 | ✔️ [softmax_f32(memory fence)](./softmax/softmax.cu)|f32|f32|[link](./softmax/)|⭐️⭐️|  
 | ✔️ [softmax_f32x4(memory fence)](./softmax/softmax.cu)|f32|f32|[link](./softmax/)|⭐️⭐️|  
 | ✔️ [softmax_f32(per token)](./softmax/softmax.cu)|f32|f32|[link](./softmax/)|⭐️⭐️|  
diff --git a/dot-product/README.md b/dot-product/README.md
@@ -8,6 +8,7 @@
 - [X] dot_prod_f32x4_f32_kernel(float4向量化版本)
 - [X] dot_prod_f16_f32_kernel(fp16版本，使用fp32 acc)
 - [X] dot_prod_f16x2_f32_kernel(fp16向量化版本，使用fp32 acc)
+- [X] dot_prod_f16x8_pack_f32_kernel(fp16向量化版本，使用fp32 acc, pack)
 - [X] PyTorch bindings
 
 ## 测试
@@ -22,12 +23,13 @@ python3 dot_product.py
 
 ```bash
 --------------------------------------------------------------------------------
-       out_f32f32: -88.81410217   , time:0.01135945ms
-     out_f32x4f32: -88.81417847   , time:0.01171017ms
-    out_f32f32_th: -88.81379700   , time:0.01147819ms
+       out_f32f32: -1534.59301758 , time:0.17350578ms
+     out_f32x4f32: -1534.61364746 , time:0.18058038ms
+    out_f32f32_th: -1534.61157227 , time:0.18307972ms
 --------------------------------------------------------------------------------
-       out_f16f32: -88.62890625   , time:0.01113868ms
-     out_f16x2f32: -88.65764618   , time:0.01108241ms
-    out_f16f16_th: -88.75000000   , time:0.01112628ms
+       out_f16f32: -1538.26318359 , time:0.10106802ms
+     out_f16x2f32: -1537.58288574 , time:0.05217433ms
+ out_f16x8packf32: -1536.44006348 , time:0.02096844ms
+    out_f16f16_th: -1536.00000000 , time:0.02491832ms
 --------------------------------------------------------------------------------
 ```
diff --git a/dot-product/dot_product.cu b/dot-product/dot_product.cu
@@ -15,6 +15,7 @@
 #define FLOAT4(value) (reinterpret_cast<float4*>(&(value))[0])
 #define HALF2(value) (reinterpret_cast<half2*>(&(value))[0])
 #define BFLOAT2(value) (reinterpret_cast<__nv_bfloat162*>(&(value))[0])
+#define LDST128BITS(value) (reinterpret_cast<float4*>(&(value))[0])
 
 // -------------------------------------- FP32 -------------------------------------- 
 // Warp Reduce Sum
@@ -123,7 +124,7 @@ __global__ void dot_prod_f16_f32_kernel(half* a, half* b, float* y, int N) {
   if (tid == 0) atomicAdd(y, prod);
 }
 
-template<const int NUM_THREADS = 256>
+template<const int NUM_THREADS = 256/2>
 __global__ void dot_prod_f16x2_f32_kernel(half* a, half* b, float* y, int N) {
   int tid = threadIdx.x;
   int idx = (blockIdx.x * NUM_THREADS + tid) * 2; // 2 half elements per thread
@@ -148,6 +149,38 @@ __global__ void dot_prod_f16x2_f32_kernel(half* a, half* b, float* y, int N) {
   if (tid == 0) atomicAdd(y, prod);
 }
 
+template<const int NUM_THREADS = 256/8>
+__global__ void dot_prod_f16x8_pack_f32_kernel(half* a, half* b, float* y, int N) {
+  int tid = threadIdx.x;
+  int idx = (blockIdx.x * NUM_THREADS + tid) * 8; // 8 half elements per thread
+  constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;
+  __shared__ float reduce_smem[NUM_WARPS];
+  // temporary register(memory), .local space in ptx, addressable
+  half pack_a[8], pack_b[8]; // 8x16 bits=128 bits.
+  LDST128BITS(pack_a[0]) = LDST128BITS(a[idx]); // load 128 bits
+  LDST128BITS(pack_b[0]) = LDST128BITS(b[idx]); // load 128 bits
+  const half z = __float2half(0.0f);
+  
+  half prod_f16 = z;
+  #pragma unroll 
+  for (int i = 0; i < 8; i += 2) {
+    half2 v = __hmul2(HALF2(pack_a[i]), HALF2(pack_b[i]));
+    prod_f16 += (((idx + i ) < N) ? (v.x + v.y) : z);
+  }
+
+  int warp = tid / WARP_SIZE;
+  int lane = tid % WARP_SIZE;
+  // perform warp sync reduce.
+  float prod = warp_reduce_sum_f16_f32<WARP_SIZE>(prod_f16);
+  // warp leaders store the data to shared memory.
+  if (lane == 0) reduce_smem[warp] = prod;
+  __syncthreads(); // make sure the data is in shared memory.
+  // the first warp compute the final sum.
+  prod = (lane < NUM_WARPS) ? reduce_smem[lane] : 0.0f;
+  if (warp == 0) prod = warp_reduce_sum_f32<NUM_WARPS>(prod);
+  if (tid == 0) atomicAdd(y, prod);
+}
+
 // --------------------- PyTorch bindings for custom kernel -----------------------
 #define STRINGFY(str) #str
 #define TORCH_BINDING_COMMON_EXTENSION(func) \
@@ -159,8 +192,42 @@ if(((T).options().dtype() != (th_type))) {                   \
   throw std::runtime_error("values must be "#th_type);       \
 }
 
-#define CHECK_TORCH_TENSOR_SHAPE(T, S0) \
-if (((T).size(0) != (S0))) { throw std::runtime_error("Tensor size mismatch!"); }
+#define LANUCH_DOT_PROD_KERNEL(NT, packed_type, acc_type, element_type) \
+dot_prod_##packed_type##_##acc_type##_kernel<(NT)><<<grid, block>>>(    \
+  reinterpret_cast<element_type*>(a.data_ptr()),                        \
+  reinterpret_cast<element_type*>(b.data_ptr()),                        \
+  prod.data_ptr<float>(), N);  
+
+#define DISPATCH_DOT_PROD_KERNEL(K, packed_type, acc_type, element_type, n_elements) \
+  const int NT = (K)/(n_elements);                                                   \
+  dim3 block(NT);                                                                    \
+  dim3 grid((S));                                                                    \
+  switch (NT)                                                                        \
+  {                                                                                  \
+  case 32:                                                                           \
+    LANUCH_DOT_PROD_KERNEL(32, packed_type, acc_type, element_type)                  \
+    break;                                                                           \
+  case 64:                                                                           \
+    LANUCH_DOT_PROD_KERNEL(64, packed_type, acc_type, element_type)                  \
+    break;                                                                           \
+  case 128:                                                                          \
+    LANUCH_DOT_PROD_KERNEL(128, packed_type, acc_type, element_type)                 \
+    break;                                                                           \
+  case 256:                                                                          \
+    LANUCH_DOT_PROD_KERNEL(256, packed_type, acc_type, element_type)                 \
+    break;                                                                           \
+  case 512:                                                                          \
+    LANUCH_DOT_PROD_KERNEL(512, packed_type, acc_type, element_type)                 \
+    break;                                                                           \
+  case 1024:                                                                         \
+    LANUCH_DOT_PROD_KERNEL(1024, packed_type, acc_type, element_type)                \
+    break;                                                                           \
+  default:                                                                           \
+    throw std::runtime_error(                                                        \
+      "only support (K)/(n_elements): 32/64/128/256/512/1024");                      \
+    break;                                                                           \
+  } 
+
 
 #define TORCH_BINDING_DOT_PROD(packed_type, acc_type, th_type, element_type, n_elements)  \
 torch::Tensor dot_prod_##packed_type##_##acc_type(torch::Tensor a, torch::Tensor b) {     \
@@ -169,30 +236,49 @@ torch::Tensor dot_prod_##packed_type##_##acc_type(torch::Tensor a, torch::Tensor
   auto options = torch::TensorOptions().dtype(torch::kFloat32).device(                    \
     torch::kCUDA, 0);                                                                     \
   auto prod = torch::zeros({1}, options);                                                 \
-  const int N = a.size(0);                                                                \
-  CHECK_TORCH_TENSOR_SHAPE(b, N)                                                          \
-  static const int NUM_THREADS_PER_BLOCK = 256 / (n_elements);                            \
-  const int NUM_BLOCKS = (N + 256 - 1) / 256;                                             \
-  dim3 block(NUM_THREADS_PER_BLOCK);                                                      \
-  dim3 grid(NUM_BLOCKS);                                                                  \
-  dot_prod_##packed_type##_##acc_type##_kernel<                                           \
-    NUM_THREADS_PER_BLOCK><<<grid, block>>>(                                              \
+  const int ndim = a.dim();                                                               \
+  if (ndim != 2) {                                                                        \
+    int N = 1;                                                                            \
+    for (int i = 0; i < ndim; ++i) { N *= a.size(i); }                                    \
+    dim3 block(256);                                                                      \
+    dim3 grid(((N + 256 - 1) / 256) / (n_elements));                                      \
+    dot_prod_##packed_type##_##acc_type##_kernel<                                         \
+      256 ><<<grid, block>>>(                                                             \
       reinterpret_cast<element_type*>(a.data_ptr()),                                      \
       reinterpret_cast<element_type*>(b.data_ptr()),                                      \
       prod.data_ptr<float>(), N);                                                         \
+  } else {                                                                                \
+    const int S = a.size(0);                                                              \
+    const int K = a.size(1);                                                              \
+    const int N = S * K;                                                                  \
+    if ((K/(n_elements)) <= 1024) {                                                       \
+      DISPATCH_DOT_PROD_KERNEL(K, packed_type, acc_type, element_type, n_elements)        \
+    } else {                                                                              \
+      int N = 1;                                                                          \
+      for (int i = 0; i < ndim; ++i) { N *= a.size(i); }                                  \
+      dim3 block(256);                                                                    \
+      dim3 grid(((N + 256 - 1) / 256) / (n_elements));                                    \
+      dot_prod_##packed_type##_##acc_type##_kernel<                                       \
+        256 ><<<grid, block>>>(                                                           \
+        reinterpret_cast<element_type*>(a.data_ptr()),                                    \
+        reinterpret_cast<element_type*>(b.data_ptr()),                                    \
+        prod.data_ptr<float>(), N);                                                       \
+    }                                                                                     \
+  }                                                                                       \
   return prod;                                                                            \
 }
 
 // packed_type, acc_type, th_type, element_type, n_elements_per_pack
-TORCH_BINDING_DOT_PROD(f32,      f32,  torch::kFloat32,       float,              1)
-TORCH_BINDING_DOT_PROD(f32x4,    f32,  torch::kFloat32,       float,              4)
-TORCH_BINDING_DOT_PROD(f16,      f32,  torch::kHalf,          half,               1)
-TORCH_BINDING_DOT_PROD(f16x2,    f32,  torch::kHalf,          half,               2)
-
+TORCH_BINDING_DOT_PROD(f32,        f32,  torch::kFloat32,       float,              1)
+TORCH_BINDING_DOT_PROD(f32x4,      f32,  torch::kFloat32,       float,              4)
+TORCH_BINDING_DOT_PROD(f16,        f32,  torch::kHalf,          half,               1)
+TORCH_BINDING_DOT_PROD(f16x2,      f32,  torch::kHalf,          half,               2)
+TORCH_BINDING_DOT_PROD(f16x8_pack, f32,  torch::kHalf,          half,               8)
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   TORCH_BINDING_COMMON_EXTENSION(dot_prod_f32_f32)
   TORCH_BINDING_COMMON_EXTENSION(dot_prod_f32x4_f32)
   TORCH_BINDING_COMMON_EXTENSION(dot_prod_f16_f32)
   TORCH_BINDING_COMMON_EXTENSION(dot_prod_f16x2_f32)
+  TORCH_BINDING_COMMON_EXTENSION(dot_prod_f16x8_pack_f32)
 }
diff --git a/dot-product/dot_product.py b/dot-product/dot_product.py
@@ -43,18 +43,18 @@ def run_benchmark(perf_func: callable, a: torch.Tensor, b: torch.Tensor, tag: st
 
 
 print("-" * 80)
-N_ELEMENTS = 256*92*16
-a = torch.randn((N_ELEMENTS)).cuda().float()
-b = torch.randn((N_ELEMENTS)).cuda().float()
+S, K = 4096, 4096
+a = torch.randn((S*K)).cuda().float()
+b = torch.randn((S*K)).cuda().float()
 run_benchmark(lib.dot_prod_f32_f32,   a, b, "f32f32")
 run_benchmark(lib.dot_prod_f32x4_f32, a, b, "f32x4f32")
-run_benchmark(torch.dot, a, b , "f32f32_th")
+run_benchmark(torch.dot,              a, b, "f32f32_th")
 
 print("-" * 80)
 a_f16 = a.half()
 b_f16 = b.half()
-run_benchmark(lib.dot_prod_f16_f32,   a_f16, b_f16, "f16f32")
-run_benchmark(lib.dot_prod_f16x2_f32, a_f16, b_f16, "f16x2f32")
-run_benchmark(torch.dot, a_f16, b_f16 , "f16f16_th")
-
+run_benchmark(lib.dot_prod_f16_f32,        a_f16, b_f16, "f16f32")
+run_benchmark(lib.dot_prod_f16x2_f32,      a_f16, b_f16, "f16x2f32")
+run_benchmark(lib.dot_prod_f16x8_pack_f32, a_f16, b_f16, "f16x8packf32")
+run_benchmark(torch.dot,                   a_f16, b_f16, "f16f16_th")
 print("-" * 80)