Merge pull request #540 from karpathy/move/fused_classifier

karpathy · web-flow · commit 299d375fbc55 · 2024-06-03T15:38:24.000-07:00
move fused classifier
diff --git a/llmc/fused_classifier.cuh b/llmc/fused_classifier.cuh
@@ -0,0 +1,143 @@
+/*
+Fused Classifier:
+- Forwards the Cross Entropy Loss
+- Never materializes the full normalized logits, only at the target label
+- (fusion) Also kicks off the backward pass, because everything is already loaded
+*/
+// llmc internal imports
+#include "cuda_common.h"
+#include "cuda_utils.cuh"
+
+// ----------------------------------------------------------------------------
+// CUDA kernels
+
+struct SoftmaxParams {
+    float Scale;
+    float Offset;
+};
+
+__device__ SoftmaxParams prepare_softmax_blockwide3(int64_t idx, const floatX* inp, int V, int P) {
+    // same but not float4
+    // one row of inp, i.e. inp[idx, :] of shape (V,)
+
+    const floatX* x = inp + idx * P;
+    float thread_maxval = -INFINITY;
+    float thread_sumval = 0.0f;
+    int i = (V+x128::size-1)/x128::size + threadIdx.x - blockDim.x;
+
+    // special-case loop to handle the unaligned elements at the end of the array
+    // this lets us skip the bounds check in the main loop below, which improves performance
+    while ((i+1)*x128::size > V) {
+        for(int k = 0; k < x128::size; ++k) {
+            if (i*x128::size+k >= V) {
+                break; // bounds checking against real V (rather than padded P)
+            }
+            float v = (float)x[i*x128::size+k];
+            float old_maxval = thread_maxval;
+            thread_maxval = fmaxf(thread_maxval, v);
+            thread_sumval *= expf((old_maxval - thread_maxval));
+            thread_sumval += expf(v - thread_maxval);
+        }
+        i -= blockDim.x;
+    }
+
+    // main loop for the bulk of the iterations (no bounds checking required!)
+    for (; i >= 0; i -= blockDim.x) {
+        x128 packed_x = load128(x + i * x128::size); // load and keep in cache until fused_classifier loop
+        for(int k = 0; k < x128::size; ++k) {
+            float v = (float)packed_x[k];
+            float old_maxval = thread_maxval;
+            thread_maxval = fmaxf(thread_maxval, v);
+            thread_sumval *= expf((old_maxval - thread_maxval));
+            thread_sumval += expf(v - thread_maxval);
+        }
+    }
+
+    // Block Max Reduction -> Maths -> Block Sum Reduction
+    float block_maxval = blockReduce<warpReduceMax>(thread_maxval, false, -INFINITY);
+    thread_sumval *= expf(thread_maxval - block_maxval);
+    float block_sumval = blockReduce<warpReduceSum>(thread_sumval);
+
+    // return the softmax parameters
+    return SoftmaxParams{1.f / block_sumval, block_maxval};
+}
+
+// will _update_ logits to logit gradients
+// uses template to decide whether to write logits and probs
+// split both loops in "multiple-of-x128-size" and "bounds-checked remainder" parts
+template <bool WriteLogits = true, bool WriteProbs = false>
+__global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS)
+    fused_classifier_kernel5(floatX* logits, floatX* losses, floatX* probs,
+                                const float dloss, const int* targets,
+                                int B, int T, int V, int P) {
+    // note: idx is small enough that it easily fits into 32 bit;
+    // by making it a long here, we ensure that any offsets calculated with it (e.g., idx * P)
+    // are done is 64 bit
+    int64_t idx = gridDim.x - (blockIdx.x+1); // reverse order for cache hits on matmul data
+    int ix = targets[idx];
+
+    // softmax (reading B * T * V, same logits read again below, hopefully still in cache)
+    SoftmaxParams sp = prepare_softmax_blockwide3(idx, logits, V, P);
+
+    // calculate the probability needed for the loss and update (single-threaded)
+    if(threadIdx.x == 0) {
+        float prob = expf((float)logits[idx * P + ix] - sp.Offset) * sp.Scale;
+        losses[idx] = (floatX)(-logf(prob));
+    }
+
+    // calculate the gradients directly, saves bandwidth from probs during training
+    // but also supports writing probs for inference-only and debugging
+    const floatX* logits_vec = logits + idx * P;
+    for (int i = threadIdx.x; i < V/x128::size; i += blockDim.x) {
+        // this is the 2nd read of logits after the one in prepare_softmax2
+        // it will be overwritten by the logits gradients which is when we reduce cache persistence
+        x128 packed_logits_vec = load128(logits_vec + i * x128::size); // rely on cs of store128cs
+        x128 packed_probs;
+        for(int k = 0; k < x128::size; ++k) {
+            int element = i*x128::size + k;
+            float prob = expf((float)packed_logits_vec[k] - sp.Offset) * sp.Scale;
+            packed_probs[k] = (floatX)prob;
+            float indicator = (element == ix) ? 1.0f : 0.0f;
+            packed_logits_vec[k] = (floatX)((prob - indicator) * dloss);
+        }
+        if (WriteLogits){
+            // reduce cache persistence for the overwritten logits
+            // to maximise probability that logits remain in cache between prepare_softmax and here
+            store128cs(logits + idx * P + i * x128::size, packed_logits_vec);
+        }
+        if (WriteProbs) {
+            store128(probs + idx * P + i * x128::size, packed_probs);
+        }
+    }
+
+    // handle remaining elements after the last multiple of x128::size
+    // e.g. if V = 8003, and x128::size = 8, we need to handle the last 3 elements
+    int unaligned_start = V & ~(x128::size - 1); // round down to multiple of x128::size
+    for (int i = threadIdx.x + unaligned_start; i < V; i++) {
+        float prob = expf((float)logits_vec[i] - sp.Offset) * sp.Scale;
+        float indicator = (i == ix) ? 1.0f : 0.0f;
+        float dlogit = (prob - indicator) * dloss;
+        if (WriteLogits){
+            __stcs(logits + idx * P + i, (floatX)dlogit);
+        }
+        if (WriteProbs) {
+            probs[idx * P + i] = (floatX)prob;
+        }
+    }
+}
+
+// ----------------------------------------------------------------------------
+// kernel launchers
+
+// replaces logits with logit gradients
+template <typename Type>
+void fused_classifier(Type* logits, Type* losses,
+                      const float dloss, const int* targets,
+                      int B, int T, int V, int P) {
+    NVTX_RANGE_FN();
+    const int block_size = 1024;
+    const int N = B * T;
+    const int grid_size = N;
+    fused_classifier_kernel5<<<grid_size, block_size>>>(logits, losses, (floatX*)NULL, dloss, targets, B, T, V, P);
+    cudaCheck(cudaGetLastError());
+}
diff --git a/train_gpt2.cu b/train_gpt2.cu
@@ -10,11 +10,7 @@ GPT-2 Transformer Neural Net training loop. See README.md for usage.
 #include <string_view>
 #include <sys/stat.h>
 #include <sys/types.h>
-#ifdef MULTI_GPU
-#include <mpi.h>
-#include <nccl.h>
-#endif
-// our own utilities
+// ----------- CPU utilities -----------
 // defines: fopenCheck, freadCheck, fcloseCheck, fseekCheck, mallocCheck
 // defines: create_dir_if_not_exists, find_max_step
 #include "llmc/utils.h"
@@ -31,6 +27,7 @@ GPT-2 Transformer Neural Net training loop. See README.md for usage.
 #include "llmc/logger.h"
 // defines: get_flops_promised
 #include "llmc/mfu.h"
+// ----------- GPU utilities -----------
 // defines:
 // WARP_SIZE, MAX_1024_THREADS_BLOCKS, CEIL_DIV, cudaCheck, PRECISION_MODE
 // NVTX_RANGE_FN
@@ -42,6 +39,7 @@ GPT-2 Transformer Neural Net training loop. See README.md for usage.
 // defines: CUBLAS_LOWP, cublasCheck, cublaslt_workspace_size, cublaslt_workspace
 // defines: cublas_compute, cublaslt_handle, cublas_handle
 #include "llmc/cublas_common.h"
+// ----------- Layer implementations in CUDA -----------
 // defines: encoder_forward, encoder_backward
 #include "llmc/encoder.cuh"
 // defines: gelu_forward, gelu_backward_inplace
@@ -53,6 +51,13 @@ GPT-2 Transformer Neural Net training loop. See README.md for usage.
 // defines: attention_forward, attention_backward
 #include "llmc/attention.cuh"
 #endif
+// defines: fused_classifier
+#include "llmc/fused_classifier.cuh"
+// ----------- Multi-GPU support -----------
+#ifdef MULTI_GPU
+#include <mpi.h>
+#include <nccl.h>
+#endif
 
 // ----------------------------------------------------------------------------
 // global var containing information about the GPU this process is running on
@@ -730,121 +735,6 @@ __global__ void global_norm_squared_kernel(float* out, const T* data, size_t cou
     }
 }
 
-struct SoftmaxParams {
-    float Scale;
-    float Offset;
-};
-
-__device__ SoftmaxParams prepare_softmax_blockwide3(int64_t idx, const floatX* inp, int V, int P) {
-    // same but not float4
-    // one row of inp, i.e. inp[idx, :] of shape (V,)
-
-    const floatX* x = inp + idx * P;
-    float thread_maxval = -INFINITY;
-    float thread_sumval = 0.0f;
-    int i = (V+x128::size-1)/x128::size + threadIdx.x - blockDim.x;
-
-    // special-case loop to handle the unaligned elements at the end of the array
-    // this lets us skip the bounds check in the main loop below, which improves performance
-    while ((i+1)*x128::size > V) {
-        for(int k = 0; k < x128::size; ++k) {
-            if (i*x128::size+k >= V) {
-                break; // bounds checking against real V (rather than padded P)
-            }
-            float v = (float)x[i*x128::size+k];
-            float old_maxval = thread_maxval;
-            thread_maxval = fmaxf(thread_maxval, v);
-            thread_sumval *= expf((old_maxval - thread_maxval));
-            thread_sumval += expf(v - thread_maxval);
-        }
-        i -= blockDim.x;
-    }
-
-    // main loop for the bulk of the iterations (no bounds checking required!)
-    for (; i >= 0; i -= blockDim.x) {
-        x128 packed_x = load128(x + i * x128::size); // load and keep in cache until fused_classifier loop
-        for(int k = 0; k < x128::size; ++k) {
-            float v = (float)packed_x[k];
-            float old_maxval = thread_maxval;
-            thread_maxval = fmaxf(thread_maxval, v);
-            thread_sumval *= expf((old_maxval - thread_maxval));
-            thread_sumval += expf(v - thread_maxval);
-        }
-    }
-
-    // Block Max Reduction -> Maths -> Block Sum Reduction
-    float block_maxval = blockReduce<warpReduceMax>(thread_maxval, false, -INFINITY);
-    thread_sumval *= expf(thread_maxval - block_maxval);
-    float block_sumval = blockReduce<warpReduceSum>(thread_sumval);
-
-    // return the softmax parameters
-    return SoftmaxParams{1.f / block_sumval, block_maxval};
-}
-
-// will _update_ logits to logit gradients
-// uses template to decide whether to write logits and probs
-// split both loops in "multiple-of-x128-size" and "bounds-checked remainder" parts
-template <bool WriteLogits = true, bool WriteProbs = false>
-__global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS)
-                fused_classifier_kernel5(floatX* logits, floatX* losses, floatX* probs,
-                                         const float dloss, const int* targets,
-                                         int B, int T, int V, int P) {
-    // note: idx is small enough that it easily fits into 32 bit;
-    // by making it a long here, we ensure that any offsets calculated with it (e.g., idx * P)
-    // are done is 64 bit
-    int64_t idx = gridDim.x - (blockIdx.x+1); // reverse order for cache hits on matmul data
-    int ix = targets[idx];
-
-    // softmax (reading B * T * V, same logits read again below, hopefully still in cache)
-    SoftmaxParams sp = prepare_softmax_blockwide3(idx, logits, V, P);
-
-    // calculate the probability needed for the loss and update (single-threaded)
-    if(threadIdx.x == 0) {
-        float prob = expf((float)logits[idx * P + ix] - sp.Offset) * sp.Scale;
-        losses[idx] = (floatX)(-logf(prob));
-    }
-
-    // calculate the gradients directly, saves bandwidth from probs during training
-    // but also supports writing probs for inference-only and debugging
-    const floatX* logits_vec = logits + idx * P;
-    for (int i = threadIdx.x; i < V/x128::size; i += blockDim.x) {
-        // this is the 2nd read of logits after the one in prepare_softmax2
-        // it will be overwritten by the logits gradients which is when we reduce cache persistence
-        x128 packed_logits_vec = load128(logits_vec + i * x128::size); // rely on cs of store128cs
-        x128 packed_probs;
-        for(int k = 0; k < x128::size; ++k) {
-            int element = i*x128::size + k;
-            float prob = expf((float)packed_logits_vec[k] - sp.Offset) * sp.Scale;
-            packed_probs[k] = (floatX)prob;
-            float indicator = (element == ix) ? 1.0f : 0.0f;
-            packed_logits_vec[k] = (floatX)((prob - indicator) * dloss);
-        }
-        if (WriteLogits){
-            // reduce cache persistence for the overwritten logits
-            // to maximise probability that logits remain in cache between prepare_softmax and here
-            store128cs(logits + idx * P + i * x128::size, packed_logits_vec);
-        }
-        if (WriteProbs) {
-            store128(probs + idx * P + i * x128::size, packed_probs);
-        }
-    }
-
-    // handle remaining elements after the last multiple of x128::size
-    // e.g. if V = 8003, and x128::size = 8, we need to handle the last 3 elements
-    int unaligned_start = V & ~(x128::size - 1); // round down to multiple of x128::size
-    for (int i = threadIdx.x + unaligned_start; i < V; i++) {
-        float prob = expf((float)logits_vec[i] - sp.Offset) * sp.Scale;
-        float indicator = (i == ix) ? 1.0f : 0.0f;
-        float dlogit = (prob - indicator) * dloss;
-        if (WriteLogits){
-            __stcs(logits + idx * P + i, (floatX)dlogit);
-        }
-        if (WriteProbs) {
-            probs[idx * P + i] = (floatX)prob;
-        }
-    }
-}
-
 // device functions and the kernel to cast data between types
 template<typename Td, typename Ts>
 __device__ Td cast_value(Ts val);
@@ -1050,19 +940,6 @@ void layernorm_backward(floatX* dinp, floatX* dweight, floatX* dbias, float* scr
     cudaCheck(cudaGetLastError());
 }
 
-// replaces logits with logit gradients
-template <typename Type>
-void fused_classifier(Type* logits, Type* losses,
-                      const float dloss, const int* targets,
-                      int B, int T, int V, int P) {
-    NVTX_RANGE_FN();
-    const int block_size = 1024;
-    const int N = B * T;
-    const int grid_size = N;
-    fused_classifier_kernel5<<<grid_size, block_size>>>(logits, losses, (floatX*)NULL, dloss, targets, B, T, V, P);
-    cudaCheck(cudaGetLastError());
-}
-
 template<typename T>
 void global_norm_squared(float* out, const T* values, size_t count) {
     const int block_size = 512;