@@ -1407,7 +1407,7 @@ void attention_forward(floatX* out, floatX* qkvr, floatX* att,
1407
1407
1408
1408
// multiply all elements of preatt elementwise by scale
1409
1409
float scale = 1.0 / sqrtf (HS);
1410
- int grid_size = CEIL_DIV (B * NH * T * 32 , block_size);
1410
+ int grid_size = CEIL_DIV (B * NH * T * WARP_SIZE , block_size);
1411
1411
softmax_forward_kernel5<<<grid_size, block_size>>> (att, scale, preatt, B * NH, T);
1412
1412
1413
1413
// new approach: first cuBLAS another batched matmul
@@ -1683,10 +1683,8 @@ void fill_in_parameter_sizes(size_t* param_sizes, size_t* param_sizeof, GPT2Conf
1683
1683
// allocate memory for the parameters and point the individual tensors to the right places
1684
1684
void * malloc_and_point_parameters (ParameterTensors* params, size_t * param_elements, size_t *param_sizeof) {
1685
1685
// calculate the total number of parameters and bytes across all tensors
1686
- size_t num_parameters = 0 ;
1687
1686
size_t num_parameters_bytes = 0 ;
1688
1687
for (int i = 0 ; i < NUM_PARAMETER_TENSORS; i++) {
1689
- num_parameters += param_elements[i];
1690
1688
num_parameters_bytes += param_elements[i] * param_sizeof[i];
1691
1689
}
1692
1690
// malloc all parameters all at once on the device
@@ -2433,7 +2431,7 @@ float multi_gpu_cpu_float_sum(float value) {
2433
2431
2434
2432
// Averages out the loss and gradients across all GPUs. No-op when multi-GPU is disabled.
2435
2433
// todo - this version only works if all the parameters are the same size (floatX)
2436
- void gpt2_multi_gpu_accumulate (GPT2* model, MultiGpuConfig* multi_gpu_config) {
2434
+ void gpt2_multi_gpu_grad_reduce (GPT2* model, MultiGpuConfig* multi_gpu_config) {
2437
2435
#ifdef MULTI_GPU
2438
2436
NVTX_RANGE_FN ();
2439
2437
if (multi_gpu_config->num_processes == 1 ) { return ; }
@@ -2490,12 +2488,12 @@ float gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, fl
2490
2488
// repurposing this buffer (which isn't needed now) to write grad norm into it
2491
2489
float * grad_norm_squared = (float *)model->acts .output ;
2492
2490
if (multi_gpu_config->zero_stage == 1 ) {
2493
- // ^1 because of the ncclReduceScatter() in gpt2_multi_gpu_accumulate ,
2491
+ // ^1 because of the ncclReduceScatter() in gpt2_multi_gpu_grad_reduce ,
2494
2492
// grads_memory only contains the averaged gradients at the local shard
2495
2493
// so we only calculate the grad norm at the grads_memory belonging to the local shard
2496
2494
global_norm_squared (grad_norm_squared, grads_memory + shard_offset, shard_num_parameters);
2497
2495
} else {
2498
- // the ncclAllReduce() in gpt2_multi_gpu_accumulate has averaged the gradients across all GPUs
2496
+ // the ncclAllReduce() in gpt2_multi_gpu_grad_reduce has averaged the gradients across all GPUs
2499
2497
// so each GPU can compute the squared norm over the whole grad vector, with no added comms needed
2500
2498
global_norm_squared (grad_norm_squared, grads_memory, model->num_parameters );
2501
2499
}
@@ -2583,7 +2581,7 @@ float gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, fl
2583
2581
return grad_norm_cpu;
2584
2582
}
2585
2583
2586
- void gpt2_multi_gpu_gather (GPT2 *model, MultiGpuConfig* multi_gpu_config)
2584
+ void gpt2_multi_gpu_param_gather (GPT2 *model, MultiGpuConfig* multi_gpu_config)
2587
2585
{
2588
2586
#ifdef MULTI_GPU
2589
2587
if (multi_gpu_config->num_processes == 1 ) { return ; } // 1 process => noop
@@ -3160,7 +3158,7 @@ int main(int argc, char *argv[]) {
3160
3158
// this is esp important to do here in multigpu update below, where model.mean_loss gets allreduced
3161
3159
model.mean_loss = lossf;
3162
3160
// update the parameters
3163
- gpt2_multi_gpu_accumulate (&model, &multi_gpu_config);
3161
+ gpt2_multi_gpu_grad_reduce (&model, &multi_gpu_config);
3164
3162
// learning rate schedule: warmup linearly to max LR, then cosine decay to LR * final_learning_rate_frac
3165
3163
float step_learning_rate = learning_rate;
3166
3164
if (step < warmup_iterations) {
@@ -3175,7 +3173,7 @@ int main(int argc, char *argv[]) {
3175
3173
}
3176
3174
// update the model parameters
3177
3175
float grad_norm = gpt2_update (&model, step_learning_rate, 0 .9f , 0 .95f , 1e-8f , weight_decay, 1 .0f , step+1 , &multi_gpu_config);
3178
- gpt2_multi_gpu_gather (&model, &multi_gpu_config);
3176
+ gpt2_multi_gpu_param_gather (&model, &multi_gpu_config);
3179
3177
// zero out the gradients for the next iteration
3180
3178
gpt2_zero_grad (&model);
3181
3179
cudaCheck (cudaEventRecord (end));
0 commit comments