diff --git a/train.py b/train.py index 6994fb9b..61308681 100644 --- a/train.py +++ b/train.py @@ -614,7 +614,7 @@ def get_weight_decay(progress): # Final summary t_end = time.time() startup_time = t_start_training - t_start -steady_state_mfu = 100 * num_flops_per_token * TOTAL_BATCH_SIZE * (step - 10) / total_training_time / H100_BF16_PEAK_FLOPS if total_training_time > 0 else 0 +steady_state_mfu = 100 * num_flops_per_token * TOTAL_BATCH_SIZE * (step - 11) / total_training_time / H100_BF16_PEAK_FLOPS if total_training_time > 0 else 0 peak_vram_mb = torch.cuda.max_memory_allocated() / 1024 / 1024 print("---")