From 663cda72029818b003e0f9bd8845f948e38acd67 Mon Sep 17 00:00:00 2001 From: Bishal Date: Mon, 9 Mar 2026 15:32:05 -0400 Subject: [PATCH] fix off-by-one in steady-state MFU calculation The timing gate `if step > 10` excludes steps 0-10 (11 iterations) from total_training_time. The MFU formula used (step - 10) as the timed step count, overcounting by one. Fix to (step - 11). This is a reporting-only change: no effect on training behavior, schedules, or the time budget break condition. --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 6994fb9b..61308681 100644 --- a/train.py +++ b/train.py @@ -614,7 +614,7 @@ def get_weight_decay(progress): # Final summary t_end = time.time() startup_time = t_start_training - t_start -steady_state_mfu = 100 * num_flops_per_token * TOTAL_BATCH_SIZE * (step - 10) / total_training_time / H100_BF16_PEAK_FLOPS if total_training_time > 0 else 0 +steady_state_mfu = 100 * num_flops_per_token * TOTAL_BATCH_SIZE * (step - 11) / total_training_time / H100_BF16_PEAK_FLOPS if total_training_time > 0 else 0 peak_vram_mb = torch.cuda.max_memory_allocated() / 1024 / 1024 print("---")