NVIDIA-BioNeMo · gagank1 · Apr 4, 2026 · Apr 6, 2026 · Apr 6, 2026 · Apr 6, 2026
@@ -177,6 +177,25 @@ python train_fsdp2.py \
 A final model suitable for uploading to the Hugging Face Hub can be exported at the end of training by setting
 `checkpoint.save_final_model=true`.
 
+## MFU Tracking
+
+Enable per-step Model FLOPs Utilization (MFU) logging during training by adding `log_mfu=true`:
+
+```bash
+torchrun --nproc_per_node=1 train_fsdp2.py --config-name encodon_1b log_mfu=true
+```
+
+This adds two metrics at each logging interval, emitted alongside existing metrics via WANDB and
+stdout:
+
+- `train/tflops_per_gpu` — achieved BF16 TFLOPS per GPU
+- `train/mfu_pct` — MFU as a percentage of the GPU's peak dense BF16 TFLOPS
+
+The FLOPs formula auto-detects model architecture from the model config (MHA, standard FFN,
+vocabulary size) and scales with the actual unpadded token count on each rank. This means it
+naturally handles gradient accumulation, data parallelism, BSHD, and THD (sequence packing)
+without per-strategy code paths. The implementation lives in `perf_logger.py`.
+
 ## Developer Guide
 
 ### Running Tests

@@ -65,3 +65,4 @@ quant_stats_config:
 fp8_layers: null
 fp4_layers: null
 use_fp32_master_weights: null
+log_mfu: false
@@ -36,6 +36,69 @@
 PAD_TOKEN_ID = 3
 
 
+# Dense BF16 tensor core peak TFLOPS (without sparsity). Product pages often list
+# the 2x sparse number; dense = sparse / 2. Sources: NVIDIA datasheets for each GPU.
+_GPU_PEAK_TFLOPS_BF16 = {
+    "H100": 989.0,
+    "H200": 989.0,
+    "A100": 312.0,
+    "A6000": 155.0,
+    "L40": 181.0,
+    "GH200": 989.0,
+    "B200": 2250.0,
+    "GB200": 2250.0,
+    "B300": 2500.0,
+    "GB300": 2500.0,
+}
+
+# Model types that use gated MLP (SwiGLU/GeGLU) with 3 projections vs. standard FFN with 2.
+_GATED_MLP_MODEL_TYPES = frozenset({"llama", "mistral", "qwen2"})
+
+
+def _detect_peak_tflops_bf16():
+    """Auto-detect dense BF16 peak TFLOPS for the local GPU. Returns (peak, device_name)."""
+    if not torch.cuda.is_available():
+        return None, "unknown"
+    name = torch.cuda.get_device_name(0)
+    for key, tflops in _GPU_PEAK_TFLOPS_BF16.items():
+        if key.lower() in name.lower():
+            return tflops, name
+    return None, name
+
+
+def _compute_per_token_flops(model_config_dict: dict, seq_len: int) -> int:
+    """Training FLOPs per token for a transformer (forward + backward = 3x forward).
+
+    First-principles matmul count: Q/K/V/O projections (GQA-aware), attention
+    logits/values (the S^2 cost expressed per-token as 4*S*H), 2-or-3-projection
+    MLP (SwiGLU detected via model_type), and LM head. The returned value is
+    multiplied by the actual unpadded token count at log time, so it naturally
+    handles BSHD, THD (sequence packing), gradient accumulation, DP, and CP:
+    unpadded tokens on each rank already reflect that rank's share of work.
+    """
+    h = model_config_dict["hidden_size"]
+    n_heads = model_config_dict["num_attention_heads"]
+    n_kv = model_config_dict.get("num_key_value_heads", n_heads)
+    head_dim = h // n_heads
+    kv_dim = n_kv * head_dim
+    ffn = model_config_dict["intermediate_size"]
+    vocab = model_config_dict.get("vocab_size", 0)
+    num_layers = model_config_dict["num_hidden_layers"]
+    model_type = model_config_dict.get("model_type", "")
+    num_mlp_proj = 3 if model_type in _GATED_MLP_MODEL_TYPES else 2
+
+    per_layer = (
+        2 * h * h  # Q projection
+        + 4 * h * kv_dim  # K + V projections (GQA-aware)
+        + 2 * h * h  # O projection
+        + 4 * seq_len * h  # attention logits + values (S^2 -> S per token)
+        + 2 * num_mlp_proj * h * ffn  # MLP (2 or 3 projections)
+    )
+    lm_head = 2 * h * vocab if vocab > 0 else 0
+    per_token_fwd = num_layers * per_layer + lm_head
+    return 3 * per_token_fwd
+
+
 class PerfLogger:
     """Performance logger for CodonFM training.
 
@@ -44,17 +107,39 @@ class PerfLogger:
     Args:
         dist_config: The distributed configuration.
         args: The Hydra arguments.
+        model_config_dict: Optional HF-style model config dict. When supplied together with
+            ``args.log_mfu`` set to True, the logger computes per-step Model FLOPs Utilization
+            (``train/mfu_pct``) and throughput (``train/tflops_per_gpu``) on each logging step.
     """
 
-    def __init__(self, dist_config: DistributedConfig, args: DictConfig):
+    def __init__(self, dist_config: DistributedConfig, args: DictConfig, model_config_dict: dict | None = None):
         """Initialize the logger."""
         self._dist_config = dist_config
         self._run_config = OmegaConf.to_container(args, resolve=True, throw_on_missing=True)
 
-        self.min_loss = torch.tensor(float("inf"), device=torch.device(f"cuda:{dist_config.local_rank}"))
+        self._device = torch.device(f"cuda:{dist_config.local_rank}")
+        self.min_loss = torch.tensor(float("inf"), device=self._device)
 
         self.logging_frequency = args.logger.frequency
 
+        # MFU setup: compute per-token FLOPs and peak TFLOPS once at init. Actual FLOPs per
+        # step are derived at log time from the tracked unpadded token count, which already
+        # reflects each rank's share under DP and sequence packing.
+        self._log_mfu = bool(args.get("log_mfu", False)) and model_config_dict is not None
+        self._per_token_flops = 0
+        self._peak_tflops: float | None = None
+        if self._log_mfu:
+            self._per_token_flops = _compute_per_token_flops(model_config_dict, args.dataset.max_seq_length)
+            self._peak_tflops, gpu_name = _detect_peak_tflops_bf16()
+            if dist_config.local_rank == 0:
+                logger.info(
+                    "MFU tracking enabled: GPU=%s, peak=%s TFLOPS BF16, per-token FLOPs=%.3e, seq_len=%d",
+                    gpu_name,
+                    f"{self._peak_tflops:.1f}" if self._peak_tflops else "unknown",
+                    float(self._per_token_flops),
+                    args.dataset.max_seq_length,
+                )
+
         metrics_dict = {
             "train/loss": torchmetrics.MeanMetric(),
             "train/grad_norm": torchmetrics.MeanMetric(),
@@ -66,9 +151,13 @@ def __init__(self, dist_config: DistributedConfig, args: DictConfig):
             "train/gpu_memory_allocated_max_gb": torchmetrics.MaxMetric(),
             "train/gpu_memory_allocated_mean_gb": torchmetrics.MeanMetric(),
         }
+        if self._log_mfu:
+            metrics_dict["train/tflops_per_gpu"] = torchmetrics.MeanMetric()
+            if self._peak_tflops is not None:
+                metrics_dict["train/mfu_pct"] = torchmetrics.MeanMetric()
 
         self.metrics = torchmetrics.MetricCollection(metrics_dict)
-        self.metrics.to(torch.device(f"cuda:{dist_config.local_rank}"))
+        self.metrics.to(self._device)
         self.previous_step_time = time.perf_counter()
 
         if self._dist_config.is_main_process():
@@ -79,7 +168,6 @@ def __init__(self, dist_config: DistributedConfig, args: DictConfig):
         self.quant_stats_config = args.quant_stats_config.enabled
 
         # Gradient accumulation tracking
-        self._device = torch.device(f"cuda:{dist_config.local_rank}")
         self.num_tokens = 0
         self.num_unpadded_tokens = torch.tensor(0, dtype=torch.int64, device=self._device)
         self.running_loss = torch.tensor(0.0, device=self._device)
@@ -155,6 +243,19 @@ def log_step(
                 self.metrics["train/tokens_per_second_per_gpu"].update(self.num_tokens / step_time)
                 self.metrics["train/unpadded_tokens_per_second_per_gpu"].update(self.num_unpadded_tokens / step_time)
 
+                if self._log_mfu:
+                    # PaLM/Megatron/MosaicML convention: count the configured-shape token budget
+                    # (input_ids.numel() = B * S_padded for BSHD, or total packed tokens for THD),
+                    # not attention_mask.sum(). The hardware executes matmuls over every position
+                    # regardless of masking, and this matches published MFU numbers.
+                    # num_tokens is accumulated over the grad-acc micro-batches of one optimizer
+                    # step (the last step in the logging window). step_time is per-step average.
+                    flops_per_step = self._per_token_flops * self.num_tokens
+                    tflops_per_gpu = flops_per_step / step_time / 1e12
+                    self.metrics["train/tflops_per_gpu"].update(tflops_per_gpu)
+                    if self._peak_tflops is not None:
+                        self.metrics["train/mfu_pct"].update(tflops_per_gpu / self._peak_tflops * 100.0)
+
                 memory_allocated = torch.cuda.memory_allocated() / (1024**3)
                 self.metrics["train/gpu_memory_allocated_max_gb"].update(memory_allocated)
                 self.metrics["train/gpu_memory_allocated_mean_gb"].update(memory_allocated)

@@ -163,7 +163,11 @@ def main(args: DictConfig) -> float | None:
             start_step = 0
             epoch = 0
 
-        perf_logger = PerfLogger(dist_config, args)
+        perf_logger = PerfLogger(
+            dist_config,
+            args,
+            model_config_dict=config.to_dict() if args.get("log_mfu", False) else None,
+        )
 
         # Training loop
         step = start_step

@@ -374,6 +374,25 @@ output = model(**inputs)
 
 - [ESM-2 Training with Accelerate](../esm2_accelerate_te/README.md)
 
+## MFU Tracking
+
+Enable per-step Model FLOPs Utilization (MFU) logging during training by adding `log_mfu=true`:
+
+```bash
+torchrun --nproc_per_node=2 train_fsdp2.py --config-name L1_3B log_mfu=true
+```
+
+This adds two metrics at each logging interval, emitted alongside existing metrics via WANDB and
+stdout:
+
+- `train/tflops_per_gpu` — achieved BF16 TFLOPS per GPU
+- `train/mfu_pct` — MFU as a percentage of the GPU's peak dense BF16 TFLOPS
+
+The FLOPs formula auto-detects model architecture from the HF config (MHA vs. GQA, gated vs.
+standard FFN, LM head presence) and scales with the actual unpadded token count on each rank. This
+means it naturally handles data parallelism, context parallelism, BSHD, and THD (sequence packing)
+without per-strategy code paths. The implementation lives in `perf_logger.py`.
+
 ## Developer Guide
 
 ### Running Tests

@@ -12,6 +12,8 @@ use_torch_compile: false
 
 cp_size: 1
 
+log_mfu: false
+
 use_sequence_packing: false
 dataset:
   tokenizer_name: ${config_name_or_path}

@@ -32,18 +32,84 @@
 logger = logging.getLogger(__name__)
 
 
+# Dense BF16 tensor core peak TFLOPS (without sparsity). Product pages often list
+# the 2x sparse number; dense = sparse / 2. Sources: NVIDIA datasheets for each GPU.
+_GPU_PEAK_TFLOPS_BF16 = {
+    "H100": 989.0,
+    "H200": 989.0,
+    "A100": 312.0,
+    "A6000": 155.0,
+    "L40": 181.0,
+    "GH200": 989.0,
+    "B200": 2250.0,
+    "GB200": 2250.0,
+    "B300": 2500.0,
+    "GB300": 2500.0,
+}
+
+# Model types that use gated MLP (SwiGLU/GeGLU) with 3 projections vs. standard FFN with 2.
+_GATED_MLP_MODEL_TYPES = frozenset({"llama", "mistral", "qwen2"})
+
+
+def _detect_peak_tflops_bf16():
+    """Auto-detect dense BF16 peak TFLOPS for the local GPU. Returns (peak, device_name)."""
+    if not torch.cuda.is_available():
+        return None, "unknown"
+    name = torch.cuda.get_device_name(0)
+    for key, tflops in _GPU_PEAK_TFLOPS_BF16.items():
+        if key.lower() in name.lower():
+            return tflops, name
+    return None, name
+
+
+def _compute_per_token_flops(model_config_dict: dict, seq_len: int) -> int:
+    """Training FLOPs per token for a transformer (forward + backward = 3x forward).
+
+    First-principles matmul count: Q/K/V/O projections (GQA-aware), attention
+    logits/values (the S^2 cost expressed per-token as 4*S*H), 2-or-3-projection
+    MLP (SwiGLU detected via model_type), and LM head. The returned value is
+    multiplied by the actual unpadded token count at log time, so it naturally
+    handles BSHD, THD (sequence packing), DP, and CP: unpadded tokens on each
+    rank already reflect that rank's share of work.
+    """
+    h = model_config_dict["hidden_size"]
+    n_heads = model_config_dict["num_attention_heads"]
+    n_kv = model_config_dict.get("num_key_value_heads", n_heads)
+    head_dim = h // n_heads
+    kv_dim = n_kv * head_dim
+    ffn = model_config_dict["intermediate_size"]
+    vocab = model_config_dict.get("vocab_size", 0)
+    num_layers = model_config_dict["num_hidden_layers"]
+    model_type = model_config_dict.get("model_type", "")
+    num_mlp_proj = 3 if model_type in _GATED_MLP_MODEL_TYPES else 2
+
+    per_layer = (
+        2 * h * h  # Q projection
+        + 4 * h * kv_dim  # K + V projections (GQA-aware)
+        + 2 * h * h  # O projection
+        + 4 * seq_len * h  # attention logits + values (S^2 -> S per token)
+        + 2 * num_mlp_proj * h * ffn  # MLP (2 or 3 projections)
+    )
+    lm_head = 2 * h * vocab if vocab > 0 else 0
+    per_token_fwd = num_layers * per_layer + lm_head
+    return 3 * per_token_fwd
+
+
 class PerfLogger:
     """Class to log performance metrics to stdout and wandb, and print final averaged metrics at the end of training.
 
     Args:
         dist_config: The distributed configuration.
         args: The arguments.
+        model_config_dict: Optional HF-style model config dict. When supplied together with
+            ``args.log_mfu`` set to True, the logger computes per-step Model FLOPs Utilization
+            (``train/mfu_pct``) and throughput (``train/tflops_per_gpu``) on each logging step.
 
     Attributes:
         min_loss: The minimum loss seen so far.
     """
 
-    def __init__(self, dist_config: DistributedConfig, args: DictConfig):
+    def __init__(self, dist_config: DistributedConfig, args: DictConfig, model_config_dict: dict | None = None):
         """Initialize the logger."""
         self._dist_config = dist_config
         self._run_config = OmegaConf.to_container(args, resolve=True, throw_on_missing=True)
@@ -53,6 +119,24 @@ def __init__(self, dist_config: DistributedConfig, args: DictConfig):
         self.logging_frequency = args.logger.frequency
         # Track whether to collect memory stats (disabled by default for max performance)
 
+        # MFU setup: compute per-token FLOPs and peak TFLOPS once at init. Actual FLOPs per
+        # step are derived at log time from the current batch's unpadded token count, which
+        # already reflects each rank's share under DP/CP and sequence packing.
+        self._log_mfu = bool(args.get("log_mfu", False)) and model_config_dict is not None
+        self._per_token_flops = 0
+        self._peak_tflops: float | None = None
+        if self._log_mfu:
+            self._per_token_flops = _compute_per_token_flops(model_config_dict, args.dataset.max_seq_length)
+            self._peak_tflops, gpu_name = _detect_peak_tflops_bf16()
+            if dist_config.local_rank == 0:
+                logger.info(
+                    "MFU tracking enabled: GPU=%s, peak=%s TFLOPS BF16, per-token FLOPs=%.3e, seq_len=%d",
+                    gpu_name,
+                    f"{self._peak_tflops:.1f}" if self._peak_tflops else "unknown",
+                    float(self._per_token_flops),
+                    args.dataset.max_seq_length,
+                )
+
         metrics_dict = {
             "train/loss": torchmetrics.MeanMetric(),
             "train/grad_norm": torchmetrics.MeanMetric(),
@@ -65,6 +149,10 @@ def __init__(self, dist_config: DistributedConfig, args: DictConfig):
             "train/gpu_memory_allocated_max_gb": torchmetrics.MaxMetric(),
             "train/gpu_memory_allocated_mean_gb": torchmetrics.MeanMetric(),
         }
+        if self._log_mfu:
+            metrics_dict["train/tflops_per_gpu"] = torchmetrics.MeanMetric()
+            if self._peak_tflops is not None:
+                metrics_dict["train/mfu_pct"] = torchmetrics.MeanMetric()
 
         self.metrics = torchmetrics.MetricCollection(metrics_dict)
         # We move metrics to a GPU device so we can use torch.distributed to aggregate them before logging.
@@ -124,6 +212,17 @@ def log_step(
                 self.metrics["train/unpadded_tokens_per_second_per_gpu"].update(num_unpadded_tokens / step_time)
                 self.metrics["train/total_unpadded_tokens_per_batch"].update(num_unpadded_tokens)
 
+                if self._log_mfu:
+                    # PaLM/Megatron/MosaicML convention: count the configured-shape token budget
+                    # (input_ids.numel() = B * S_padded for BSHD, or total packed tokens for THD),
+                    # not the attention-mask count. The hardware executes matmuls over every
+                    # position regardless of masking, and this matches published MFU numbers.
+                    flops_per_step = self._per_token_flops * num_tokens
+                    tflops_per_gpu = flops_per_step / step_time / 1e12
+                    self.metrics["train/tflops_per_gpu"].update(tflops_per_gpu)
+                    if self._peak_tflops is not None:
+                        self.metrics["train/mfu_pct"].update(tflops_per_gpu / self._peak_tflops * 100.0)
+
                 # Handle sequence packing for torchmetrics calculation.
                 if outputs.logits.dim() < 3:
                     outputs.logits = outputs.logits.unsqueeze(0)

@@ -156,7 +156,11 @@ def main(args: DictConfig) -> float | None:
         start_step = 0
         epoch = 0
 
-    perf_logger = PerfLogger(dist_config, args)
+    perf_logger = PerfLogger(
+        dist_config,
+        args,
+        model_config_dict=config.to_dict() if args.get("log_mfu", False) else None,
+    )
 
     # Training loop
     step = start_step

@@ -165,7 +165,11 @@ def main(args: DictConfig) -> float | None:
         start_step = 0
         epoch = 0
 
-    perf_logger = PerfLogger(dist_config, args)
+    perf_logger = PerfLogger(
+        dist_config,
+        args,
+        model_config_dict=config.to_dict() if args.get("log_mfu", False) else None,
+    )
 
     # Training loop
     step = start_step