NousResearch · xrsrke · Nov 10, 2025 · Nov 10, 2025 · Nov 10, 2025 · Nov 11, 2025
diff --git a/.ci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh
@@ -43,6 +43,7 @@ install_pip_dependencies() {
   pip_install -r /opt/conda/requirements.txt
   pip_install -r /opt/conda/requirements-flux.txt
   pip_install -r /opt/conda/requirements-vlm.txt
+  pip_install -r /opt/conda/requirements-transformers-modeling-backend.txt
   popd
 }
 

diff --git a/.ci/docker/requirements-dev.txt b/.ci/docker/requirements-dev.txt
@@ -2,5 +2,6 @@ expecttest==0.1.6
 pytest==7.3.2
 pytest-cov
 pre-commit
+pyrefly==0.45.1
 tomli-w >= 1.1.0
 transformers
diff --git a/.ci/docker/requirements-flux.txt b/.ci/docker/requirements-flux.txt
@@ -1,4 +1,2 @@
 transformers>=4.51.1
-einops
 sentencepiece
-pillow
diff --git a/.ci/docker/requirements-transformers-modeling-backend.txt b/.ci/docker/requirements-transformers-modeling-backend.txt
@@ -0,0 +1 @@
+transformers==4.57.1
diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt
@@ -8,3 +8,5 @@ fsspec
 tyro
 tokenizers >= 0.15.0
 safetensors
+einops
+pillow
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
@@ -33,6 +33,7 @@ COPY requirements-dev.txt /opt/conda/
 COPY requirements.txt /opt/conda/
 COPY requirements-flux.txt /opt/conda/
 COPY requirements-vlm.txt /opt/conda/
+COPY requirements-transformers-modeling-backend.txt /opt/conda/
 COPY conda-env-ci.txt /opt/conda/
 COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/utils.sh utils.sh

diff --git a/.gitignore b/.gitignore
@@ -44,3 +44,7 @@ slurm-*
 
 # env files
 .env
+.venv/
+
+# Vibe coding
+.claude
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -53,3 +53,11 @@ repos:
       args: ["--ignore-words-list=MIS"]
       additional_dependencies:
         - tomli
+
+- repo: https://github.com/facebook/pyrefly-pre-commit
+  rev: 0.45.1
+  hooks:
+    - id: pyrefly-check
+      name: Pyrefly (type checking)
+      pass_filenames: false
+      language: system
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -4,7 +4,7 @@ possible. Contributions should follow the [Contributing Guidelines](#contributin
 
 ### Setup
 ```
-pip install -r requirements-dev.txt
+pip install -r requirements.txt -r requirements-dev.txt
 ```
 
 ### Pull Requests

diff --git a/GRPO.md b/GRPO.md
@@ -4,6 +4,8 @@ GRPO instructions
 
 ## Installation instructions
 ```shell
+mkdir logs
+chmod g+rw ./logs
 pip install uv
 uv pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu129
 uv pip install -r requirements.txt
@@ -12,8 +14,23 @@ export VLLM_COMMIT=2918c1b49c88c29783c86f78d2c4221cb9622379
 uv pip install vllm torch==2.9.0 --torch-backend=cu129 --prerelease=allow --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} --extra-index-url https://download.pytorch.org/whl/cu129
 pip install flashinfer-python==0.4.1 flashinfer-cubin==0.4.1
 pip install flashinfer-jit-cache==0.4.1 --index-url https://flashinfer.ai/whl/cu129
+pip install transformers==4.57.1
 ```
 
 ## Configuration instructions
 
 see `torchtitan/grop/configs/qwen25-7b-math.toml` for good initial values
+
+## sbatch script
+
+`online_multinode_vllm.slurm` contains some paths to edit,
+- TRAIN_PATH - where this is installed on the cluster
+- TRAIN_ENV - if you don't init the venv to .venv, this needs to be changed to that venv
+- VLLM_ENV - same as TRAIN_ENV unless you're doing something different
+- API_ENV - atropos venv
+
+One that's done, you can do something like
+```bash
+sbatch --export=ALL,CONFIG_FILE=/home/dakota/github/torchtitan/torchtitan/grpo/configs/qwen25-7b-math.toml,MODEL_NAME=Qwen/Qwen2.5-7B,PYTHON_SCRIPT=/home/dakota/github/atropos/environments/math_server_zero.py,WANDB_PROJECT=qwen7b_debug online_multinode_vllm.slurm
+```
+to launch a run
diff --git a/README.md b/README.md
@@ -40,9 +40,14 @@ The Guiding Principles when building `torchtitan`
 * Minimal changes to the model code when applying multi-dimensional parallelism.
 * Bias towards a clean, minimal codebase while providing basic reusable / swappable components.
 
-`torchtitan` has been showcasing PyTorch's latest distributed training features, via pretraining Llama 3.1 LLMs of various sizes.
-To accelerate contributions to and innovations around torchtitan, we host an [`experiments`](torchtitan/experiments) folder. We look forward to your contributions!
+`torchtitan` has been showcasing PyTorch's latest distributed training features, via support for pretraining Llama 3.1 LLMs of various sizes.
 
+## Contributing
+
+We look forward to your contributions!
+
+* To accelerate contributions to and innovations around torchtitan, we host an [`experiments`](torchtitan/experiments) folder. New ideas should start there. To contribute, follow the [`experiments guidelines`](torchtitan/experiments/README.md).
+* For fixes and contributions to core, follow these [`guidelines`](CONTRIBUTING.md).
 
 ## Llama 3.1 training
 
@@ -59,6 +64,7 @@ To accelerate contributions to and innovations around torchtitan, we host an [`e
    - [Interoperable checkpoints](docs/checkpoint.md) which can be loaded directly into [`torchtune`](https://github.com/pytorch/torchtune) for fine-tuning
 5. `torch.compile` support
 6. [Float8](https://discuss.pytorch.org/t/distributed-w-torchtitan-enabling-float8-all-gather-in-fsdp2/209323) support ([how-to](docs/float8.md))
+7. [MXFP8 training for dense and MoE models](docs/mxfp8.md) on Blackwell GPUs.
 7. DDP and HSDP
 8. [TorchFT](https://github.com/pytorch/torchft) integration
 9. Checkpointable data-loading, with the C4 dataset pre-configured (144M entries) and support for [custom datasets](docs/datasets.md)

diff --git a/assets/images/mxfp8_with_loss.png b/assets/images/mxfp8_with_loss.png
diff --git a/assets/version.txt b/assets/version.txt
@@ -1 +1 @@
-0.2.0
+0.2.1
diff --git a/docs/checkpoint.md b/docs/checkpoint.md
@@ -68,7 +68,7 @@ NGPU=1 CONFIG_FILE=<path_to_model_config> ./run_train.sh --checkpoint.enable --c
 ### HuggingFace
 `torchtitan` offers two ways to work with Hugging Face models: either by directly saving and loading a Hugging Face checkpoint during training, or by using an example conversion script to directly reformat the model weights on cpu.
 
-1. You can directly save huggingface model weights during training by using the `--checkpoint.last_save_in_safetensors_format` and `--checkpoint.last_save_model_only` options together. To directly load a `torchtitan` training session from a huggingface safetensors file, enable `--checkpoint.initial_load_in_hf`, and set either `--model.hf_assets_path` or `--checkpoint.initial_load_path` to the directory containing the huggingface checkpoint. `--checkpoint.initial_load_path` overrides `--model.hf_assets_path` if both are set.
+1. You can directly save huggingface model weights during training by using the `--checkpoint.last_save_in_hf` and `--checkpoint.last_save_model_only` options together. To directly load a `torchtitan` training session from a huggingface safetensors file, enable `--checkpoint.initial_load_in_hf`, and set either `--model.hf_assets_path` or `--checkpoint.initial_load_path` to the directory containing the huggingface checkpoint. `--checkpoint.initial_load_path` overrides `--model.hf_assets_path` if both are set.
 
 2. To directly reformat the weights without the need to run a training loop, run the corresponding conversion script. The naming scheme is `torchtitan`-centric, e.g. convert_from_hf means convert hf->tt.
 

diff --git a/docs/debugging.md b/docs/debugging.md
@@ -54,6 +54,69 @@ python -m torchtitan.config.manager --help
 
 This will print a structured configuration to `stdout`, allowing you to verify that overrides are being applied correctly.
 
+## Communication Mode (COMM_MODE) for Debugging
+
+The `COMM_MODE` environment variable provides specialized debugging modes that allow you to test and validate your training setup without requiring full multi-GPU distributed execution. This is particularly useful for rapid iteration during development and debugging.
+
+### Available Modes
+
+#### 1. `fake_backend` - Configuration Validation Mode
+
+This mode enables dry-run validation of your configuration, model setup, and rank-0 program logic without actual distributed communication:
+
+```bash
+NGPU=32 COMM_MODE="fake_backend" ./run_train.sh
+```
+
+**What it does:**
+- Uses fake process groups that simulate distributed communication without actual data transfer
+- Runs on a single GPU without `torchrun` or NCCL initialization
+- Validates configuration parsing, model initialization, and overall training workflow
+- Executes only one training step by default
+
+**When to use it:**
+- Quick validation of configuration files before launching expensive multi-GPU jobs
+- Debugging training and parallelism logic that doesn't require actual communication. Note that No data-dependent logic should be validated with "fake_backend".
+
+**Example use case:**
+```bash
+# Validate a 128-GPU configuration on a single GPU
+NGPU=128 COMM_MODE="fake_backend" CONFIG_FILE="./train_configs/llama3_70b.toml" ./run_train.sh
+```
+
+#### 2. `local_tensor` - Single-GPU Distributed Simulation
+
+This mode simulates the full distributed training workflow on a single GPU by executing all communication and computation locally:
+
+```bash
+NGPU=32 COMM_MODE="local_tensor" ./run_train.sh
+```
+
+**What it does:**
+- Simulates multi-GPU behavior on a single shared GPU
+- Executes all collectives (all-reduce, all-gather, etc.) locally without network communication
+- Maintains the same code paths as distributed training for accurate debugging
+- Runs only one training step by default
+
+**When to use it:**
+- Debugging distributed training logic (FSDP, TP, PP, CP, EP) with data dependencies without multi-GPU setup. Note that local tensor doesn't support FSDP2 but should support SimpleFSDP.
+- Verifying correctness of parallelism strategies locally
+- Testing gradient synchronization and communication patterns
+- Reproducing distributed training bugs in a simplified environment
+
+**Example use case:**
+```bash
+# Debug 8-way TP + 2-way FSDP on a single GPU
+NGPU=16 COMM_MODE="local_tensor" ./run_train.sh \
+  --parallelism.tensor_parallel_degree 8 \
+  --parallelism.data_parallel_shard_degree 2
+```
+
+### Limitations
+
+- **Performance testing**: Neither mode provides accurate performance metrics; use actual distributed runs for benchmarking
+- **Memory requirement**: Local tensor runs require more memory on a single GPU than the actual distributed runs
+
 ## Troubleshooting jobs that timeout
 
 If you encounter jobs that timeout, you'll need to debug them to identify the root cause. To help with this process, we've enabled Flight Recorder, a tool that continuously collects diagnostic information about your jobs.

diff --git a/docs/mxfp8.md b/docs/mxfp8.md
@@ -0,0 +1,190 @@
+## MXFP8 Training on B200 GPUs
+
+MXFP8 training can provide substantial training speedups for models where the majority of GEMMs are sufficiently large. MXFP8 is a microscaling format from the [MX OCP spec](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) that uses block-based scaling to maintain numerical accuracy while leveraging low-precision tensor cores. On NVIDIA B200 GPUs, MXFP8 training achieves up to **28% speedup** over bfloat16 baseline with minimal accuracy degradation.
+
+> **📖 For a comprehensive case study of using TorchTitan MXFP8 to train dense models at scale**, see our blog post: [Accelerating 2K+ Scale Pre-training up to 1.28x with TorchAO MXFP8 and TorchTitan on Crusoe B200 Cluster](https://pytorch.org/blog/accelerating-2k-scale-pre-training-up-to-1-28x-with-torchao-mxfp8-and-torchtitan-on-crusoe-b200-cluster/)
+
+### Table of Contents
+
+- [Requirements](#requirements)
+- [How MXFP8 Works](#how-mxfp8-works)
+- [MXFP8 for Linear Modules](#mxfp8-for-linear-modules)
+  - [Usage](#usage)
+- [MXFP8 for Grouped GEMMs (MoE)](#mxfp8-for-grouped-gemms-moe)
+  - [Usage](#usage-1)
+- [Example TOML Configuration](#example-toml-configuration)
+- [Performance](#performance)
+  - [Dense Models](#dense-models)
+  - [MoE models](#moe-models)
+- [Composability](#composability)
+- [Known Limitations](#known-limitations)
+- [Additional Resources](#additional-resources)
+
+### Requirements
+
+- NVIDIA B200 (SM100 or SM100a)
+- PyTorch nightly
+- TorchAO v0.14.0 or newer ([TorchAO Installation Guide](https://github.com/pytorch/ao#installation))
+
+Note: GB200 is also supported but requires building torchao from source (see installation guide above).
+
+### How MXFP8 Works
+
+MXFP8 differs from standard Float8 training in its scaling approach:
+
+- **Granular scaling factor**: Instead of using a single scale factor per tensor (tensorwise) or per row/column (rowwise), MXFP8 uses a more granular, block-based scaling with a default block size of 1x32 elements. Each block of 32 elements shares a common scale factor. The data dtype is `torch.float8_e4m3fn`, and the scale factor dtype is `torch.float8_e8mfnu`.
+- **Native hardware support**: On NVIDIA B200 (Blackwell) GPUs, MXFP8 GEMMs and Grouped GEMMs are accelerated using cuBLAS and CUTLASS kernels exposed via `torch._scaled_mm` and `torch._scaled_grouped_mm`, achieving up to 2x speedup over bfloat16 on common shapes.
+- **Dynamic quantization**: For every MXFP8 Linear or Grouped GEMM, activations and weights are dynamically quantized to MXFP8, then a MXFP8 GEMM/Grouped GEMM is performed, resulting in a net speedup.
+
+### MXFP8 for Linear Modules
+
+#### Usage
+
+To enable MXFP8 training for linear layers, launch your training job with the following command (or alternatively set configs in toml files):
+
+```bash
+CONFIG_FILE="./torchtitan/models/llama3/train_configs/llama3_8b.toml" ./run_train.sh \
+  --model.converters="quantize.linear.mx" \
+  --quantize.linear.mx.recipe_name="mxfp8_cublas" \
+  --compile.enable
+```
+
+**Configuration Options:**
+
+* `--model.converters="quantize.linear.mx"`: Swap `nn.Linear` with `MXLinear` to perform MXFP8 matmul.
+* `--quantize.linear.mx.recipe_name="mxfp8_cublas"`: Use the cuBLAS-based MXFP8 recipe for best performance on B200 GPUs. Alternative: `"mxfp8_cublas_rceil"` uses round-ceiling mode for scale calculation.
+* `--quantize.linear.mx.mxfp8_dim1_cast_kernel_choice="triton"`: Choose the kernel for dimension-1 quantization. Options: `"triton"` (default), `"cuda"`, or `"torch"`.
+* `--quantize.linear.mx.filter_fqns="..."` (optional): Comma-separated list of fully qualified names of modules not to convert to MXFP8 training.
+  * Example: `--quantize.linear.mx.filter_fqns="attention.wq,attention.wk,attention.wv,output"`
+  * This allows you to selectively apply MXFP8 only to layers that will benefit from it.
+* `--compile.enable` (required for competitive performance): Use `torch.compile` to fuse the MXFP8 scaling/casting kernels.
+
+**Hardware Requirements:**
+
+MXFP8 training requires NVIDIA B200 (SM100) or newer GPUs.
+
+### MXFP8 for Grouped GEMMs (MoE)
+
+For Mixture-of-Experts (MoE) models, MXFP8 can accelerate the expert computation through dynamically quantized grouped GEMMs.
+
+#### Usage
+
+To enable MXFP8 for MoE expert layers:
+
+```bash
+CONFIG_FILE="./torchtitan/models/llama4/train_configs/llama4_17bx16e.toml" ./run_train.sh \
+  --model.converters="quantize.grouped_mm.mx" \
+  --quantize.grouped_mm.mx.fqns="experts" \
+  --quantize.grouped_mm.mx.recipe_name="mxfp8" \
+  --compile.enable \
+  --model.print_after_conversion
+```
+
+**Combined usage**: You can use MXFP8 for both linear modules and grouped GEMMs simultaneously by specifying both converters:
+  ```bash
+  --model.converters="quantize.linear.mx,quantize.grouped_mm.mx"
+  ```
+
+**Configuration Options:**
+
+* `--model.converters="quantize.grouped_mm.mx"`: Enable MXFP8 grouped GEMM conversion for MoE layers.
+* `--quantize.grouped_mm.mx.fqns="experts"`: Comma-separated list of fully qualified names of MoE modules to apply MXFP8 dynamic quantization on grouped GEMM operations. Any module that matches the FQN will be converted, if it has (1) experts represented as 3d nn.Parameter instances (which is the case for TorchTitan MoEs), and (2) a `torch._grouped_mm` op performs the actual routed expert computation using those 3d expert weights.
+  * You can specify multiple FQNs to target different MoE layers in your model.
+* `--quantize.grouped_mm.mx.recipe_name="mxfp8"`: Quantization recipe for grouped GEMMs (currently only `"mxfp8"` is supported).
+* `--compile.enable`: Use `torch.compile` for best performance.
+
+**Important Notes:**
+
+* **Token group alignment**: For MoE training with MXFP8, token group sizes must be multiples of 32 (the MXFP8 block size). This is automatically configured [here](https://github.com/pytorch/torchtitan/blob/b39377f9fe33865fefb9bf64a33f6d74a598be87/torchtitan/components/quantization/mx.py#L131) when you enable MXFP8 grouped GEMMs in TorchTitan.
+
+* **torch.compile recommendation**: All benchmarks in this document were run with `torch.compile` enabled. We recommend using `torch.compile` for best performance.
+
+### Example TOML Configuration
+
+Here's an example configuration for MXFP8 training in a TOML file:
+
+```toml
+[model]
+converters = ["quantize.linear.mx", "quantize.grouped_mm.mx"]
+
+[quantize.linear.mx]
+recipe_name = "mxfp8_cublas"
+mxfp8_dim1_cast_kernel_choice = "cuda"
+filter_fqns = ["output", "router.gate"]
+
+[quantize.grouped_mm.mx]
+recipe_name = "mxfp8"
+fqns = ["experts"]
+
+[compile]
+enable = true
+components = ["model"]
+```
+
+### Performance
+
+#### Dense Models
+
+Single-node training on 8x power limited B200 GPUs, batch size 1, sequence length 8192, steps 100, torch.compile, FSDP2, per-op SAC:
+
+| Scaling Method          | Peak Memory (GB) | Median tokens/s | Speedup over BF16 |
+|------------------------|------------------|-----------------|-------------------|
+| None (bfloat16)        | 33.71           | 8307.5          | -                 |
+| mxfp8_cublas           | 33.88           | 9969.0          | +20.0%            |
+| mxfp8_cublas_rceil     | 33.88           | 9642.0          | +16.1%            |
+| float8 tensorwise      | 33.38           | 10417.0         | +25.4%            |
+
+- pytorch version: `2.9.0.dev20250815+cu128`
+- torchao version: `0.13.0+gite4e681be`
+- torchtitan commit: `6fc499f6f5b32151a799188be2208cfb09faed30`
+
+*Source: [TorchAO MX Formats Benchmarks](https://github.com/pytorch/ao/tree/main/torchao/prototype/mx_formats#training-e2e-benchmarks-on-nvidia-b200)*
+
+#### MoE models
+
+512 GPU training on 64 node GB200 cluster:
+
+| Scaling Method          | Median tokens/s | Speedup over BF16 |
+|------------------------|-----------------|-------------------|
+| None (bfloat16)        | 6169            | -                 |
+| mxfp8                  | 7401            | +20.3%            |
+
+Training runs on 64 node GB200 cluster with TorchTitan Llama4 Scout show that MXFP8 MoE training has equivalent convergence to bfloat16 training baseline. In fact, after 3,000 steps it finishes with slightly *lower* loss than bfloat16! This is consistent with our scaling experiments with [MXFP8 training for dense models](https://pytorch.org/blog/accelerating-2k-scale-pre-training-up-to-1-28x-with-torchao-mxfp8-and-torchtitan-on-crusoe-b200-cluster/).
+
+![MXFP8 vs BF16 Training Loss Curves](../assets/images/mxfp8_with_loss.png)
+
+*Training loss curves over 3,000 steps showing MXFP8 achieves equivalent convergence to bfloat16 baseline.*
+
+Training and model configurations for this run:
+- Model: Llama4 Scout
+- Dataset: C4
+- Sequence length: 8192
+- Local batch size: 10
+- Learning rate: 1e-4
+- LR scheduler warmup steps: 2000
+- Parallelisms (64 nodes of 4 devices each = 256 chips):
+    - FSDP=256 (on attention layers, shared experts, dense layer FFNs) and 256/4=64 (on routed experts)
+    - EP=16 (on routed experts)
+- Activation checkpointing mode: `none` (ideally this should use selective per op AC but there was a bug at the time preventing us from using it).
+- `torch.compile` enabled
+- `mxfp8` applied to routed experts computation (grouped GEMMs)
+- `mxfp8` applied to all linear layers except: `output`, `router.gate`, `attention.wk`, `attention.wv` (Wk and Wv too small to benefit from mxfp8)
+
+### Composability
+For distributed training, MXFP8 is compatible with:
+- `torch.compile`
+- FSDP2/TP/EP/PP
+- Full activation checkpointing
+
+All distributed communication for MXFP8 training is currently done in high precision.
+
+### Known Limitations
+- Currently in prototype stage - no BC guarantees.
+- Requires torch nightly - important bug fixes have landed since 2.9.1
+- For GB200s, requires building torchao from source
+
+### Additional Resources
+
+- [Accelerating 2K+ Scale Pre-training up to 1.28x with TorchAO MXFP8 and TorchTitan on Crusoe B200 Cluster](https://pytorch.org/blog/accelerating-2k-scale-pre-training-up-to-1-28x-with-torchao-mxfp8-and-torchtitan-on-crusoe-b200-cluster/) - Blog post on accelerating dense model training with MXFP8
+- [TorchAO MX Formats Documentation](https://github.com/pytorch/ao/tree/main/torchao/prototype/mx_formats)
+- [TorchAO MoE Training Documentation](https://github.com/pytorch/ao/tree/main/torchao/prototype/moe_training)
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,3 +8,5 @@ fsspec @@
     tyro
     tokenizers >= 0.15.0
     safetensors
+    einops
+    pillow