support fp8 gemm

zigzagcai · zigzagcai · commit e692eaa54c89 · 2025-02-20T16:28:39.000+08:00
diff --git a/internlm/core/engine.py b/internlm/core/engine.py
@@ -3,13 +3,18 @@
 
 # adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/engine
 
+from contextlib import nullcontext
 from typing import List, Optional
 
 import torch
+import transformer_engine.pytorch as te
 from torch.nn import Module
 from torch.nn.modules.loss import _Loss
 from torch.optim.lr_scheduler import _LRScheduler
+from transformer_engine.common.recipe import DelayedScaling, Format
 
+from internlm.core.context import ParallelMode
+from internlm.core.context import global_context as gpc
 from internlm.core.gradient_handler import BaseGradientHandler
 from internlm.solver.optimizer import BaseOptimizer
 from internlm.solver.schedulers import Beta2Scheduler
@@ -78,6 +83,28 @@ def __init__(
         # build gradient handler
         self._gradient_handlers = gradient_handlers if gradient_handlers else []
 
+        # FP8 GEMM
+        fp8_cfg = gpc.config.get("fp8", None)
+        self.use_fp8 = fp8_cfg is not None
+        self.fp8_recipe = None
+        self.fp8_group = None
+        if self.use_fp8:
+            self.fp8_group = gpc.get_group(ParallelMode.GLOBAL)
+            if fp8_cfg.format == "e4m3":
+                fp8_format = Format.E4M3
+            elif fp8_cfg.format == "hybrid":
+                fp8_format = Format.HYBRID
+            else:
+                raise ValueError("The DelayedScaling recipe only supports E4M3 and HYBRID formats.")
+            self.fp8_recipe = DelayedScaling(
+                margin=fp8_cfg.margin,
+                interval=fp8_cfg.interval,
+                fp8_format=fp8_format,
+                amax_history_len=fp8_cfg.amax_history_len,
+                amax_compute_algo=fp8_cfg.amax_compute_algo,
+                override_linear_precision=(False, False, not fp8_cfg.fp8_wgrad),
+            )
+
     @property
     def model(self):
         """Returns the model attached to the engine."""
@@ -166,7 +193,11 @@ def __call__(self, *args, **kwargs):
         Returns:
             torch.Tensor: The output of the model.
         """
-        return self.model(*args, **kwargs)
+        with te.fp8_autocast(
+            enabled=self.use_fp8, fp8_recipe=self.fp8_recipe, fp8_group=self.fp8_group
+        ) if self.use_fp8 else nullcontext():
+            output = self.model(*args, **kwargs)
+        return output
 
     def load_batch(self, data_iter, to_gpu=True):
         """
diff --git a/internlm/model/model_implementations/builder.py b/internlm/model/model_implementations/builder.py
@@ -1,6 +1,7 @@
 from typing import List, Union
 
 import torch
+import transformer_engine.pytorch as te
 from torch import nn
 
 from internlm.core.context import ParallelMode
@@ -22,6 +23,31 @@
 logger = get_logger(__file__)
 
 
+def simple_swap(model, device):
+    for submodule_name, submodule in model.named_modules():
+        if isinstance(submodule, torch.nn.Linear):
+            path_in_state_dict = submodule_name.split(".")
+            current_module = model
+
+            # traverse to leaf module
+            leaf_path = path_in_state_dict[:-1]
+            leaf_name = path_in_state_dict[-1]
+            for child_name in leaf_path:
+                current_module = getattr(current_module, child_name)
+
+            # perform a swap
+            old_leaf = getattr(current_module, leaf_name)
+            new_leaf = te.Linear(old_leaf.in_features, old_leaf.out_features, old_leaf.bias is not None, device=device)
+            with torch.no_grad():
+                new_leaf.weight.copy_(old_leaf.weight)
+                assert torch.equal(new_leaf.weight, old_leaf.weight)
+                if old_leaf.bias is not None:
+                    new_leaf.bias.copy_(old_leaf.bias)
+                    assert torch.equal(new_leaf.bias, old_leaf.bias)
+
+            setattr(current_module, leaf_name, new_leaf)
+
+
 def create_model() -> Union[nn.Module, List[nn.Module]]:
     if is_using_hf():
         model = create_model_hf(hf=gpc.config.hf)
@@ -130,4 +156,7 @@ def traverse(module):
         else:
             traverse(model)
 
+    if gpc.config.get("fp8", None) is not None:
+        simple_swap(model, fsdp_init_method)
+
     return model