Merge branch 'develop' into feat/refactor-impl

zigzagcai · zigzagcai · commit 746eb9dc6a5e · 2025-02-25T18:17:07.000+08:00
diff --git a/configs/7B_isp_sft.py b/configs/7B_isp_sft.py
@@ -243,6 +243,7 @@
 cudnn_deterministic = False
 cudnn_benchmark = False
 
+
 monitor = dict(
     # feishu alert configs
     alert=dict(
diff --git a/internlm/model/model_implementations/transformers/modeling_internlm.py b/internlm/model/model_implementations/transformers/modeling_internlm.py
@@ -18,7 +18,6 @@
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.core.naive_amp import set_output_attr_to_module
-from internlm.core.parallel.shard import partition_uniform
 from internlm.model.model_implementations.transformers.base_model import (
     BaseTransformerModel,
 )
diff --git a/internlm/model/model_ops/moe/moe.py b/internlm/model/model_ops/moe/moe.py
@@ -1,5 +1,4 @@
 import torch
-import torch.nn.functional as F
 
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
diff --git a/internlm/model/model_ops/ops/ring_flash_attn/zigzag_ring_flash_attn_with_sliding_window.py b/internlm/model/model_ops/ops/ring_flash_attn/zigzag_ring_flash_attn_with_sliding_window.py
@@ -5,11 +5,10 @@
 from flash_attn.flash_attn_interface import _flash_attn_backward, _flash_attn_forward
 
 from internlm.core.context import global_context as gpc
+from internlm.core.parallel.comm import get_offload_manager
 
 from .utils import RingComm, update_out_and_lse
 
-fa_output_mapping = {}
-
 
 def create_buffer(tensor):
     buffer_shape = list(tensor.shape)
@@ -443,9 +442,10 @@ def forward(
         k = k.contiguous()
         v = v.contiguous()
 
-        if gpc.is_forward is False and gpc.config.selective_checkpoint:
-            assert layer_idx in fa_output_mapping
-            out, softmax_lse = fa_output_mapping.pop(layer_idx)
+        _ckpt_block_num = int(gpc.config.model.checkpoint * gpc.config.isp_num_layers)
+
+        if gpc.is_forward is False and gpc.config.selective_checkpoint and layer_idx < _ckpt_block_num:
+            out, softmax_lse = get_offload_manager().get_fa_output_with_layer(layer_idx)
         else:
             out, softmax_lse = zigzag_double_ring_flash_attn_forward(
                 context_group,
@@ -460,8 +460,8 @@ def forward(
             )
 
         # store attn forward output to avoid re-computation of attn when activation checkpoint is enabled
-        if gpc.is_forward and gpc.config.selective_checkpoint:
-            fa_output_mapping[layer_idx] = (out, softmax_lse)
+        if gpc.is_forward and gpc.config.selective_checkpoint and layer_idx < _ckpt_block_num:
+            get_offload_manager().insert_fa_output_with_layer(layer_idx=layer_idx, output=(out, softmax_lse))
 
         # this should be out_padded
         ctx.save_for_backward(q, k, v, out, softmax_lse)
diff --git a/internlm/model/model_ops/utils.py b/internlm/model/model_ops/utils.py
@@ -5,7 +5,6 @@
 from tqdm import tqdm
 
 from internlm.core.context import global_context as gpc
-from internlm.model.model_ops.modules.mha import MHA
 from internlm.utils.logger import get_logger
 from internlm.utils.storage_manager import get_fns, llm_load
 from internlm.utils.utils import TensorParallelMode

Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,6 @@`
`18`	`18`	`from internlm.core.context import ParallelMode`
`19`	`19`	`from internlm.core.context import global_context as gpc`
`20`	`20`	`from internlm.core.naive_amp import set_output_attr_to_module`
`21`		`-from internlm.core.parallel.shard import partition_uniform`
`22`	`21`	`from internlm.model.model_implementations.transformers.base_model import (`
`23`	`22`	`BaseTransformerModel,`
`24`	`23`	`)`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`	`import torch`
`2`		`-import torch.nn.functional as F`
`3`	`2`
`4`	`3`	`from internlm.core.context import ParallelMode`
`5`	`4`	`from internlm.core.context import global_context as gpc`