InternLM
diff --git a/‎configs/1.8B_MoE16_sft.py
+1-2 b/‎configs/1.8B_MoE16_sft.py
+1-2
diff --git a/‎configs/57B_qwen2_MoE.py
+1-2 b/‎configs/57B_qwen2_MoE.py
+1-2
diff --git a/‎configs/7B_MoE4_sft.py
+1-2 b/‎configs/7B_MoE4_sft.py
+1-2
diff --git a/‎configs/7B_baichuan2.py
-1 b/‎configs/7B_baichuan2.py
-1
diff --git a/‎configs/7B_gemma.py
-1 b/‎configs/7B_gemma.py
-1
diff --git a/‎configs/7B_internlm2.py
-1 b/‎configs/7B_internlm2.py
-1
diff --git a/‎configs/7B_isp_sft.py
-1 b/‎configs/7B_isp_sft.py
-1
diff --git a/‎configs/7B_llama2.py
-1 b/‎configs/7B_llama2.py
-1
diff --git a/‎configs/7B_qwen2.py
-1 b/‎configs/7B_qwen2.py
-1
diff --git a/‎configs/7B_sft.py
-1 b/‎configs/7B_sft.py
-1
diff --git a/‎configs/8x22B_mixtral.py
+1-2 b/‎configs/8x22B_mixtral.py
+1-2
diff --git a/‎configs/8x7B_mixtral.py
+1-2 b/‎configs/8x7B_mixtral.py
+1-2
diff --git a/‎configs/_base_/models/internlm2_1B.py
-1 b/‎configs/_base_/models/internlm2_1B.py
-1
diff --git a/‎configs/_base_/models/internlm2_20B.py
-1 b/‎configs/_base_/models/internlm2_20B.py
-1
diff --git a/‎configs/_base_/models/internlm2_7B.py
-1 b/‎configs/_base_/models/internlm2_7B.py
-1
diff --git a/‎configs/_base_/models/internlm_20B.py
-1 b/‎configs/_base_/models/internlm_20B.py
-1
diff --git a/‎configs/_base_/models/internlm_7B.py
-1 b/‎configs/_base_/models/internlm_7B.py
-1
diff --git a/‎doc/code-docs/source/initialize.rst
+1-1 b/‎doc/code-docs/source/initialize.rst
+1-1
diff --git a/‎doc/code-docs/source/training.rst
+1-1 b/‎doc/code-docs/source/training.rst
+1-1
diff --git a/‎doc/en/train_performance.md
+1-1 b/‎doc/en/train_performance.md
+1-1
diff --git a/‎doc/train_performance.md
+1-1 b/‎doc/train_performance.md
+1-1
diff --git a/‎doc/usage.md
-2 b/‎doc/usage.md
-2
diff --git a/‎generate.py
+2-3 b/‎generate.py
+2-3
diff --git a/‎internlm/checkpoint/checkpoint_manager.py
+4-1 b/‎internlm/checkpoint/checkpoint_manager.py
+4-1
@@ -170,7 +170,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
@@ -197,7 +196,7 @@
     2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
 """
 parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
+    zero1=dict(size=-1),
     tensor=dict(size=1, mode="mtp"),
     pipeline=dict(size=1, interleaved_overlap=True),
     weight=dict(size=1, overlap=True),
 
@@ -175,7 +175,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
@@ -202,7 +201,7 @@
     2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
 """
 parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
+    zero1=dict(size=-1),
     tensor=dict(size=1, mode="mtp"),
     pipeline=dict(size=1, interleaved_overlap=True),
     weight=dict(size=1, overlap=True),
 
@@ -182,7 +182,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
@@ -217,7 +216,7 @@
     4. forward_overlap_per: str, all gather prefetch granularity, per 'module' or per 'layer', defaults to 'layer'.
 """
 parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
+    zero1=dict(size=-1),
     tensor=dict(size=1, mode="mtp"),
     pipeline=dict(size=1, interleaved_overlap=True),
     weight=dict(size=1, overlap=True, launch_allgather_before="wo", forward_overlap_per="layer"),
 
@@ -165,7 +165,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
 
@@ -172,7 +172,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
 
@@ -174,7 +174,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
 
@@ -187,7 +187,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
 
@@ -164,7 +164,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
 
@@ -172,7 +172,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
 
@@ -174,7 +174,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
 
@@ -176,7 +176,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
@@ -203,7 +202,7 @@
     2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
 """
 parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
+    zero1=dict(size=-1),
     tensor=dict(size=1, mode="mtp"),
     pipeline=dict(size=1, interleaved_overlap=True),
     weight=dict(size=1, overlap=True),
 
@@ -176,7 +176,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
@@ -203,7 +202,7 @@
     2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
 """
 parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
+    zero1=dict(size=-1),
     tensor=dict(size=1, mode="mtp"),
     pipeline=dict(size=1, interleaved_overlap=True),
     weight=dict(size=1, overlap=True),
 
@@ -51,7 +51,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
 
@@ -48,7 +48,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
 
@@ -48,7 +48,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
 
@@ -43,7 +43,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
 
@@ -43,7 +43,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
 
@@ -43,7 +43,7 @@ InternEvo 使用 `argparse <https://docs.python.org/3/library/argparse.html>`_
 模型初始化
 -------------------------
 
-.. autofunction:: internlm.train.initialize_model
+.. autofunction:: internlm.train.initialize_model_and_parallel_communicator
 
 InternEvo 在配置文件中使用字段 ``model_type`` 和 ``model`` 来控制模型初始化过程。示例模型初始化配置定义如下：
 
 
@@ -27,7 +27,7 @@
 - 初始化模型
 .. code-block:: python
 
-    model = initialize_model()
+    model = initialize_model_and_parallel_communicator()
 
 详细介绍请参考： `模型初始化 <https://internevo.readthedocs.io/zh-cn/latest/initialize.html#internlm-model-init>`_
 
 
@@ -121,7 +121,7 @@ model = dict(
 )
 
 parallel = dict(
-    zero1=dict(size=8, fsdp=False),
+    zero1=dict(size=8),
     tensor=1,
     pipeline=dict(size=1, interleaved_overlap=True),
     sequence_parallel=False,
 
@@ -117,7 +117,7 @@ model = dict(
 )
 
 parallel = dict(
-    zero1=dict(size=8, fsdp=False),
+    zero1=dict(size=8),
     tensor=1,
     pipeline=dict(size=1, interleaved_overlap=True),
     sequence_parallel=False,
 
@@ -268,7 +268,6 @@ zero1 parallel (dict):
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
@@ -432,7 +431,6 @@ parallel = dict(
     - 当`zero1 <= 0`，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配
     - 当`zero1 == 1`，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数
     - 当`zero1 > 1`且`zero1 <= data_parallel_world_size`，则 zero1 进程组是数据并行进程组的子集
-  2. fsdp: 布尔值，启用/禁用torch的完全分片数据并行，默认为False。
 - tensor（字典）：
   1. size: 整数，张量并行的大小。
   2. mode: 字符串，张量并行模式，应该是 ['mtp', 'msp', 'fsp', 'isp'] 中的一个，
 
@@ -21,7 +21,7 @@
 from internlm.initialize import initialize_distributed_env
 from internlm.monitor import initialize_monitor_manager
 from internlm.monitor.monitor import monitor_manager as mm
-from internlm.train import initialize_model, initialize_parallel_communicator
+from internlm.train import initialize_model_and_parallel_communicator
 from internlm.utils.common import (
     enable_pytorch_expandable_segments,
     launch_time,
@@ -106,8 +106,7 @@ def main():
         raise e
 
     # initialize model
-    model = initialize_model()
-    _ = initialize_parallel_communicator(model)
+    model, _ = initialize_model_and_parallel_communicator()
     model = model.model
 
     state_dict = merge_pp_within_tp(generation_config.ckpt_folder, del_model_prefix=True)
 
@@ -23,6 +23,7 @@
 from internlm.utils.common import get_current_device
 from internlm.utils.logger import get_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
+from internlm.utils.parallel import is_using_fsdp, is_using_hf
 from internlm.utils.storage_manager import (
     get_storage_manager,
     init_storage_manager,
@@ -271,7 +272,7 @@ def __init__(
         self.storage_manager = get_storage_manager()
         self.snapshot_counter = -1
 
-        if hasattr(model, "model"):
+        if hasattr(model, "model") and not is_using_fsdp():
             model = model.model
 
         self.model = model
@@ -575,6 +576,8 @@ def try_resume_training(self, train_state: TrainState, current_time=""):
                     f"tp={gpc.get_local_rank(ParallelMode.TENSOR)},pp={gpc.get_local_rank(ParallelMode.PIPELINE)},"
                     f"dp={gpc.get_local_rank(ParallelMode.DATA)}==========="
                 )
+        elif is_using_fsdp() and is_using_hf() and not self.auto_resume:
+            pass
         else:
             load_path = self.load_ckpt_info["path"]
             load_content = self.load_ckpt_info["content"]
Original file line number	Diff line number	Diff line change
`@@ -121,7 +121,7 @@ model = dict(`
`121`	`121`	`)`
`122`	`122`
`123`	`123`	`parallel = dict(`
`124`		`- zero1=dict(size=8, fsdp=False),`
	`124`	`+ zero1=dict(size=8),`
`125`	`125`	`tensor=1,`
`126`	`126`	`pipeline=dict(size=1, interleaved_overlap=True),`
`127`	`127`	`sequence_parallel=False,`
Original file line number	Diff line number	Diff line change
`@@ -117,7 +117,7 @@ model = dict(`
`117`	`117`	`)`
`118`	`118`
`119`	`119`	`parallel = dict(`
`120`		`- zero1=dict(size=8, fsdp=False),`
	`120`	`+ zero1=dict(size=8),`
`121`	`121`	`tensor=1,`
`122`	`122`	`pipeline=dict(size=1, interleaved_overlap=True),`
`123`	`123`	`sequence_parallel=False,`