InternLM
diff --git a/‎.github/workflows/demo_in_readme.yaml
+2 b/‎.github/workflows/demo_in_readme.yaml
+2
diff --git a/‎.github/workflows/lint_check.yaml
+2-6 b/‎.github/workflows/lint_check.yaml
+2-6
diff --git a/‎README-ja-JP.md
+1-1 b/‎README-ja-JP.md
+1-1
diff --git a/‎README-zh-Hans.md
+1-1 b/‎README-zh-Hans.md
+1-1
diff --git a/‎README.md
+1-1 b/‎README.md
+1-1
diff --git a/‎ci_scripts/train/generate_config.py
+1-1 b/‎ci_scripts/train/generate_config.py
+1-1
diff --git a/‎ci_scripts/train/load_ckpt.sh
+1-1 b/‎ci_scripts/train/load_ckpt.sh
+1-1
diff --git a/‎ci_scripts/train/slurm_train.sh
+1-1 b/‎ci_scripts/train/slurm_train.sh
+1-1
diff --git a/‎ci_scripts/train/torchrun.sh
+1-1 b/‎ci_scripts/train/torchrun.sh
+1-1
diff --git a/‎doc/code-docs/locales/en/LC_MESSAGES/training.po
+2-2 b/‎doc/code-docs/locales/en/LC_MESSAGES/training.po
+2-2
diff --git a/‎doc/code-docs/source/example/20B_demo.rst
+1-1 b/‎doc/code-docs/source/example/20B_demo.rst
+1-1
diff --git a/‎doc/code-docs/source/example/7B_demo.rst
+1-1 b/‎doc/code-docs/source/example/7B_demo.rst
+1-1
diff --git a/‎doc/code-docs/source/initialize.rst
+1-1 b/‎doc/code-docs/source/initialize.rst
+1-1
diff --git a/‎doc/code-docs/source/training.rst
+3-3 b/‎doc/code-docs/source/training.rst
+3-3
diff --git a/‎doc/en/usage.md
+1-1 b/‎doc/en/usage.md
+1-1
diff --git a/‎doc/usage.md
+1-1 b/‎doc/usage.md
+1-1
diff --git a/‎generate.py
+6-4 b/‎generate.py
+6-4
diff --git a/‎internlm/__init__.py
-9 b/‎internlm/__init__.py
-9
diff --git a/‎internlm/accelerator/abstract_accelerator.py
+21-4 b/‎internlm/accelerator/abstract_accelerator.py
+21-4
diff --git a/‎internlm/accelerator/cuda_accelerator.py
+1 b/‎internlm/accelerator/cuda_accelerator.py
+1
diff --git a/‎internlm/accelerator/dipu_accelerator.py
+1 b/‎internlm/accelerator/dipu_accelerator.py
+1
diff --git a/‎internlm/accelerator/ditorch_accelerator.py
+1 b/‎internlm/accelerator/ditorch_accelerator.py
+1
diff --git a/‎internlm/accelerator/npu_accelerator.py
+1 b/‎internlm/accelerator/npu_accelerator.py
+1
diff --git a/‎internlm/apis/inference_utils.py
+2-2 b/‎internlm/apis/inference_utils.py
+2-2
@@ -63,6 +63,7 @@ jobs:
         export GITHUB_WORKSPACE=$GITHUB_WORKSPACE
         export SLURM_PARTITION=$SLURM_PARTITION
         source activate ${evo_env_torch21_flash2}
+        export PYTHONPATH=$PWD:$PYTHONPATH
         sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
         EOF
 
@@ -97,6 +98,7 @@ jobs:
         export GITHUB_WORKSPACE=$GITHUB_WORKSPACE
         export SLURM_PARTITION=$SLURM_PARTITION
         source activate ${evo_env_torch21_flash2}
+        export PYTHONPATH=$PWD:$PYTHONPATH
         sh ./ci_scripts/train/torchrun.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
         rm -rf $GITHUB_WORKSPACE/llm_ckpts
         EOF
 
@@ -18,25 +18,21 @@ jobs:
       run: |
         pip install flake8==v3.8.4
         FLAKE_DISABLE_LIST="F403,F405,W504,W503,E203"
-        flake8 --max-line-length=120 --ignore=$FLAKE_DISABLE_LIST --exclude=./internlm/model/ops/ring_flash_attn/zigzag_ring_flash_attn_with_sliding_window.py ./internlm/*
-        flake8 --max-line-length=120 --ignore=$FLAKE_DISABLE_LIST ./train.py
+        flake8 --max-line-length=120 --ignore=$FLAKE_DISABLE_LIST --exclude=./internlm/model/model_ops/ops/ring_flash_attn/zigzag_ring_flash_attn_with_sliding_window.py ./internlm/*
 
     - name: lint-isort
       run: |
         pip install isort==5.12.0
         isort --check --profile=black ./internlm/*
-        isort --check --profile=black ./train.py
 
     - name: lint-black
       run: |
         pip install black==22.8.0
         BLACK_EXCLUDE_SETTINGS='\.venv/|\.local/|\.cache/|\.git/'
         black --line-length=120 --check --exclude $BLACK_EXCLUDE_SETTINGS ./internlm/*
-        black --line-length=120 --check --exclude $BLACK_EXCLUDE_SETTINGS ./train.py
 
     - name: lint-pylint
       run: |
         pip install pylint==v2.17.2
         PYLINT_DISABLE_LIST="C0114,C0415,W0212,W0235,W0238,W0621,C0103,R1735,C2801,E0402,C0412,W0719,R1728,W1514,W0718,W0105,W0707,C0209,W0703,W1203"
-        pylint --rcfile .pylintrc --disable=$PYLINT_DISABLE_LIST --ignore=./internlm/model/ops/ring_flash_attn/zigzag_ring_flash_attn_with_sliding_window.py ./internlm/*
-        pylint --rcfile .pylintrc --disable=$PYLINT_DISABLE_LIST ./train.py
+        pylint --rcfile .pylintrc --disable=$PYLINT_DISABLE_LIST --ignore=./internlm/model/model_ops/ops/ring_flash_attn/zigzag_ring_flash_attn_with_sliding_window.py ./internlm/*
@@ -99,7 +99,7 @@ data = dict(
 
 Slurm環境で2ノード16カードを使用する場合、コマンドは以下の通りです：
 ```bash
-$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/7B_sft.py
+$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
 ```
 
 torchを使用し、1ノード8カードで実行する場合、コマンドは以下の通りです：
 
@@ -99,7 +99,7 @@ data = dict(
 
 slurm环境，双机16卡，启动训练命令如下：
 ```bash
-$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/7B_sft.py
+$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
 ```
 
 torch环境，单机8卡，启动训练命令如下：
 
@@ -99,7 +99,7 @@ Training can be started on slurm or torch distributed environment.
 
 On slurm, using 2 nodes and 16 cards, the command is as follows:
 ```bash
-$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/7B_sft.py
+$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
 ```
 
 On torch, using 1 node and 8 cards, the command is as follows:
 
@@ -5,7 +5,7 @@
 import os
 
 from ci_scripts.common import com_func
-from internlm.core.context import Config
+from internlm.utils.config import Config
 
 
 def generate_new_config(config_py_file, test_config_json, case_name):
 
@@ -22,7 +22,7 @@ if [[ ! -f ${file} ]]; then
         exit_code=$(($exit_code + 1))
     fi
 
-srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ${file}
+srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python internlm/launcher/launch.py --config ${file}
 [[ $? -ne 0 ]] && { echo "test slurm training failed.";  exit_code=$(($exit_code + 1)); }
 
 
 
@@ -22,7 +22,7 @@ if [[ -d ${CKPTS20_PATH} ]]; then
     fi
 fi
 
-srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./ci_scripts/train/ci_7B_sft.py
+srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python internlm/launcher/launch.py --config ./ci_scripts/train/ci_7B_sft.py
 [[ $? -ne 0 ]] && { echo "test slurm training failed.";  exit_code=$(($exit_code + 1)); }
 
 num=$(num_files "${CKPTS20_OUTPUT}")
 
@@ -22,7 +22,7 @@ if [[ -d ${CKPTS20_PATH} ]]; then
     fi
 fi
 
-srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$1 -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 train.py --config ./ci_scripts/train/ci_7B_sft.py --launcher torch
+srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$1 -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 internlm/launcher/launch.py --config ./ci_scripts/train/ci_7B_sft.py --launcher torch
 [[ $? -ne 0 ]] && { echo "test torch training failed.";  exit_code=$(($exit_code + 1)); }
 
 num=$(num_files "${CKPTS_OUTPUT}")
 
@@ -68,10 +68,10 @@ msgstr "Initialize Distributed Training Environment"
 
 #: ../../source/training.rst:23
 msgid ""
-"调用 ``initialize_distributed_env`` 函数，支持通过 slurm 或 torch "
+"调用 ``init_distributed`` 函数，支持通过 slurm 或 torch "
 "方式启动训练脚本，并传入配置文件、端口号、进程随机种子等信息。函数详细说明如下："
 msgstr ""
-"Call the initialize_distributed_env function, which supports launching "
+"Call the init_distributed function, which supports launching "
 "the training script through Slurm or Torch, and pass in information such "
 "as the configuration file, port number, and process random seed. Detailed"
 " description of the function is as follows:"
 
@@ -167,7 +167,7 @@
 
 .. code-block:: bash
 
-    srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/20B_sft.py
+    srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/20B_sft.py
 
 训练结果
 ----------------
 
@@ -165,7 +165,7 @@
 
 .. code-block:: bash
 
-    srun -p internllm -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/7B_sft.py
+    srun -p internllm -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
 
 训练结果
 ----------------
 
@@ -43,7 +43,7 @@ InternEvo 使用 `argparse <https://docs.python.org/3/library/argparse.html>`_
 模型初始化
 -------------------------
 
-.. autofunction:: internlm.train.initialize_model_and_parallel_communicator
+.. autofunction:: internlm.initialize.initialize_model.initialize_model_and_parallel_communicator
 
 InternEvo 在配置文件中使用字段 ``model_type`` 和 ``model`` 来控制模型初始化过程。示例模型初始化配置定义如下：
 
 
@@ -18,11 +18,11 @@
 - 初始化分布式训练环境
 .. code-block:: python
 
-    initialize_distributed_env(config=args.config, launcher=args.launcher, master_port=args.port, seed=args.seed)
+    init_distributed(config=args.config, launcher=args.launcher, master_port=args.port, seed=args.seed)
 
-调用 ``initialize_distributed_env`` 函数，支持通过 slurm 或 torch 方式启动训练脚本，并传入配置文件、端口号、进程随机种子等信息。函数详细说明如下：
+调用 ``init_distributed`` 函数，支持通过 slurm 或 torch 方式启动训练脚本，并传入配置文件、端口号、进程随机种子等信息。函数详细说明如下：
 
-.. autofunction:: internlm.initialize.initialize_distributed_env
+.. autofunction:: internlm.initialize.init_distributed
 
 - 初始化模型
 .. code-block:: python
 
@@ -407,7 +407,7 @@ After completing the data preparation and relevant training configurations menti
 If you want to start distributed training on slurm with 16 GPUs across multiple nodes, use the following command:
 
 ```bash
-$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/7B_sft.py
+$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
 ```
 
 If you want to start distributed training on torch with 8 GPUs on a single node, use the following command:
 
@@ -453,7 +453,7 @@ parallel = dict(
 
 若在 slurm 上启动分布式运行环境，多节点 16 卡的运行命令如下所示：
 ```bash
-$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/7B_sft.py
+$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
 ```
 
 若在 torch 上启动分布式运行环境，单节点 8 卡的运行命令如下所示：
 
@@ -18,10 +18,12 @@
 from internlm.apis.inference import SequenceGenerator
 from internlm.core.context import global_context as gpc
 from internlm.data import build_generation_loader_with_data_type
-from internlm.initialize import initialize_distributed_env
+from internlm.initialize import initialize_launcher
+from internlm.initialize.initialize_model import (
+    initialize_model_and_parallel_communicator,
+)
 from internlm.monitor import initialize_monitor_manager
-from internlm.monitor.monitor import monitor_manager as mm
-from internlm.train import initialize_model_and_parallel_communicator
+from internlm.monitor import monitor_manager as mm
 from internlm.utils.common import (
     enable_pytorch_expandable_segments,
     launch_time,
@@ -219,7 +221,7 @@ def main():
     hostname = socket.gethostname()
 
     # initialize distributed environment
-    initialize_distributed_env(config=args.config, launcher=args.launcher, master_port=args.port, seed=args.seed)
+    initialize_launcher(config=args.config, launcher=args.launcher, distributed_port=args.port, seed=args.seed)
     assert hasattr(gpc, "config") and gpc.config is not None
     assert "generation" in gpc.config, f"Please set `generation` config in `{args.config}` file"
     assert (
 
@@ -1,9 +0,0 @@
-from .initialize.initialize_trainer import initialize_trainer
-from .initialize.launch import get_default_parser, launch_from_slurm, launch_from_torch
-
-__all__ = [
-    "get_default_parser",
-    "initialize_trainer",
-    "launch_from_slurm",
-    "launch_from_torch",
-]
@@ -1,8 +1,10 @@
 """
 Universal accelerator interface implementation, inspired by DeepSpeed.
 """
+import abc
 import enum
 import os
+from abc import ABC
 
 
 class AcceleratorType(enum.Enum):
@@ -17,57 +19,72 @@ class AcceleratorType(enum.Enum):
 internlm_accelerator = None
 
 
-class Accelerator:
+class Accelerator(ABC):
     """
     Abstract base class for accelerator
     """
 
     def __init__(self) -> None:
-        pass
+        self._name_str = None
+        self._communication_backend_name = None
 
+    @abc.abstractmethod
     def get_backend_name(self):
         """
         Return the name of the accelerator.
         """
         raise NotImplementedError
 
+    @abc.abstractmethod
     def get_accelerator_backend(self):
         """
-        Return the name of the backend.
+        Return the name of the accelerator backend.
         """
         raise NotImplementedError
 
-    # Device APIs
+    @abc.abstractmethod
+    def communication_backend_name(self):
+        """
+        Return the name of the communication backend.
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
     def device_name(self, device_index=None):
         """
         Return the name of the device.
         """
         raise NotImplementedError
 
+    @abc.abstractmethod
     def set_device(self, device_index):
         """
         Bind the current process to a device.
         """
         raise NotImplementedError
 
+    @abc.abstractmethod
     def get_device_id(self):
         """
         Return the current device index.
         """
         raise NotImplementedError
 
+    @abc.abstractmethod
     def current_device_name(self):
         """
         Return the name of the current device.
         """
         raise NotImplementedError
 
+    @abc.abstractmethod
     def device_count(self):
         """
         Return the number of devices on the machine.
         """
         raise NotImplementedError
 
+    @abc.abstractmethod
     def synchronize(self, device_index=None):
         """
         Synchronize the current process.
 
@@ -14,6 +14,7 @@ class CUDA_Accelerator(Accelerator):
     """
 
     def __init__(self) -> None:
+        super().__init__()
         self._name_str = "cuda"
         self._communication_backend_name = "nccl"
         self.amp = self.get_amp()
 
@@ -14,6 +14,7 @@ class DIPU_Accelerator(Accelerator):
     """
 
     def __init__(self) -> None:
+        super().__init__()
         self._name_str = "cuda"
         self._communication_backend_name = "nccl"
         self.amp = self.get_amp()
 
@@ -14,6 +14,7 @@ class DITORCH_Accelerator(Accelerator):
     """
 
     def __init__(self) -> None:
+        super().__init__()
         self._name_str = "cuda"
         self._communication_backend_name = "nccl"
         self.amp = self.get_amp()
 
@@ -14,6 +14,7 @@ class ASCEND_Accelerator(Accelerator):
     """
 
     def __init__(self) -> None:
+        super().__init__()
         self._name_str = "npu"
         self._communication_backend_name = "hccl"
         self.amp = self.get_amp()
 
@@ -2,7 +2,7 @@
 
 from internlm.core.context import ParallelMode  # noqa: E402
 from internlm.core.context import global_context as gpc  # noqa: E402
-from internlm.core.parallel.comm.utils import _gather as gather
+from internlm.core.parallel.comm.utils import _gather
 
 
 class InferenceParams:
@@ -64,6 +64,6 @@ def process_parallel_output(model_output):
 
     # gather tp parallel output
     if gpc.config.model.parallel_output and gpc.is_initialized(ParallelMode.TENSOR):
-        return gather(model_output, ParallelMode.TENSOR, -1)
+        return _gather(model_output, ParallelMode.TENSOR, -1)
     else:
         return model_output