Skip to content

Commit a70aaf6

Browse files
committed
initial refactor: (1) reorg src structure to avoid cyclic imports (2)remove legacy or history codes (3)refine initializer interface
1 parent 0bc7552 commit a70aaf6

File tree

162 files changed

+1974
-2187
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

162 files changed

+1974
-2187
lines changed

.github/workflows/demo_in_readme.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ jobs:
6363
export GITHUB_WORKSPACE=$GITHUB_WORKSPACE
6464
export SLURM_PARTITION=$SLURM_PARTITION
6565
source activate ${evo_env_torch21_flash2}
66+
export PYTHONPATH=$PWD:$PYTHONPATH
6667
sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
6768
EOF
6869
@@ -97,6 +98,7 @@ jobs:
9798
export GITHUB_WORKSPACE=$GITHUB_WORKSPACE
9899
export SLURM_PARTITION=$SLURM_PARTITION
99100
source activate ${evo_env_torch21_flash2}
101+
export PYTHONPATH=$PWD:$PYTHONPATH
100102
sh ./ci_scripts/train/torchrun.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
101103
rm -rf $GITHUB_WORKSPACE/llm_ckpts
102104
EOF

.github/workflows/lint_check.yaml

+2-6
Original file line numberDiff line numberDiff line change
@@ -18,25 +18,21 @@ jobs:
1818
run: |
1919
pip install flake8==v3.8.4
2020
FLAKE_DISABLE_LIST="F403,F405,W504,W503,E203"
21-
flake8 --max-line-length=120 --ignore=$FLAKE_DISABLE_LIST --exclude=./internlm/model/ops/ring_flash_attn/zigzag_ring_flash_attn_with_sliding_window.py ./internlm/*
22-
flake8 --max-line-length=120 --ignore=$FLAKE_DISABLE_LIST ./train.py
21+
flake8 --max-line-length=120 --ignore=$FLAKE_DISABLE_LIST --exclude=./internlm/model/model_ops/ops/ring_flash_attn/zigzag_ring_flash_attn_with_sliding_window.py ./internlm/*
2322
2423
- name: lint-isort
2524
run: |
2625
pip install isort==5.12.0
2726
isort --check --profile=black ./internlm/*
28-
isort --check --profile=black ./train.py
2927
3028
- name: lint-black
3129
run: |
3230
pip install black==22.8.0
3331
BLACK_EXCLUDE_SETTINGS='\.venv/|\.local/|\.cache/|\.git/'
3432
black --line-length=120 --check --exclude $BLACK_EXCLUDE_SETTINGS ./internlm/*
35-
black --line-length=120 --check --exclude $BLACK_EXCLUDE_SETTINGS ./train.py
3633
3734
- name: lint-pylint
3835
run: |
3936
pip install pylint==v2.17.2
4037
PYLINT_DISABLE_LIST="C0114,C0415,W0212,W0235,W0238,W0621,C0103,R1735,C2801,E0402,C0412,W0719,R1728,W1514,W0718,W0105,W0707,C0209,W0703,W1203"
41-
pylint --rcfile .pylintrc --disable=$PYLINT_DISABLE_LIST --ignore=./internlm/model/ops/ring_flash_attn/zigzag_ring_flash_attn_with_sliding_window.py ./internlm/*
42-
pylint --rcfile .pylintrc --disable=$PYLINT_DISABLE_LIST ./train.py
38+
pylint --rcfile .pylintrc --disable=$PYLINT_DISABLE_LIST --ignore=./internlm/model/model_ops/ops/ring_flash_attn/zigzag_ring_flash_attn_with_sliding_window.py ./internlm/*

README-ja-JP.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ data = dict(
9999

100100
Slurm環境で2ノード16カードを使用する場合、コマンドは以下の通りです:
101101
```bash
102-
$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/7B_sft.py
102+
$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
103103
```
104104

105105
torchを使用し、1ノード8カードで実行する場合、コマンドは以下の通りです:

README-zh-Hans.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ data = dict(
9999

100100
slurm环境,双机16卡,启动训练命令如下:
101101
```bash
102-
$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/7B_sft.py
102+
$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
103103
```
104104

105105
torch环境,单机8卡,启动训练命令如下:

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ Training can be started on slurm or torch distributed environment.
9999

100100
On slurm, using 2 nodes and 16 cards, the command is as follows:
101101
```bash
102-
$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/7B_sft.py
102+
$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
103103
```
104104

105105
On torch, using 1 node and 8 cards, the command is as follows:

ci_scripts/train/generate_config.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import os
66

77
from ci_scripts.common import com_func
8-
from internlm.core.context import Config
8+
from internlm.utils.config import Config
99

1010

1111
def generate_new_config(config_py_file, test_config_json, case_name):

ci_scripts/train/load_ckpt.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ if [[ ! -f ${file} ]]; then
2222
exit_code=$(($exit_code + 1))
2323
fi
2424

25-
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ${file}
25+
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python internlm/launcher/launch.py --config ${file}
2626
[[ $? -ne 0 ]] && { echo "test slurm training failed."; exit_code=$(($exit_code + 1)); }
2727

2828

ci_scripts/train/slurm_train.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ if [[ -d ${CKPTS20_PATH} ]]; then
2222
fi
2323
fi
2424

25-
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./ci_scripts/train/ci_7B_sft.py
25+
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python internlm/launcher/launch.py --config ./ci_scripts/train/ci_7B_sft.py
2626
[[ $? -ne 0 ]] && { echo "test slurm training failed."; exit_code=$(($exit_code + 1)); }
2727

2828
num=$(num_files "${CKPTS20_OUTPUT}")

ci_scripts/train/torchrun.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ if [[ -d ${CKPTS20_PATH} ]]; then
2222
fi
2323
fi
2424

25-
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$1 -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 train.py --config ./ci_scripts/train/ci_7B_sft.py --launcher torch
25+
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$1 -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 internlm/launcher/launch.py --config ./ci_scripts/train/ci_7B_sft.py --launcher torch
2626
[[ $? -ne 0 ]] && { echo "test torch training failed."; exit_code=$(($exit_code + 1)); }
2727

2828
num=$(num_files "${CKPTS_OUTPUT}")

doc/code-docs/locales/en/LC_MESSAGES/training.po

+2-2
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,10 @@ msgstr "Initialize Distributed Training Environment"
6868

6969
#: ../../source/training.rst:23
7070
msgid ""
71-
"调用 ``initialize_distributed_env`` 函数,支持通过 slurm 或 torch "
71+
"调用 ``init_distributed`` 函数,支持通过 slurm 或 torch "
7272
"方式启动训练脚本,并传入配置文件、端口号、进程随机种子等信息。函数详细说明如下:"
7373
msgstr ""
74-
"Call the initialize_distributed_env function, which supports launching "
74+
"Call the init_distributed function, which supports launching "
7575
"the training script through Slurm or Torch, and pass in information such "
7676
"as the configuration file, port number, and process random seed. Detailed"
7777
" description of the function is as follows:"

doc/code-docs/source/example/20B_demo.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@
167167

168168
.. code-block:: bash
169169
170-
srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/20B_sft.py
170+
srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/20B_sft.py
171171
172172
训练结果
173173
----------------

doc/code-docs/source/example/7B_demo.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@
165165

166166
.. code-block:: bash
167167
168-
srun -p internllm -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/7B_sft.py
168+
srun -p internllm -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
169169
170170
训练结果
171171
----------------

doc/code-docs/source/initialize.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ InternEvo 使用 `argparse <https://docs.python.org/3/library/argparse.html>`_
4343
模型初始化
4444
-------------------------
4545

46-
.. autofunction:: internlm.train.initialize_model_and_parallel_communicator
46+
.. autofunction:: internlm.initialize.initialize_model.initialize_model_and_parallel_communicator
4747

4848
InternEvo 在配置文件中使用字段 ``model_type`` 和 ``model`` 来控制模型初始化过程。示例模型初始化配置定义如下:
4949

doc/code-docs/source/training.rst

+3-3
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,11 @@
1818
- 初始化分布式训练环境
1919
.. code-block:: python
2020
21-
initialize_distributed_env(config=args.config, launcher=args.launcher, master_port=args.port, seed=args.seed)
21+
init_distributed(config=args.config, launcher=args.launcher, master_port=args.port, seed=args.seed)
2222
23-
调用 ``initialize_distributed_env`` 函数,支持通过 slurm 或 torch 方式启动训练脚本,并传入配置文件、端口号、进程随机种子等信息。函数详细说明如下:
23+
调用 ``init_distributed`` 函数,支持通过 slurm 或 torch 方式启动训练脚本,并传入配置文件、端口号、进程随机种子等信息。函数详细说明如下:
2424

25-
.. autofunction:: internlm.initialize.initialize_distributed_env
25+
.. autofunction:: internlm.initialize.init_distributed
2626

2727
- 初始化模型
2828
.. code-block:: python

doc/en/usage.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -407,7 +407,7 @@ After completing the data preparation and relevant training configurations menti
407407
If you want to start distributed training on slurm with 16 GPUs across multiple nodes, use the following command:
408408

409409
```bash
410-
$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/7B_sft.py
410+
$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
411411
```
412412

413413
If you want to start distributed training on torch with 8 GPUs on a single node, use the following command:

doc/usage.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -453,7 +453,7 @@ parallel = dict(
453453

454454
若在 slurm 上启动分布式运行环境,多节点 16 卡的运行命令如下所示:
455455
```bash
456-
$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/7B_sft.py
456+
$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
457457
```
458458

459459
若在 torch 上启动分布式运行环境,单节点 8 卡的运行命令如下所示:

generate.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,12 @@
1818
from internlm.apis.inference import SequenceGenerator
1919
from internlm.core.context import global_context as gpc
2020
from internlm.data import build_generation_loader_with_data_type
21-
from internlm.initialize import initialize_distributed_env
21+
from internlm.initialize import initialize_launcher
22+
from internlm.initialize.initialize_model import (
23+
initialize_model_and_parallel_communicator,
24+
)
2225
from internlm.monitor import initialize_monitor_manager
23-
from internlm.monitor.monitor import monitor_manager as mm
24-
from internlm.train import initialize_model_and_parallel_communicator
26+
from internlm.monitor import monitor_manager as mm
2527
from internlm.utils.common import (
2628
enable_pytorch_expandable_segments,
2729
launch_time,
@@ -219,7 +221,7 @@ def main():
219221
hostname = socket.gethostname()
220222

221223
# initialize distributed environment
222-
initialize_distributed_env(config=args.config, launcher=args.launcher, master_port=args.port, seed=args.seed)
224+
initialize_launcher(config=args.config, launcher=args.launcher, distributed_port=args.port, seed=args.seed)
223225
assert hasattr(gpc, "config") and gpc.config is not None
224226
assert "generation" in gpc.config, f"Please set `generation` config in `{args.config}` file"
225227
assert (

internlm/__init__.py

-9
Original file line numberDiff line numberDiff line change
@@ -1,9 +0,0 @@
1-
from .initialize.initialize_trainer import initialize_trainer
2-
from .initialize.launch import get_default_parser, launch_from_slurm, launch_from_torch
3-
4-
__all__ = [
5-
"get_default_parser",
6-
"initialize_trainer",
7-
"launch_from_slurm",
8-
"launch_from_torch",
9-
]

internlm/accelerator/abstract_accelerator.py

+21-4
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
"""
22
Universal accelerator interface implementation, inspired by DeepSpeed.
33
"""
4+
import abc
45
import enum
56
import os
7+
from abc import ABC
68

79

810
class AcceleratorType(enum.Enum):
@@ -17,57 +19,72 @@ class AcceleratorType(enum.Enum):
1719
internlm_accelerator = None
1820

1921

20-
class Accelerator:
22+
class Accelerator(ABC):
2123
"""
2224
Abstract base class for accelerator
2325
"""
2426

2527
def __init__(self) -> None:
26-
pass
28+
self._name_str = None
29+
self._communication_backend_name = None
2730

31+
@abc.abstractmethod
2832
def get_backend_name(self):
2933
"""
3034
Return the name of the accelerator.
3135
"""
3236
raise NotImplementedError
3337

38+
@abc.abstractmethod
3439
def get_accelerator_backend(self):
3540
"""
36-
Return the name of the backend.
41+
Return the name of the accelerator backend.
3742
"""
3843
raise NotImplementedError
3944

40-
# Device APIs
45+
@abc.abstractmethod
46+
def communication_backend_name(self):
47+
"""
48+
Return the name of the communication backend.
49+
"""
50+
raise NotImplementedError
51+
52+
@abc.abstractmethod
4153
def device_name(self, device_index=None):
4254
"""
4355
Return the name of the device.
4456
"""
4557
raise NotImplementedError
4658

59+
@abc.abstractmethod
4760
def set_device(self, device_index):
4861
"""
4962
Bind the current process to a device.
5063
"""
5164
raise NotImplementedError
5265

66+
@abc.abstractmethod
5367
def get_device_id(self):
5468
"""
5569
Return the current device index.
5670
"""
5771
raise NotImplementedError
5872

73+
@abc.abstractmethod
5974
def current_device_name(self):
6075
"""
6176
Return the name of the current device.
6277
"""
6378
raise NotImplementedError
6479

80+
@abc.abstractmethod
6581
def device_count(self):
6682
"""
6783
Return the number of devices on the machine.
6884
"""
6985
raise NotImplementedError
7086

87+
@abc.abstractmethod
7188
def synchronize(self, device_index=None):
7289
"""
7390
Synchronize the current process.

internlm/accelerator/cuda_accelerator.py

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ class CUDA_Accelerator(Accelerator):
1414
"""
1515

1616
def __init__(self) -> None:
17+
super().__init__()
1718
self._name_str = "cuda"
1819
self._communication_backend_name = "nccl"
1920
self.amp = self.get_amp()

internlm/accelerator/dipu_accelerator.py

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ class DIPU_Accelerator(Accelerator):
1414
"""
1515

1616
def __init__(self) -> None:
17+
super().__init__()
1718
self._name_str = "cuda"
1819
self._communication_backend_name = "nccl"
1920
self.amp = self.get_amp()

internlm/accelerator/ditorch_accelerator.py

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ class DITORCH_Accelerator(Accelerator):
1414
"""
1515

1616
def __init__(self) -> None:
17+
super().__init__()
1718
self._name_str = "cuda"
1819
self._communication_backend_name = "nccl"
1920
self.amp = self.get_amp()

internlm/accelerator/npu_accelerator.py

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ class ASCEND_Accelerator(Accelerator):
1414
"""
1515

1616
def __init__(self) -> None:
17+
super().__init__()
1718
self._name_str = "npu"
1819
self._communication_backend_name = "hccl"
1920
self.amp = self.get_amp()

internlm/apis/inference_utils.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from internlm.core.context import ParallelMode # noqa: E402
44
from internlm.core.context import global_context as gpc # noqa: E402
5-
from internlm.core.parallel.comm.utils import _gather as gather
5+
from internlm.core.parallel.comm.utils import _gather
66

77

88
class InferenceParams:
@@ -64,6 +64,6 @@ def process_parallel_output(model_output):
6464

6565
# gather tp parallel output
6666
if gpc.config.model.parallel_output and gpc.is_initialized(ParallelMode.TENSOR):
67-
return gather(model_output, ParallelMode.TENSOR, -1)
67+
return _gather(model_output, ParallelMode.TENSOR, -1)
6868
else:
6969
return model_output

0 commit comments

Comments
 (0)