InternLM
diff --git a/‎README-ja-JP.md
+1-1 b/‎README-ja-JP.md
+1-1
diff --git a/‎README-zh-Hans.md
+1-1 b/‎README-zh-Hans.md
+1-1
diff --git a/‎README.md
+1-1 b/‎README.md
+1-1
diff --git a/‎ci_scripts/train/load_ckpt.sh
+1-1 b/‎ci_scripts/train/load_ckpt.sh
+1-1
diff --git a/‎ci_scripts/train/slurm_train.sh
+1-1 b/‎ci_scripts/train/slurm_train.sh
+1-1
diff --git a/‎ci_scripts/train/torchrun.sh
+1-1 b/‎ci_scripts/train/torchrun.sh
+1-1
diff --git a/‎doc/code-docs/source/example/20B_demo.rst
+1-1 b/‎doc/code-docs/source/example/20B_demo.rst
+1-1
diff --git a/‎doc/code-docs/source/example/7B_demo.rst
+1-1 b/‎doc/code-docs/source/example/7B_demo.rst
+1-1
diff --git a/‎doc/en/usage.md
+1-1 b/‎doc/en/usage.md
+1-1
diff --git a/‎doc/usage.md
+1-1 b/‎doc/usage.md
+1-1
diff --git a/‎generate.py
+1-1 b/‎generate.py
+1-1
diff --git a/‎internlm/checkpoint/checkpoint_manager.py
+1-1 b/‎internlm/checkpoint/checkpoint_manager.py
+1-1
diff --git a/‎internlm/checkpoint/components.py
+1-1 b/‎internlm/checkpoint/components.py
+1-1
diff --git a/‎internlm/core/__init__.py
+1-1 b/‎internlm/core/__init__.py
+1-1
diff --git a/‎internlm/core/scheduler/base_scheduler.py
+1-1 b/‎internlm/core/scheduler/base_scheduler.py
+1-1
diff --git a/‎internlm/data/train_state.py
-19 b/‎internlm/data/train_state.py
-19
diff --git a/‎internlm/apis/__init__.py ‎internlm/inference/__init__.py
+3 b/‎internlm/apis/__init__.py ‎internlm/inference/__init__.py
+3
diff --git a/‎internlm/apis/inference.py ‎internlm/inference/inference.py
+2-23 b/‎internlm/apis/inference.py ‎internlm/inference/inference.py
+2-23
diff --git a/‎internlm/apis/inference_utils.py ‎internlm/inference/inference_utils.py b/‎internlm/apis/inference_utils.py ‎internlm/inference/inference_utils.py
diff --git a/‎internlm/initialize/initialize_trainer.py
+1-1 b/‎internlm/initialize/initialize_trainer.py
+1-1
diff --git a/‎internlm/launcher/__init__.py ‎internlm/launch/__init__.py b/‎internlm/launcher/__init__.py ‎internlm/launch/__init__.py
diff --git a/‎internlm/core/trainer_builder.py ‎internlm/launch/launcher.py
+51-33 b/‎internlm/core/trainer_builder.py ‎internlm/launch/launcher.py
+51-33
diff --git a/‎internlm/launcher/launch.py
-45 b/‎internlm/launcher/launch.py
-45
@@ -99,7 +99,7 @@ data = dict(
 
 Slurm環境で2ノード16カードを使用する場合、コマンドは以下の通りです：
 ```bash
-$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
+$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launch.launcher --config ./configs/7B_sft.py
 ```
 
 torchを使用し、1ノード8カードで実行する場合、コマンドは以下の通りです：
 
@@ -99,7 +99,7 @@ data = dict(
 
 slurm环境，双机16卡，启动训练命令如下：
 ```bash
-$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
+$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launch.launcher --config ./configs/7B_sft.py
 ```
 
 torch环境，单机8卡，启动训练命令如下：
 
@@ -99,7 +99,7 @@ Training can be started on slurm or torch distributed environment.
 
 On slurm, using 2 nodes and 16 cards, the command is as follows:
 ```bash
-$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
+$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launch.launcher --config ./configs/7B_sft.py
 ```
 
 On torch, using 1 node and 8 cards, the command is as follows:
 
@@ -22,7 +22,7 @@ if [[ ! -f ${file} ]]; then
         exit_code=$(($exit_code + 1))
     fi
 
-srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python internlm/launcher/launch.py --config ${file}
+srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python internlm/launch/launcher.py --config ${file}
 [[ $? -ne 0 ]] && { echo "test slurm training failed.";  exit_code=$(($exit_code + 1)); }
 
 
 
@@ -22,7 +22,7 @@ if [[ -d ${CKPTS20_PATH} ]]; then
     fi
 fi
 
-srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python internlm/launcher/launch.py --config ./ci_scripts/train/ci_7B_sft.py
+srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python internlm/launch/launcher.py --config ./ci_scripts/train/ci_7B_sft.py
 [[ $? -ne 0 ]] && { echo "test slurm training failed.";  exit_code=$(($exit_code + 1)); }
 
 num=$(num_files "${CKPTS20_OUTPUT}")
 
@@ -22,7 +22,7 @@ if [[ -d ${CKPTS20_PATH} ]]; then
     fi
 fi
 
-srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$1 -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 internlm/launcher/launch.py --config ./ci_scripts/train/ci_7B_sft.py --launcher torch
+srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$1 -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 internlm/launch/launcher.py --config ./ci_scripts/train/ci_7B_sft.py --launcher torch
 [[ $? -ne 0 ]] && { echo "test torch training failed.";  exit_code=$(($exit_code + 1)); }
 
 num=$(num_files "${CKPTS_OUTPUT}")
 
@@ -167,7 +167,7 @@
 
 .. code-block:: bash
 
-    srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/20B_sft.py
+    srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launch.launcher --config ./configs/20B_sft.py
 
 训练结果
 ----------------
 
@@ -165,7 +165,7 @@
 
 .. code-block:: bash
 
-    srun -p internllm -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
+    srun -p internllm -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launch.launcher --config ./configs/7B_sft.py
 
 训练结果
 ----------------
 
@@ -407,7 +407,7 @@ After completing the data preparation and relevant training configurations menti
 If you want to start distributed training on slurm with 16 GPUs across multiple nodes, use the following command:
 
 ```bash
-$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
+$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launch.launcher --config ./configs/7B_sft.py
 ```
 
 If you want to start distributed training on torch with 8 GPUs on a single node, use the following command:
 
@@ -453,7 +453,7 @@ parallel = dict(
 
 若在 slurm 上启动分布式运行环境，多节点 16 卡的运行命令如下所示：
 ```bash
-$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
+$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launch.launcher --config ./configs/7B_sft.py
 ```
 
 若在 torch 上启动分布式运行环境，单节点 8 卡的运行命令如下所示：
 
@@ -15,7 +15,7 @@
 from tqdm import tqdm
 
 from internlm.accelerator import get_accelerator
-from internlm.apis.inference import SequenceGenerator
+from internlm.inference import SequenceGenerator
 from internlm.core.context import global_context as gpc
 from internlm.data import build_generation_loader_with_data_type
 from internlm.initialize import initialize_launcher
 
@@ -10,7 +10,7 @@
 from internlm.accelerator import get_accelerator
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
-from internlm.core.trainer import TrainState
+from internlm.train.train_state import TrainState
 from internlm.model.model_implementations.registry import model_initializer
 from internlm.model.model_implementations.transformers.base_model import (
     BaseTransformerModel,
 
@@ -8,7 +8,7 @@
 from internlm.accelerator import get_accelerator
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
-from internlm.core.trainer import TrainState
+from internlm.train.train_state import TrainState
 from internlm.model.model_ops.moe import MoE
 from internlm.solver.optimizer import HybridZeroOptimizer, HybridZeroOptimizer_v2
 from internlm.utils.common import get_current_device
 
@@ -1,6 +1,6 @@
 from .engine import Engine
 from .naive_amp import NaiveAMPModel
-from .trainer import Trainer
+from ..train.trainer import Trainer
 
 __all__ = [
     "NaiveAMPModel",
 
@@ -8,8 +8,8 @@
 
 import torch
 
-from internlm.apis import InferenceParams
 from internlm.core.engine import Engine
+from internlm.inference import InferenceParams
 
 
 class BaseScheduler(ABC):
 
@@ -1,6 +1,9 @@
+from .inference import SequenceGenerator, batch_tokenize
 from .inference_utils import InferenceParams, process_parallel_output
 
 __all__ = [
     "InferenceParams",
     "process_parallel_output",
+    "SequenceGenerator",
+    "batch_tokenize",
 ]
@@ -3,32 +3,11 @@
 
 import torch
 import torch.nn.functional as F
-from torch import nn
 
-from internlm.apis import InferenceParams, process_parallel_output
 from internlm.core.context import ParallelMode  # noqa: E402
 from internlm.core.context import global_context as gpc  # noqa: E402
-from internlm.core.trainer import Trainer
-
-__all__ = ["SequenceGenerator"]
-
-
-def _get_model_device(model):
-    """
-    obtain the device of an nn.Module.model
-
-    Args:
-        model: nn.Module
-
-    Return: torch.device. if None, the parameters of this model is None.
-    """
-    assert isinstance(model, nn.Module)
-
-    parameters = list(model.parameters())
-    if len(parameters) == 0:
-        return None
-    else:
-        return parameters[0].device
+from internlm.train.trainer import Trainer
+from internlm.inference import InferenceParams, process_parallel_output
 
 
 class SequenceGenerator:
 
@@ -24,7 +24,7 @@
     ZeroBubblePipelineVShapeScheduler,
 )
 from internlm.core.scheduler.pipeline_scheduler_1f1b import get_tensor_shape
-from internlm.core.trainer import Trainer
+from internlm.train.trainer import Trainer
 from internlm.data.utils import packed_data_normalizer, unpack_data
 from internlm.solver.optimizer import BaseOptimizer
 from internlm.solver.schedulers import Beta2Scheduler
 
@@ -1,43 +1,33 @@
-import gc
-import logging
-import time
-from functools import partial
-from typing import Dict, List, Optional, Union
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
 
-import torch
+from internlm.checkpoint.checkpoint_manager import CheckpointManager
 import torch.distributed as dist
 from torch.utils.data import DataLoader
-
-from internlm.checkpoint.checkpoint_manager import CheckpointManager
-from internlm.core.context import ParallelMode
-from internlm.core.context import global_context as gpc
+from functools import partial
+from typing import Dict, List
+from internlm.core.context import ParallelMode, global_context as gpc
 from internlm.core.parallel.comm import initialize_offload_manager
-from internlm.core.trainer import (
-    Trainer,
-    get_scheduler_hooks,
-    load_new_batch,
-    record_current_batch_training_metrics,
+from internlm.train.utils import get_scheduler_hooks, load_new_batch, record_current_batch_training_metrics
+from internlm.data import (
+    build_train_loader_with_data_type,
+    build_valid_loader_with_data_type,
 )
 from internlm.data.streaming.utils import streaming_simple_resume
-from internlm.data.train_state import get_train_state
 from internlm.eval import evaluate_on_val_dls
-from internlm.initialize import initialize_trainer
-from internlm.initialize.initialize_model import (
-    initialize_model_and_parallel_communicator,
-)
+from internlm.initialize import initialize_launcher, initialize_trainer
+from internlm.initialize.initialize_model import initialize_model_and_parallel_communicator
 from internlm.initialize.initialize_optimizer import initialize_optimizer
 from internlm.initialize.initialize_profiler import initialize_llm_profile
+from internlm.launch.trainer_builder import logger
+from internlm.model.model_implementations.builder import create_model
+from internlm.model.model_implementations.registry import register_model_initializer
 from internlm.model.model_ops.losses.ce_loss import InternLoss
 from internlm.model.model_ops.metrics import AccPerplex
-from internlm.monitor import send_alert_message
-from internlm.utils.common import (
-    BatchSkipper,
-    check_cuda_env,
-    enable_pytorch_expandable_segments,
-    get_current_device,
-    get_megatron_flops,
-    launch_time,
-)
+from internlm.monitor import internevo_monitor, send_alert_message
+from internlm.train.train_state import TrainState
+from internlm.train.trainer import Trainer
+from internlm.utils.common import BatchSkipper, check_cuda_env, enable_pytorch_expandable_segments, get_current_device, get_megatron_flops, launch_time, parse_args
 from internlm.utils.gputest import empty_cache_and_diag
 from internlm.utils.logger import get_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
@@ -46,9 +36,6 @@
 from internlm.utils.utils import DataType
 from internlm.utils.writer import Writer
 
-# global llm logger
-logger = logging.getLogger(__file__)
-
 
 class TrainerBuilder(Trainer):
     """
@@ -117,7 +104,7 @@ def __init__(
         initialize_offload_manager(gpc.config.get("selective_checkpoint_offload", False))
 
         # initialize train state
-        train_state = get_train_state(train_dl)
+        train_state = TrainState(gpc.config, train_dl.batch_sampler)
 
         # initialize optimizer
         optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model, isp_communicator)
@@ -385,3 +372,34 @@ def _update_profilers(self, batch_count: int, prof):
             self.memory_profiler.step()
         if batch_count % 2 == 0:
             prof.step()
+
+
+@internevo_monitor(feishu_alert=True, clean_run=True)
+def main(args):
+    # initialize model
+    register_model_initializer()
+    model = create_model()
+
+    # initialize train dataloader
+    train_dl, dataset_types = build_train_loader_with_data_type()
+
+    # initialize validation dataloader
+    val_dls = build_valid_loader_with_data_type()
+
+    # build trainer
+    merged_args = {**vars(args), "dataset_types": dataset_types}
+    trainer = TrainerBuilder(model, train_dl, val_dls, **merged_args)
+
+    # training
+    trainer.fit()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    # Initialize distributed environment
+    initialize_launcher(config=args.config, launcher=args.launcher, distributed_port=args.port, seed=args.seed)
+    assert hasattr(gpc, "config") and gpc.config is not None
+
+    # Run the main function with parsed arguments
+    main(args)
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,9 @@`
	`1`	`+from .inference import SequenceGenerator, batch_tokenize`
`1`	`2`	`from .inference_utils import InferenceParams, process_parallel_output`
`2`	`3`
`3`	`4`	`__all__ = [`
`4`	`5`	`"InferenceParams",`
`5`	`6`	`"process_parallel_output",`
	`7`	`+ "SequenceGenerator",`
	`8`	`+ "batch_tokenize",`
`6`	`9`	`]`
Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@`
`24`	`24`	`ZeroBubblePipelineVShapeScheduler,`
`25`	`25`	`)`
`26`	`26`	`from internlm.core.scheduler.pipeline_scheduler_1f1b import get_tensor_shape`
`27`		`-from internlm.core.trainer import Trainer`
	`27`	`+from internlm.train.trainer import Trainer`
`28`	`28`	`from internlm.data.utils import packed_data_normalizer, unpack_data`
`29`	`29`	`from internlm.solver.optimizer import BaseOptimizer`
`30`	`30`	`from internlm.solver.schedulers import Beta2Scheduler`