update src reorg

zigzagcai · zigzagcai · commit 3407aee1eaf2 · 2025-02-19T15:06:30.000+08:00
diff --git a/README-ja-JP.md b/README-ja-JP.md
@@ -99,7 +99,7 @@ data = dict(
 
 Slurm環境で2ノード16カードを使用する場合、コマンドは以下の通りです：
 ```bash
-$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
+$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launch.launcher --config ./configs/7B_sft.py
 ```
 
 torchを使用し、1ノード8カードで実行する場合、コマンドは以下の通りです：
diff --git a/README-zh-Hans.md b/README-zh-Hans.md
@@ -99,7 +99,7 @@ data = dict(
 
 slurm环境，双机16卡，启动训练命令如下：
 ```bash
-$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
+$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launch.launcher --config ./configs/7B_sft.py
 ```
 
 torch环境，单机8卡，启动训练命令如下：
diff --git a/README.md b/README.md
@@ -99,7 +99,7 @@ Training can be started on slurm or torch distributed environment.
 
 On slurm, using 2 nodes and 16 cards, the command is as follows:
 ```bash
-$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
+$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launch.launcher --config ./configs/7B_sft.py
 ```
 
 On torch, using 1 node and 8 cards, the command is as follows:
diff --git a/doc/code-docs/source/example/20B_demo.rst b/doc/code-docs/source/example/20B_demo.rst
@@ -167,7 +167,7 @@
 
 .. code-block:: bash
 
-    srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/20B_sft.py
+    srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launch.launcher --config ./configs/20B_sft.py
 
 训练结果
 ----------------
diff --git a/doc/code-docs/source/example/7B_demo.rst b/doc/code-docs/source/example/7B_demo.rst
@@ -165,7 +165,7 @@
 
 .. code-block:: bash
 
-    srun -p internllm -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
+    srun -p internllm -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launch.launcher --config ./configs/7B_sft.py
 
 训练结果
 ----------------
diff --git a/doc/en/usage.md b/doc/en/usage.md
@@ -407,7 +407,7 @@ After completing the data preparation and relevant training configurations menti
 If you want to start distributed training on slurm with 16 GPUs across multiple nodes, use the following command:
 
 ```bash
-$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
+$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launch.launcher --config ./configs/7B_sft.py
 ```
 
 If you want to start distributed training on torch with 8 GPUs on a single node, use the following command:
diff --git a/doc/usage.md b/doc/usage.md
@@ -453,7 +453,7 @@ parallel = dict(
 
 若在 slurm 上启动分布式运行环境，多节点 16 卡的运行命令如下所示：
 ```bash
-$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launcher.launch --config ./configs/7B_sft.py
+$ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python -m internlm.launch.launcher --config ./configs/7B_sft.py
 ```
 
 若在 torch 上启动分布式运行环境，单节点 8 卡的运行命令如下所示：
diff --git a/generate.py b/generate.py
@@ -15,7 +15,7 @@
 from tqdm import tqdm
 
 from internlm.accelerator import get_accelerator
-from internlm.apis.inference import SequenceGenerator
+from internlm.inference import SequenceGenerator
 from internlm.core.context import global_context as gpc
 from internlm.data import build_generation_loader_with_data_type
 from internlm.initialize import initialize_launcher
diff --git a/internlm/core/scheduler/base_scheduler.py b/internlm/core/scheduler/base_scheduler.py
@@ -8,7 +8,7 @@
 
 import torch
 
-from internlm.apis import InferenceParams
+from internlm.inference import InferenceParams
 from internlm.core.engine import Engine
 
 
diff --git a/internlm/inference/__init__.py b/internlm/inference/__init__.py
@@ -1,6 +1,9 @@
 from .inference_utils import InferenceParams, process_parallel_output
+from .inference import SequenceGenerator, batch_tokenize
 
 __all__ = [
     "InferenceParams",
     "process_parallel_output",
+    "SequenceGenerator",
+    "batch_tokenize",
 ]
diff --git a/internlm/inference/inference.py b/internlm/inference/inference.py
@@ -3,33 +3,12 @@
 
 import torch
 import torch.nn.functional as F
-from torch import nn
 
-from internlm.apis import InferenceParams, process_parallel_output
+from internlm.inference import InferenceParams, process_parallel_output
 from internlm.core.context import ParallelMode  # noqa: E402
 from internlm.core.context import global_context as gpc  # noqa: E402
 from internlm.core.trainer import Trainer
 
-__all__ = ["SequenceGenerator"]
-
-
-def _get_model_device(model):
-    """
-    obtain the device of an nn.Module.model
-
-    Args:
-        model: nn.Module
-
-    Return: torch.device. if None, the parameters of this model is None.
-    """
-    assert isinstance(model, nn.Module)
-
-    parameters = list(model.parameters())
-    if len(parameters) == 0:
-        return None
-    else:
-        return parameters[0].device
-
 
 class SequenceGenerator:
     """
diff --git a/internlm/inference/inference_utils.py b/internlm/inference/inference_utils.py
diff --git a/internlm/launch/__init__.py b/internlm/launch/__init__.py
diff --git a/internlm/launch/launcher.py b/internlm/launch/launcher.py
@@ -2,7 +2,7 @@
 # -*- encoding: utf-8 -*-
 
 from internlm.core.context import global_context as gpc
-from internlm.core.trainer_builder import TrainerBuilder
+from internlm.launch.trainer_builder import TrainerBuilder
 from internlm.data import (
     build_train_loader_with_data_type,
     build_valid_loader_with_data_type,
diff --git a/internlm/launch/trainer_builder.py b/internlm/launch/trainer_builder.py
diff --git a/internlm/monitor/monitor.py b/internlm/monitor/monitor.py
@@ -1,3 +1,4 @@
+from datetime import datetime
 import fcntl
 import logging
 import os
@@ -15,7 +16,6 @@
 from internlm.monitor import send_feishu_msg_with_webhook
 from internlm.utils.common import SingletonMeta, set_env_var
 
-from .utils import get_job_key
 
 logger = logging.getLogger(__file__)
 internlm_accelerator = get_accelerator()
@@ -55,6 +55,34 @@ def execute_with_exception_handling(func, *args, **kwargs):
     return decorator
 
 
+def now_time():
+    return datetime.now().strftime("%b%d_%H-%M-%S")
+
+
+def get_job_id():
+    job_id = "none"
+    if os.getenv("SLURM_JOB_ID") is not None:
+        job_id = os.getenv("SLURM_JOB_ID")
+    elif os.getenv("KUBERNETES_POD_NAME") is not None:
+        job_id = os.getenv("KUBERNETES_POD_NAME").split("-")[0]
+    elif os.getenv("MLP_TASK_INSTANCE_ID") is not None:
+        job_id = os.getenv("MLP_TASK_ID")
+
+    return job_id
+
+
+def get_job_name():
+    job_name = f"unknown-{now_time()}"
+    if os.getenv("JOB_NAME") is not None:
+        job_name = os.getenv("JOB_NAME")
+
+    return job_name
+
+
+def get_job_key():
+    return f"{get_job_id()}_{get_job_name()}"
+
+
 def send_alert_message(address: str = None, title: str = None, message: str = None):
     """
     Send alert messages to the given Feishu webhook address in log rank.
diff --git a/internlm/monitor/utils.py b/internlm/monitor/utils.py
diff --git a/tests/test_infer/test_generate.py b/tests/test_infer/test_generate.py
@@ -4,7 +4,7 @@
 import torch
 from sentencepiece import SentencePieceProcessor
 
-from internlm.apis.inference import SequenceGenerator, batch_tokenize
+from internlm.inference import SequenceGenerator, batch_tokenize
 from internlm.initialize import initialize_launcher  # noqa: E402
 from internlm.initialize.initialize_model import (
     initialize_model_and_parallel_communicator,
diff --git a/tests/test_infer/test_trainer_generate.py b/tests/test_infer/test_trainer_generate.py
@@ -3,7 +3,7 @@
 import pytest
 from sentencepiece import SentencePieceProcessor
 
-from internlm.apis.inference import SequenceGenerator, batch_tokenize
+from internlm.inference import SequenceGenerator, batch_tokenize
 from internlm.checkpoint import CheckpointManager  # noqa: E402
 from internlm.core.context import global_context as gpc  # noqa: E402
 from internlm.core.trainer import Trainer, TrainState  # noqa: E402
diff --git a/tools/load_internlm2_model.py b/tools/load_internlm2_model.py
@@ -7,7 +7,7 @@
 
 import torch
 
-from internlm.apis.inference import SequenceGenerator
+from internlm.inference import SequenceGenerator
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.initialize import initialize_launcher

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,9 @@`
`1`	`1`	`from .inference_utils import InferenceParams, process_parallel_output`
	`2`	`+from .inference import SequenceGenerator, batch_tokenize`
`2`	`3`
`3`	`4`	`__all__ = [`
`4`	`5`	`"InferenceParams",`
`5`	`6`	`"process_parallel_output",`
	`7`	`+ "SequenceGenerator",`
	`8`	`+ "batch_tokenize",`
`6`	`9`	`]`