diff --git a/elian/train/qwen3/__init__.py b/elian/train/qwen3/__init__.py new file mode 100644 index 000000000..e2f3b0d76 --- /dev/null +++ b/elian/train/qwen3/__init__.py @@ -0,0 +1 @@ +from .train_config import TrainConfig \ No newline at end of file diff --git a/elian/train/qwen3/main.py b/elian/train/qwen3/main.py new file mode 100644 index 000000000..1c0636e58 --- /dev/null +++ b/elian/train/qwen3/main.py @@ -0,0 +1,32 @@ +import torch.distributed as dist +from cyclopts import App + +from xtuner.v1.train.trainer import Trainer +from train_config import TrainConfig +from xtuner.v1.model import Qwen3Dense4BConfig +from xtuner.v1.config import LRConfig, AdamWConfig + +app = App( + name="entrypoint of sft & pretrain", + help="Elian-XTuner's entry point for fine-tuning and training, launched using configuration files or arguments.", +) + + +@app.default() +def main(): + cfg = TrainConfig() + print(cfg) + model_cfg = Qwen3Dense4BConfig(max_position_embeddings = cfg.max_position_embeddings) + optim_cfg = AdamWConfig(lr=cfg.lr) + lr_cfg = LRConfig(lr_type=cfg.lr_type, lr_min=cfg.lr_min) + trainer = Trainer( + **cfg.to_trainer_kwargs(model_cfg, optim_cfg, lr_cfg) + ) + trainer.fit() + + if dist.is_initialized(): + dist.destroy_process_group() + + +if __name__ == "__main__": + app(exit_on_error=False) diff --git a/elian/train/qwen3/run.sh b/elian/train/qwen3/run.sh new file mode 100644 index 000000000..0b11d0ce1 --- /dev/null +++ b/elian/train/qwen3/run.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# ========================================== +# Training Script for Multi-GPU Training +# ========================================== +# step1: cd /data1/nuist_llm/TrainLLM/SFT-elian/xtuner +# step2: YOUR_ENV_PATH='/home/202312150002/anaconda3/envs/llm/lib/python3.10/site-packages' +# step3: cp -r ./xtuner $YOUR_ENV_PATH +# step4: bash ./elian/train/qwen3/run.sh + +# conda +export PATH="/home/202312150002/anaconda3/bin:$PATH" +source /home/202312150002/anaconda3/etc/profile.d/conda.sh +conda activate xtuner +TRAIN_PATH=/data1/nuist_llm/TrainLLM/SFT-elian/xtuner/elian/train/qwen3 +cd $TRAIN_PATH || exit 1 + +# cuda +export PATH="/usr/local/cuda-12.4/bin:$PATH" +export LD_LIBRARY_PATH="/usr/local/cuda-12.4/lib64:$LD_LIBRARY_PATH" +export CUDA_HOME="/usr/local/cuda-12.4" +echo "CUDA version used: $($CUDA_HOME/bin/nvcc --version | grep 'release' | awk '{print $6}')" + +# node +NUM_NODES=1 +GPU_LIST="0,1,2,3" +NUM_GPUS_PER_NODE=$(echo $GPU_LIST | awk -F',' '{print NF}') +export NPROC_PER_NODE=4 +export NNODES=1 +export NODE_RANK=0 +export PORT=10171 +export NODE_0_ADDR=172.16.107.15 +export NCCL_DEBUG=INFO +export NCCL_IB_DISABLE=1 +export OMP_NUM_THREADS=1 +export NCCL_SOCKET_IFNAME=lo,eth0 + +# train params +TRAIN_SCRIPT="$TRAIN_PATH/main.py" +export CUDA_VISIBLE_DEVICES=$GPU_LIST +echo "Elian-Xtuner-V0.2.0 (used GPUs: ${CUDA_VISIBLE_DEVICES})" +export XTUNER_DETERMINISTIC=true # torch.use_deterministic_algorithms + +torchrun --nproc_per_node=$NUM_GPUS_PER_NODE \ + --nnodes=$NUM_NODES \ + --node_rank=$NODE_RANK \ + --master_addr=$NODE_0_ADDR \ + --master_port=$PORT \ + $TRAIN_SCRIPT \ No newline at end of file diff --git a/elian/train/qwen3/test.py b/elian/train/qwen3/test.py new file mode 100644 index 000000000..fa49549b6 --- /dev/null +++ b/elian/train/qwen3/test.py @@ -0,0 +1,47 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from xtuner.v1.loss.ce_loss import CELossConfig, CELossContextInputItem, CELossContext +import time + + +hidden_states = torch.randn(32768, 4096, device="cuda", dtype=torch.bfloat16, requires_grad=True) +lm_head = nn.Linear(4096, 151936, bias=False).to(device="cuda", dtype=torch.bfloat16) +torch.cuda.reset_peak_memory_stats() +t1 = time.time() +logits = lm_head(hidden_states) +shifted_labels = torch.randint(0, 151936, (32768, ), device="cuda") +loss = F.cross_entropy(logits, shifted_labels) +loss.backward() +max_memory = torch.cuda.max_memory_allocated() +reserved_memory = torch.cuda.max_memory_reserved() +print(f"Eager mode Loss: {loss.item()}") +print(f"Eager mode hidden_states grad norm: {hidden_states.grad.norm().item()}") +print(f"Eager mode lm_head weight grad norm: {lm_head.weight.grad.norm().item()}") +print(f"Eager mode Max memory allocated: {max_memory / 1024**3:.2f} GB") +print(f"Eager mode Max memory reserved: {reserved_memory / 1024**3:.2f} GB") +print(f"Eager mode Time taken: {time.time() - t1:.2f} seconds") + +del logits +torch.cuda.empty_cache() +torch.cuda.reset_peak_memory_stats() + +shifted_labels = shifted_labels.unsqueeze(0) +hidden_states = hidden_states.unsqueeze(0) +hidden_states = hidden_states.clone().detach().requires_grad_(True) +lm_head.weight.grad = None +t1 = time.time() +loss_ctx_input_list = [CELossContextInputItem(shifted_labels=shifted_labels)] +loss_cfg = CELossConfig(mode='chunk', chunk_size=1024, loss_reduction="token") +batches_loss_kwargs = CELossContext.build_batches_loss_kwargs(loss_ctx_input_list, loss_cfg) +loss_ctx = CELossContext(loss_cfg, batches_loss_kwargs[0]) +loss, _ = loss_ctx.forward(hidden_states, lm_head.weight) +loss.backward() +max_memory = torch.cuda.max_memory_allocated() +reserved_memory = torch.cuda.max_memory_reserved() +print(f"Chunk mode Loss: {loss.item()}") +print(f"Chunk mode hidden_states grad norm: {hidden_states.grad.norm().item()}") +print(f"Chunk mode lm_head weight grad norm: {lm_head.weight.grad.norm().item()}") +print(f"Chunk mode Max memory allocated: {max_memory / 1024**3:.2f} GB") +print(f"Chunk mode Max memory reserved: {reserved_memory / 1024**3:.2f} GB") +print(f"Chunk mode Time taken: {time.time() - t1:.2f} seconds") \ No newline at end of file diff --git a/elian/train/qwen3/train_config.py b/elian/train/qwen3/train_config.py new file mode 100644 index 000000000..e15f6986b --- /dev/null +++ b/elian/train/qwen3/train_config.py @@ -0,0 +1,198 @@ +from dataclasses import dataclass, field, asdict +from pathlib import Path +from typing import Optional + +from xtuner.v1.train.trainer import ResumeConfig +from xtuner.v1.config import FSDPConfig +from xtuner.v1.loss.ce_loss import CELossConfig +from xtuner.v1.datasets.config import DatasetCombine, DatasetConfig, DataloaderConfig +from xtuner.v1.datasets.sft_tokenize_fn.openai import OpenaiTokenizeFunctionConfig + +@dataclass +class TrainConfig: + # base path + model_path: str = "/data1/nuist_llm/TrainLLM/ModelCkpt/qwen3-4b/instruct-base" + work_dir: str = "/data1/nuist_llm/TrainLLM/SFT-elian/xtuner/elian/save/model-01" + log_dir: str = "/data1/nuist_llm/TrainLLM/SFT-elian/xtuner/elian/save/model-01" + + # data params + dataset_cfg: list = field(default_factory=lambda: [ + "/data1/nuist_llm/TrainLLM/datasets/SFT/math/category/code/Nemotron-Post-Training-V2-code-coldStart.jsonl", + "/data1/nuist_llm/TrainLLM/datasets/SFT/math/category/code/Nemotron-Post-Training-V2-code.jsonl", + "/data1/nuist_llm/TrainLLM/datasets/SFT/math/category/math/Nemotron-Post-Training-V2-math.jsonl", + "/data1/nuist_llm/TrainLLM/datasets/SFT/math/category/other/Nemotron-Post-Training-V2-math-coldStart.jsonl" + ] + [f"/data1/nuist_llm/TrainLLM/datasets/SFT/math/category/other/chat-0000{i}-of-00012.jsonl" for i in range(10)] + [ + "/data1/nuist_llm/TrainLLM/datasets/SFT/math/category/other/chat-00010-of-00012.jsonl","/data1/nuist_llm/TrainLLM/datasets/SFT/math/category/other/chat-00011-of-00012.jsonl" + ]) + cache_dir: str = "/data1/nuist_llm/cacheTem/elianXtuner" + class_name: str = "JsonlDataset" # TODO @elian: new parquest + sample_ratio: float = 1.0 + cache_tag: str = "elian-xtuner" + message_template: str = "qwen3" + max_token_size: int = 4096 + max_position_embeddings: int = 4096 + collator: str = "sft_llm_collator" # ["sft_llm_collator", "sft_vllm_collator", "fake_collator"] + pack_level: str = "soft" # ["soft", "none"] # soft is True, none is False for Pack + pack_max_length: int = 8024 # max_position_size + pack_workers: int = 8 + num_workers: int = 8 + + + # train params + global_batch_size: int = 1 + total_epoch: int = 1 + + # fsdp params + sp_size: int = 2 + tp_size: int = 2 + ep_size: int = 1 + recompute_ratio:float = 1.0 + cpu_offload: bool = False + + # loss params + mode: str = "chunk" + chunk_size: int = 1024 + loss_reduction: str = "token" # ["token", "sample", "square"] + + # resume params + resume_from: Optional[str] = None + auto_resume: bool = False + load_optimizer: bool = True + load_dataset: bool = True + load_scheduler: bool = True + strict_load: bool = False + + # save checkpoint step + hf_interval: Optional[int] = 2000 + hf_max_keep: Optional[int] = 1 + checkpoint_interval: Optional[int] = 1000 + checkpoint_maxkeep: Optional[int] = 2 + + # profiling + profile_step: Optional[int] = 1 + profile_time: bool = True + profile_memory: bool = True + intra_layer_micro_batch: int = 1 + + # other + seed: int = 42 + debug: bool = False + backend: str = "nccl" + exp_tracker: str = "tensorboard" + + # optim + lr: float = 6e-5 + weight_decay: float = 0.001 + betas: tuple = (0.9, 0.95) + max_grad_norm: float = 1.0 + lr_type: str = "cosine" # ["cosine", "linear", "constant"] + warmup_ratio: float = 0.03 + lr_min: float = 1e-6 + + def build_resume_cfg(self) -> Optional[ResumeConfig]: + if self.resume_from or self.auto_resume: + return ResumeConfig( + resume_from=self.resume_from, + auto_resume=self.auto_resume, + load_optimizer=self.load_optimizer, + load_dataset=self.load_dataset, + load_scheduler=self.load_scheduler, + ) + return None + + def build_fsdp_cfg(self) -> Optional[FSDPConfig]: + if self.tp_size > 1 or self.ep_size > 1: + return FSDPConfig( + tp_size = self.tp_size, + sp_size = self.sp_size, + ep_size = self.ep_size, + cpu_offload = self.cpu_offload + ) + else: + return None + + def build_loss_cfg(self) -> Optional[CELossConfig]: + if self.mode!="eager" or self.loss_reduction!="token": + return CELossConfig( + mode = self.mode, + chunk_size = self.chunk_size, + loss_reduction = self.loss_reduction + ) + else: + return None + + def build_datasets_cfg(self) -> list[DatasetCombine]: + all_datasets = [] + for data_file in self.dataset_cfg: + data_path = Path(data_file) + name = data_path.stem + tokenize_fn_cfg = OpenaiTokenizeFunctionConfig( + chat_template=self.message_template, + max_length=self.max_token_size + ) + all_datasets.append( + { + "dataset":DatasetConfig( + anno_path=data_file, + cache_dir=self.cache_dir, + name=name, + cache_tag=self.cache_tag, + class_name=self.class_name, + sample_ratio=self.sample_ratio + ), + "tokenize_fn":tokenize_fn_cfg + } + ) + return all_datasets + + def build_dataloader(self) -> DataloaderConfig: + return DataloaderConfig( + collator = self.collator, + pack_level = self.pack_level, + pack_max_length = self.pack_max_length, + pack_workers = self.pack_workers, + num_workers = self.num_workers + ) + + def to_trainer_kwargs(self, model_cfg, optim_cfg, lr_cfg): + return dict( + model_cfg=model_cfg, + tokenizer_path=self.model_path, + load_from=self.model_path, + optim_cfg=optim_cfg, + lr_cfg=lr_cfg, + global_batch_size = self.global_batch_size, + work_dir = self.work_dir, + log_dir = self.log_dir, + sp_size = self.sp_size, + total_epoch = self.total_epoch, + checkpoint_interval = self.checkpoint_interval, + checkpoint_maxkeep = self.checkpoint_maxkeep, + hf_interval = self.hf_interval, + hf_max_keep = self.hf_max_keep, + exp_tracker = self.exp_tracker, + profile_step = self.profile_step, + profile_time = self.profile_time, + profile_memory = self.profile_memory, + intra_layer_micro_batch = self.intra_layer_micro_batch, + seed = self.seed, + debug = self.debug, + backend = self.backend, + resume_cfg=self.build_resume_cfg(), + fsdp_cfg=self.build_fsdp_cfg(), + loss_cfg=self.build_loss_cfg(), + dataset_cfg=self.build_datasets_cfg(), + dataloader_cfg=self.build_dataloader() + ) + + def __str__(self): + cfg_dict = asdict(self) + max_key_len = max(len(k) for k in cfg_dict.keys()) + lines = [] + for k, v in cfg_dict.items(): + lines.append(f"{k:<{max_key_len}} : {v}") + return "\n".join(lines) + +if __name__ == "__main__": + cfg = TrainConfig() + print(cfg) \ No newline at end of file diff --git a/requirements/runtime.txt b/requirements/runtime.txt index ce60f33cf..778304b80 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -2,15 +2,12 @@ bitsandbytes==0.45.0 datasets<4.0.0 einops loguru -mmengine@git+https://github.com/open-mmlab/mmengine.git@c1724c6 openpyxl peft>=0.14.0 scikit-image scipy SentencePiece tiktoken -torch>=2.6.0 -torchvision transformers==4.56.0 cyclopts transformers_stream_generator diff --git a/test.py b/test.py new file mode 100644 index 000000000..fa49549b6 --- /dev/null +++ b/test.py @@ -0,0 +1,47 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from xtuner.v1.loss.ce_loss import CELossConfig, CELossContextInputItem, CELossContext +import time + + +hidden_states = torch.randn(32768, 4096, device="cuda", dtype=torch.bfloat16, requires_grad=True) +lm_head = nn.Linear(4096, 151936, bias=False).to(device="cuda", dtype=torch.bfloat16) +torch.cuda.reset_peak_memory_stats() +t1 = time.time() +logits = lm_head(hidden_states) +shifted_labels = torch.randint(0, 151936, (32768, ), device="cuda") +loss = F.cross_entropy(logits, shifted_labels) +loss.backward() +max_memory = torch.cuda.max_memory_allocated() +reserved_memory = torch.cuda.max_memory_reserved() +print(f"Eager mode Loss: {loss.item()}") +print(f"Eager mode hidden_states grad norm: {hidden_states.grad.norm().item()}") +print(f"Eager mode lm_head weight grad norm: {lm_head.weight.grad.norm().item()}") +print(f"Eager mode Max memory allocated: {max_memory / 1024**3:.2f} GB") +print(f"Eager mode Max memory reserved: {reserved_memory / 1024**3:.2f} GB") +print(f"Eager mode Time taken: {time.time() - t1:.2f} seconds") + +del logits +torch.cuda.empty_cache() +torch.cuda.reset_peak_memory_stats() + +shifted_labels = shifted_labels.unsqueeze(0) +hidden_states = hidden_states.unsqueeze(0) +hidden_states = hidden_states.clone().detach().requires_grad_(True) +lm_head.weight.grad = None +t1 = time.time() +loss_ctx_input_list = [CELossContextInputItem(shifted_labels=shifted_labels)] +loss_cfg = CELossConfig(mode='chunk', chunk_size=1024, loss_reduction="token") +batches_loss_kwargs = CELossContext.build_batches_loss_kwargs(loss_ctx_input_list, loss_cfg) +loss_ctx = CELossContext(loss_cfg, batches_loss_kwargs[0]) +loss, _ = loss_ctx.forward(hidden_states, lm_head.weight) +loss.backward() +max_memory = torch.cuda.max_memory_allocated() +reserved_memory = torch.cuda.max_memory_reserved() +print(f"Chunk mode Loss: {loss.item()}") +print(f"Chunk mode hidden_states grad norm: {hidden_states.grad.norm().item()}") +print(f"Chunk mode lm_head weight grad norm: {lm_head.weight.grad.norm().item()}") +print(f"Chunk mode Max memory allocated: {max_memory / 1024**3:.2f} GB") +print(f"Chunk mode Max memory reserved: {reserved_memory / 1024**3:.2f} GB") +print(f"Chunk mode Time taken: {time.time() - t1:.2f} seconds") \ No newline at end of file diff --git a/tests/script.py b/tests/script.py new file mode 100644 index 000000000..ce135c54c --- /dev/null +++ b/tests/script.py @@ -0,0 +1,4 @@ +import torch +from flash_attn.flash_attn_interface import flash_attn_varlen_func + +print(flash_attn_varlen_func) # 不应该是 None diff --git a/xtuner/v1/_writer/swanlab_writer.py b/xtuner/v1/_writer/swanlab_writer.py new file mode 100644 index 000000000..def65b420 --- /dev/null +++ b/xtuner/v1/_writer/swanlab_writer.py @@ -0,0 +1 @@ +# TODO @elian \ No newline at end of file diff --git a/xtuner/v1/datasets/config.py b/xtuner/v1/datasets/config.py index 5a0c2a592..f07a9302b 100644 --- a/xtuner/v1/datasets/config.py +++ b/xtuner/v1/datasets/config.py @@ -22,7 +22,7 @@ class DatasetConfig(BaseModel): name: Annotated[str, Parameter(group="dataset")] = "default" class_name: Annotated[str, Parameter(group="dataset")] = "JsonlDataset" sample_ratio: Annotated[float, Parameter(group="dataset")] = 1.0 - media_root: Annotated[str, Parameter(group="dataset")] = "" + media_root: Annotated[str, Parameter(group="dataset")] = "" # VLM param def build( self, diff --git a/xtuner/v1/datasets/sft_tokenize_fn/openai.py b/xtuner/v1/datasets/sft_tokenize_fn/openai.py index aa3097e3d..742676082 100644 --- a/xtuner/v1/datasets/sft_tokenize_fn/openai.py +++ b/xtuner/v1/datasets/sft_tokenize_fn/openai.py @@ -2,7 +2,7 @@ import hashlib import inspect from typing import TYPE_CHECKING, Annotated - +import json from cyclopts import Parameter from pydantic import BaseModel, ConfigDict @@ -22,6 +22,7 @@ class OpenaiTokenizeFunction(CachableTokenizeFunction[DataItem]): + """custom implementation "TokenizeFunction" of tutorial: https://xtuner.readthedocs.io/zh-cn/latest/pretrain_sft/tutorial/dataset.html""" def __init__( self, tokenizer: "PreTrainedTokenizer", @@ -40,7 +41,9 @@ def __init__( self.max_length = max_length def __call__(self, item: dict | list, **kwargs) -> DataItem: - messages = ChatMessages(messages=item) + # @elian: my dataset item is dict {"messages": list} + data = item if isinstance(item, dict) else json.loads(item) + messages = ChatMessages(messages=data["messages"]) tokenized = messages.tokenize(self.tokenizer, self.chat_template) return DataItem( diff --git a/xtuner/v1/engine/train_engine.py b/xtuner/v1/engine/train_engine.py index 19837695f..69c46bdb8 100644 --- a/xtuner/v1/engine/train_engine.py +++ b/xtuner/v1/engine/train_engine.py @@ -187,7 +187,7 @@ def build_optimizer(self, optim_cfg: OptimConfig) -> torch.optim.Optimizer: if dist.get_rank() == 0: logger.info( - f"Total trainable parameters: {num_total_requires_grad // 1e6}M, total parameters: {num_total // 1e6}M" + f"Total trainable parameters: {(num_total_requires_grad // 1e6)//1000}B, total parameters: {(num_total // 1e6//1000)}B" ) logger.info(f"Untrainable parameters names: {untrainable_names}") return optim_cfg.build(params) diff --git a/xtuner/v1/model/base.py b/xtuner/v1/model/base.py index 1422b163f..5c93233ad 100644 --- a/xtuner/v1/model/base.py +++ b/xtuner/v1/model/base.py @@ -68,7 +68,7 @@ class TransformerConfig(PydanticBaseModel): max_window_layers: Annotated[int | None, Parameter(group="model")] = None rope_scaling_cfg: RopeScalingConfig | None = None - @computed_field + @computed_field # 模型初始化之后,根据现有字段动态计算出一个新字段 def num_attention_heads(self) -> int: return self.attention.num_attention_heads diff --git a/xtuner/v1/model/dense/qwen2.py b/xtuner/v1/model/dense/qwen2.py new file mode 100644 index 000000000..ec0f15ad1 --- /dev/null +++ b/xtuner/v1/model/dense/qwen2.py @@ -0,0 +1,50 @@ +import re + +from xtuner.v1.model.base import TransformerConfig +from xtuner.v1.module.attention import MHAConfig + +from .dense import Dense + + +class Qwen2Dense(Dense): + def to_hf_key_list(self, key: str) -> list[str]: + #tie embedding needs to ensure that the output embedding (lm head) shares the weight with the input embedding. + if self.config.tie_word_embeddings and "lm_head" in key: + key = key.replace("lm_head", "embed_tokens") + + if "layers" in key or "embed_tokens" in key: + key = "model." + key + + if "layers" in key: + key = re.sub(r"layers\.(\d+)\.(experts|gate)", r"layers.\1.mlp.\2", key) + + if key.startswith("norm."): + return [key.replace("norm.", "model.norm.")] + else: + return [key] + + +class Qwen2DenseConfig(TransformerConfig): + use_sliding_window: bool = False + + def build(self) -> Qwen2Dense: + return Qwen2Dense(self) + + +# TODO: Unify the config name style +class Qwen2Dense1d5BConfig(Qwen2DenseConfig): + vocab_size: int = 151936 + max_position_embeddings: int = 131072 + pad_token_id: int = 1516453 # eos_id + num_hidden_layers: int = 36 + max_window_layers: int = 36 + hidden_size: int = 4096 + intermediate_size: int = 12288 + rms_norm_eps: float = 1e-6 + rope_theta: float = 1000000.0 + hidden_act: str = "silu" + + attention: MHAConfig = MHAConfig( + num_attention_heads=32, num_key_value_heads=8, head_dim=128, qk_norm=True, sliding_window=1024 + ) + tie_word_embeddings: bool = False diff --git a/xtuner/v1/model/dense/qwen3.py b/xtuner/v1/model/dense/qwen3.py index 9be1eeab1..2cc4613ad 100644 --- a/xtuner/v1/model/dense/qwen3.py +++ b/xtuner/v1/model/dense/qwen3.py @@ -64,4 +64,4 @@ class Qwen3Dense4BConfig(Qwen3DenseConfig): attention: MHAConfig = MHAConfig( num_attention_heads=32, num_key_value_heads=8, head_dim=128, qk_norm=True, sliding_window=1024 ) - tie_word_embeddings: bool = True + tie_word_embeddings: bool = True \ No newline at end of file diff --git a/xtuner/v1/train/trainer.py b/xtuner/v1/train/trainer.py index 706542414..d6e6974ee 100644 --- a/xtuner/v1/train/trainer.py +++ b/xtuner/v1/train/trainer.py @@ -148,8 +148,8 @@ class TrainerConfig(BaseModel): dist_backend: str = "cpu:gloo,cuda:nccl" debug: bool = False - @model_validator(mode="after") - def _convert_work_dir(self): + @model_validator(mode="after") # 模型校验器, 在字段校验完成之后执行 + def _convert_work_dir(self): # 把 work_dir 字段规范化 if isinstance(self.work_dir, str): self.work_dir = Path(self.work_dir) elif self.work_dir is None: @@ -321,7 +321,7 @@ def __init__( global_batch_size = self.data_mesh["dp"].size() self._global_batch_size = global_batch_size - self._resolve_config_conflicts(self.tokenizer, model_cfg, dataloader_cfg) + self._resolve_config_conflicts(self.tokenizer, model_cfg, dataloader_cfg) # reslove pad_id problem self._dataloader = self.build_dataloader( dataset_config=dataset_cfg, @@ -586,7 +586,7 @@ def _init_data_mesh( "`tp_size * sp_size` size must be a divisor of world size." ) - dp_size = self.world_size // (tp_size * sp_size) + dp_size = self.world_size // (tp_size * sp_size) # 16 // (2 * 2) = 4 # TODO: fsdp_config could be None device = str(DEVICE) if self._fsdp_config.cpu_offload else "cpu" @@ -858,6 +858,7 @@ def _get_checkpoint_path(self, epoch: int, step: int) -> Path: return self.checkpoint_dir / f"epoch-{epoch}-step-{step}" def _set_deterministic(self): + # 确定性: 避免使用非确定性算子 if XTUNER_DETERMINISTIC: torch.use_deterministic_algorithms(True, warn_only=True) @@ -882,12 +883,12 @@ def _init_xtuner_meta(self, work_dir: Path, auto_resume: bool) -> XTunerMeta: if self.rank == 0: work_dir.mkdir(parents=True, exist_ok=True) - meta_path = work_dir / self._META_PATH + meta_path = work_dir / self._META_PATH # example: /home/xtuner/train/save/.xtuner if not meta_path.exists() and self.rank == 0: meta = XTunerMeta(exps=[]) with open(meta_path, "w") as f: f.write(meta.model_dump_json(indent=2)) - dist.barrier() + dist.barrier() # Wait for all distributed processes to arrive here. Prevent processes with rank != 0 from trying to read a file when it is not yet created meta = cast(XTunerMeta, XTunerMeta.model_validate(load(meta_path, file_format="json"))) @@ -999,7 +1000,7 @@ def _log_step( max_memory = DEVICE_MODULE.max_memory_allocated() # type: ignore[attr-defined] reserved_memory = DEVICE_MODULE.max_memory_reserved() # type: ignore[attr-defined] - + # TODO 打印步长作为参数 self.logger.info( f"Step {self.cur_step}/{self.total_step} data_time: {data_time:.4f} lr: {lr:.6f} time: {step_time:.4f} " f"text_tokens: {step_consumed_tokens} " @@ -1147,7 +1148,8 @@ def _resolve_config_conflicts( def _resolve_resume_cfg(self, resume_cfg: ResumeConfig): latest_checkpoint = self.meta.latest_exp.latest_checkpoint - if latest_checkpoint is not None and resume_cfg.auto_resume: + # @elianDO: found a bug -> if resume_from is passed in, it will become invalid. + if latest_checkpoint is not None and resume_cfg.auto_resume and resume_cfg.resume_from is None: resume_cfg.resume_from = Path(latest_checkpoint) return resume_cfg @@ -1213,4 +1215,4 @@ def _setup_env(self): for k, v in env.items(): log_str += f"{k}: {v}\n" log_str += "==================================================" - logger.info(log_str) + logger.info(log_str) \ No newline at end of file