Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 155d8b4

Browse files
authoredJul 29, 2024··
Merge branch 'develop' into tp_recompute
2 parents 537d470 + 57b7cd5 commit 155d8b4

File tree

81 files changed

+4932
-859
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

81 files changed

+4932
-859
lines changed
 

‎README-zh-Hans.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@
1717
[![使用文档](https://readthedocs.org/projects/internevo/badge/?version=latest)](https://internevo.readthedocs.io/zh_CN/latest/?badge=latest)
1818
[![license](./doc/imgs/license.svg)](./LICENSE)
1919

20-
[📘使用教程](./doc/en/usage.md) |
21-
[🛠️安装指引](./doc/en/install.md) |
22-
[📊框架性能](./doc/en/train_performance.md) |
20+
[📘使用教程](./doc/usage.md) |
21+
[🛠️安装指引](./doc/install.md) |
22+
[📊框架性能](./doc/train_performance.md) |
2323
[🤔问题报告](https://github.com/InternLM/InternEvo/issues/new)
2424

2525
[English](./README.md) |

‎configs/1.8B_MoE16_sft.py

+237
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
JOB_NAME = "1.8b_moe_train"
2+
DO_ALERT = False
3+
4+
SEQ_LEN = 2048
5+
HIDDEN_SIZE = 1024
6+
NUM_ATTENTION_HEAD = 16
7+
MLP_RATIO = 1.5
8+
NUM_LAYER = 24
9+
VOCAB_SIZE = 92544
10+
MULTIPLE_OF = 128
11+
12+
MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
13+
# Ckpt folder format:
14+
# fs: 'local:/mnt/nfs/XXX'
15+
SAVE_CKPT_FOLDER = "local:llm_ckpts"
16+
LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
17+
18+
# boto3 Ckpt folder format:
19+
# import os
20+
# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
21+
# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
22+
# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
23+
CHECKPOINT_EVERY = 50
24+
ckpt = dict(
25+
enable_save_ckpt=False, # enable ckpt save.
26+
save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt.
27+
# load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
28+
load_ckpt_folder="local:llm_ckpts/",
29+
# 'load_ckpt_info' setting guide:
30+
# 1. the 'path' indicate ckpt path,
31+
# 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
32+
# 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "llama", "hf_llama".
33+
load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internevo"),
34+
# 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
35+
# training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
36+
# with an automatic restart mechanism upon training reboot.
37+
# Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
38+
# path specified in `load_ckpt_info` by default.
39+
# If you want to initialize your model weights from another model, you must set `auto_resume` to False.
40+
# If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
41+
auto_resume=True,
42+
checkpoint_every=CHECKPOINT_EVERY,
43+
async_upload=True, # async ckpt upload. (only work for boto3 ckpt)
44+
async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload.
45+
oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency.
46+
)
47+
48+
TRAIN_FOLDER = None # "/path/to/dataset"
49+
VALID_FOLDER = None # "/path/to/dataset"
50+
data = dict(
51+
seq_len=SEQ_LEN,
52+
# micro_num means the number of micro_batch contained in one gradient update
53+
micro_num=4,
54+
# packed_length = micro_bsz * SEQ_LEN
55+
micro_bsz=2,
56+
# defaults to the value of micro_num
57+
valid_micro_num=4,
58+
# defaults to 0, means disable evaluate
59+
valid_every=5000,
60+
pack_sample_into_one=False,
61+
total_steps=5000,
62+
skip_batches="",
63+
# rampup_batch_size (str): A string with three space-separated integers representing the
64+
# starting batch size, the increment, and the number of steps between
65+
# each increment. For example, "192 24 8" means that the batch size (micro_num)
66+
# starts at 192 and increases by 24 every 8 steps. Defaults to None.
67+
# (IMPORTANT): The interval step size is 'micro_bsz'.
68+
rampup_batch_size="",
69+
# Datasets with less than 50 rows will be discarded
70+
min_length=50,
71+
train_folder=TRAIN_FOLDER,
72+
valid_folder=VALID_FOLDER,
73+
empty_cache_and_diag_interval=200,
74+
diag_outlier_ratio=1.1,
75+
)
76+
77+
grad_scaler = dict(
78+
fp16=dict(
79+
# the initial loss scale, defaults to 2**16
80+
initial_scale=2**16,
81+
# the minimum loss scale, defaults to None
82+
min_scale=1,
83+
# the number of steps to increase loss scale when no overflow occurs
84+
growth_interval=1000,
85+
),
86+
# the multiplication factor for increasing loss scale, defaults to 2
87+
growth_factor=2,
88+
# the multiplication factor for decreasing loss scale, defaults to 0.5
89+
backoff_factor=0.5,
90+
# the maximum loss scale, defaults to None
91+
max_scale=2**24,
92+
# the number of overflows before decreasing loss scale, defaults to 2
93+
hysteresis=2,
94+
)
95+
96+
hybrid_zero_optimizer = dict(
97+
# Enable low_level_optimzer overlap_communication
98+
overlap_sync_grad=False,
99+
overlap_sync_param=False,
100+
# bucket size for nccl communication params
101+
reduce_bucket_size=512 * 1024 * 1024,
102+
# grad clipping
103+
clip_grad_norm=1.0,
104+
)
105+
106+
loss = dict(
107+
label_smoothing=0,
108+
moe_loss_coeff=0.1,
109+
)
110+
111+
adam = dict(
112+
lr=1e-4,
113+
adam_beta1=0.9,
114+
adam_beta2=0.95,
115+
adam_beta2_c=0,
116+
adam_eps=1e-8,
117+
weight_decay=0.01,
118+
)
119+
120+
lr_scheduler = dict(
121+
total_steps=data["total_steps"],
122+
init_steps=0, # optimizer_warmup_step
123+
warmup_ratio=0.01,
124+
eta_min=1e-5,
125+
last_epoch=-1,
126+
)
127+
128+
beta2_scheduler = dict(
129+
init_beta2=adam["adam_beta2"],
130+
c=adam["adam_beta2_c"],
131+
cur_iter=-1,
132+
)
133+
134+
use_fp32_norm = False
135+
model = dict(
136+
checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
137+
num_attention_heads=NUM_ATTENTION_HEAD,
138+
embed_split_hidden=True,
139+
vocab_size=VOCAB_SIZE,
140+
embed_grad_scale=1,
141+
parallel_output=False,
142+
hidden_size=HIDDEN_SIZE,
143+
num_layers=NUM_LAYER,
144+
mlp_ratio=MLP_RATIO,
145+
apply_post_layer_norm=False,
146+
dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
147+
norm_type="rmsnorm",
148+
layer_norm_epsilon=1e-5,
149+
use_flash_attn=True,
150+
multiple_of=MULTIPLE_OF,
151+
# Whether the odd and even columns of the query and key in the model are normally interleaved.
152+
# If it's True, the model's odd and even columns are normally ordered; if it's False,
153+
# it means that the model has prematurely concatenated all odd columns and even columns in front
154+
# and back, in order to improve the RoPE's computational efficiency.
155+
# Example:
156+
# qk_interleaved = True: q[-1] = [q1,q2,q3,q4,q5,q6,...], k[-1] = [k1,k2,k3,k4,k5,k6,...]
157+
# qk_interleaved = False: q[-1] = [q1,q3,q5,...,q2,q4,q6,...], k[-1] = [k1,k3,k5,...,k2,k4,k6,...]
158+
qk_interleaved=False,
159+
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used.
160+
num_experts=16,
161+
moe_use_residual=False,
162+
moe_type="GShard", # Support: "GShard", "MegaBlock", "MegaBlock-D"
163+
)
164+
"""
165+
zero1 parallel (dict):
166+
1. size: int
167+
* if size <= 0, the size of the zero process group is equal to the size of the dp process group,
168+
so parameters will be divided within the range of dp.
169+
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
170+
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
171+
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
172+
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
173+
tensor parallel (dict):
174+
1. size: int, the size of tensor parallel.
175+
2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
176+
defaults to 'mtp', means the pure megatron tensor parallel without sequence parallel.
177+
msp: megatron tensor parallel with sequence parallel, sequence parallel size = tensor parallel size.
178+
fsp: tensor parallel by flash-attn with sequence parallel, sequence parallel size = tensor parallel size.
179+
isp: customed intern sequence parallel without tensor parallel, can be used with weight parallel.
180+
pipeline parallel (dict):
181+
1. size: int, the size of pipeline parallel.
182+
2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
183+
defaults to False.
184+
weight parallel (dict):
185+
1. size: int, the size of weight parallel.
186+
2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
187+
3. memory_pool: bool, enable/disable memory pool, defaults to False.
188+
"""
189+
parallel = dict(
190+
zero1=dict(size=-1, fsdp=False),
191+
tensor=dict(size=1, mode="mtp"),
192+
pipeline=dict(size=1, interleaved_overlap=True),
193+
weight=dict(size=1, overlap=True, memory_pool=True),
194+
)
195+
196+
cudnn_deterministic = False
197+
cudnn_benchmark = False
198+
199+
monitor = dict(
200+
# feishu alert configs
201+
alert=dict(
202+
enable_feishu_alert=DO_ALERT,
203+
feishu_alert_address=None, # feishu webhook to send alert message
204+
light_monitor_address=None, # light_monitor address to send heartbeat
205+
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
206+
),
207+
tensorboard=dict(
208+
queue_max_length=10,
209+
),
210+
)
211+
212+
# custom moe impl configs
213+
# GShard MoE config
214+
# moe = dict(
215+
# top_k=2,
216+
# capacity_factor=1.0,
217+
# eval_capacity_factor=1.0,
218+
# min_capacity=4,
219+
# noisy_gate_policy=None,
220+
# drop_tokens=True,
221+
# use_rts=True,
222+
# use_fused_gating=False,
223+
# )
224+
225+
# MegaBlock MoE config
226+
moe = dict(
227+
top_k=2,
228+
# capacity_factor=1.0, # only used in MegaBlock(non-dmoe)
229+
# drop_tokens=True, # only used in MegaBlock(non-dmoe)
230+
# parallel_mode="tensor", # only used in MegaBlock-D(dmoe), parallel_mode can be tensor or weight
231+
)
232+
233+
model_type = "INTERNLM_MoE"
234+
235+
# metric_dtype can be "fp32" or other string
236+
# only when set to "fp32" will use fp32 to calc in metrics
237+
# metric_dtype = "fp32"

0 commit comments

Comments
 (0)
Please sign in to comment.