|
| 1 | +JOB_NAME = "1.8b_moe_train" |
| 2 | +DO_ALERT = False |
| 3 | + |
| 4 | +SEQ_LEN = 2048 |
| 5 | +HIDDEN_SIZE = 1024 |
| 6 | +NUM_ATTENTION_HEAD = 16 |
| 7 | +MLP_RATIO = 1.5 |
| 8 | +NUM_LAYER = 24 |
| 9 | +VOCAB_SIZE = 92544 |
| 10 | +MULTIPLE_OF = 128 |
| 11 | + |
| 12 | +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" |
| 13 | +# Ckpt folder format: |
| 14 | +# fs: 'local:/mnt/nfs/XXX' |
| 15 | +SAVE_CKPT_FOLDER = "local:llm_ckpts" |
| 16 | +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" |
| 17 | + |
| 18 | +# boto3 Ckpt folder format: |
| 19 | +# import os |
| 20 | +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint |
| 21 | +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" |
| 22 | +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" |
| 23 | +CHECKPOINT_EVERY = 50 |
| 24 | +ckpt = dict( |
| 25 | + enable_save_ckpt=False, # enable ckpt save. |
| 26 | + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. |
| 27 | + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), |
| 28 | + load_ckpt_folder="local:llm_ckpts/", |
| 29 | + # 'load_ckpt_info' setting guide: |
| 30 | + # 1. the 'path' indicate ckpt path, |
| 31 | + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" |
| 32 | + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "llama", "hf_llama". |
| 33 | + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internevo"), |
| 34 | + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering |
| 35 | + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) |
| 36 | + # with an automatic restart mechanism upon training reboot. |
| 37 | + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint |
| 38 | + # path specified in `load_ckpt_info` by default. |
| 39 | + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. |
| 40 | + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. |
| 41 | + auto_resume=True, |
| 42 | + checkpoint_every=CHECKPOINT_EVERY, |
| 43 | + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) |
| 44 | + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. |
| 45 | + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. |
| 46 | +) |
| 47 | + |
| 48 | +TRAIN_FOLDER = None # "/path/to/dataset" |
| 49 | +VALID_FOLDER = None # "/path/to/dataset" |
| 50 | +data = dict( |
| 51 | + seq_len=SEQ_LEN, |
| 52 | + # micro_num means the number of micro_batch contained in one gradient update |
| 53 | + micro_num=4, |
| 54 | + # packed_length = micro_bsz * SEQ_LEN |
| 55 | + micro_bsz=2, |
| 56 | + # defaults to the value of micro_num |
| 57 | + valid_micro_num=4, |
| 58 | + # defaults to 0, means disable evaluate |
| 59 | + valid_every=5000, |
| 60 | + pack_sample_into_one=False, |
| 61 | + total_steps=5000, |
| 62 | + skip_batches="", |
| 63 | + # rampup_batch_size (str): A string with three space-separated integers representing the |
| 64 | + # starting batch size, the increment, and the number of steps between |
| 65 | + # each increment. For example, "192 24 8" means that the batch size (micro_num) |
| 66 | + # starts at 192 and increases by 24 every 8 steps. Defaults to None. |
| 67 | + # (IMPORTANT): The interval step size is 'micro_bsz'. |
| 68 | + rampup_batch_size="", |
| 69 | + # Datasets with less than 50 rows will be discarded |
| 70 | + min_length=50, |
| 71 | + train_folder=TRAIN_FOLDER, |
| 72 | + valid_folder=VALID_FOLDER, |
| 73 | + empty_cache_and_diag_interval=200, |
| 74 | + diag_outlier_ratio=1.1, |
| 75 | +) |
| 76 | + |
| 77 | +grad_scaler = dict( |
| 78 | + fp16=dict( |
| 79 | + # the initial loss scale, defaults to 2**16 |
| 80 | + initial_scale=2**16, |
| 81 | + # the minimum loss scale, defaults to None |
| 82 | + min_scale=1, |
| 83 | + # the number of steps to increase loss scale when no overflow occurs |
| 84 | + growth_interval=1000, |
| 85 | + ), |
| 86 | + # the multiplication factor for increasing loss scale, defaults to 2 |
| 87 | + growth_factor=2, |
| 88 | + # the multiplication factor for decreasing loss scale, defaults to 0.5 |
| 89 | + backoff_factor=0.5, |
| 90 | + # the maximum loss scale, defaults to None |
| 91 | + max_scale=2**24, |
| 92 | + # the number of overflows before decreasing loss scale, defaults to 2 |
| 93 | + hysteresis=2, |
| 94 | +) |
| 95 | + |
| 96 | +hybrid_zero_optimizer = dict( |
| 97 | + # Enable low_level_optimzer overlap_communication |
| 98 | + overlap_sync_grad=False, |
| 99 | + overlap_sync_param=False, |
| 100 | + # bucket size for nccl communication params |
| 101 | + reduce_bucket_size=512 * 1024 * 1024, |
| 102 | + # grad clipping |
| 103 | + clip_grad_norm=1.0, |
| 104 | +) |
| 105 | + |
| 106 | +loss = dict( |
| 107 | + label_smoothing=0, |
| 108 | + moe_loss_coeff=0.1, |
| 109 | +) |
| 110 | + |
| 111 | +adam = dict( |
| 112 | + lr=1e-4, |
| 113 | + adam_beta1=0.9, |
| 114 | + adam_beta2=0.95, |
| 115 | + adam_beta2_c=0, |
| 116 | + adam_eps=1e-8, |
| 117 | + weight_decay=0.01, |
| 118 | +) |
| 119 | + |
| 120 | +lr_scheduler = dict( |
| 121 | + total_steps=data["total_steps"], |
| 122 | + init_steps=0, # optimizer_warmup_step |
| 123 | + warmup_ratio=0.01, |
| 124 | + eta_min=1e-5, |
| 125 | + last_epoch=-1, |
| 126 | +) |
| 127 | + |
| 128 | +beta2_scheduler = dict( |
| 129 | + init_beta2=adam["adam_beta2"], |
| 130 | + c=adam["adam_beta2_c"], |
| 131 | + cur_iter=-1, |
| 132 | +) |
| 133 | + |
| 134 | +use_fp32_norm = False |
| 135 | +model = dict( |
| 136 | + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] |
| 137 | + num_attention_heads=NUM_ATTENTION_HEAD, |
| 138 | + embed_split_hidden=True, |
| 139 | + vocab_size=VOCAB_SIZE, |
| 140 | + embed_grad_scale=1, |
| 141 | + parallel_output=False, |
| 142 | + hidden_size=HIDDEN_SIZE, |
| 143 | + num_layers=NUM_LAYER, |
| 144 | + mlp_ratio=MLP_RATIO, |
| 145 | + apply_post_layer_norm=False, |
| 146 | + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" |
| 147 | + norm_type="rmsnorm", |
| 148 | + layer_norm_epsilon=1e-5, |
| 149 | + use_flash_attn=True, |
| 150 | + multiple_of=MULTIPLE_OF, |
| 151 | + # Whether the odd and even columns of the query and key in the model are normally interleaved. |
| 152 | + # If it's True, the model's odd and even columns are normally ordered; if it's False, |
| 153 | + # it means that the model has prematurely concatenated all odd columns and even columns in front |
| 154 | + # and back, in order to improve the RoPE's computational efficiency. |
| 155 | + # Example: |
| 156 | + # qk_interleaved = True: q[-1] = [q1,q2,q3,q4,q5,q6,...], k[-1] = [k1,k2,k3,k4,k5,k6,...] |
| 157 | + # qk_interleaved = False: q[-1] = [q1,q3,q5,...,q2,q4,q6,...], k[-1] = [k1,k3,k5,...,k2,k4,k6,...] |
| 158 | + qk_interleaved=False, |
| 159 | + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. |
| 160 | + num_experts=16, |
| 161 | + moe_use_residual=False, |
| 162 | + moe_type="GShard", # Support: "GShard", "MegaBlock", "MegaBlock-D" |
| 163 | +) |
| 164 | +""" |
| 165 | +zero1 parallel (dict): |
| 166 | + 1. size: int |
| 167 | + * if size <= 0, the size of the zero process group is equal to the size of the dp process group, |
| 168 | + so parameters will be divided within the range of dp. |
| 169 | + * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. |
| 170 | + * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. |
| 171 | + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. |
| 172 | + 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. |
| 173 | +tensor parallel (dict): |
| 174 | + 1. size: int, the size of tensor parallel. |
| 175 | + 2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'], |
| 176 | + defaults to 'mtp', means the pure megatron tensor parallel without sequence parallel. |
| 177 | + msp: megatron tensor parallel with sequence parallel, sequence parallel size = tensor parallel size. |
| 178 | + fsp: tensor parallel by flash-attn with sequence parallel, sequence parallel size = tensor parallel size. |
| 179 | + isp: customed intern sequence parallel without tensor parallel, can be used with weight parallel. |
| 180 | +pipeline parallel (dict): |
| 181 | + 1. size: int, the size of pipeline parallel. |
| 182 | + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, |
| 183 | + defaults to False. |
| 184 | +weight parallel (dict): |
| 185 | + 1. size: int, the size of weight parallel. |
| 186 | + 2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False. |
| 187 | + 3. memory_pool: bool, enable/disable memory pool, defaults to False. |
| 188 | +""" |
| 189 | +parallel = dict( |
| 190 | + zero1=dict(size=-1, fsdp=False), |
| 191 | + tensor=dict(size=1, mode="mtp"), |
| 192 | + pipeline=dict(size=1, interleaved_overlap=True), |
| 193 | + weight=dict(size=1, overlap=True, memory_pool=True), |
| 194 | +) |
| 195 | + |
| 196 | +cudnn_deterministic = False |
| 197 | +cudnn_benchmark = False |
| 198 | + |
| 199 | +monitor = dict( |
| 200 | + # feishu alert configs |
| 201 | + alert=dict( |
| 202 | + enable_feishu_alert=DO_ALERT, |
| 203 | + feishu_alert_address=None, # feishu webhook to send alert message |
| 204 | + light_monitor_address=None, # light_monitor address to send heartbeat |
| 205 | + alert_file_path=f"llm_alter/{JOB_NAME}_alert.log", |
| 206 | + ), |
| 207 | + tensorboard=dict( |
| 208 | + queue_max_length=10, |
| 209 | + ), |
| 210 | +) |
| 211 | + |
| 212 | +# custom moe impl configs |
| 213 | +# GShard MoE config |
| 214 | +# moe = dict( |
| 215 | +# top_k=2, |
| 216 | +# capacity_factor=1.0, |
| 217 | +# eval_capacity_factor=1.0, |
| 218 | +# min_capacity=4, |
| 219 | +# noisy_gate_policy=None, |
| 220 | +# drop_tokens=True, |
| 221 | +# use_rts=True, |
| 222 | +# use_fused_gating=False, |
| 223 | +# ) |
| 224 | + |
| 225 | +# MegaBlock MoE config |
| 226 | +moe = dict( |
| 227 | + top_k=2, |
| 228 | + # capacity_factor=1.0, # only used in MegaBlock(non-dmoe) |
| 229 | + # drop_tokens=True, # only used in MegaBlock(non-dmoe) |
| 230 | + # parallel_mode="tensor", # only used in MegaBlock-D(dmoe), parallel_mode can be tensor or weight |
| 231 | +) |
| 232 | + |
| 233 | +model_type = "INTERNLM_MoE" |
| 234 | + |
| 235 | +# metric_dtype can be "fp32" or other string |
| 236 | +# only when set to "fp32" will use fp32 to calc in metrics |
| 237 | +# metric_dtype = "fp32" |
0 commit comments