nanuGPT/run_config2.yaml at main · sytelus/nanuGPT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
hydra:
  run:
    dir: experiments/${env.experiment_name}/${env.experiment_uuid}
  sweep:
    dir: experiments/${env.experiment_name}_sweep/${env.experiment_uuid}
env:
  # Name of the experiment.
  experiment_name: default
  # UUID of the experiment.
  experiment_uuid: 4b59c21b-e598-4998-89c6-bb44411867f0
logging:
  # Calculate and log the norm of parameters if set to True.
  log_params_norm: true
  # Calculate and log statistics for each parameter if set to True.
  log_per_param_stats: false
  # Specify the interval (in steps) at which to log the parameters norm.
  params_norm_log_interval: 1
  # Enable synchronization in Timers for more accurate timing reports. May cause slowdowns in highly optimized configurations.
  sync_timer: false
  # Use CUDA events for Timers if set to True. Provides accuracy with minimal performance impact.
  cuda_event_timer: true
  # Calculate and log statistics for parameter/gradient groups if set to True.
  log_stats_by_group: true
  # Calculate and log the number of zeros in gradients if set to True.
  log_num_zeros_in_grad: false
  # Log FP8-related statistics, such as scaling factors, if set to True.
  log_fp8_stats: false
  # Log MoE-related statistics, such as routing states, if set to True.
  log_moe_stats: false
  # Enable logging to Azure Table if set to True.
  table_logging: true
  # Enable metric logging to Prometheus if set to True.
  prometheus_metric_logging: false
  # Enable logging to Weights & Biases (wandb) if set to True.
  wandb_logging: false
  # Log only on Rank 0 if set to True when using Weights & Biases.
  wandb_rank_0: false
  # Print detailed model weight information if set to True.
  verbose: false
  # Set the verbosity level for logging. Higher levels print more detailed information.
  verbose_level: 0
  # Specify the name for logging to Azure Table.
  table_log_name:
  # Specify the name for logging to Weights & Biases (wandb).
  wandb_name:
  # Specify the project name for Weights & Biases (wandb).
  wandb_project: Megatron-LM
  # Log tensors to a specified file in NPZ format. Use '%(rank)s' to include the `args.rank` in the filename.
  log_tensors_file:
  # Specify a file tag for logging job status.
  log_refinery_tag:
  # Specify the number of steps to profile using NVIDIA Nsight Systems (nsys).
  nsys_profile_steps: 0
  # Specify the number of steps to profile using PyTorch memory profiler.
  mem_profile_steps: 0
  # Specify the number of steps to profile using PyTorch profiler.
  torch_profile_steps: 0
  # Log the full (long) representation of the loss if set to True.
  log_full_loss: false
  # Enable distributed logging on every GPU if set to True.
  log_all_ranks: false
  # Enable distributed logging on GPUs with EP rank 0 and TP rank 0 if set to True.
  log_ep0_tp0_ranks: false
  # Enable distributed logging on GPUs in the last TP (Tensor Parallel) and DP (Data Parallel) ranks, but with different PP (Pipeline Parallel) ranks if set to True.
  log_pp_ranks: false
data:
  # File path to the preprocessed training dataset.
  train_data_path:
  # List of evaluation tasks to be performed.
  eval_tasks: []
  # Temporary directory for storing preprocessed data.
  data_tmpdir: /precooked_scratch
  # Maximum sequence length for input processing.
  seq_length:
  # Fixed sequence length for evaluation tasks.
  clamp_seq_len:
  # Maximum sequence length for the encoder. Should be specified separately from --seq-length.
  encoder_seq_length:
  # Maximum sequence length for the decoder.
  decoder_seq_length:
  # Maximum sequence length for the retriever model in the biencoder setup.
  retriever_seq_length: 256
  # Number of worker threads for data loading.
  num_workers: 2
  # Name of the tokenizer to be used.
  tokenizer: gpt2
  # Reset position IDs after encountering an end-of-document token.
  reset_position_ids: false
  # Reset self-attention mask after encountering an end-of-document token.
  reset_attention_mask: false
  # Apply loss masking for end-of-document tokens.
  eod_mask_loss: false
  # Enable NCCL broadcasting to share preprocessed data shards across nodes.
  nccl_replicate_data: false
  # Load the entire training dataset into memory before starting training. This is required when using --nccl-replicate-data.
  preload_training_data: true
  # Number of iterations to run over the validation set during training. If not specified, the full validation set will be used.
  train_validation_iters:
distributed:
  # Degree of parallelism for tensor model operations.
  tensor_model_parallel_size: 1
  # Degree of parallelism for pipeline model operations.
  pipeline_model_parallel_size: 1
  # Degree of parallelism for expert models.
  expert_parallel_size: 1
  # Override the expert parallelism requirement for the distributed Adam optimizer. WARNING: This may produce incorrect results.
  danger_override_expert_parallel_optimizer: false
  # Rank at which to split the encoder and decoder for pipeline parallelism.
  pipeline_model_parallel_split_rank:
  # Specify the number of layers per virtual pipeline stage. Acceptable formats are: (1) A single integer "A" for uniform layers per stage, (2) Two integers "A_B" for different layers in different parts of the model, (3) A detailed format "AxX_BxY_CxZ_..." where "AxX" means "X" stages each with "A" layers.
  num_layers_per_virtual_pipeline_stage:
  # Specify the number of layers per pipeline stage in the format "W_X_Y_Z", where each number represents the layers in that stage. Supports an arbitrary number of stages. For example, "X_Y" for two stages.
  num_layers_per_pipeline_stage:
  # Enable overlapping point-to-point communication with computation using APEX. Applies to interleaved pipeline parallelism.
  apex_overlap_p2p: true
  # Distributed backend to use for training.
  distributed_backend: Nccl
  # Key-value store to use for rendezvous in init_process_group.
  distributed_store: default
  # Timeout for process group initialization in minutes.
  pg_timeout_minutes: 10.0
  # Initialize all process groups simultaneously to speed up initialization.
  pg_init_parallel: true
  # Implementation of DistributedDataParallel to use.
  DDP_impl: local
  # Number of buckets to split model weight gradients for asynchronous all-reduce in local DDP.
  local_DDP_allreduce_bucket_count: 50
  # Use contiguous buffers in local DDP for efficiency.
  use_contiguous_buffers_in_local_ddp: true
  # Enable Tensor Parallel communication overlap. Effective only with certain transformer implementations (e.g., 'transformer_engine' or 'mixed_v2').
  tp_overlap: false
  # Optimize tensor communication in the pipeline using scatter/gather operations.
  scatter_gather_tensors_in_pipeline: true
  # Use custom-built ring exchange for point-to-point communications if enabled. Requires a custom-built image supporting ring-exchange p2p.
  use_ring_exchange_p2p: false
  # If True, defer DDP initialization in initialize_megatron() and return a function to complete it later. Also enables --use-cpu-initialization. Intended for use with an external DDP manager.
  lazy_mpu_init: false
  # Initialize affine parallel weights on the CPU if enabled.
  use_cpu_initialization: false
  # Level of memory cleanup to perform after each iteration (training and evaluation) to reduce GPU memory fragmentation. Options are: 0=off, 1=moderate cleanup, 2=aggressive cleanup.
  empty_unused_memory_level: 0
  # If set to True, the input embedding layer is placed on its own pipeline stage, separate from transformer layers. Currently, for T5 models, this affects only the encoder embedding.
  standalone_embedding_stage: false
mixed_precision:
  # Enable model training and inference in 16-bit floating point (fp16) mode.
  fp16: false
  # Enable model training and inference in bfloat16 (bf16) mode.
  bf16: true
  # Static loss scaling factor, typically a power of 2, which can help improve convergence in fp16 mode. If set to None, dynamic loss scaling will be used instead.
  loss_scale:
  # Initial loss scale value for dynamic loss scaling.
  initial_loss_scale: 4294967296
  # Minimum allowable loss scale value during dynamic loss scaling.
  min_loss_scale: 1.0
  # Window size (in iterations) for adjusting the dynamic loss scale.
  loss_scale_window: 1000
  # Hysteresis for dynamic loss scaling, controlling the number of steps to wait before adjusting the loss scale.
  hysteresis: 2
  # Perform residual connections in 32-bit floating point (fp32) to improve numerical stability.
  fp32_residual_connection: false
  # Scale the dot product of query and key vectors by 1 / layer-number for improved stability.
  query_key_layer_scaling: false
  # Perform attention masking and softmax operations in 32-bit floating point (fp32) mode. This flag is ignored if query-key layer scaling is enabled.
  attention_softmax_in_fp32: true
  # Accumulate gradients and perform all-reduce operations in 32-bit floating point (fp32) mode.
  accumulate_allreduce_grads_in_fp32: false
  # Compute the unreduced cross-entropy loss for the language model head in 16-bit floating point (fp16) mode.
  fp16_lm_cross_entropy: false
transformer_engine:
  # Enable the E4M3 format for Transformer layers in FP8 precision.
  fp8_e4m3: false
  # Enable the hybrid FP8 format for Transformer layers, combining different FP8 formats for optimal performance.
  fp8_hybrid: false
  # Execute weight gradient computations in higher precision, even when using FP8.
  fp8_wgrad: true
  # Margin for scaling in FP8 operations. Adjust to balance range and precision.
  fp8_margin: 0
  # Update interval for FP8 scaling factors, defined in terms of iterations.
  fp8_interval: 1
  # Specify which Transformer implementation to use.
  transformer_impl: local
  # Number of steps for which the amax (absolute max) history is recorded per tensor for FP8 scaling.
  fp8_amax_history_len: 1
  # Algorithm to compute amax from history.
  fp8_amax_compute_algo: most_recent
  # Use a custom algorithm for amax computation. This is effective only when using the 'mixed_v2' implementation.
  custom_amax: false
  # Reduce the amax values across data parallel groups to ensure consistent scaling factors.
  reduce_amax_across_dp: true
network_size:
  # Number of transformer layers in the model.
  num_layers:
  # Dimension of the hidden layers in the transformer.
  hidden_size:
  # Dimension of the hidden layers in the Feed-Forward Network (FFN) within the transformer. If not provided, it defaults to 4 times the hidden size.
  ffn_hidden_size:
  # Number of attention heads in the multi-head attention mechanism.
  num_attention_heads:
  # Dimension of the projection weights in the multi-head attention mechanism. Defaults to hidden_size divided by num_attention_heads if not specified.
  kv_channels:
  # Maximum number of position embeddings. Defines the range of positions that the model can attend to.
  max_position_embeddings:
  # Type of position encoding used.
  pos_encoding_type: relative_one_q
  # Backend implementation for the attention mechanism.
  attention_backend: megatron
  # Type of attention mechanism.
  attention_type: self
  # Number of groups for grouped query attention. If set to zero, grouped query attention (GQA) is disabled.
  num_query_groups: 0
  # Size of the vocabulary. Required for preprocessed datasets.
  vocab_size:
  # Pad the vocabulary size to be divisible by this value for computational efficiency.
  make_vocab_size_divisible_by: 128
  # Epsilon value used in layer normalization to prevent division by zero.
  layernorm_epsilon: 1e-05
  # Normalization method to use.
  norm_type: rmsnorm
  # Insert an extra normalization layer at the end of each residual block if set to True.
  use_extra_norm: false
  # Pattern for applying normalization in the attention mechanism. For example, 'QK' means normalize queries and keys, 'QKP' means normalize queries, keys, and projections.
  attn_norm_pattern: ''
  # Use the linear layer implementation from the upstream library for the final projection if set to True.
  use_upstream_linear_layer: false
  # Apply the residual connection after layer normalization, mimicking the original BERT architecture if set to True.
  apply_residual_connection_post_layernorm: false
  # Use OpenAI's GeLU activation function implementation for backward compatibility if set to True.
  openai_gelu: false
  # Enable workarounds for known issues with the Torch ONNX exporter if set to True.
  onnx_safe: false
  # Include a bias term in the query, key, and value projections if set to True.
  qkv_bias: false
initialization:
  # Random seed for Python, NumPy, PyTorch, and CUDA to ensure reproducibility.
  seed: 1234
  # Enable random initialization of parameters across data parallel ranks instead of synchronized initialization.
  data_parallel_random_init: false
  # Standard deviation for the zero-mean normal distribution used in weight initialization.
  init_method_std: 0.02
  # Enable Xavier uniform initialization for model parameters.
  init_method_xavier_uniform: false
  # Enable improved weight decay for better regularization during training.
  wd_improved: false
  # Use Maximal Update Parameterization (MuP) for better hyperparameter transferability. This implementation follows the version described in Table 8 of the MuTransfer paper (https://arxiv.org/pdf/2203.03466.pdf). MuP is designed to ensure that hyperparameters such as learning rate and beta2 found in smaller models transfer well to larger models.
  mup: false
  # Base hidden size dimension used for transfer in Maximal Update Parameterization.
  mup_base_hidden_size: 1536
  # Ratio of the Feed-Forward Network (FFN) hidden size to the base hidden size in the base model for MuP.
  mup_base_ffn_multiple: 4
  # Base dimension of the attention heads used for transfer in Maximal Update Parameterization.
  mup_base_head_size: 128
learning_rate:
  # Initial learning rate at the beginning of training. The learning rate will change over time based on the specified decay style and warmup schedule.
  lr:
  # Style of learning rate decay.
  lr_decay_style: cosine
  # Number of iterations over which to decay the learning rate. If not specified, defaults to the total number of training iterations (`--train-iters`).
  lr_decay_iters:
  # Number of samples over which to decay the learning rate. If not specified, defaults to the total number of training samples (`--train-samples`).
  lr_decay_samples:
  # Percentage of the total training process at which to start decaying the learning rate. `--lr-decay-iters` determines the duration of the decay.
  lr_decay_at_percent: 0
  # Fraction of total warmup iterations or samples to use for learning rate warmup. This should be a float value between 0 and 1.
  lr_warmup_fraction:
  # Number of iterations over which to linearly warm up the learning rate.
  lr_warmup_iters: 0
  # Number of samples over which to linearly warm up the learning rate.
  lr_warmup_samples: 0
  # Minimum allowable learning rate. The scheduler will clip values below this threshold.
  min_lr: 0.0
  # Minimum value for weight decay. Defaults to the same value as `min_lr` if not specified.
  min_wd:
  # Maximum value for weight decay, scaled by the weight decay multiplier (`wd-mult`).
  max_wd:
  # Override the learning rate scheduler settings from a checkpoint with the values provided in the input arguments. This includes the learning rate, warmup iterations, minimum learning rate, maximum number of iterations, and decay style.
  override_lr_scheduler: false
  # Use a different maximum position embedding size than specified in the checkpoint.
  override_max_pos_embeddings: false
  # Use the learning rate scheduler settings saved in a checkpoint, including the learning rate, warmup iterations, minimum learning rate, maximum number of iterations, and decay style, ignoring the input arguments.
  use_checkpoint_lr_scheduler: false
  # Override argument values not directly supplied on the command line with values from the checkpoint, if available. Warning: This might lead to unintended consequences.
  override_unset_args_from_checkpoint: false
regularization:
  # Dropout probability applied after the attention mechanism to prevent overfitting.
  attention_dropout: 0.0
  # Dropout probability applied to the hidden states within the transformer layers.
  hidden_dropout: 0.0
  # Weight decay (L2 penalty) coefficient used by the AdamW optimizer to regularize the model weights.
  weight_decay: 0.01
  # Multiplier for weight decay applied specifically to attention layers.
  weight_decay_multiplier: 0.5
  # Multiplier for weight decay applied specifically to embedding layers.
  weight_decay_multiplier_embedding: 0.0
  # Relative multiplier for weight decay, used in conjunction with the learning rate for the AdamW optimizer.
  wd_mult: 1.0
  # Exponent used in the cosine schedule for weight decay with the AdamW optimizer.
  wd_exp: 1.0
  # Exponent used in the cosine schedule for the learning rate.
  lr_exp: 1.0
  # Maximum allowed value for the global L2 norm of gradients to prevent gradient explosion.
  clip_grad: 1.0
  # Coefficient used for computing running averages of gradient in the Adam optimizer.
  adam_beta1: 0.9
  # Coefficient used for computing running averages of the squared gradient in the Adam optimizer.
  adam_beta2: 0.999
  # Small value added to the denominator in the Adam optimizer to improve numerical stability.
  adam_eps: 1e-08
  # Momentum factor used in the stochastic gradient descent (SGD) optimizer to accelerate gradient vectors in the right directions.
  sgd_momentum: 0.9
training:
  # Batch size per model instance (local batch size). The global batch size is calculated as local batch size times data parallel size times the number of microbatches.
  micro_batch_size:
  # Number of microbatches. The global batch size divided by the number of microbatches equals the micro-batch size times data parallel size. This value should be None when micro_batch_size is set.
  num_microbatches:
  # Method for averaging loss over samples and microbatches. 'legacy' averages globally over microbatches, which can lead to a non-invariant loss under different micro-batch sizes. 'samplewise' first averages the loss over each sample, then over the batch dimension. For full loss masks, both methods should yield the same result.
  micro_batch_averaging: legacy
  # Total training batch size. If set, it should be a multiple of micro-batch size times data parallel size. If not set, the global batch size defaults to micro-batch size times data parallel size, resulting in one micro-batch.
  global_batch_size:
  # Ramp up batch size with specified parameters: <start batch size> <batch size increment> <ramp-up samples>. For example: --rampup-batch-size 16 8 300000 --global-batch-size 1024 will start with a global batch size of 16 and increase it to 1024 over 126 intervals, using approximately 2380 samples per interval.
  rampup_batch_size:
  # Set points within training where batch sizes are doubled. For example: --batch-size-scaling-fracs 0.125 0.25 0.5 --global-batch-size 256 will result in a global batch size of 256 for the last half of training samples, 128 for the quarter of samples before, 64 for one eighth of samples before that, and 32 for the first eighth of training samples.
  batch_size_scaling_fracs:
  # Checkpoint activations for training larger models or with larger sequences and batch sizes. Supported granularities are: 'full' (whole transformer layer is recomputed), 'selective' (core attention part of the transformer layer is recomputed).
  recompute_granularity:
  # Distribute recomputed activations across model parallel groups if set to True.
  distribute_saved_activations: false
  # Method for activation recomputation: 'uniform' (uniformly divide transformer layers and recompute input activations at specified granularity), 'block' (recompute input activations of only a set number of individual transformer layers per pipeline stage). Default is to not apply recompute to any layers.
  recompute_method:
  # Number of transformer layers to recompute: 'uniform' (number of layers in each uniformly divided recompute unit), 'block' (number of individual layers to recompute within each pipeline stage).
  recompute_num_layers: 1
  # Total number of tokens for training across all training runs. Either train_iters or train_samples should be provided.
  train_tokens:
  # Total number of iterations for training across all training runs. Either train_iters or train_samples should be provided.
  train_iters:
  # Total number of samples for training across all training runs. Either train_iters or train_samples should be provided.
  train_samples:
  # Interval (in iterations) at which to report loss and timing information during training.
  log_interval: 10
  # Exit the program after a specified number of iterations if the iteration number is divisible by this value.
  exit_interval:
  # Exit the program if a specified condition on a metric is met. For example, --exit-on-metric-cond='my_metric < 42' where 'my_metric' is a key in the loss dictionary.
  exit_on_metric_cond:
  # Exit the program after a specified duration in minutes.
  exit_duration_in_mins:
  # Dynamically save a checkpoint and shut down training if a SIGTERM signal is received.
  exit_signal_handler: false
  # Enable fusion of query-key-value scaling, masking, and softmax operations for efficiency.
  masked_softmax_fusion: true
  # Enable fusion of bias and GELU activation functions for efficiency.
  bias_gelu_fusion: true
  # Pattern for applying attention. Format is 'FXMY' where the first 1/X layers have attention, then every Yth layer. Set to 'all' to enable attention in all layers, or 'none' to disable attention in all layers.
  attn_pattern: all
  # Use the GeGLU activation function instead of the standard GELU.
  use_geglu: false
  # Enable Just-In-Time (JIT) compilation for the GeGLU activation function.
  jit_geglu: false
  # Multiplier for the hidden size in the Feed-Forward Network (FFN).
  ffn_hidden_size_multiplier: 4
  # Enable fusion of bias and dropout operations for efficiency.
  bias_dropout_fusion: true
  # Optimizer to use for training.
  optimizer: adam
  # Overlap parameter synchronization with the forward pass for the distributed Adam optimizer.
  distributed_adam_overlap_param_sync: true
  # Overlap gradient synchronization with the backward pass for the distributed Adam optimizer.
  distributed_adam_overlap_grad_sync: true
  # Enable asynchronous execution of tensor-model-parallel all-reduce with weight gradient computation of a column-linear layer.
  async_tensor_model_parallel_allreduce: false
  # Enable using a persistent fused layer normalization kernel. This kernel supports only specific hidden sizes. Check 'persist_ln_hidden_sizes' to see if your hidden size is supported.
  persist_layer_norm: false
  # Enable sequence parallel optimization for efficient training.
  sequence_parallel: false
  # Fuse gradient accumulation with weight gradient computation in linear layers for efficiency.
  gradient_accumulation_fusion: true
  # Operation mode.
  mode: train_and_eval
checkpointing:
  # Directory to save checkpoints.
  save:
  # Directory to store additional experiment information.
  experiment_dir:
  # Number of iterations between saving checkpoints.
  save_interval:
  # Interval between checkpoint saves as a percentage of total consumed tokens.
  save_interval_percent:
  # Number of additional interval checkpoints to keep before deletion.
  extra_save_interval_checkpoints_to_keep: 0
  # Percentage of training tokens at which to keep checkpoints. If set to k, keeps 100/k checkpoints between 0 and the total number of training tokens. Set to zero to disable checkpoint deletion.
  checkpoints_at_percent: 100
  # Save the current state of the optimizer.
  save_optim: true
  # Save optimizer state only in final checkpoints.
  save_optim_intermediate: false
  # Save the current state of the random number generator (RNG).
  save_rng: true
  # Blob storage path to upload checkpoints. Must be different from --download if both are specified.
  upload:
  # Blob storage path to download checkpoints (e.g., for starting from a pre-trained model). Must be different from --upload if both are specified.
  download:
  # Command to use for copying files during download/upload operations.
  cp_cmd: azcopy
  # Script to call for uploading checkpoints to remote storage.
  upload_script: tools/checkpoint_upload.py
  # Directory containing a model checkpoint to load.
  load:
  # Specific checkpoint iteration to load. Defaults to the latest checkpoint if set to -1.
  checkpoint_iter: -1
  # Strategy for saving checkpoints across data-parallel groups.
  dp_ckpt_save_strategy: round-robin
  # Load a checkpoint from the local directory without downloading. Useful for debugging.
  load_local_checkpoint: false
  # Load the optimizer state when loading a checkpoint.
  load_optim: true
  # Load the RNG state when loading a checkpoint.
  load_rng: true
  # Load the model for fine-tuning. Do not load optimizer or RNG state from the checkpoint and set iteration to 0. Assumed when loading a release checkpoint.
  finetune: false
  # Prevent starting training with random weights.
  prevent_random_weights: false
  # Perform a single dummy forward and backward pass after restarts.
  dummy_step_after_restart: false
  # Perform dummy steps every N steps and compare results across data-parallel ranks for consistency.
  dummy_steps_with_checksum: 0
  # Use NCCL to broadcast checkpoint files across the data-parallel group.
  nccl_replicate_checkpoint: true
  # Delete the local checkpoint file after loading if restarting after a preemption.
  delete_local_checkpoint_after_load: true
  # Use tensorizer for saving and loading checkpoints. Tensorizer helps in reducing the checkpoint size and speeding up the loading process.
  use_tensorizer: false
autoresume:
  # Enable autoresume functionality on the ADLR cluster.
  adlr_autoresume: false
  # Interval (in iterations) at which to check for an autoresume termination signal.
  adlr_autoresume_interval: 1000
validation:
  # Number of iterations between evaluations on the validation set.
  eval_interval: 1000
moe:
  # Number of experts in the Mixture of Experts (MoE) layer. Set to None to disable MoE.
  moe_num_experts:
  # Apply MoE layers every n transformer layers.
  moe_every_n_layers: 2
  # Number of experts to select for each forward pass.
  moe_top_k: 2
  # Capacity factor for the MoE experts, determining the buffer size for each expert.
  moe_capacity_factor: 1.25
  # Reserve one expert specifically for handling overflow tokens.
  moe_reserved_expert: false
  # Algorithm used to decide which tokens to skip during overflow ('causal' or other supported methods).
  moe_overflow_algo: causal
  # Coefficient for the auxiliary MoE load balancing loss to ensure even distribution of tokens across experts.
  moe_loss_coeff: 0.1
biencoder:
  # Dimension of block embeddings used in Inverse Cloze Task (ICT) and REALM. Default is 128 as per the paper.
  ict_head_size:
  # Dimension of the projection head used in the biencoder. Default is 128 as per the paper.
  biencoder_projection_dim: 0
  # Whether to share parameters between the query and context models.
  biencoder_shared_query_context_model: false
  # Directory containing a checkpoint for an ICTBertModel.
  ict_load:
  # Directory containing a checkpoint for a BertModel, required to initialize ICT and REALM.
  bert_load:
  # File path to the titles dataset used for ICT.
  titles_data_path:
  # Probability of including the query within the block for the ICT dataset.
  query_in_block_prob: 0.1
  # Whether to use single-sentence documents in ICT, instead of multi-sentence documents.
  use_one_sent_docs: false
  # File path to the Wikipedia Evidence dataset from the DPR paper.
  evidence_data_path:
  # List of top-k accuracies to report (e.g., [1, 5, 20]).
  retriever_report_topk_accuracies: []
  # Whether to scale retriever scores by the inverse square root of the hidden size for normalization.
  retriever_score_scaling: false
  # File path to save or load BlockData for the retrieval process.
  block_data_path:
  # File path to save or load Open-Retrieval Embedding data.
  embedding_path:
  # Batch size for indexing jobs during the retrieval process.
  indexer_batch_size: 128
  # Number of batches between logging progress during indexing jobs.
  indexer_log_interval: 1000
vit:
  # Number of output classes for the vision classification task.
  num_classes: 1000
  # Height of the input images for the vision classification task (in pixels).
  img_h: 224
  # Width of the input images for the vision classification task (in pixels).
  img_w: 224
  # Number of color channels in the input image data (e.g., 3 for RGB images).
  num_channels: 3
  # Dimension of the patches into which the input images are divided in the Vision Transformer (ViT).
  patch_dim: 16
  # Fraction of the total classes to use during training. A value less than 1.0 can be used for class subsampling.
  classes_fraction: 1.0
  # Fraction of the data per class to use during training. A value less than 1.0 can be used for data subsampling within each class.
  data_per_class_fraction: 1.0
  # Enable data sharding to distribute the dataset across multiple data parallel workers.
  data_sharding: true
inference:
  # Threshold for deciding whether to use pipelining during inference. If the product of batch size and sequence length is smaller than this threshold, pipelining will not be used. If the product is greater than or equal to this threshold, pipelining will be used to improve efficiency.
  inference_batch_times_seqlen_threshold: 512