-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathllama8b.yaml
64 lines (56 loc) · 1.23 KB
/
llama8b.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#Training args
model_name_or_path: NousResearch/Hermes-3-Llama-3.1-8B
torch_dtype: bfloat16
use_lora: false
quantization: null
gradient_checkpointing: true
force_auto_device_map: false
use_flash_attention: true
deepspeed: train_configs/deepspeed_zero3.json
output_dir: models/Hermes-3-Llama-3.1-8B
overwrite_output_dir: true
load_best_model_at_end: false
metric_for_best_model: eval_loss
greater_is_better: false
save_strategy: "no"
save_only_model: true
save_total_limit: 1
# evaluation
do_train: true
do_eval: true
do_predict: false
evaluation_strategy: "epoch"
per_device_train_batch_size: 2
per_device_eval_batch_size: 2
gradient_accumulation_steps: 8
# optimizer settings
optim: adamw_torch
learning_rate: 0.000005
weight_decay: 0.0
num_train_epochs: 3
lr_scheduler_type: cosine
warmup_ratio: 0.1
adam_beta1: 0.9
adam_beta2: 0.95
adam_epsilon: 1e-12
# lora settings
lora_r: 128
lora_alpha: 256
lora_dropout: 0.05
lora_target_modules:
- all
# reporting
logging_strategy: steps
logging_first_step: true
logging_steps: 5
report_to: wandb
run_name: "Hermes-3-Llama-3.1-8B"
disable_tqdm: false
# hub settings
push_to_hub: false
resume_from_checkpoint: false
# performance
bf16: true
fp16: false
torch_compile: false
ddp_find_unused_parameters: false