Skip to content

Commit

Permalink
update configs
Browse files Browse the repository at this point in the history
  • Loading branch information
ahmeda14960 committed Feb 5, 2025
1 parent ecd7f48 commit 1fdeec5
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 6 deletions.
2 changes: 1 addition & 1 deletion config/data/dolma_llama.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -199,4 +199,4 @@ train_weights:
dolma/stackexchange: 19.6
dolma/starcoder: 263.8
dolma/wiki: 7.4
vocab_size: null
vocab_size: null
2 changes: 1 addition & 1 deletion config/data/dolma_llama_euwest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -197,4 +197,4 @@ train_weights:
dolma/stackexchange: 19.6
dolma/starcoder: 263.8
dolma/wiki: 7.4
vocab_size: null
vocab_size: null
13 changes: 10 additions & 3 deletions config/llama3.1_tulu3_sft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,24 +30,31 @@ trainer:
type: wandb
project: "marin"
tags: ["dolma", "olmo", "llama"]
wandb:
project: "marin"
name: "llama3.1_tulu_sft_packed"

mp: p=f32,c=bfloat16
# same as 606 sft in marin
train_batch_size: 128
# number of steps until we hit stop iteration
num_train_steps: 1791 # 3,000,000,000,000 / 4,000,000 = 750,000
num_train_steps: 2574 # 3,000,000,000,000 / 4,000,000 = 750,000
steps_per_eval: 1000
tensor_parallel_axes: ["mlp", "heads"]
fsdp_axis: "embed"
batch_axis: "batch"
checkpointer:
base_path: "gs://levanter-checkpoints/marin/llama_3.1_tulusft/"
optimizer:
learning_rate: 5e-6
weight_decay: 0.0
min_lr_ratio: 0.0
lr_schedule: "linear"
warmup: 0.03

hf_save_steps: 1790
hf_save_path: "gs://levanter-checkpoints/marin/llama_3.1_tulusft/"
hf_save_steps: 500
hf_save_path: "gs://levanter-checkpoints/marin/llama_3.1_tulusft/hf/"

initialize_from_hf: True
model_name_or_path: "meta-llama/Llama-3.1-8B"
epoch: 0
1 change: 0 additions & 1 deletion config/llama_7b_with_olmo_config_euwest4.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ trainer:
until: 1000
- every: 1000
until: 40000
python -m levanter.main.export_lm_to_hf --checkpoint_path "gs://marin-ckpt-eu-w4/checkpoints/olmo7b_seed0_datafix0/fgtbtvho/step-25" --output_dir "gs://marin-ckpt-eu-w4/checkpoints/olmo7b_seed0_datafix0/hf_100M_tokens"
mp: p=f32,c=bfloat16
train_batch_size: 2048
num_train_steps: 750000 # 3,000,000,000,000 / 4,000,000 = 750,000
Expand Down

0 comments on commit 1fdeec5

Please sign in to comment.