update configs

stanford-crfm · Feb 5, 2025 · 1fdeec5 · 1fdeec5
1 parent ecd7f48
commit 1fdeec5
Show file tree

Hide file tree

Showing 4 changed files with 12 additions and 6 deletions.
diff --git a/config/data/dolma_llama.yaml b/config/data/dolma_llama.yaml
@@ -199,4 +199,4 @@ train_weights:
   dolma/stackexchange: 19.6
   dolma/starcoder: 263.8
   dolma/wiki: 7.4
-vocab_size: null
+vocab_size: null
diff --git a/config/data/dolma_llama_euwest.yaml b/config/data/dolma_llama_euwest.yaml
@@ -197,4 +197,4 @@ train_weights:
   dolma/stackexchange: 19.6
   dolma/starcoder: 263.8
   dolma/wiki: 7.4
-vocab_size: null
+vocab_size: null
diff --git a/config/llama3.1_tulu3_sft.yaml b/config/llama3.1_tulu3_sft.yaml
@@ -30,24 +30,31 @@ trainer:
     type: wandb
     project: "marin"
     tags: ["dolma", "olmo", "llama"]
+  wandb:
+    project: "marin"
+    name: "llama3.1_tulu_sft_packed"
 
   mp: p=f32,c=bfloat16
   # same as 606 sft in marin
   train_batch_size: 128
   # number of steps until we hit stop iteration
-  num_train_steps: 1791 # 3,000,000,000,000 / 4,000,000 = 750,000
+  num_train_steps: 2574 # 3,000,000,000,000 / 4,000,000 = 750,000
   steps_per_eval: 1000
   tensor_parallel_axes: ["mlp", "heads"]
   fsdp_axis: "embed"
   batch_axis: "batch"
+  checkpointer:
+    base_path: "gs://levanter-checkpoints/marin/llama_3.1_tulusft/"
 optimizer:
   learning_rate: 5e-6
   weight_decay: 0.0
   min_lr_ratio: 0.0
   lr_schedule: "linear"
   warmup: 0.03
 
-hf_save_steps: 1790
-hf_save_path: "gs://levanter-checkpoints/marin/llama_3.1_tulusft/"
+hf_save_steps: 500
+hf_save_path: "gs://levanter-checkpoints/marin/llama_3.1_tulusft/hf/"
 
+initialize_from_hf: True
+model_name_or_path: "meta-llama/Llama-3.1-8B"
 epoch: 0
diff --git a/config/llama_7b_with_olmo_config_euwest4.yaml b/config/llama_7b_with_olmo_config_euwest4.yaml
@@ -24,7 +24,6 @@ trainer:
         until: 1000
       - every: 1000
         until: 40000
-python -m levanter.main.export_lm_to_hf --checkpoint_path "gs://marin-ckpt-eu-w4/checkpoints/olmo7b_seed0_datafix0/fgtbtvho/step-25"  --output_dir "gs://marin-ckpt-eu-w4/checkpoints/olmo7b_seed0_datafix0/hf_100M_tokens"
   mp: p=f32,c=bfloat16
   train_batch_size: 2048
   num_train_steps: 750000  # 3,000,000,000,000 / 4,000,000 = 750,000