add configs

stanford-crfm · Feb 3, 2025 · 0536c63 · 0536c63
1 parent 75bcaee
commit 0536c63
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 9 deletions.
diff --git a/config/debug_sft.yaml → config/llama3.1_tulu3_sft.yaml b/config/debug_sft.yaml → config/llama3.1_tulu3_sft.yaml
@@ -12,10 +12,10 @@ model:  # 7B class model
   type: llama
   seq_len: 4096
   hidden_dim: 4096
-  intermediate_dim: 11008
+  intermediate_dim: 14336
   num_layers: 32
   num_heads: 32
-  num_kv_heads: 32
+  num_kv_heads: 8
   use_flash_attention: True
   flash_attention_block_size: 512
   use_bias: false
@@ -31,7 +31,8 @@ trainer:
   mp: p=f32,c=bfloat16
   # same as 606 sft in marin
   train_batch_size: 128
-  num_train_steps: 7335  # 3,000,000,000,000 / 4,000,000 = 750,000
+  # number of steps until we hit stop iteration
+  num_train_steps: 1791 # 3,000,000,000,000 / 4,000,000 = 750,000
   steps_per_eval: 1000
   tensor_parallel_axes: ["mlp", "heads"]
   fsdp_axis: "embed"
@@ -43,4 +44,7 @@ optimizer:
   lr_schedule: "linear"
   warmup: 0.03
 
+hf_save_steps: 1790
+hf_save_path: "gs://levanter-checkpoints/marin/llama_3.1_tulusft/"
+
 epoch: 0
diff --git a/config/llama3_openthoughts_sft.yaml b/config/llama3_openthoughts_sft.yaml
@@ -12,10 +12,10 @@ model:  # 7B class model
   type: llama
   seq_len: 4096
   hidden_dim: 4096
-  intermediate_dim: 11008
+  intermediate_dim: 14336
   num_layers: 32
   num_heads: 32
-  num_kv_heads: 32
+  num_kv_heads: 8
   use_flash_attention: True
   flash_attention_block_size: 512
   use_bias: false
@@ -31,7 +31,8 @@ trainer:
   mp: p=f32,c=bfloat16
   # same as 606 sft in marin
   train_batch_size: 128
-  num_train_steps: 7335  # 3,000,000,000,000 / 4,000,000 = 750,000
+  # number of steps until we hit stop iteration
+  num_train_steps: 802
   steps_per_eval: 1000
   tensor_parallel_axes: ["mlp", "heads"]
   fsdp_axis: "embed"
@@ -43,4 +44,8 @@ optimizer:
   lr_schedule: "linear"
   warmup: 0.03
 
+
+hf_save_steps: 801
+hf_save_path: "gs://levanter-checkpoints/marin/tulusft_openthoughtsft/"
+
 epoch: 0
diff --git a/config/llama_sft_hf_ckpt.yaml → config/llama3_sft_hf_ckpt.yaml b/config/llama_sft_hf_ckpt.yaml → config/llama3_sft_hf_ckpt.yaml
@@ -1,13 +1,14 @@
 # Model configuration
 model:
   type: llama
-  seq_len: 2048
+  seq_len: 4096
   hidden_dim: 4096
-  intermediate_dim: 11008
+  intermediate_dim: 14336
   num_layers: 32
   num_heads: 32
-  num_kv_heads: 32
+  num_kv_heads: 8
   use_flash_attention: true
   flash_attention_block_size: 512
   use_bias: false
   use_layer_norm_weight: false
+  initializer_range: 0.02