test olmo better

ahmeda14960 · ahmeda14960 · commit 5dd98ea91ec0 · 2025-03-16T21:58:38.000-07:00
diff --git a/src/levanter/models/olmo.py b/src/levanter/models/olmo.py
@@ -265,28 +265,24 @@ def init(config: Olmo2Config, *, key) -> "Olmo2Attention":
         use_bias = config.attention_bias
         Embed = config.Embed
         QHeadsPerGroup = hax.Axis("q_heads_per_group", config.num_heads // config.num_kv_heads)
+        HeadSize = config.HeadSize
 
         k_q, k_k, k_v, k_o, k_q_norm, k_k_norm = jrandom.split(key, 6)
         q_proj = hnn.Linear.init(
-            In=Embed, Out=(config.KVHeads, QHeadsPerGroup, config.HeadSize), key=k_q, use_bias=use_bias, out_first=True
-        )
-        k_proj = hnn.Linear.init(
-            In=Embed, Out=(config.KVHeads, config.HeadSize), key=k_k, use_bias=use_bias, out_first=True
-        )
-        v_proj = hnn.Linear.init(
-            In=Embed, Out=(config.KVHeads, config.HeadSize), key=k_v, use_bias=use_bias, out_first=True
-        )
-        o_proj = hnn.Linear.init(
-            In=(config.Heads, config.HeadSize), Out=Embed, key=k_o, use_bias=use_bias, out_first=True
+            In=Embed, Out=(config.KVHeads, QHeadsPerGroup, HeadSize), key=k_q, use_bias=use_bias, out_first=True
         )
+        k_proj = hnn.Linear.init(In=Embed, Out=(config.KVHeads, HeadSize), key=k_k, use_bias=use_bias, out_first=True)
+        v_proj = hnn.Linear.init(In=Embed, Out=(config.KVHeads, HeadSize), key=k_v, use_bias=use_bias, out_first=True)
+        o_proj = hnn.Linear.init(In=(config.Heads, HeadSize), Out=Embed, key=k_o, use_bias=use_bias, out_first=True)
 
-        # OLMo2 uses normalization over the entire hidden dimension for q and k
+        # Fix this line - k_norm should be the size of kv_heads * head_size (if that's what HF is doing)
         q_norm = Olmo2RMSNorm.init(
             config.Embed, eps=config.layer_norm_epsilon, use_weight=config.use_layer_norm_weight
         )
-        k_norm = Olmo2RMSNorm.init(
-            config.Embed, eps=config.layer_norm_epsilon, use_weight=config.use_layer_norm_weight
-        )
+
+        # Define a specific KVHeadSize axis for k_norm if needed
+        KVEmbedSize = Axis("kv_embed", config.KVHeads.size * HeadSize.size)
+        k_norm = Olmo2RMSNorm.init(KVEmbedSize, eps=config.layer_norm_epsilon, use_weight=config.use_layer_norm_weight)
 
         return Olmo2Attention(config, q_proj, k_proj, v_proj, o_proj, q_norm, k_norm)
 
diff --git a/tests/test_olmo.py b/tests/test_olmo.py
@@ -214,20 +214,31 @@ def test_olmo2_roundtrip(scan_layers, num_kv_heads):
     torch_out = torch_model(input_torch)
     torch_out = torch_out.logits[0].detach().cpu().numpy()
 
-    # Add this before the roundtrip test fails
-    print("HF model params:", {k: v.shape for k, v in torch_model.state_dict().items() if "layers.0" in k})
-    print(
-        "Levanter model expected:",
-        {
-            "layers.0.mlp.gate_proj.weight": (config.hidden_dim, config.intermediate_dim),
-            "layers.0.mlp.up_proj.weight": (config.hidden_dim, config.intermediate_dim),
-            "layers.0.mlp.down_proj.weight": (config.intermediate_dim, config.hidden_dim),
-        },
-    )
     with tempfile.TemporaryDirectory() as tmpdir:
         # Save HF model
         torch_model.save_pretrained(f"{tmpdir}/torch_model")
 
+        # Add this before the converter.load_pretrained line
+        torch_state_dict = torch.load(f"{tmpdir}/torch_model/pytorch_model.bin")
+        print("\nDetailed HF Shapes:")
+        for k, v in torch_state_dict.items():
+            if "layers.0" in k:
+                print(f"{k}: {v.shape}")
+
+        # Create a template model to inspect
+        template_model = Olmo2LMHeadModel.init(Vocab=Vocab, config=config, key=random.PRNGKey(0))
+        print("\nLevanter Model Parameter Structure:")
+        for layer_idx in range(config.num_layers):
+            print(f"Layer {layer_idx}:")
+
+            # Print the attention module params shapes
+            attn = template_model.transformer.layers.blocks[layer_idx].self_attn
+            print(f"  q_proj: {attn.q_proj.weight.array.shape}")
+            print(f"  k_proj: {attn.k_proj.weight.array.shape}")
+            print(f"  v_proj: {attn.v_proj.weight.array.shape}")
+            print(f"  o_proj: {attn.o_proj.weight.array.shape}")
+            print(f"  q_norm: {attn.q_norm.weight.array.shape if attn.q_norm.weight is not None else None}")
+            print(f"  k_norm: {attn.k_norm.weight.array.shape if attn.k_norm.weight is not None else None}")
         # Load into our model
         model = converter.load_pretrained(
             Olmo2LMHeadModel, ref=f"{tmpdir}/torch_model", resize_vocab_to_match_tokenizer=False