Fix init in both models to be the same, add lm model diff test

rka97 · rka97 · commit a58fbd57ebd9 · 2025-10-21T08:46:00.000Z
diff --git a/algoperf/workloads/lm/lm_jax/nanodo_model.py b/algoperf/workloads/lm/lm_jax/nanodo_model.py
@@ -21,14 +21,17 @@ class DoConfig:
   N: int  # number of transformer block layers
   V: int  # vocab size
   F: int  # FF inner dimension
-  kernel_init: nn.initializers.Initializer = nn.initializers.xavier_uniform()
-  embed_init: nn.initializers.Initializer = nn.initializers.variance_scaling(
-    1.0, 'fan_in', 'normal', out_axis=0
-  )
+  attention_init: nn.initializers.Initializer = nn.initializers.normal(stddev=0.02)
+  linear_init: nn.initializers.Initializer = nn.initializers.normal(stddev=0.02)
+  embed_init: nn.initializers.Initializer = nn.initializers.normal(stddev=0.02)
+  use_residual_scaling: bool = True
   dtype: jnp.dtype = jnp.float32
   rmsnorm_epsilon: float = 1e-6
   multiple_of: int = 256
-  tie_embeddings: bool = True  # Whether to tie input and output embeddings
+  tie_embeddings: bool = True  # Whether to tie input and output embed
+
+  def __post_init__(self):
+    self.residual_init = nn.initializers.normal(stddev=0.02/jnp.sqrt(2 * self.N))
 
 
 class Mlp(nn.Module):
@@ -40,9 +43,8 @@ class Mlp(nn.Module):
   def __call__(self, x_BxLxD: jax.Array):
     cfg = self.cfg
     # Use Xavier uniform initialization explicitly
-    xavier_init = nn.initializers.xavier_uniform()
     linear = partial(
-      nn.Dense, kernel_init=xavier_init, use_bias=False, dtype=cfg.dtype
+      nn.Dense, kernel_init=cfg.linear_init, use_bias=False, dtype=cfg.dtype
     )
     #  Adjust hidden dimension to keep the number of parameters invariant to
     # the activation function used since the GLU MLP has 3 * hidden_dim * D
@@ -55,7 +57,7 @@ def __call__(self, x_BxLxD: jax.Array):
     x_BxLx2F = linear(2 * hidden_dim)(x_BxLxD)
     # Apply GLU activation
     x_BxLxF = nn.glu(x_BxLx2F, axis=-1)
-    x_BxLxD = linear(cfg.D)(x_BxLxF)
+    x_BxLxD = nn.Dense(cfg.D, use_bias=False, dtype=cfg.dtype, kernel_init=cfg.residual_init if cfg.use_residual_scaling else cfg.linear_init)(x_BxLxF)
     return x_BxLxD
 
 
@@ -122,7 +124,7 @@ def setup(self):
       nn.DenseGeneral,
       axis=-1,
       features=(cfg.H, self.Dh),
-      kernel_init=cfg.kernel_init,
+      kernel_init=cfg.attention_init,
       use_bias=False,
       dtype=cfg.dtype,
     )
@@ -134,7 +136,7 @@ def setup(self):
       features=cfg.D,
       name='attn_out_proj',
       # axis=(-2, -1),      #
-      kernel_init=cfg.kernel_init,
+      kernel_init=cfg.residual_init if cfg.use_residual_scaling else cfg.linear_init,
       use_bias=False,
       dtype=cfg.dtype,
     )
@@ -265,6 +267,9 @@ def predict(self, y_BxL: jax.Array, k: int = 1):
 
       # Get the logits for the last token in each sequence
       next_token_logits = logits[:, -1, :]
+      last_token_id = y_BxL[:, -1]
+      # Prevent predicting the same token consecutively
+      next_token_logits = next_token_logits.at[jnp.arange(len(last_token_id)), last_token_id].set(float('-inf'))
 
       # Get the most likely token
       next_token = jnp.argmax(next_token_logits, axis=-1)
diff --git a/algoperf/workloads/lm/lm_pytorch/plainlm_model.py b/algoperf/workloads/lm/lm_pytorch/plainlm_model.py
@@ -23,6 +23,7 @@ class ModelConfig:
   n_heads: int
   rmsnorm_eps: float = 1e-6
   tie_embeddings: bool = True
+  use_residual_scaling: bool = True
 
 
 class MLP(nn.Module):
@@ -32,10 +33,8 @@ def __init__(self, dim: int, hidden_dim: int, multiple_of: int = 256):
     self.fc1 = nn.Linear(dim, 2 * hidden_dim, bias=False)
     self.fc2 = nn.Linear(hidden_dim, dim, bias=False)
     self.glu = nn.GLU(dim=2)
-
-    # Initialize with Xavier uniform
-    nn.init.xavier_uniform_(self.fc1.weight)
-    nn.init.xavier_uniform_(self.fc2.weight)
+    nn.init.normal_(self.fc1.weight, std=0.02)
+    nn.init.normal_(self.fc2.weight, std=0.02)
 
   def forward(self, x):
     # x: (bsz, T, dim)
@@ -89,6 +88,11 @@ def __init__(self, cfg: ModelConfig):
 
     self.w_qkv = nn.Linear(cfg.dim, 3 * cfg.dim, bias=False)
     self.w_out = nn.Linear(cfg.dim, cfg.dim, bias=False)
+    # Split into Q, K, V sections
+    wq, wk, wv = torch.chunk(self.w_qkv.weight, 3, dim=0)
+    for w in [wq, wk, wv]:
+        nn.init.normal_(w, std=0.02)
+    nn.init.normal_(self.w_out.weight, std=0.02)
 
   def forward(self, x, freqs_cis):
     bsz, seqlen, d = x.shape  # (bsz, seqlen, d)
@@ -254,15 +258,11 @@ def _init_weights(self, module):
       if module.bias is not None:
         torch.nn.init.zeros_(module.bias)
     elif isinstance(module, nn.Embedding):
-      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+      torch.nn.init.normal_(module.weight, std=0.02)
 
   def _scale_residual_branches(self):
     for n, p in self.named_parameters():
-      if n.endswith('fc2.weight'):  # mlp/glu output layer
-        torch.nn.init.normal_(
-          p, mean=0.0, std=0.02 / math.sqrt(2 * self.n_layers)
-        )
-      if n.endswith('w_out.weight'):  # attn output layer
+      if n.endswith('fc2.weight') or n.endswith('w_out.weight'):  # mlp/glu output layer
         torch.nn.init.normal_(
           p, mean=0.0, std=0.02 / math.sqrt(2 * self.n_layers)
         )
diff --git a/tests/modeldiffs/lm/compare.py b/tests/modeldiffs/lm/compare.py