Merge branch 'main' into push_causallm

dlwh · web-flow · commit 8bc2b7ac44a5 · 2025-03-12T11:37:52.000-07:00
diff --git a/src/levanter/models/attention.py b/src/levanter/models/attention.py
@@ -581,7 +581,7 @@ class AttentionMask(eqx.Module):
 
     """
 
-    is_causal: bool = eqx.static_field()
+    is_causal: bool = eqx.field(static=True)
     explicit_mask: Optional[NamedArray] = None
     segment_ids: Optional[NamedArray] = None
     # CF https://github.com/jax-ml/jax/blob/47858c4ac2fd4757a3b6fc5bb2981b71a71f00c2/jax/experimental/pallas/ops/tpu/flash_attention.py#L34
diff --git a/src/levanter/models/backpack.py b/src/levanter/models/backpack.py
@@ -97,7 +97,7 @@ def from_hf_config(cls, hf_config: PretrainedConfig):
 class BackpackMlp(eqx.Module):
     c_fc: hnn.Linear  # projection from Embed to Intermediate (typically 4x Embed)
     c_proj: hnn.Linear  # projection from Intermediate to Embed
-    act: Callable = eqx.static_field()
+    act: Callable = eqx.field(static=True)
 
     @staticmethod
     def init(
@@ -134,7 +134,7 @@ class WeightsOnlyAttention(ModuleWithStateDictSerialization):
     """
 
     # No projection
-    config: Gpt2Config = eqx.static_field()
+    config: Gpt2Config = eqx.field(static=True)
 
     c_attn: hnn.Linear  # input projection from [embed] -> [(q, k, v), heads, head_dim]
     dropout: hnn.Dropout
@@ -225,7 +225,7 @@ class BackpackSenses(eqx.Module):
     ln: hnn.LayerNorm
     final_mlp: BackpackMlp
 
-    Pos: Axis = eqx.static_field()
+    Pos: Axis = eqx.field(static=True)
 
     @staticmethod
     def init(
@@ -266,8 +266,8 @@ def sense_embed(self, input_embeds, *, key):
 
 
 class BackpackGpt2Embeddings(eqx.Module):
-    Vocab: Axis = eqx.static_field()
-    config: Gpt2Config = eqx.static_field()
+    Vocab: Axis = eqx.field(static=True)
+    config: Gpt2Config = eqx.field(static=True)
 
     token_embeddings: NamedArray
     position_embeddings: NamedArray
diff --git a/src/levanter/models/gemma.py b/src/levanter/models/gemma.py
@@ -226,7 +226,7 @@ def __call__(self, x: NamedArray) -> NamedArray:
 
 
 class GemmaDecoderLayer(ModuleWithStateDictSerialization):
-    config: GemmaConfig = eqx.static_field()
+    config: GemmaConfig = eqx.field(static=True)
     self_attn: LlamaAttention
     mlp: LlamaMlp
     input_layernorm: GemmaRMSNorm
@@ -267,7 +267,7 @@ def __call__(self, x: NamedArray, mask: Optional[NamedArray | AttentionMask], *,
 
 
 class GemmaTransformer(ModuleWithStateDictSerialization):
-    config: GemmaConfig = eqx.static_field()
+    config: GemmaConfig = eqx.field(static=True)
     layers: BlockFoldable[GemmaDecoderLayer]
     norm: GemmaRMSNorm
 
diff --git a/src/levanter/models/gpt2.py b/src/levanter/models/gpt2.py
@@ -130,7 +130,7 @@ def flops_per_token(self, vocab_size: int) -> Optional[float]:
 class Gpt2Mlp(eqx.Module):
     c_fc: hnn.Linear  # projection from Embed to Intermediate (typically 4x Embed)
     c_proj: hnn.Linear  # projection from Intermediate to Embed
-    act: Callable = eqx.static_field()
+    act: Callable = eqx.field(static=True)
 
     @staticmethod
     def init(Embed: Axis, Mlp: Axis, activation_fn, *, key, use_bias: bool = True) -> "Gpt2Mlp":
@@ -153,7 +153,7 @@ def __call__(self, x: NamedArray, *, key=None):
 
 
 class Gpt2Attention(eqx.Module):
-    config: Gpt2Config = eqx.static_field()
+    config: Gpt2Config = eqx.field(static=True)
 
     c_attn: hnn.Linear  # input projection from [embed] -> [(q, k, v), heads, head_dim]
     c_proj: hnn.Linear  # output projection from [heads, head_dim] -> [embed]
@@ -246,7 +246,7 @@ def __call__(self, x: NamedArray, mask: Optional[AttentionMask | NamedArray], la
 
 
 class Gpt2Transformer(ModuleWithStateDictSerialization):
-    config: Gpt2Config = eqx.static_field()
+    config: Gpt2Config = eqx.field(static=True)
     blocks: Stacked[Gpt2Block]
     ln_f: hnn.LayerNorm
 
@@ -274,8 +274,8 @@ def _state_dict_key_map(self) -> Dict[str, Optional[str]]:
 
 
 class Gpt2Embeddings(ModuleWithStateDictSerialization, eqx.Module):
-    Vocab: Axis = eqx.static_field()
-    config: Gpt2Config = eqx.static_field()
+    Vocab: Axis = eqx.field(static=True)
+    config: Gpt2Config = eqx.field(static=True)
 
     token_embeddings: hnn.Embedding
     position_embeddings: hnn.Embedding
diff --git a/src/levanter/models/llama.py b/src/levanter/models/llama.py
@@ -181,7 +181,7 @@ class LlamaMlp(eqx.Module):
     gate_proj: hnn.Linear  # projection from Embed to Mlp
     up_proj: hnn.Linear  # projection from Embed to Mlp
     down_proj: hnn.Linear  # projection from Mlp to Embed
-    act: Callable = eqx.static_field()
+    act: Callable = eqx.field(static=True)
 
     @staticmethod
     def init(
@@ -207,7 +207,7 @@ def __call__(self, x: NamedArray, *, key=None) -> NamedArray:
 
 
 class LlamaAttention(eqx.Module):
-    config: LlamaConfig = eqx.static_field()
+    config: LlamaConfig = eqx.field(static=True)
     q_proj: hnn.Linear  # projection from Embed to query
     k_proj: hnn.Linear  # projection from Embed to key
     v_proj: hnn.Linear  # projection from Embed to value
@@ -276,12 +276,12 @@ class LlamaRMSNorm(eqx.Module):
     Similar to LayerNorm, but uses the RMS of the input along the specified axis (or axes) instead of variance.
     """
 
-    axis: AxisSpec = eqx.static_field()
+    axis: AxisSpec = eqx.field(static=True)
     weight: Optional[NamedArray]
     bias: Optional[NamedArray]
 
-    eps: float = eqx.static_field(default=1e-5)
-    dtype: Optional[jnp.dtype] = eqx.static_field(default=jnp.float32)
+    eps: float = eqx.field(static=True, default=1e-5)
+    dtype: Optional[jnp.dtype] = eqx.field(static=True, default=jnp.float32)
 
     @staticmethod
     def init(axis: AxisSpec, eps: float = 1e-6, use_weight: bool = True, use_bias: bool = True, dtype=jnp.float32):
@@ -316,7 +316,7 @@ def __call__(self, x: NamedArray) -> NamedArray:
 
 
 class LlamaDecoderLayer(eqx.Module):
-    config: LlamaConfig = eqx.static_field()
+    config: LlamaConfig = eqx.field(static=True)
     self_attn: LlamaAttention
     mlp: LlamaMlp
     input_layernorm: LlamaRMSNorm
@@ -357,7 +357,7 @@ def __call__(self, x: NamedArray, mask: Optional[NamedArray | AttentionMask], *,
 
 
 class LlamaTransformer(eqx.Module):
-    config: LlamaConfig = eqx.static_field()
+    config: LlamaConfig = eqx.field(static=True)
     layers: BlockFoldable[LlamaDecoderLayer]
     norm: LlamaRMSNorm
 
@@ -392,7 +392,7 @@ class LlamaEmbedding(ModuleWithStateDictSerialization, eqx.Module):
     - Llama doesn't use dropout.
     """
 
-    Vocab: Axis = eqx.static_field()
+    Vocab: Axis = eqx.field(static=True)
     token_embeddings: hnn.Embedding
 
     @staticmethod
diff --git a/src/levanter/models/qwen.py b/src/levanter/models/qwen.py
@@ -117,7 +117,7 @@ def flops_per_token(self, vocab_size: int):
 
 # Modified attention class for Qwen
 class QwenAttention(eqx.Module):
-    config: QwenConfig = eqx.static_field()
+    config: QwenConfig = eqx.field(static=True)
     q_proj: hnn.Linear
     k_proj: hnn.Linear
     v_proj: hnn.Linear
@@ -201,7 +201,7 @@ def __call__(
 
 # Modified decoder layer for Qwen
 class QwenDecoderLayer(eqx.Module):
-    config: QwenConfig = eqx.static_field()
+    config: QwenConfig = eqx.field(static=True)
     self_attn: QwenAttention
     mlp: LlamaMlp  # Can reuse Llama MLP as structure is similar
     input_layernorm: LlamaRMSNorm
@@ -242,7 +242,7 @@ def __call__(self, x: NamedArray, mask: Optional[NamedArray | AttentionMask], *,
 
 # Modified transformer for Qwen
 class QwenTransformer(LlamaTransformer):
-    config: QwenConfig = eqx.static_field()
+    config: QwenConfig = eqx.field(static=True)
     layers: BlockFoldable[QwenDecoderLayer]
     norm: LlamaRMSNorm
 
diff --git a/src/levanter/models/whisper.py b/src/levanter/models/whisper.py
@@ -123,7 +123,7 @@ def from_hf_config(cls, hf_config: HfConfig):
 class WhisperMlp(eqx.Module):
     fc1: hnn.Linear  # projection from Embed to Intermediate (typically 4x Embed)
     fc2: hnn.Linear  # projection from Intermediate to Embed
-    act: Callable = eqx.static_field()
+    act: Callable = eqx.field(static=True)
 
     @staticmethod
     def init(Embed: Axis, Mlp: Axis, activation_fn, *, key, use_bias: bool = True) -> "WhisperMlp":
@@ -146,7 +146,7 @@ def __call__(self, x: NamedArray, *, key=None):
 
 
 class WhisperAttention(eqx.Module):
-    config: WhisperConfig = eqx.static_field()
+    config: WhisperConfig = eqx.field(static=True)
 
     q_proj: hnn.Linear  # input projection from [embed] -> [q, heads, head_dim]
     k_proj: hnn.Linear  # input projection from [embed] -> [k, heads, head_dim]
@@ -296,10 +296,10 @@ def __call__(
 
 
 class WhisperEncoder(ModuleWithStateDictSerialization):
-    config: WhisperConfig = eqx.static_field()
+    config: WhisperConfig = eqx.field(static=True)
     conv1: hnn.Conv
     conv2: hnn.Conv
-    act: Callable = eqx.static_field()
+    act: Callable = eqx.field(static=True)
 
     transformer: WhisperTransformer
 
@@ -350,8 +350,8 @@ def _state_dict_key_map(self) -> Dict[str, Optional[str]]:
 
 
 class WhisperDecoderEmbeddings(eqx.Module):
-    Vocab: Axis = eqx.static_field()
-    config: WhisperConfig = eqx.static_field()
+    Vocab: Axis = eqx.field(static=True)
+    config: WhisperConfig = eqx.field(static=True)
 
     token_embeddings: hnn.Embedding
     position_embeddings: hnn.Embedding
diff --git a/src/levanter/optim/model_averaging.py b/src/levanter/optim/model_averaging.py
@@ -33,7 +33,7 @@ class EmaModelAveraging(ModelAveraging[M]):
     """
 
     model: M
-    beta: float = eqx.static_field()
+    beta: float = eqx.field(static=True)
 
     def update(self: S, new_model: M, step: int) -> S:
         del step
diff --git a/tests/test_grad_accum.py b/tests/test_grad_accum.py
@@ -18,9 +18,9 @@ class Mlp(eqx.Module):
 
     w_in: hax.NamedArray
     w_out: hax.NamedArray
-    In: hax.Axis = eqx.static_field()
-    Out: hax.Axis = eqx.static_field()
-    Mid: hax.Axis = eqx.static_field()
+    In: hax.Axis = eqx.field(static=True)
+    Out: hax.Axis = eqx.field(static=True)
+    Mid: hax.Axis = eqx.field(static=True)
 
     @staticmethod
     def init(In: hax.Axis, Out: hax.Axis, Mid: hax.Axis, *, key):
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -33,8 +33,8 @@ class MLP(eqx.Module):
     """slightly less annoying MLP, used for testing purposes"""
 
     layers: List[nn.Linear]
-    activation: Callable = eqx.static_field()
-    final_activation: Callable = eqx.static_field()
+    activation: Callable = eqx.field(static=True)
+    final_activation: Callable = eqx.field(static=True)
     in_size: int = static_field()
     out_size: int = static_field()
     width_size: int = static_field()