Fixed lm1b_nnx example training script

vfdev-5 · vfdev-5 · commit 16677b3805df · 2025-05-14T08:56:36.000Z
diff --git a/examples/lm1b_nnx/README.md b/examples/lm1b_nnx/README.md
@@ -52,7 +52,7 @@ Then install Flax + the example dependencies:
 git clone --depth=1 --branch=main https://github.com/google/flax
 cd flax
 pip install -e .
-cd examples/lm1b
+cd examples/lm1b_nnx
 pip install -r requirements.txt
 ```
 
@@ -75,9 +75,9 @@ tensorboard --logdir=$HOME/logs
 You should expect to get numbers similar to these:
 
 
-Hardware | config  | Training time |      Loss      |                             TensorBoard.dev                              |                                                          Workdir
--------- | ------- | ------------- | -------------- | ------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------
-TPU v3-8 | default | 13h18m | 3.127 | [2021-08-08](https://tensorboard.dev/experiment/n30WkNOZTJq3RHWD7wNslg/) | [gs://flax_public/examples/lm1b/default](https://console.cloud.google.com/storage/browser/flax_public/examples/lm1b/default)
+Hardware | config  | Training time |      Loss      |                                                          Workdir
+-------- | ------- | ------------- | -------------- | --------------------------------------------------------------------------------------------------------------------------
+TPU v3-8 | default | 13h18m | 3.127 | [gs://flax_public/examples/lm1b/default](https://console.cloud.google.com/storage/browser/flax_public/examples/lm1b/default)
 
 ### Downloading the LM1B Datasets
 
@@ -87,6 +87,5 @@ data on a storage bucket, from where it can be loaded directly. Set the
 `TFDS_DATA_DIR` to your storage bucket path (`gs://<bucket name>`).
 
 You can download and prepare LM1B datasets using TFDS directly:
-`python -m tensorflow_datasets.scripts.download_and_prepare
---datasets=lm1b`
+`python -m tensorflow_datasets.scripts.download_and_prepare --datasets=lm1b`
 
diff --git a/examples/lm1b_nnx/input_pipeline_test.py b/examples/lm1b_nnx/input_pipeline_test.py
@@ -48,9 +48,11 @@ def _get_datasets(self):
     vocab_path = os.path.join(tempfile.mkdtemp(), 'sentencepiece_model')
 
     # Go two directories up to the root of the flax directory.
-    flax_root_dir = pathlib.Path(__file__).parents[4]
+    try:
+      flax_root_dir = pathlib.Path(__file__).parents[4]
+    except IndexError:
+      flax_root_dir = "/"
     data_dir = str(flax_root_dir) + '/.tfds/metadata'  # pylint: disable=unused-variable
-
     with tfds.testing.mock_data(num_examples=128, data_dir=data_dir):
       train_ds, eval_ds, predict_ds, _ = input_pipeline.get_datasets(
         n_devices=2, config=config, vocab_path=vocab_path
diff --git a/examples/lm1b_nnx/main.py b/examples/lm1b_nnx/main.py
@@ -34,7 +34,7 @@
   'File path to the training hyperparameter configuration.',
   lock_config=True,
 )
-flags.mark_flags_as_required(['config', 'workdir'])
+flags.mark_flags_as_required(['workdir'])
 
 
 def main(argv):
diff --git a/examples/lm1b_nnx/train.py b/examples/lm1b_nnx/train.py
@@ -41,7 +41,6 @@
 from jax.sharding import PartitionSpec as P
 from utils import HasCache, TrainState
 
-from flax import linen as nn
 from flax import nnx
 from flax.training import checkpoints, common_utils
 
@@ -115,7 +114,7 @@ def compute_weighted_cross_entropy(
     targets, vocab_size, on_value=confidence, off_value=low_confidence
   )
 
-  loss = -jnp.sum(soft_targets * nn.log_softmax(logits), axis=-1)
+  loss = -jnp.sum(soft_targets * nnx.log_softmax(logits), axis=-1)
   loss = loss - normalizing_constant
 
   normalizing_factor = np.prod(targets.shape)
@@ -191,9 +190,9 @@ def train_step(
 
   dropout_rng = jax.random.fold_in(dropout_rng, state.step)
 
-  def loss_fn(params):
+  def loss_fn(params, other_variables):
     """loss function used for training."""
-    module = nnx.merge(state.graphdef, params)
+    module = nnx.merge(state.graphdef, params, other_variables)
     module.set_attributes(deterministic=False, decode=False)
     logits = module(
       inputs,
@@ -211,7 +210,7 @@ def loss_fn(params):
   step = state.step
   lr = learning_rate_fn(step)
   grad_fn = jax.value_and_grad(loss_fn, has_aux=True)
-  (_, logits), grads = grad_fn(state.params)
+  (_, logits), grads = grad_fn(state.params, state.other_variables)
   new_state = state.apply_gradients(grads=grads)
   metrics = compute_metrics(logits, inputs, weights)
   metrics['learning_rate'] = lr
@@ -221,14 +220,15 @@ def loss_fn(params):
 
 def eval_step(
   params: nnx.State,
+  other_variables: nnx.State,
   batch,
   graphdef: nnx.GraphDef[models.TransformerLM],
   label_smoothing=0.0,
 ):
   """Calculate evaluation metrics on a batch."""
   inputs = batch['inputs']
   weights = jnp.where(inputs > 0, 1.0, 0.0)
-  module = nnx.merge(graphdef, params)
+  module = nnx.merge(graphdef, params, other_variables)
   module.set_attributes(deterministic=True, decode=False)
   logits = module(inputs)
 
@@ -238,6 +238,7 @@ def eval_step(
 def predict_step(
   inputs,
   params: nnx.State,
+  other_variables: nnx.State,
   rngkey: jax.Array,
   graphdef: nnx.GraphDef[models.TransformerLM],
   eos_id: int,
@@ -247,20 +248,20 @@ def predict_step(
   top_k: int,
 ):
   """Predict language model on a batch."""
-  module = nnx.merge(graphdef, params)
+  module = nnx.merge(graphdef, params, other_variables)
 
   # TODO(cgarciae): check how pytorch does this.
   for _path, m in module.iter_modules():
     if isinstance(m, HasCache):
       input_shape = (inputs.shape[0], max_decode_len, config.emb_dim)
       m.init_cache(input_shape, dtype=config.dtype)
 
-  graphdef, params, cache = nnx.split(module, nnx.Param, nnx.Cache)
+  graphdef, params, cache, other_variables = nnx.split(module, nnx.Param, nnx.Cache, ...)
 
   def tokens_ids_to_logits(flat_ids, cache: nnx.State):
     """Token slice to logits from decoder model."""
     # --> [batch * beam, 1, vocab]
-    module = nnx.merge(graphdef, params, cache)
+    module = nnx.merge(graphdef, params, cache, other_variables)
     module.set_attributes(deterministic=True, decode=True)
     logits = module(flat_ids)
     cache = nnx.state(module, nnx.Cache)
@@ -313,7 +314,7 @@ def evaluate(
   eval_iter = iter(eval_ds)  # pytype: disable=wrong-arg-types
   for _, eval_batch in zip(range(num_eval_steps), eval_iter):
     eval_batch = jax.tree.map(lambda x: x._numpy(), eval_batch)  # pylint: disable=protected-access
-    metrics = jit_eval_step(state.params, eval_batch, state.graphdef)
+    metrics = jit_eval_step(state.params, state.other_variables, eval_batch, state.graphdef)
     eval_metrics.append(metrics)
   eval_metrics = common_utils.stack_forest(eval_metrics)
   eval_metrics_sums = jax.tree.map(jnp.sum, eval_metrics)
@@ -330,6 +331,7 @@ def generate_prediction(
   jit_pred_step,
   graphdef: nnx.GraphDef[models.TransformerLM],
   params: nnx.State,
+  other_variables: nnx.State,
   tokenized_prompts,
   eos_id,
   inference_rng,
@@ -359,6 +361,7 @@ def generate_prediction(
     predicted = jit_pred_step(
       pred_batch,
       params,
+      other_variables,
       inference_rngs,
       graphdef,
       eos_id,
@@ -389,6 +392,7 @@ def train_and_evaluate(config: default.Config, workdir: str):
     workdir: Working directory for checkpoints and TF summaries. If this
       contains checkpoint training will be resumed from the latest checkpoint.
   """
+  workdir = os.path.abspath(workdir)
   tf.io.gfile.makedirs(workdir)
 
   vocab_path = config.vocab_path
@@ -440,18 +444,15 @@ def encode_strings(strs, max_len):
     max_len=max(config.max_target_length, config.max_eval_target_length),
     dropout_rate=config.dropout_rate,
     attention_dropout_rate=config.attention_dropout_rate,
-    kernel_init=nn.initializers.xavier_uniform(),
-    bias_init=nn.initializers.normal(stddev=1e-6),
+    kernel_init=nnx.initializers.xavier_uniform(),
+    bias_init=nnx.initializers.normal(stddev=1e-6),
     axis_rules=config.axis_rules,
   )
 
   # Mesh definition
   devices_array = utils.create_device_mesh(config)
   mesh = Mesh(devices_array, config.mesh_axes)
 
-  # print(mesh.shape)
-  # exit()
-
   start_step = 0
   rng = jax.random.PRNGKey(config.seed)
   rng, init_rng = jax.random.split(rng)
@@ -498,18 +499,19 @@ def constructor(config: models.TransformerConfig, key: jax.Array):
       None,
     ),  # type: ignore
     out_shardings=(state_sharding, None),  # type: ignore
-    static_argnums=(2, 3),
+    static_argnames=("learning_rate_fn", "label_smoothing"),
     donate_argnums=0,
   )
 
   jit_eval_step = jax.jit(
     eval_step,
     in_shardings=(
       state_sharding.params,
+      state_sharding.other_variables,
       data_sharding,
     ),  # type: ignore
     out_shardings=None,  # type: ignore
-    static_argnums=(2, 3),
+    static_argnames=("graphdef", "label_smoothing"),
   )
 
   # Since the inputs and rngkey args for predict_step will be batched,
@@ -520,6 +522,7 @@ def constructor(config: models.TransformerConfig, key: jax.Array):
       in_axes=(
         0,
         jax.tree.map(lambda x: None, state.params),
+        jax.tree.map(lambda x: None, state.other_variables),
         0,
         None,
         None,
@@ -532,10 +535,11 @@ def constructor(config: models.TransformerConfig, key: jax.Array):
     in_shardings=(
       data_sharding,
       state_sharding.params,
+      state_sharding.other_variables,
       data_sharding,
     ),  # type: ignore
     out_shardings=data_sharding,  # type: ignore
-    static_argnums=tuple(range(3, 9)),
+    static_argnums=tuple(range(4, 10)),
   )
 
   # Main Train Loop
@@ -575,7 +579,7 @@ def constructor(config: models.TransformerConfig, key: jax.Array):
         h(step)
 
       # Periodic metric handling.
-      if step % config.eval_every_steps == 0 or is_last_step:
+      if (step > 0 and step % config.eval_every_steps == 0) or is_last_step:
         with report_progress.timed('training_metrics'):
           logging.info('Gathering training metrics.')
           train_metrics = common_utils.stack_forest(train_metrics)
@@ -609,6 +613,7 @@ def constructor(config: models.TransformerConfig, key: jax.Array):
             jit_pred_step=jit_pred_step,
             graphdef=state.graphdef,
             params=state.params,
+            other_variables=state.other_variables,
             tokenized_prompts=tokenized_prompts,
             eos_id=eos_id,
             inference_rng=inference_rng,
diff --git a/examples/lm1b_nnx/utils.py b/examples/lm1b_nnx/utils.py
@@ -35,6 +35,7 @@
 
 class TrainState(train_state.TrainState):
   graphdef: nnx.GraphDef[TransformerLM]
+  other_variables: nnx.State
 
 
 @runtime_checkable
@@ -157,9 +158,13 @@ def setup_initial_state(
 
   with mesh:
     model = constructor(config, rng)
-    graphdef, params = nnx.split(model, nnx.Param)
+    graphdef, params, other_variables = nnx.split(model, nnx.Param, ...)
     state = TrainState.create(
-      apply_fn=graphdef.apply, params=params, tx=tx, graphdef=graphdef
+      apply_fn=graphdef.apply,
+      params=params,
+      other_variables=other_variables,
+      tx=tx,
+      graphdef=graphdef,
     )
     state = jax.tree.map(_to_array, state)
     state_spec = nnx.get_partition_spec(state)

Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@`
`34`	`34`	`'File path to the training hyperparameter configuration.',`
`35`	`35`	`lock_config=True,`
`36`	`36`	`)`
`37`		`-flags.mark_flags_as_required(['config', 'workdir'])`
	`37`	`+flags.mark_flags_as_required(['workdir'])`
`38`	`38`
`39`	`39`
`40`	`40`	`def main(argv):`