debugging validation

ecmwf · icedoom888 · Oct 9, 2024 · Oct 10, 2024 · Oct 22, 2024 · Oct 22, 2024
commit e2bd86804aec2501e627240613904cd791bb2464
diff --git a/src/anemoi/training/data/datamodule.py b/src/anemoi/training/data/datamodule.py
@@ -162,10 +162,11 @@ def _get_dataset(
         rollout: int = 1,
         label: str = "generic",
     ) -> NativeGridDataset:
+
         r = max(rollout, self.rollout)
 
-         # Compute effective batch size 
-        effective_bs = self.config.dataloader.batch_size[label] *\
+        # Compute effective batch size 
+        effective_bs = self.config.dataloader.batch_size['training'] *\
             self.config.hardware.num_gpus_per_node *\
             self.config.hardware.num_nodes //\
             self.config.hardware.num_gpus_per_model

diff --git a/src/anemoi/training/train/forecaster.py b/src/anemoi/training/train/forecaster.py
@@ -75,6 +75,8 @@ def __init__(
             config=DotDict(map_config_to_primitives(OmegaConf.to_container(config, resolve=True))),
         )
 
+        self.model = torch.compile(self.model)
+
         self.data_indices = data_indices
 
         self.save_hyperparameters()
@@ -321,8 +323,11 @@ def on_train_epoch_end(self) -> None:
         self.rollout = min(self.rollout, self.rollout_max)
 
     def validation_step(self, batch: torch.Tensor, batch_idx: int) -> None:
+        print('I am doing validation!!!')
         with torch.no_grad():
             val_loss, metrics, y_preds = self._step(batch, batch_idx, validation_mode=True)
+        print('Done step..')
+        print('Logging..')
         self.log(
             "val_wmse",
             val_loss,
@@ -333,7 +338,8 @@ def validation_step(self, batch: torch.Tensor, batch_idx: int) -> None:
             batch_size=batch.shape[0],
             sync_dist=True,
         )
-        for mname, mvalue in metrics.items():
+        for i, (mname, mvalue) in enumerate(metrics.items()):
+            print(i)
             self.log(
                 "val_" + mname,
                 mvalue,
@@ -344,6 +350,7 @@ def validation_step(self, batch: torch.Tensor, batch_idx: int) -> None:
                 batch_size=batch.shape[0],
                 sync_dist=True,
             )
+        print('Done')
         return val_loss, y_preds
 
     def configure_optimizers(self) -> tuple[list[torch.optim.Optimizer], list[dict]]: