Metric Logging, Fixes and Tests for full finetune Recipe (#304)

kartikayk · web-flow · commit 834feaaf1abf · 2024-02-04T12:21:18.000-08:00
diff --git a/.gitignore b/.gitignore
@@ -184,3 +184,6 @@ cover/
 
 # VSCode
 .vscode/
+
+# wandb
+wandb/
diff --git a/recipes/full_finetune.py b/recipes/full_finetune.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import argparse
 import os
 import sys
 
@@ -65,18 +66,27 @@ def __init__(self, params: FullFinetuneParams) -> None:
 
         # logging attributes
         self._output_dir = params.output_dir
+        self._metric_logger = utils.get_metric_logger(
+            metric_logger_type=params.metric_logger_type,
+            project=params.project,
+            log_dir=params.output_dir,
+        )
+        self._log_every_n_steps = (
+            params.log_every_n_steps if params.log_every_n_steps else 1
+        )
 
         # _is_rank_zero is used primarily for logging. In the future, the logger
         # should directly take care of this
         _, rank = utils.get_world_size_and_rank()
         self._is_rank_zero = rank == 0
 
         # These are public properties which are updated by the checkpoint loader
-        # when ``resume_from_checkpoint`` is `True`
+        # when ``resume_from_checkpoint`` is `True` or validated in tests
         self.seed = utils.set_seed(seed=params.seed)
         self.epochs_run = 0
         self.total_epochs = params.epochs
         self.max_steps_per_epoch = params.max_steps_per_epoch
+        self.total_training_steps = 0
 
         self._resume_from_checkpoint = params.resume_from_checkpoint
 
@@ -143,6 +153,20 @@ def setup(self, params: FullFinetuneParams) -> None:
         else:
             self._grad_scaler = GradScaler(enabled=False)
 
+        # Finally update the recipe state which can only be correctly set after all of the
+        # other components have been initialized and updated.
+
+        # Number of training steps in each epoch depends on the number of batches produced
+        # by the dataloader and the max_steps_per_epoch param set by the user and is used
+        # for logging and tracking training state. This should be computed after the dataloader
+        # has been setup
+        steps_per_epoch = len(self._dataloader)
+        if self.max_steps_per_epoch and self.max_steps_per_epoch < len(
+            self._dataloader
+        ):
+            steps_per_epoch = self.max_steps_per_epoch
+            self.total_training_steps = self.epochs_run * steps_per_epoch
+
     def _update_recipe_state(self, ckpt_dict: Dict[str, Any]) -> None:
         """
         Updates the recipe state from checkpoint.
@@ -333,6 +357,8 @@ def train(self) -> None:
                     and idx == self.max_steps_per_epoch
                 ):
                     break
+
+                self.total_training_steps += 1
                 self._optimizer.zero_grad()
 
                 input_ids, labels = batch
@@ -350,13 +376,26 @@ def train(self) -> None:
 
                 pbar.set_description(f"{curr_epoch+1}|{idx+1}|Loss: {loss.item()}")
 
+                if self.total_training_steps % self._log_every_n_steps == 0:
+                    self._metric_logger.log_dict(
+                        {
+                            "loss": loss.item(),
+                            "lr": self._optimizer.param_groups[0]["lr"],
+                            "gpu_resources": torch.cuda.memory_allocated(),
+                        },
+                        step=self.total_training_steps,
+                    )
+
                 self._grad_scaler.scale(loss).backward()
                 self._grad_scaler.step(self._optimizer)
                 self._grad_scaler.update()
 
             self.epochs_run += 1
             self.save_checkpoint(epoch=curr_epoch)
 
+    def cleanup(self) -> None:
+        self._metric_logger.close()
+
 
 def recipe_main() -> None:
     """
@@ -368,13 +407,11 @@ def recipe_main() -> None:
         - Overwritten by arguments from the command-line using ``TuneArgumentParser``
     """
     parser = utils.TuneArgumentParser(
-        description=recipe.__doc__,
+        description=FullFinetuneParams.__doc__,
         formatter_class=argparse.RawDescriptionHelpFormatter,
     )
     args, _ = parser.parse_known_args()
-    parser.log_args(args)
     args = vars(args)
-
     recipe_params = FullFinetuneParams(**args)
 
     # Env variables set by torch run; only need to initialize process group
@@ -383,6 +420,7 @@ def recipe_main() -> None:
     recipe = FullFinetuneRecipe(params=recipe_params)
     recipe.setup(params=recipe_params)
     recipe.train()
+    recipe.cleanup()
 
 
 if __name__ == "__main__":
diff --git a/recipes/params.py b/recipes/params.py
@@ -86,6 +86,7 @@ class FullFinetuneParams:
     output_dir: str = "/tmp/full_finetune_output"
     metric_logger_type: str = "disk"
     project: Optional[str] = None
+    log_every_n_steps: Optional[int] = None
 
     def __post_init__(self):
         for param in fields(self):
diff --git a/recipes/tests/test_finetune_llm.py b/recipes/tests/test_finetune_llm.py
@@ -5,6 +5,8 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
+import os
+import tempfile
 from typing import Dict, Optional
 
 import pytest
@@ -152,7 +154,45 @@ def test_finetune_errors(self, capsys, pytestconfig):
         ):
             finetune_llm.recipe(FullFinetuneParams(**kwargs_values))
 
-    def test_finetune_llm_loss_refactored(self, capsys, pytestconfig):
+
+class TestFullFinetuneRecipe:
+    def _fetch_loss_values(self, output) -> Dict[str, float]:
+        lines = output.splitlines()
+        loss_values = {}
+        for line in lines:
+            if "Loss:" in line:
+                splits = line.split("Loss:")
+                loss_value = float(splits[1].split(":")[0])
+                loss_values[splits[0]] = loss_value
+        return loss_values
+
+    def _fetch_expected_loss_values(self, ckpt) -> Dict[str, float]:
+        small_test_ckpt_loss_values = {
+            "1|1|": 10.5074,
+            "1|2|": 10.5563,
+            "2|1|": 10.5152,
+            "2|2|": 10.4851,
+        }
+        llama2_7b_ckpt_loss_values = {
+            "1|1|": 1.1333,
+            "1|2|": 1.1199,
+            "2|1|": 1.2614,
+            "2|2|": 0.9486,
+        }
+        if ckpt == "small_test_ckpt":
+            return small_test_ckpt_loss_values
+        if ckpt == "llama2_7b":
+            return llama2_7b_ckpt_loss_values
+        raise ValueError(f"Unknown ckpt {ckpt}")
+
+    def _fetch_ckpt_model_path(self, ckpt) -> str:
+        if ckpt == "small_test_ckpt":
+            return "/tmp/test-artifacts/small-ckpt-01242024"
+        if ckpt == "llama2_7b":
+            return "/tmp/test-artifacts/llama2-7b-01242024"
+        raise ValueError(f"Unknown ckpt {ckpt}")
+
+    def test_loss(self, capsys, pytestconfig):
         large_scale = pytestconfig.getoption("--large-scale")
         ckpt = "llama2_7b" if large_scale else "small_test_ckpt"
         expected_loss_values = self._fetch_expected_loss_values(ckpt)
@@ -195,3 +235,71 @@ def test_finetune_llm_loss_refactored(self, capsys, pytestconfig):
             assert key in expected_loss_values
             expected_loss_value = expected_loss_values[key]
             assert value == pytest.approx(expected_loss_value, abs=0.001)
+
+    def test_training_state_on_resume(self):
+        """
+        Test whether the recipe state is correctly updated on resume. Since this
+        is model agnostic, we should run this on the small model only. The test
+        consists of two stages:
+            - Train a model for 4 epochs
+            - Resume training after epoch 3 and check training state.
+        """
+
+        model_ckpt = "small_test_ckpt"
+        expected_loss_values = self._fetch_expected_loss_values(model_ckpt)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+
+            kwargs_values = {
+                "dataset": "alpaca",
+                "seed": 9,
+                "shuffle": True,
+                "model": model_ckpt,
+                "model_checkpoint": self._fetch_ckpt_model_path(model_ckpt),
+                "tokenizer": "llama2_tokenizer",
+                "tokenizer_checkpoint": "/tmp/test-artifacts/tokenizer.model",
+                "epochs": 4,
+                "max_steps_per_epoch": 2,
+                "output_dir": tmpdirname,
+                "device": "cpu",
+                "resume_from_checkpoint": False,
+                "enable_fsdp": False,
+            }
+
+            recipe_params = FullFinetuneParams(**kwargs_values)
+
+            recipe = FullFinetuneRecipe(recipe_params)
+            recipe.setup(params=recipe_params)
+            recipe.train()
+            recipe.cleanup()
+
+            # In the new run, remove seed and max_steps_per_epoch and
+            # check if these are correctly inferred from the checkpoint
+            # Note this will raise some warnings in the logs, but is a
+            # stronger test
+            kwargs_values_resume = {
+                "dataset": "alpaca",
+                "shuffle": True,
+                "model": model_ckpt,
+                "model_checkpoint": os.path.join(tmpdirname, "model_2.ckpt"),
+                "tokenizer": "llama2_tokenizer",
+                "tokenizer_checkpoint": "/tmp/test-artifacts/tokenizer.model",
+                "epochs": 4,
+                "output_dir": tmpdirname,
+                "device": "cpu",
+                "resume_from_checkpoint": True,  # set to True to resume
+                "enable_fsdp": False,
+            }
+
+            recipe_params = FullFinetuneParams(**kwargs_values_resume)
+
+            recipe = FullFinetuneRecipe(recipe_params)
+            recipe.setup(params=recipe_params)
+
+            assert recipe.epochs_run == 3
+            assert recipe.seed == kwargs_values["seed"]
+            assert recipe.max_steps_per_epoch == kwargs_values["max_steps_per_epoch"]
+            assert recipe.total_epochs == kwargs_values["epochs"]
+            assert recipe.total_training_steps == (
+                3 * kwargs_values["max_steps_per_epoch"]
+            )
diff --git a/torchtune/utils/metric_logging.py b/torchtune/utils/metric_logging.py
@@ -155,7 +155,6 @@ def __init__(
                 "``wandb`` package not found. Please install wandb using `pip install wandb` to use WandBLogger."
                 "Alternatively, use the ``StdoutLogger``, which can be specified by setting metric_logger_type='stdout'."
             ) from e
-
         self._wandb = wandb
         self._wandb.init(
             project=project,