Git Theta Low Memory Mode (#234)

blester125 · web-flow · commit d067dd88af4b · 2024-04-24T17:46:33.000-04:00
The original file, piped into the filter from git, it stored in a
temporary file before being read by the checkpoint plugin.

When cleaning parameter groups, we free the memory for the group after
it has been written to disk and converted to metadata.

This is the most everything we can do until the dl native formats support
streaming.
diff --git a/git_theta/__init__.py b/git_theta/__init__.py
@@ -7,6 +7,7 @@
     lsh,
     metadata,
     params,
+    scripts,
     theta,
     updates,
     utils,
diff --git a/git_theta/checkpoints/pickled_dict_checkpoint.py b/git_theta/checkpoints/pickled_dict_checkpoint.py
@@ -7,6 +7,7 @@
 from git_theta.checkpoints import Checkpoint
 
 
+# TODO: We should rename this back to be Torch related as we do things like check if they are Torch.tensors.
 class PickledDictCheckpoint(Checkpoint):
     """Class for wrapping picked dict checkpoints, commonly used with PyTorch."""
 
@@ -29,7 +30,9 @@ def load(cls, checkpoint_path):
         if isinstance(checkpoint_path, io.IOBase):
             checkpoint_path = io.BytesIO(checkpoint_path.read())
 
-        model_dict = torch.load(checkpoint_path)
+        # Map all values to the CPU as they may bave been saved to the GPU and we don't
+        # know if the same GPU topology is available now.
+        model_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"))
         if not isinstance(model_dict, dict):
             raise ValueError("Supplied PyTorch checkpoint must be a dict.")
         if not all(isinstance(k, str) for k in model_dict.keys()):
@@ -40,7 +43,9 @@ def load(cls, checkpoint_path):
 
     @classmethod
     def from_framework(cls, model_dict):
-        return cls({k: v.cpu().numpy() for k, v in model_dict.items()})
+        # If things were saved with gradient requirements we need to detach them
+        # before converting them to numpy arrays.
+        return cls({k: v.cpu().detach().numpy() for k, v in model_dict.items()})
 
     def to_framework(self):
         return {k: torch.as_tensor(v) for k, v in self.items()}
diff --git a/git_theta/filters.py b/git_theta/filters.py
@@ -29,9 +29,9 @@ def clean(
         update_serializer, EnvVarConstants.UPDATE_DATA_PATH
     )
     prev_metadata = metadata.Metadata.from_commit(repo, path, "HEAD").flatten()
+    logger = logging.getLogger("git_theta")
 
     async def _clean(param_keys, new_param):
-        logger = logging.getLogger("git_theta")
         logger.debug(f"Cleaning {'/'.join(param_keys)}")
         # Get the metadata from the previous version of the parameter
         param_metadata = prev_metadata.get(param_keys)
@@ -104,10 +104,29 @@ async def _clean(param_keys, new_param):
             theta_metadata=new_theta_metadata,
         )
         logger.debug(f"Finished Cleaning {'/'.join(param_keys)}")
+        del new_param
         return param_keys, new_param_metadata
 
     # Sort the keys so we don't get changing diffs based on serialization order.
     sorted_checkpoint = dict(sorted(checkpoint.flatten().items()))
+    if EnvVarConstants.LOW_MEMORY:
+        # Run one at a time and delete the old values as you go
+        # TODO: Is is possible/better to process the keys based on the size
+        # of the tensor and resort later? Then you could do things like delete
+        # all the small ones before you have to process the large one.
+        logger.warning(
+            "Runing Git-Theta in Low Memory Mode, no concurrency will be used, and references to parameter weights will be freed after use."
+        )
+        meta = {}
+        for k in list(sorted_checkpoint.keys()):
+            # Get the param while removing it from the dict, removing the
+            # reference in the dict will allow the tensor to be gc'd
+            v = sorted_checkpoint.pop(k)
+            param_name, param_meta = async_utils.run(_clean(k, v))
+            meta[param_name] = param_meta
+            # Drop the reference to the value to allow it to be gc'd.
+            del v
+        return metadata.Metadata(meta).unflatten()
     return metadata.Metadata(
         **async_utils.run(
             async_utils.run_map(
diff --git a/git_theta/git_utils.py b/git_theta/git_utils.py
@@ -351,11 +351,8 @@ def get_file_version(repo, path: str, commit_hash_or_tag: Union[str, git.Commit]
         # GitPython can take commit sha1's or tags (or commit objects) here and
         # it gives the same results.
         tree = repo.commit(commit_hash_or_tag).tree
-        if path in tree:
-            return tree[path]
-        else:
-            return None
-    except git.BadName:
+        return tree[path]
+    except (git.BadName, KeyError):
         return None
 
 
diff --git a/git_theta/scripts/git_theta_cli.py b/git_theta/scripts/git_theta_cli.py
diff --git a/git_theta/scripts/git_theta_filter.py b/git_theta/scripts/git_theta_filter.py
@@ -4,7 +4,6 @@
 import logging
 import os
 import sys
-import tempfile
 
 import git_theta
 from git_theta import checkpoints, git_utils, metadata
@@ -39,7 +38,28 @@ def run_clean(args):
     logger.debug(f"Running clean filter on {args.file}")
     repo = git_utils.get_git_repo()
     checkpoint_handler = checkpoints.get_checkpoint_handler()
-    model_checkpoint = checkpoint_handler.from_file(sys.stdin.buffer)
+    if EnvVarConstants.LOW_MEMORY:
+        logger.warning(
+            "Running Git-Theta in low memory mode. No concurrency is supported and the original checkout value will be transiently stored in a temporary file."
+        )
+        temp_file = f".{args.file}-temp-checkpoint"
+        try:
+            # In some places we don't have enough space when you write to the
+            # tempfile location.
+            logger.debug(f"Writing checkpoint to {temp_file}")
+            with open(temp_file, "w+b") as tmp:
+                tmp.write(sys.stdin.buffer.read())
+                logger.debug(f"Reading checkpoint from {temp_file}")
+                # We write and then seek instead of write,close,open because this was
+                # originally written to use the tempfile lib, but there were space
+                # issues. We keep that paradigm as we may switch back eventually,
+                tmp.seek(0)
+                model_checkpoint = checkpoint_handler.from_file(tmp)
+        finally:
+            # Make sure we always remove the temp checkpoint file.
+            os.remove(temp_file)
+    else:
+        model_checkpoint = checkpoint_handler.from_file(sys.stdin.buffer)
     new_metadata = clean(model_checkpoint, repo, args.file)
     new_metadata.write(sys.stdout)
     # If we had side-loaded information, write it out so we don't get false
diff --git a/git_theta/utils.py b/git_theta/utils.py
@@ -82,6 +82,7 @@ class EnvVarConstants:
     MAX_CONCURRENCY = EnvVar(name="GIT_THETA_MAX_CONCURRENCY", default=-1)
     MANUAL_MERGE = EnvVar(name="GIT_THETA_MANUAL_MERGE", default=False)
     LOG_LEVEL = EnvVar(name="GIT_THETA_LOG_LEVEL", default="DEBUG")
+    LOW_MEMORY = EnvVar(name="GIT_THETA_LOW_MEMORY", default=False)
 
 
 def flatten(
diff --git a/setup.py b/setup.py
@@ -108,7 +108,7 @@ def get_version(file_name: str, version_variable: str = "__version__") -> str:
     },
     entry_points={
         "console_scripts": [
-            "git-theta = git_theta.scripts.git_theta:main",
+            "git-theta = git_theta.scripts.git_theta_cli:main",
             "git-theta-filter = git_theta.scripts.git_theta_filter:main",
             "git-theta-merge = git_theta.scripts.git_theta_merge:main",
             "git-theta-diff = git_theta.scripts.git_theta_diff:main",