r-three · blester125 · May 13, 2024 · craffel · Jun 12, 2024 · craffel
diff --git a/git_theta/checkpoints/__init__.py b/git_theta/checkpoints/__init__.py
@@ -4,4 +4,5 @@
     Checkpoint,
     get_checkpoint_handler,
     get_checkpoint_handler_name,
+    sniff_checkpoint,
 )
diff --git a/git_theta/checkpoints/base.py b/git_theta/checkpoints/base.py
@@ -157,3 +157,21 @@ def get_checkpoint_handler(checkpoint_type: Optional[str] = None) -> Checkpoint:
     checkpoint_type = get_checkpoint_handler_name(checkpoint_type)
     discovered_plugins = entry_points(group="git_theta.plugins.checkpoints")
     return discovered_plugins[checkpoint_type].load()
+
+
+def sniff_checkpoint(checkpoint_path) -> str:
+    """En"""
+    discovered_plugins = entry_points(group="git_theta.plugins.checkpoint.sniffers")
+    loaded_plugins = {ep.name: ep.load() for ep in discovered_plugins}
+    logger = logging.getLogger("git_theta")
+    logger.debug(
+        f"Sniffing {checkpoint_path} to infer which deep learning framework it is."
+    )
+    for ckpt_type, ckpt_sniffer in loaded_plugins.items():
+        logger.debug(f"Checking if {checkpoint_path} is a {ckpt_type} checkpoint.")
+        if ckpt_sniffer(checkpoint_path):
+            logger.debug(
+                f"Determined that {checkpoint_path} is a {ckpt_type} checkpoint."
+            )
+            return ckpt_type
+    raise ValueError(f"Couldn't determine checkpoint type for {checkpoint_path}")
diff --git a/git_theta/scripts/git_theta_filter.py b/git_theta/scripts/git_theta_filter.py
@@ -36,6 +36,8 @@ def run_clean(args):
     """
     logger = logging.getLogger("git_theta")
     logger.debug(f"Running clean filter on {args.file}")
+    if EnvVarConstants.CHECKPOINT_TYPE == "sniff":
+        EnvVarConstants.CHECKPOINT_TYPE = checkpoints.sniff_checkpoints()
     repo = git_utils.get_git_repo()
     checkpoint_handler = checkpoints.get_checkpoint_handler()
     if EnvVarConstants.LOW_MEMORY:
@@ -74,6 +76,8 @@ def run_smudge(args):
     """
     logger = logging.getLogger("git_theta")
     logger.debug(f"Running smudge filter on {args.file}")
+    if EnvVarConstants.CHECKPOINT_TYPE == "sniff":
+        EnvVarConstants.CHECKPOINT_TYPE = checkpoints.sniff_checkpoints()
 
     repo = git_utils.get_git_repo()
     curr_metadata = metadata.Metadata.from_file(sys.stdin)

diff --git a/git_theta/utils.py b/git_theta/utils.py
@@ -71,7 +71,7 @@ def __get__(self, obj, objtype=None):
 
 
 class EnvVarConstants:
-    CHECKPOINT_TYPE = EnvVar(name="GIT_THETA_CHECKPOINT_TYPE", default="pytorch")
+    CHECKPOINT_TYPE = EnvVar(name="GIT_THETA_CHECKPOINT_TYPE", default="sniff")
     UPDATE_TYPE = EnvVar(name="GIT_THETA_UPDATE_TYPE", default="dense")
     UPDATE_DATA_PATH = EnvVar(name="GIT_THETA_UPDATE_DATA_PATH", default="")
     PARAMETER_ATOL = EnvVar(name="GIT_THETA_PARAMETER_ATOL", default=1e-8)

diff --git a/plugins/checkpoints/flax/git_theta_checkpoints_flax/__init__.py b/plugins/checkpoints/flax/git_theta_checkpoints_flax/__init__.py
@@ -0,0 +1 @@
+__version__ = "0.2.0"
diff --git a/git_theta/checkpoints/flax_checkpoint.py → ...git_theta_checkpoints_flax/checkpoints.py b/git_theta/checkpoints/flax_checkpoint.py → ...git_theta_checkpoints_flax/checkpoints.py
diff --git a/plugins/checkpoints/flax/git_theta_checkpoints_flax/sniffer.py b/plugins/checkpoints/flax/git_theta_checkpoints_flax/sniffer.py
@@ -0,0 +1,9 @@
+"""Infer if a checkpoint is flax based.
+
+We put this in a different file to avoid importing dl frameworks for file sniffing.
+"""
+
+
+def flax_sniffer(checkpoint_path: str) -> bool:
+    # TODO: Check if the actual value is msgpack based on magic numbers?
+    return checkpoint_path.endswith(".flax")
diff --git a/plugins/checkpoints/flax/setup.py b/plugins/checkpoints/flax/setup.py
@@ -0,0 +1,65 @@
+"""Plugin to support the flax checkpoint format."""
+
+import ast
+import os
+
+from setuptools import setup
+
+
+def get_version(file_name: str, version_variable: str = "__version__") -> str:
+    """Find the version by walking the AST to avoid duplication.
+
+    Parameters
+    ----------
+    file_name : str
+        The file we are parsing to get the version string from.
+    version_variable : str
+        The variable name that holds the version string.
+
+    Raises
+    ------
+    ValueError
+        If there was no assignment to version_variable in file_name.
+
+    Returns
+    -------
+    version_string : str
+        The version string parsed from file_name_name.
+    """
+    with open(file_name) as f:
+        tree = ast.parse(f.read())
+        # Look at all assignment nodes that happen in the ast. If the variable
+        # name matches the given parameter, grab the value (which will be
+        # the version string we are looking for).
+        for node in ast.walk(tree):
+            if isinstance(node, ast.Assign):
+                if node.targets[0].id == version_variable:
+                    return node.value.s
+    raise ValueError(
+        f"Could not find an assignment to {version_variable} " f"within '{file_name}'"
+    )
+
+
+setup(
+    name="git_theta_checkpoints_flax",
+    description="Plugin to support the flax checkpoint format.",
+    install_requires=[
+        # "git_theta",
+        "flax",
+        "jax",
+    ],
+    version=get_version("git_theta_checkpoints_flax/__init__.py"),
+    packages=[
+        "git_theta_checkpoints_flax",
+    ],
+    author="Brian Lester",
+    entry_points={
+        "git_theta.plugins.checkpoints": [
+            "flax = git_theta_checkpoints_flax.checkpoints:FlaxCheckpoint",
+            "flax-checkpoint = git_theta_checkpoints_flax.checkpoints:FlaxCheckpoint",
+        ],
+        "git_theta.plugins.checkpoint.sniffers": [
+            "flax = git_theta_checkpoints_flax.sniffer:flax_sniffer",
+        ],
+    },
+)
diff --git a/plugins/checkpoints/pytorch/git_theta_checkpoints_pytorch/__init__.py b/plugins/checkpoints/pytorch/git_theta_checkpoints_pytorch/__init__.py
@@ -0,0 +1 @@
+__version__ = "0.2.0"
diff --git a/...ta/checkpoints/pickled_dict_checkpoint.py → ..._theta_checkpoints_pytorch/checkpoints.py b/...ta/checkpoints/pickled_dict_checkpoint.py → ..._theta_checkpoints_pytorch/checkpoints.py
diff --git a/plugins/checkpoints/pytorch/git_theta_checkpoints_pytorch/sniffer.py b/plugins/checkpoints/pytorch/git_theta_checkpoints_pytorch/sniffer.py
@@ -0,0 +1,15 @@
+"""Infer if a checkpoint is pytorch based.
+
+We put this in a different file to avoid importing dl frameworks for file sniffing.
+"""
+
+import re
+
+
+def pytorch_sniffer(checkpoint_path: str) -> bool:
+    # Many checkpoints on HuggingFace Hub are named this.
+    if checkpoint_path == "pytorch_model.bin":
+        return True
+    if re.search(r"\.py?t$", checkpoint_path):
+        return True
+    return False
diff --git a/plugins/checkpoints/pytorch/setup.py b/plugins/checkpoints/pytorch/setup.py
@@ -0,0 +1,64 @@
+"""Plugin to support the safetensor format."""
+
+import ast
+import os
+
+from setuptools import setup
+
+
+def get_version(file_name: str, version_variable: str = "__version__") -> str:
+    """Find the version by walking the AST to avoid duplication.
+
+    Parameters
+    ----------
+    file_name : str
+        The file we are parsing to get the version string from.
+    version_variable : str
+        The variable name that holds the version string.
+
+    Raises
+    ------
+    ValueError
+        If there was no assignment to version_variable in file_name.
+
+    Returns
+    -------
+    version_string : str
+        The version string parsed from file_name_name.
+    """
+    with open(file_name) as f:
+        tree = ast.parse(f.read())
+        # Look at all assignment nodes that happen in the ast. If the variable
+        # name matches the given parameter, grab the value (which will be
+        # the version string we are looking for).
+        for node in ast.walk(tree):
+            if isinstance(node, ast.Assign):
+                if node.targets[0].id == version_variable:
+                    return node.value.s
+    raise ValueError(
+        f"Could not find an assignment to {version_variable} " f"within '{file_name}'"
+    )
+
+
+setup(
+    name="git_theta_checkpoints_pytorch",
+    description="Plugin to support the pytorch checkpoint format.",
+    install_requires=[
+        # "git_theta",
+        "torch",
+    ],
+    version=get_version("git_theta_checkpoints_pytorch/__init__.py"),
+    packages=[
+        "git_theta_checkpoints_pytorch",
+    ],
+    author="Brian Lester",
+    entry_points={
+        "git_theta.plugins.checkpoints": [
+            "pytorch = git_theta_checkpoints_pytorch.checkpoints:PickledDictCheckpoint",
+            "pickled-dict = git_theta_checkpoints_pytorch.checkpoints:PickledDictCheckpoint",
+        ],
+        "git_theta.plugins.checkpoint.sniffers": [
+            "pytorch = git_theta_checkpoints_pytorch.sniffer:pytorch_sniffer",
+        ],
+    },
+)
diff --git a/plugins/checkpoints/safetensors/git_theta_checkpoints_safetensors/__init__.py b/plugins/checkpoints/safetensors/git_theta_checkpoints_safetensors/__init__.py
@@ -0,0 +1 @@
+__version__ = "0.2.0"
diff --git a/...eta/checkpoints/safetensors_checkpoint.py → ...ta_checkpoints_safetensors/checkpoints.py b/...eta/checkpoints/safetensors_checkpoint.py → ...ta_checkpoints_safetensors/checkpoints.py
diff --git a/plugins/checkpoints/safetensors/git_theta_checkpoints_safetensors/flycheck_checkpoints.py b/plugins/checkpoints/safetensors/git_theta_checkpoints_safetensors/flycheck_checkpoints.py
@@ -0,0 +1,44 @@
+"""Checkpoint using the HF safetensors format.
+
+safetensors has the ability to write model checkpoint from "dl-native" -> "safetensors"
+and read "safetensors" -> any "dl-native" framework, not just the one that was
+used to write it. Therefore, we read/write with their numpy API.
+"""
+
+import safetensors.numpy
+from file_or_name import file_or_name
+
+from git_theta.checkpoints import Checkpoint
+
+
+# TODO(bdlester): Can we leverage the lazying loading ability to make things faster?
+class SafeTensorsCheckpoint(Checkpoint):
+    """Class for r/w of the safetensors format. https://github.com/huggingface/safetensors"""
+
+    name: str = "safetensors"
+
+    @classmethod
+    @file_or_name(checkpoint_path="rb")
+    def load(cls, checkpoint_path: str):
+        # Note that we use the numpy as the framework because we don't care what
+        # their downstream dl framework is, we only want the results back as
+        # numpy arrays.
+        return safetensors.numpy.load(checkpoint_path.read())
+
+    @file_or_name(checkpoint_path="wb")
+    def save(self, checkpoint_path: str):
+        # Note, git theta uses numpy internally, so we save using the numpy api,
+        # regardless of the original framework they used to write the checkpoint.
+        checkpoint_dict = self.to_framework()
+        checkpoint_path.write(safetensors.numpy.save(checkpoint_dict))
+
+    def to_framework(self):
+        return self
+
+    @classmethod
+    def from_framework(cls, model_dict):
+        return cls(model_dict)
+
+
+def safetensors_sniffer(checkpoint_path: str) -> bool:
+    return checkpoint_path.endswith(".safetensors")
diff --git a/plugins/checkpoints/safetensors/git_theta_checkpoints_safetensors/sniffer.py b/plugins/checkpoints/safetensors/git_theta_checkpoints_safetensors/sniffer.py
@@ -0,0 +1,8 @@
+"""Infer if a checkpoint is safetensors based.
+
+We put this in a different file to avoid importing dl frameworks for file sniffing.
+"""
+
+
+def safetensors_sniffer(checkpoint_path: str) -> bool:
+    return checkpoint_path.endswith(".safetensors")
diff --git a/plugins/checkpoints/safetensors/setup.py b/plugins/checkpoints/safetensors/setup.py
@@ -0,0 +1,64 @@
+"""Plugin to support the safetensor format."""
+
+import ast
+import os
+
+from setuptools import setup
+
+
+def get_version(file_name: str, version_variable: str = "__version__") -> str:
+    """Find the version by walking the AST to avoid duplication.
+
+    Parameters
+    ----------
+    file_name : str
+        The file we are parsing to get the version string from.
+    version_variable : str
+        The variable name that holds the version string.
+
+    Raises
+    ------
+    ValueError
+        If there was no assignment to version_variable in file_name.
+
+    Returns
+    -------
+    version_string : str
+        The version string parsed from file_name_name.
+    """
+    with open(file_name) as f:
+        tree = ast.parse(f.read())
+        # Look at all assignment nodes that happen in the ast. If the variable
+        # name matches the given parameter, grab the value (which will be
+        # the version string we are looking for).
+        for node in ast.walk(tree):
+            if isinstance(node, ast.Assign):
+                if node.targets[0].id == version_variable:
+                    return node.value.s
+    raise ValueError(
+        f"Could not find an assignment to {version_variable} " f"within '{file_name}'"
+    )
+
+
+setup(
+    name="git_theta_checkpoints_safetensors",
+    description="Plugin to support the safetensors checkpoint format.",
+    install_requires=[
+        # "git_theta",
+        "safetensors",
+    ],
+    version=get_version("git_theta_checkpoints_safetensors/__init__.py"),
+    packages=[
+        "git_theta_checkpoints_safetensors",
+    ],
+    author="Brian Lester",
+    entry_points={
+        "git_theta.plugins.checkpoints": [
+            "safetensors = git_theta_checkpoints_safetensors.checkpoints:SafeTensorsCheckpoint",
+            "safetensors-checkpoint = git_theta_checkpoints_safetensors.checkpoints:SafeTensorsCheckpoint",
+        ],
+        "git_theta.plugins.checkpoint.sniffers": [
+            "safetensors = git_theta_checkpoints_safetensors.sniffer:safetensors_sniffer",
+        ],
+    },
+)
diff --git a/plugins/checkpoints/tensorflow/git_theta_checkpoints_tensorflow/__init__.py b/plugins/checkpoints/tensorflow/git_theta_checkpoints_tensorflow/__init__.py
@@ -0,0 +1 @@
+__version__ = "0.2.0"
diff --git a/...heta/checkpoints/tensorflow_checkpoint.py → ...eta_checkpoints_tensorflow/checkpoints.py b/...heta/checkpoints/tensorflow_checkpoint.py → ...eta_checkpoints_tensorflow/checkpoints.py
diff --git a/plugins/checkpoints/tensorflow/git_theta_checkpoints_tensorflow/sniffer.py b/plugins/checkpoints/tensorflow/git_theta_checkpoints_tensorflow/sniffer.py
@@ -0,0 +1,14 @@
+"""Infer if a checkpoint is tensorflow based.
+
+We put this in a different file to avoid importing dl frameworks for file sniffing.
+"""
+
+
+def tensorflow_sniffer(checkpoint_path: str) -> bool:
+    return checkpoint_path.endswith(".tf")
+
+
+# TODO: Add support for detecting saved models.
+def saved_model_sniffer(checkpoint_path: str) -> bool:
+    # We don't support saved models yet.
+    return False