feat: checksum model + fixes

dmyersturnbull · Oct 24, 2021 · 57504da · 57504da
1 parent e2b3eca
commit 57504da
Show file tree

Hide file tree

Showing 26 changed files with 925 additions and 761 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,7 +3,7 @@
 Adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and
 [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
-## [0.15.1] - unreleased
+## [0.16.0] - 2021-10-23
 
 ### Added
 
@@ -12,8 +12,14 @@ Adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and
 
 ### Changed
 
+- `Checksums`
 - Simplified `.suffixes` (breaking change to an uncommon function)
 
+### Fixed
+
+- Duplicate IO methods
+- Checksum path resolution
+
 ## [0.15.0] - 2021-10-16
 
 ### Added

diff --git a/docs/faq.rst b/docs/faq.rst
@@ -13,6 +13,7 @@ There is a final type, defined to have no typing rules, that can be constructed
 variant via :meth:`typeddfs._entries.TypedDfs.wrap` to give it the additional methods.
 
 .. code-block:: python
+
     from typeddfs import TypedDfs
 
     MyDf = TypedDfs.typed("MyDf").build()
@@ -62,6 +63,7 @@ that the returned DataFrame might not conform to your requirements.
 Call :meth:`typeddfs.abs_dfs.AbsDf.retype` at the end to reorganize and verify.
 
 .. code-block:: python
+
     from typeddfs import TypedDfs
 
     MyDf = TypedDfs.typed("MyDf").require("valid").build()

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,7 +16,7 @@ build-backend = "poetry.masonry.api"
 
 [tool.poetry]
 name = "typeddfs"
-version = "0.15.0"
+version = "0.16.0"
 description = "Pandas DataFrame subclasses that enforce structure and can self-organize."
 authors = ["Douglas Myers-Turnbull"]
 maintainers = ["dmyersturnbull"]

diff --git a/tests/test_checksums.py b/tests/test_checksums.py
@@ -2,37 +2,108 @@
 
 import pytest
 
-from typeddfs.df_errors import (
-    HashContradictsExistingError,
-    HashExistsError,
-    HashFilenameMissingError,
-)
-from typeddfs.utils.checksums import ChecksumMappingOpt, Checksums
+from typeddfs.df_errors import HashExistsError, HashFilenameMissingError
+from typeddfs.utils.checksums import ChecksumMapping, Checksums
 
 
-class TestBuilders:
+class TestChecksums:
+    def test_mapping(self):
+        path = Path("my_file.txt")
+        x = ChecksumMapping(Path(".") / f".{Path('').name}", {path: "aabb"})
+        assert x == x
+        assert x[path] == "aabb"
+        assert x.line(path) == "aabb *my_file.txt"
+        assert x.lines() == ["aabb *my_file.txt"]
+
     def test_get_algorithm(self):
-        assert Checksums.get_algorithm("sha-256") == "sha256"
+        assert Checksums.resolve_algorithm("sha-256") == "sha256"
 
     def test_update(self):
-        assert Checksums().get_updated_hashes({}, {}) == ChecksumMappingOpt({})
-        original = {Path("cat"): "0x5", "ok": "0x63"}
-        update = {"cat": None, "other": "wads"}
-        expected = {
-            Path("cat").resolve(): None,
-            Path("ok").resolve(): "0x63",
-            Path("other").resolve(): "wads",
+        home = Path(__file__).parent / "resources"
+        original = ChecksumMapping(
+            home / ".resources", {home / Path("cat"): "0x5", home / Path("ok"): "0x63"}
+        )
+        update = {home / "cat": None, home / "other": "wads"}
+        assert original.update(update).entries == {
+            home / "ok": "0x63",
+            home / "other": "wads",
         }
-        assert Checksums().get_updated_hashes(original, update) == ChecksumMappingOpt(expected)
-        with pytest.raises(HashExistsError):
-            Checksums().get_updated_hashes({"x": "5"}, {"x": "5"}, overwrite=False)
-        assert Checksums().get_updated_hashes({"x": "5"}, {"x": "5"}, overwrite=None) == {
-            Path("x").resolve(): "5"
+        assert original.remove(home / "ok").entries == {home / "cat": "0x5"}
+        assert original.remove([home / "ok", home / "cat"]).entries == {}
+        with pytest.raises(HashFilenameMissingError):
+            original.remove("does not exist")
+        assert original.remove("does not exist", missing_ok=True).entries == {
+            home / "cat": "0x5",
+            home / "ok": "0x63",
+        }
+        assert original.append({home / "yay": "hi"}).entries == {
+            home / Path("cat"): "0x5",
+            home / Path("ok"): "0x63",
+            home / Path("yay"): "hi",
         }
-        with pytest.raises(HashContradictsExistingError):
-            Checksums().get_updated_hashes({"x": "5"}, {"x": "4"}, overwrite=None)
+        assert original.update({home / "ok": "5"}, overwrite=True).entries == {
+            home / Path("cat"): "0x5",
+            home / Path("ok"): "5",
+        }
+        assert original.update({home / "ok": "0x63"}, overwrite=None).entries == {
+            home / Path("cat"): "0x5",
+            home / Path("ok"): "0x63",
+        }
+        with pytest.raises(HashExistsError):
+            original.update({home / "ok": "0x63"}, overwrite=False)
+        with pytest.raises(HashExistsError):
+            original.update({home / "ok": "5"}, overwrite=None)
         with pytest.raises(HashFilenameMissingError):
-            Checksums().get_updated_hashes({}, {"x": "4"}, missing_ok=False)
+            original.update({home / "x": "4"}, missing_ok=False)
+        assert original.update(original.get) == original
+
+    def test_append(self):
+        home = Path(__file__).parent / "resources"
+        original = ChecksumMapping.new(home / ".resources", {home / "x": "0x1", home / "y": "0x2"})
+        new = original.append({home / "z": "0x3"}).entries
+        assert new == {home / "x": "0x1", home / "y": "0x2", home / "z": "0x3"}
+        assert original.entries == {home / "x": "0x1", home / "y": "0x2"}
+        with pytest.raises(HashExistsError):
+            original.append({home / "x": "0x1"})
+
+    def test_add(self):
+        home = Path(__file__).parent / "resources"
+        original = ChecksumMapping.new(home / ".resources", {home / "x": "0x1", home / "y": "0x2"})
+        assert (original + {home / "z": "0x3"}).entries == {
+            home / Path("x"): "0x1",
+            home / Path("y"): "0x2",
+            home / Path("z"): "0x3",
+        }
+        new = ChecksumMapping.new(home, {home / "z": "0x3"})
+        assert (original + new).entries == {home / "x": "0x1", home / "y": "0x2", home / "z": "0x3"}
+        with pytest.raises(ValueError):
+            original + dict(x="aaa")
+
+    def test_sub(self):
+        home = Path(__file__).parent / "resources"
+        original = ChecksumMapping.new(
+            home / ".resources", {home / "x": "0x1", home / "y": "0x2", home / "z": "0x3"}
+        )
+        assert (original - {home / "z": "0x3"}).entries == {home / "x": "0x1", home / "y": "0x2"}
+        assert (original - {home / "z"}).entries == {home / "x": "0x1", home / "y": "0x2"}
+        assert (original - home / "z").entries == {home / "x": "0x1", home / "y": "0x2"}
+        assert (original - {home / "z", home / "m"}).entries == {
+            home / "x": "0x1",
+            home / "y": "0x2",
+        }
+
+    def test_resolve(self):
+        home = Path(__file__).parent / "resources"
+        original = ChecksumMapping.new(home / ".resources", {home / "x": "0x1"})
+        resolved = original.resolve()
+        assert resolved.hash_path == (home / ".resources").resolve()
+        assert resolved.entries == {home.resolve() / "x": "0x1"}
+        unresolved = resolved.unresolve()
+        assert unresolved == original
+
+    def test_guess_algorithm(self):
+        assert Checksums.guess_algorithm("my_file.sha256") == "sha256"
+        assert Checksums.guess_algorithm("my_file.sha1") == "sha1"
 
 
 if __name__ == "__main__":

diff --git a/tests/test_fancy_io.py b/tests/test_fancy_io.py
@@ -366,11 +366,11 @@ def test_file_hash(self):
         # \n vs \r\n is an issue, so we can't check the exact hash
         with tmpfile(".csv") as path:
             df.write_file(path, file_hash=True)
-            hash_file = Checksums().get_hash_file(path)
+            hash_file = Checksums().get_filesum_of_file(path)
             assert hash_file.exists()
-            got = Checksums().parse_hash_file_resolved(hash_file)
-            assert list(got.keys()) == [path.resolve()]
-            hit = got[path.resolve()]
+            got = Checksums().load_filesum_of_file(path)
+            assert got.file_path == path
+            hit = got.hash_value
             assert len(hit) == 64
             t.read_file(path, file_hash=True)
             t.read_file(path, hex_hash=hit)
@@ -379,13 +379,13 @@ def test_dir_hash(self):
         t = TypedDfBuilder("a").reserve("x", "y").build()
         df = t.convert(pd.DataFrame([pd.Series(dict(x="cat", y="kitten"))]))
         with tmpfile(".csv") as path:
-            hash_dir = Checksums().get_hash_dir(path)
+            hash_dir = Checksums().get_dirsum_of_file(path)
             hash_dir.unlink(missing_ok=True)
             df.write_file(path, dir_hash=True)
             assert hash_dir.exists()
-            got = Checksums().parse_hash_file_resolved(hash_dir)
-            assert list(got.keys()) == [path.resolve()]
-            hit = got[path.resolve()]
+            got = Checksums().load_dirsum_exact(hash_dir)
+            assert list(got.keys()) == [path]
+            hit = got[path]
             assert len(hit) == 64
             t.read_file(path, dir_hash=True)
             t.read_file(path, hex_hash=hit)

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -30,6 +30,14 @@ def test_strip_control_chars(self):
         assert Utils.strip_control_chars("ℶℶ\u202Cℶℶ") == "ℶℶℶℶ"
         assert Utils.strip_control_chars("\u202C") == ""
 
+    def test_dots_and_dicts(self):
+        dct = dict(abc=dict(xyz="123"), zzz=["456", "789"])
+        dots = {"abc.xyz": "123", "zzz": ["456", "789"]}
+        act_dots = Utils.dict_to_dots(dct)
+        assert act_dots == dots
+        act_dct = Utils.dots_to_dict(act_dots)
+        assert act_dct == dct
+
 
 if __name__ == "__main__":
     pytest.main()
diff --git a/tox.ini b/tox.ini
@@ -19,7 +19,7 @@ commands =
     poetry run pre-commit run check-toml
     poetry run pre-commit run check-yaml
     poetry run pre-commit run check-json
-    poetry run pytest --cov-report term-missing --cov=typeddfs tests/
+    poetry run pytest -vv --cov-report term-missing --cov=typeddfs tests/
     poetry run bandit -r typeddfs
     poetry run bandit -r tests --skip B101
     - poetry run flake8 typeddfs

diff --git a/typeddfs/__init__.py b/typeddfs/__init__.py
@@ -11,6 +11,8 @@
 from typeddfs._entries import (
     AffinityMatrixDf,
     BaseDf,
+    ChecksumFile,
+    ChecksumMapping,
     Checksums,
     CompressionFormat,
     FileFormat,

diff --git a/typeddfs/_entries.py b/typeddfs/_entries.py
@@ -29,6 +29,7 @@
 from typeddfs.typed_dfs import TypedDf
 from typeddfs.untyped_dfs import UntypedDf
 from typeddfs.utils import Utils
+from typeddfs.utils.checksum_models import ChecksumFile, ChecksumMapping
 from typeddfs.utils.checksums import Checksums
 
 logger = logging.getLogger(Path(__file__).parent.name)
@@ -79,6 +80,7 @@ def example(cls) -> Type[TypedDf]:
         """
         Creates a new example TypedDf subclass.
         The class has:
+
             - required index "key"
             - required column "value"
             - reserved column "note"
@@ -112,7 +114,7 @@ def wrap(cls, df: pd.DataFrame) -> FinalDf:
     def typed(cls, name: str, doc: Optional[str] = None) -> TypedDfBuilder:
         """
         Creates a new type with flexible requirements.
-        The class will enforce contstraints and subclass :class:`typeddfs.typed_dfs.TypedDf`.
+        The class will enforce constraints and subclass :class:`typeddfs.typed_dfs.TypedDf`.
 
         Args:
             name: The name that will be used for the new class
@@ -192,6 +194,8 @@ class New(UntypedDf):
 __all__ = [
     "TypedDfs",
     "Utils",
+    "ChecksumFile",
+    "ChecksumMapping",
     "Checksums",
     "FrozeSet",
     "FrozeDict",

diff --git a/typeddfs/_mixins/_dataclass_mixin.py b/typeddfs/_mixins/_dataclass_mixin.py
@@ -107,7 +107,10 @@ def _create_dataclass(cls, fields: Sequence[Tuple[str, Type[Any]]]) -> Type[Type
             unsafe_hash=True,
             order=cls.get_typing().order_dataclass,
         )
-        _get_type = lambda: cls.__class__
+
+        def _get_type(x):
+            return cls.__class__
+
         _get_type.__name__ = "get_df_type"
         clazz.get_df_type = _get_type
 

diff --git a/typeddfs/_mixins/_feather_parquet_hdf_mixin.py b/typeddfs/_mixins/_feather_parquet_hdf_mixin.py
@@ -14,10 +14,6 @@
 
 
 class _FeatherParquetHdfMixin:
-    """
-    DataFrame that supports
-    """
-
     @classmethod
     def read_feather(cls, *args, **kwargs) -> __qualname__:
         # feather does not support MultiIndex, so reset index and use convert()

diff --git a/typeddfs/_mixins/_flexwf_mixin.py b/typeddfs/_mixins/_flexwf_mixin.py
@@ -12,15 +12,6 @@
 
 
 class _FlexwfMixin:
-    @classmethod
-    def read_fwf(cls, *args, **kwargs) -> __qualname__:
-        try:
-            return cls._convert_typed(pd.read_fwf(*args, **kwargs))
-        except pd.errors.EmptyDataError:
-            # TODO: Figure out what EmptyDataError means
-            # df = pd.DataFrame()
-            return cls.new_df()
-
     @classmethod
     def read_flexwf(
         cls,

diff --git a/typeddfs/_mixins/_ini_like_mixin.py b/typeddfs/_mixins/_ini_like_mixin.py
@@ -94,7 +94,7 @@ def read_toml(
 
         Args:
             path_or_buff: Path or buffer
-            aot: The name of the array of tables (i.e. ``[[ table ]]`)
+            aot: The name of the array of tables (i.e. ``[[ table ]]``)
                  If None, finds the unique outermost TOML key, implying ``aot_only``.
             aot_only: Fail if any outermost keys other than the AOT are found
             kwargs: Passed to ``Utils.read``
@@ -130,7 +130,7 @@ def to_toml(
 
         Args:
             path_or_buff: Path or buffer
-            aot: The name of the array of tables (i.e. ``[[ table ]]`)
+            aot: The name of the array of tables (i.e. ``[[ table ]]``)
             comment: Comment line(s) to add at the top of the document
             mode: 'w' (write) or 'a' (append)
             kwargs: Passed to :meth:`typeddfs.utils.Utils.write`