Skip to content

Commit

Permalink
feat: checksum model + fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
dmyersturnbull committed Oct 24, 2021
1 parent e2b3eca commit 57504da
Show file tree
Hide file tree
Showing 26 changed files with 925 additions and 761 deletions.
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and
[Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [0.15.1] - unreleased
## [0.16.0] - 2021-10-23

### Added

Expand All @@ -12,8 +12,14 @@ Adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and

### Changed

- `Checksums`
- Simplified `.suffixes` (breaking change to an uncommon function)

### Fixed

- Duplicate IO methods
- Checksum path resolution

## [0.15.0] - 2021-10-16

### Added
Expand Down
2 changes: 2 additions & 0 deletions docs/faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ There is a final type, defined to have no typing rules, that can be constructed
variant via :meth:`typeddfs._entries.TypedDfs.wrap` to give it the additional methods.

.. code-block:: python
from typeddfs import TypedDfs
MyDf = TypedDfs.typed("MyDf").build()
Expand Down Expand Up @@ -62,6 +63,7 @@ that the returned DataFrame might not conform to your requirements.
Call :meth:`typeddfs.abs_dfs.AbsDf.retype` at the end to reorganize and verify.

.. code-block:: python
from typeddfs import TypedDfs
MyDf = TypedDfs.typed("MyDf").require("valid").build()
Expand Down
236 changes: 141 additions & 95 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ build-backend = "poetry.masonry.api"

[tool.poetry]
name = "typeddfs"
version = "0.15.0"
version = "0.16.0"
description = "Pandas DataFrame subclasses that enforce structure and can self-organize."
authors = ["Douglas Myers-Turnbull"]
maintainers = ["dmyersturnbull"]
Expand Down
117 changes: 94 additions & 23 deletions tests/test_checksums.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,108 @@

import pytest

from typeddfs.df_errors import (
HashContradictsExistingError,
HashExistsError,
HashFilenameMissingError,
)
from typeddfs.utils.checksums import ChecksumMappingOpt, Checksums
from typeddfs.df_errors import HashExistsError, HashFilenameMissingError
from typeddfs.utils.checksums import ChecksumMapping, Checksums


class TestBuilders:
class TestChecksums:
def test_mapping(self):
path = Path("my_file.txt")
x = ChecksumMapping(Path(".") / f".{Path('').name}", {path: "aabb"})
assert x == x
assert x[path] == "aabb"
assert x.line(path) == "aabb *my_file.txt"
assert x.lines() == ["aabb *my_file.txt"]

def test_get_algorithm(self):
assert Checksums.get_algorithm("sha-256") == "sha256"
assert Checksums.resolve_algorithm("sha-256") == "sha256"

def test_update(self):
assert Checksums().get_updated_hashes({}, {}) == ChecksumMappingOpt({})
original = {Path("cat"): "0x5", "ok": "0x63"}
update = {"cat": None, "other": "wads"}
expected = {
Path("cat").resolve(): None,
Path("ok").resolve(): "0x63",
Path("other").resolve(): "wads",
home = Path(__file__).parent / "resources"
original = ChecksumMapping(
home / ".resources", {home / Path("cat"): "0x5", home / Path("ok"): "0x63"}
)
update = {home / "cat": None, home / "other": "wads"}
assert original.update(update).entries == {
home / "ok": "0x63",
home / "other": "wads",
}
assert Checksums().get_updated_hashes(original, update) == ChecksumMappingOpt(expected)
with pytest.raises(HashExistsError):
Checksums().get_updated_hashes({"x": "5"}, {"x": "5"}, overwrite=False)
assert Checksums().get_updated_hashes({"x": "5"}, {"x": "5"}, overwrite=None) == {
Path("x").resolve(): "5"
assert original.remove(home / "ok").entries == {home / "cat": "0x5"}
assert original.remove([home / "ok", home / "cat"]).entries == {}
with pytest.raises(HashFilenameMissingError):
original.remove("does not exist")
assert original.remove("does not exist", missing_ok=True).entries == {
home / "cat": "0x5",
home / "ok": "0x63",
}
assert original.append({home / "yay": "hi"}).entries == {
home / Path("cat"): "0x5",
home / Path("ok"): "0x63",
home / Path("yay"): "hi",
}
with pytest.raises(HashContradictsExistingError):
Checksums().get_updated_hashes({"x": "5"}, {"x": "4"}, overwrite=None)
assert original.update({home / "ok": "5"}, overwrite=True).entries == {
home / Path("cat"): "0x5",
home / Path("ok"): "5",
}
assert original.update({home / "ok": "0x63"}, overwrite=None).entries == {
home / Path("cat"): "0x5",
home / Path("ok"): "0x63",
}
with pytest.raises(HashExistsError):
original.update({home / "ok": "0x63"}, overwrite=False)
with pytest.raises(HashExistsError):
original.update({home / "ok": "5"}, overwrite=None)
with pytest.raises(HashFilenameMissingError):
Checksums().get_updated_hashes({}, {"x": "4"}, missing_ok=False)
original.update({home / "x": "4"}, missing_ok=False)
assert original.update(original.get) == original

def test_append(self):
home = Path(__file__).parent / "resources"
original = ChecksumMapping.new(home / ".resources", {home / "x": "0x1", home / "y": "0x2"})
new = original.append({home / "z": "0x3"}).entries
assert new == {home / "x": "0x1", home / "y": "0x2", home / "z": "0x3"}
assert original.entries == {home / "x": "0x1", home / "y": "0x2"}
with pytest.raises(HashExistsError):
original.append({home / "x": "0x1"})

def test_add(self):
home = Path(__file__).parent / "resources"
original = ChecksumMapping.new(home / ".resources", {home / "x": "0x1", home / "y": "0x2"})
assert (original + {home / "z": "0x3"}).entries == {
home / Path("x"): "0x1",
home / Path("y"): "0x2",
home / Path("z"): "0x3",
}
new = ChecksumMapping.new(home, {home / "z": "0x3"})
assert (original + new).entries == {home / "x": "0x1", home / "y": "0x2", home / "z": "0x3"}
with pytest.raises(ValueError):
original + dict(x="aaa")

def test_sub(self):
home = Path(__file__).parent / "resources"
original = ChecksumMapping.new(
home / ".resources", {home / "x": "0x1", home / "y": "0x2", home / "z": "0x3"}
)
assert (original - {home / "z": "0x3"}).entries == {home / "x": "0x1", home / "y": "0x2"}
assert (original - {home / "z"}).entries == {home / "x": "0x1", home / "y": "0x2"}
assert (original - home / "z").entries == {home / "x": "0x1", home / "y": "0x2"}
assert (original - {home / "z", home / "m"}).entries == {
home / "x": "0x1",
home / "y": "0x2",
}

def test_resolve(self):
home = Path(__file__).parent / "resources"
original = ChecksumMapping.new(home / ".resources", {home / "x": "0x1"})
resolved = original.resolve()
assert resolved.hash_path == (home / ".resources").resolve()
assert resolved.entries == {home.resolve() / "x": "0x1"}
unresolved = resolved.unresolve()
assert unresolved == original

def test_guess_algorithm(self):
assert Checksums.guess_algorithm("my_file.sha256") == "sha256"
assert Checksums.guess_algorithm("my_file.sha1") == "sha1"


if __name__ == "__main__":
Expand Down
16 changes: 8 additions & 8 deletions tests/test_fancy_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,11 +366,11 @@ def test_file_hash(self):
# \n vs \r\n is an issue, so we can't check the exact hash
with tmpfile(".csv") as path:
df.write_file(path, file_hash=True)
hash_file = Checksums().get_hash_file(path)
hash_file = Checksums().get_filesum_of_file(path)
assert hash_file.exists()
got = Checksums().parse_hash_file_resolved(hash_file)
assert list(got.keys()) == [path.resolve()]
hit = got[path.resolve()]
got = Checksums().load_filesum_of_file(path)
assert got.file_path == path
hit = got.hash_value
assert len(hit) == 64
t.read_file(path, file_hash=True)
t.read_file(path, hex_hash=hit)
Expand All @@ -379,13 +379,13 @@ def test_dir_hash(self):
t = TypedDfBuilder("a").reserve("x", "y").build()
df = t.convert(pd.DataFrame([pd.Series(dict(x="cat", y="kitten"))]))
with tmpfile(".csv") as path:
hash_dir = Checksums().get_hash_dir(path)
hash_dir = Checksums().get_dirsum_of_file(path)
hash_dir.unlink(missing_ok=True)
df.write_file(path, dir_hash=True)
assert hash_dir.exists()
got = Checksums().parse_hash_file_resolved(hash_dir)
assert list(got.keys()) == [path.resolve()]
hit = got[path.resolve()]
got = Checksums().load_dirsum_exact(hash_dir)
assert list(got.keys()) == [path]
hit = got[path]
assert len(hit) == 64
t.read_file(path, dir_hash=True)
t.read_file(path, hex_hash=hit)
Expand Down
8 changes: 8 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@ def test_strip_control_chars(self):
assert Utils.strip_control_chars("ℶℶ\u202Cℶℶ") == "ℶℶℶℶ"
assert Utils.strip_control_chars("\u202C") == ""

def test_dots_and_dicts(self):
dct = dict(abc=dict(xyz="123"), zzz=["456", "789"])
dots = {"abc.xyz": "123", "zzz": ["456", "789"]}
act_dots = Utils.dict_to_dots(dct)
assert act_dots == dots
act_dct = Utils.dots_to_dict(act_dots)
assert act_dct == dct


if __name__ == "__main__":
pytest.main()
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ commands =
poetry run pre-commit run check-toml
poetry run pre-commit run check-yaml
poetry run pre-commit run check-json
poetry run pytest --cov-report term-missing --cov=typeddfs tests/
poetry run pytest -vv --cov-report term-missing --cov=typeddfs tests/
poetry run bandit -r typeddfs
poetry run bandit -r tests --skip B101
- poetry run flake8 typeddfs
Expand Down
2 changes: 2 additions & 0 deletions typeddfs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from typeddfs._entries import (
AffinityMatrixDf,
BaseDf,
ChecksumFile,
ChecksumMapping,
Checksums,
CompressionFormat,
FileFormat,
Expand Down
6 changes: 5 additions & 1 deletion typeddfs/_entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from typeddfs.typed_dfs import TypedDf
from typeddfs.untyped_dfs import UntypedDf
from typeddfs.utils import Utils
from typeddfs.utils.checksum_models import ChecksumFile, ChecksumMapping
from typeddfs.utils.checksums import Checksums

logger = logging.getLogger(Path(__file__).parent.name)
Expand Down Expand Up @@ -79,6 +80,7 @@ def example(cls) -> Type[TypedDf]:
"""
Creates a new example TypedDf subclass.
The class has:
- required index "key"
- required column "value"
- reserved column "note"
Expand Down Expand Up @@ -112,7 +114,7 @@ def wrap(cls, df: pd.DataFrame) -> FinalDf:
def typed(cls, name: str, doc: Optional[str] = None) -> TypedDfBuilder:
"""
Creates a new type with flexible requirements.
The class will enforce contstraints and subclass :class:`typeddfs.typed_dfs.TypedDf`.
The class will enforce constraints and subclass :class:`typeddfs.typed_dfs.TypedDf`.
Args:
name: The name that will be used for the new class
Expand Down Expand Up @@ -192,6 +194,8 @@ class New(UntypedDf):
__all__ = [
"TypedDfs",
"Utils",
"ChecksumFile",
"ChecksumMapping",
"Checksums",
"FrozeSet",
"FrozeDict",
Expand Down
5 changes: 4 additions & 1 deletion typeddfs/_mixins/_dataclass_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,10 @@ def _create_dataclass(cls, fields: Sequence[Tuple[str, Type[Any]]]) -> Type[Type
unsafe_hash=True,
order=cls.get_typing().order_dataclass,
)
_get_type = lambda: cls.__class__

def _get_type(x):
return cls.__class__

_get_type.__name__ = "get_df_type"
clazz.get_df_type = _get_type

Expand Down
4 changes: 0 additions & 4 deletions typeddfs/_mixins/_feather_parquet_hdf_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,6 @@


class _FeatherParquetHdfMixin:
"""
DataFrame that supports
"""

@classmethod
def read_feather(cls, *args, **kwargs) -> __qualname__:
# feather does not support MultiIndex, so reset index and use convert()
Expand Down
9 changes: 0 additions & 9 deletions typeddfs/_mixins/_flexwf_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,6 @@


class _FlexwfMixin:
@classmethod
def read_fwf(cls, *args, **kwargs) -> __qualname__:
try:
return cls._convert_typed(pd.read_fwf(*args, **kwargs))
except pd.errors.EmptyDataError:
# TODO: Figure out what EmptyDataError means
# df = pd.DataFrame()
return cls.new_df()

@classmethod
def read_flexwf(
cls,
Expand Down
4 changes: 2 additions & 2 deletions typeddfs/_mixins/_ini_like_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def read_toml(
Args:
path_or_buff: Path or buffer
aot: The name of the array of tables (i.e. ``[[ table ]]`)
aot: The name of the array of tables (i.e. ``[[ table ]]``)
If None, finds the unique outermost TOML key, implying ``aot_only``.
aot_only: Fail if any outermost keys other than the AOT are found
kwargs: Passed to ``Utils.read``
Expand Down Expand Up @@ -130,7 +130,7 @@ def to_toml(
Args:
path_or_buff: Path or buffer
aot: The name of the array of tables (i.e. ``[[ table ]]`)
aot: The name of the array of tables (i.e. ``[[ table ]]``)
comment: Comment line(s) to add at the top of the document
mode: 'w' (write) or 'a' (append)
kwargs: Passed to :meth:`typeddfs.utils.Utils.write`
Expand Down
Loading

0 comments on commit 57504da

Please sign in to comment.