From 3d9b5ffdef05cbf9507f9493233e5f0a47b2dfd6 Mon Sep 17 00:00:00 2001
From: Taylor Turner <taylorfturner@gmail.com>
Date: Tue, 16 Jan 2024 11:19:35 -0500
Subject: [PATCH 01/17] add downloads tile (#1085)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 3ba4ee51b..1df9a2ea3 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/DataProfiler)
 ![GitHub](https://img.shields.io/github/license/CapitalOne/DataProfiler)
 ![GitHub last commit](https://img.shields.io/github/last-commit/CapitalOne/DataProfiler)
+[![Downloads](https://static.pepy.tech/badge/dataprofiler)](https://pepy.tech/project/dataprofiler)
 
 <p text-align="left">
     <picture>

From 516c6f52b74e13786abbd429c4141b2163c2d58d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= <gliptak@gmail.com>
Date: Fri, 2 Feb 2024 13:06:06 -0500
Subject: [PATCH 02/17] Add Python 3.11 to GHA

---
 .github/workflows/publish-python-package.yml | 2 +-
 .github/workflows/test-python-package.yml    | 2 +-
 tox.ini                                      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/publish-python-package.yml b/.github/workflows/publish-python-package.yml
index 75b9a41e2..4ed9e1bf3 100644
--- a/.github/workflows/publish-python-package.yml
+++ b/.github/workflows/publish-python-package.yml
@@ -20,7 +20,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v5
       with:
-        python-version: '3.10'
+        python-version: '3.11'
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
diff --git a/.github/workflows/test-python-package.yml b/.github/workflows/test-python-package.yml
index fa84b3d3a..5b7d6e5ab 100644
--- a/.github/workflows/test-python-package.yml
+++ b/.github/workflows/test-python-package.yml
@@ -16,7 +16,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.8, 3.9, "3.10"]
+        python-version: [3.8, 3.9, "3.10", "3.11"]
 
     steps:
     - uses: actions/checkout@v4
diff --git a/tox.ini b/tox.ini
index 55fa50147..18c327525 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = py37, py38, py39, py310, docs, pypi-description, manifest, precom
+envlist = py38, py39, py310, py311, docs, pypi-description, manifest, precom
 
 
 [testenv]

From f41111034b415c600387a1006f541e928cf9db49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= <gliptak@gmail.com>
Date: Thu, 14 Mar 2024 09:12:10 -0400
Subject: [PATCH 03/17] Replace snappy with cramjam (#1091)

* add downloads tile (#1085)

* Replace snappy with cramjam

* Delete test_no_snappy

---------

Co-authored-by: Taylor Turner <taylorfturner@gmail.com>
---
 .pre-commit-config.yaml                  |  2 +-
 dataprofiler/__init__.py                 | 16 ----------
 dataprofiler/tests/test_data_profiler.py | 40 ------------------------
 requirements.txt                         |  2 +-
 4 files changed, 2 insertions(+), 58 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 203e62b1f..b1d3ca62a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -55,7 +55,7 @@ repos:
             pyarrow>=1.0.1,
             chardet>=3.0.4,
             fastavro>=1.0.0.post1,
-            python-snappy>=0.5.4,
+            cramjam>=2.7.0,
             charset-normalizer>=1.3.6,
             psutil>=4.0.0,
             scipy>=1.4.1,
diff --git a/dataprofiler/__init__.py b/dataprofiler/__init__.py
index 2e89d3e2b..5f218bd85 100644
--- a/dataprofiler/__init__.py
+++ b/dataprofiler/__init__.py
@@ -20,22 +20,6 @@
 from .validators.base_validators import Validator
 from .version import __version__
 
-try:
-    import snappy
-except ImportError:
-    import warnings
-
-    warnings.warn(
-        "Snappy must be installed to use parquet/avro datasets."
-        "\n\n"
-        "For macOS use Homebrew:\n"
-        "\t`brew install snappy`"
-        "\n\n"
-        "For linux use apt-get:\n`"
-        "\tsudo apt-get -y install libsnappy-dev`\n",
-        ImportWarning,
-    )
-
 
 def set_seed(seed=None):
     # also check it's an integer
diff --git a/dataprofiler/tests/test_data_profiler.py b/dataprofiler/tests/test_data_profiler.py
index ef7664cea..9ebdfa039 100644
--- a/dataprofiler/tests/test_data_profiler.py
+++ b/dataprofiler/tests/test_data_profiler.py
@@ -56,46 +56,6 @@ def test_data_profiling(self):
             self.assertIsNotNone(profile.profile)
             self.assertIsNotNone(profile.report())
 
-    def test_no_snappy(self):
-        import importlib
-        import sys
-        import types
-
-        orig_import = __import__
-        # necessary for any wrapper around the library to test if snappy caught
-        # as an issue
-
-        def reload_data_profiler():
-            """Recursively reload modules."""
-            sys_modules = sys.modules.copy()
-            for module_name, module in sys_modules.items():
-                # Only reload top level of the dataprofiler
-                if "dataprofiler" in module_name and len(module_name.split(".")) < 3:
-                    if isinstance(module, types.ModuleType):
-                        importlib.reload(module)
-
-        def import_mock(name, *args, **kwargs):
-            if name == "snappy":
-                raise ImportError("test")
-            return orig_import(name, *args, **kwargs)
-
-        with mock.patch("builtins.__import__", side_effect=import_mock):
-            with self.assertWarns(ImportWarning) as w:
-                import dataprofiler
-
-                reload_data_profiler()
-
-        self.assertEqual(
-            str(w.warning),
-            "Snappy must be installed to use parquet/avro datasets."
-            "\n\n"
-            "For macOS use Homebrew:\n"
-            "\t`brew install snappy`"
-            "\n\n"
-            "For linux use apt-get:\n`"
-            "\tsudo apt-get -y install libsnappy-dev`\n",
-        )
-
     def test_no_tensorflow(self):
         import sys
 
diff --git a/requirements.txt b/requirements.txt
index a45dc34ae..405f808b3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,7 @@ pytz>=2020.1
 pyarrow>=1.0.1
 chardet>=3.0.4
 fastavro>=1.0.0.post1
-python-snappy>=0.5.4
+cramjam>=2.7.0
 charset-normalizer>=1.3.6
 psutil>=4.0.0
 scipy>=1.10.0

From f814ab7e9e0030a847bf5c7cd0d19765c60b7e1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= <gliptak@gmail.com>
Date: Thu, 14 Mar 2024 11:01:49 -0400
Subject: [PATCH 04/17] Update dask modules

---
 requirements-test.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-test.txt b/requirements-test.txt
index df4be852e..073628420 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,5 +1,5 @@
 coverage>=5.0.1
-dask>=2.29.0
+dask[dask-expr]>=2020.12.0
 fsspec>=0.3.3
 pytest>=6.0.1
 pytest-cov>=2.8.1

From eb9d89ef0ba43041f6745e70cfab92dac6f4325a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= <gliptak@gmail.com>
Date: Thu, 14 Mar 2024 11:14:09 -0400
Subject: [PATCH 05/17] Install dask dataframe

---
 requirements-test.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-test.txt b/requirements-test.txt
index 073628420..4281e6060 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,5 +1,5 @@
 coverage>=5.0.1
-dask[dask-expr]>=2020.12.0
+dask[dask-expr,dataframe]>=2020.12.0
 fsspec>=0.3.3
 pytest>=6.0.1
 pytest-cov>=2.8.1

From d23c4851c5a38f27af7fdeb2c3b8c3851e8a3420 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= <gliptak@gmail.com>
Date: Thu, 14 Mar 2024 11:39:21 -0400
Subject: [PATCH 06/17] Update dask modules in precommit

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b1d3ca62a..c3ecf7f5b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -93,7 +93,7 @@ repos:
 
             # requirements-test.txt
             coverage>=5.0.1,
-            dask>=2.29.0,
+            dask[dask-expr,dataframe]>=2020.12.0,
             fsspec>=0.3.3,
             pytest>=6.0.1,
             pytest-cov>=2.8.1,

From 63acf45c08ad8514f9eee9014e5858e44d58efbd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= <gliptak@gmail.com>
Date: Thu, 14 Mar 2024 11:44:41 -0400
Subject: [PATCH 07/17] Correct copy/paste error


From bd1874e101c0b8d11ad02f1b6c8dfc69e3e2deb4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= <gliptak@gmail.com>
Date: Thu, 14 Mar 2024 11:48:36 -0400
Subject: [PATCH 08/17] Try again to clear Unicode


From b9f7a8a154c357a54edab248e9591388602dc261 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= <gliptak@gmail.com>
Date: Mon, 18 Mar 2024 13:53:09 -0400
Subject: [PATCH 09/17] Rolled back pre-commit dask version

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c3ecf7f5b..b1d3ca62a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -93,7 +93,7 @@ repos:
 
             # requirements-test.txt
             coverage>=5.0.1,
-            dask[dask-expr,dataframe]>=2020.12.0,
+            dask>=2.29.0,
             fsspec>=0.3.3,
             pytest>=6.0.1,
             pytest-cov>=2.8.1,

From 95bba8289e5e27b7f93e3d91bce73881654fbeae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= <gliptak@gmail.com>
Date: Sun, 9 Jun 2024 14:04:24 -0400
Subject: [PATCH 10/17] Add py311 to tox

---
 tox.ini | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tox.ini b/tox.ini
index dc3a7c6c6..90d06af06 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,6 +1,5 @@
 [tox]
-envlist = py39, py310, pypi-description, manifest, precom
-
+envlist = py39, py310, py311, pypi-description, manifest, precom
 
 
 [testenv]

From c507ade53436fc82d763bdd81713667b7c03d64b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= <gliptak@gmail.com>
Date: Sun, 9 Jun 2024 14:05:44 -0400
Subject: [PATCH 11/17] Bump dask to 2024.4.1

---
 requirements-test.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-test.txt b/requirements-test.txt
index 60ef71bc4..725b23849 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,5 +1,5 @@
 coverage>=5.0.1
-dask[dask-expr,dataframe]>=2.29.0,<2024.2.0
+dask[dask-expr,dataframe]>=2024.4.1
 fsspec>=0.3.3
 pytest>=6.0.1
 pytest-cov>=2.8.1

From 56b8b2cf1043044a29dd9e8e5d1a4feabcbb47e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= <gliptak@gmail.com>
Date: Mon, 10 Jun 2024 10:51:24 -0400
Subject: [PATCH 12/17] Bump python-snappy 0.7.1

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 152b5eb36..b3df933ef 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,7 @@ pytz>=2020.1
 pyarrow>=1.0.1
 chardet>=3.0.4
 fastavro>=1.1.0
-python-snappy>=0.5.4
+python-snappy>=0.7.1
 charset-normalizer>=1.3.6
 psutil>=4.0.0
 scipy>=1.10.0

From e118ee84ad6130b26efead02946f5eb1736c6b09 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= <gliptak@gmail.com>
Date: Mon, 10 Jun 2024 16:51:10 -0400
Subject: [PATCH 13/17] Rewrite labeler test

---
 .../tests/labelers/test_labeler_utils.py      | 43 ++++++++-----------
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/dataprofiler/tests/labelers/test_labeler_utils.py b/dataprofiler/tests/labelers/test_labeler_utils.py
index f59a43e3f..361959449 100644
--- a/dataprofiler/tests/labelers/test_labeler_utils.py
+++ b/dataprofiler/tests/labelers/test_labeler_utils.py
@@ -1,6 +1,7 @@
 import logging
 import unittest
 from unittest import mock
+import tempfile
 
 import numpy as np
 import pandas as pd
@@ -235,9 +236,7 @@ def test_verbose(self):
         self.assertIn("f1-score ", log_output)
         self.assertIn("F1 Score: ", log_output)
 
-    @mock.patch("dataprofiler.labelers.labeler_utils.classification_report")
-    @mock.patch("pandas.DataFrame")
-    def test_save_conf_mat(self, mock_dataframe, mock_report):
+    def test_save_conf_mat(self):
 
         # ideally mock out the actual contents written to file, but
         # would be difficult to get this completely worked out.
@@ -248,29 +247,25 @@ def test_save_conf_mat(self, mock_dataframe, mock_report):
                 [0, 1, 2],
             ]
         )
-        expected_row_col_names = dict(
-            columns=["pred:PAD", "pred:UNKNOWN", "pred:OTHER"],
-            index=["true:PAD", "true:UNKNOWN", "true:OTHER"],
-        )
-        mock_instance_df = mock.Mock(spec=pd.DataFrame)()
-        mock_dataframe.return_value = mock_instance_df
+        expected_columns=["pred:PAD", "pred:UNKNOWN", "pred:OTHER"]
+        expected_index=["true:PAD", "true:UNKNOWN", "true:OTHER"]
 
-        # still omit bc confusion mat should include all despite omit
-        f1, f1_report = labeler_utils.evaluate_accuracy(
-            self.y_pred,
-            self.y_true,
-            self.num_labels,
-            self.reverse_label_mapping,
-            omitted_labels=["PAD"],
-            verbose=False,
-            confusion_matrix_file="test.csv",
-        )
-
-        self.assertTrue((mock_dataframe.call_args[0][0] == expected_conf_mat).all())
-        self.assertDictEqual(expected_row_col_names, mock_dataframe.call_args[1])
-
-        mock_instance_df.to_csv.assert_called()
+        with tempfile.NamedTemporaryFile() as tmpFile:
+            # still omit bc confusion mat should include all despite omit
+            f1, f1_report = labeler_utils.evaluate_accuracy(
+                self.y_pred,
+                self.y_true,
+                self.num_labels,
+                self.reverse_label_mapping,
+                omitted_labels=["PAD"],
+                verbose=False,
+                confusion_matrix_file=tmpFile.name,
+            )
 
+            df1 = pd.read_csv(tmpFile.name, index_col=0)
+            self.assertListEqual(list(df1.columns), expected_columns)
+            self.assertListEqual(list(df1.index), expected_index)
+            np.testing.assert_array_equal(df1.values, expected_conf_mat)
 
 class TestTFFunctions(unittest.TestCase):
     def test_get_tf_layer_index_from_name(self):

From 9021a7e5e21b76ef48f60c38251130ba8add3efc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= <gliptak@gmail.com>
Date: Mon, 10 Jun 2024 17:03:51 -0400
Subject: [PATCH 14/17] Correct isort

---
 dataprofiler/tests/labelers/test_labeler_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dataprofiler/tests/labelers/test_labeler_utils.py b/dataprofiler/tests/labelers/test_labeler_utils.py
index 361959449..cbac15c9b 100644
--- a/dataprofiler/tests/labelers/test_labeler_utils.py
+++ b/dataprofiler/tests/labelers/test_labeler_utils.py
@@ -1,7 +1,7 @@
 import logging
+import tempfile
 import unittest
 from unittest import mock
-import tempfile
 
 import numpy as np
 import pandas as pd

From c59403466350dfc54064abb92caa332bb74d5619 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= <gliptak@gmail.com>
Date: Mon, 10 Jun 2024 17:09:37 -0400
Subject: [PATCH 15/17] Satisfy black

---
 dataprofiler/tests/labelers/test_labeler_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/dataprofiler/tests/labelers/test_labeler_utils.py b/dataprofiler/tests/labelers/test_labeler_utils.py
index cbac15c9b..3a3b56ea2 100644
--- a/dataprofiler/tests/labelers/test_labeler_utils.py
+++ b/dataprofiler/tests/labelers/test_labeler_utils.py
@@ -247,8 +247,8 @@ def test_save_conf_mat(self):
                 [0, 1, 2],
             ]
         )
-        expected_columns=["pred:PAD", "pred:UNKNOWN", "pred:OTHER"]
-        expected_index=["true:PAD", "true:UNKNOWN", "true:OTHER"]
+        expected_columns = ["pred:PAD", "pred:UNKNOWN", "pred:OTHER"]
+        expected_index = ["true:PAD", "true:UNKNOWN", "true:OTHER"]
 
         with tempfile.NamedTemporaryFile() as tmpFile:
             # still omit bc confusion mat should include all despite omit
@@ -267,6 +267,7 @@ def test_save_conf_mat(self):
             self.assertListEqual(list(df1.index), expected_index)
             np.testing.assert_array_equal(df1.values, expected_conf_mat)
 
+
 class TestTFFunctions(unittest.TestCase):
     def test_get_tf_layer_index_from_name(self):
         model = tf.keras.Sequential()

From 94236da642dd1b1b71f79e4f5f73b33ba453f4c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= <gliptak@gmail.com>
Date: Mon, 10 Jun 2024 17:19:41 -0400
Subject: [PATCH 16/17] And flake8

---
 dataprofiler/tests/labelers/test_labeler_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dataprofiler/tests/labelers/test_labeler_utils.py b/dataprofiler/tests/labelers/test_labeler_utils.py
index 3a3b56ea2..c14fca54f 100644
--- a/dataprofiler/tests/labelers/test_labeler_utils.py
+++ b/dataprofiler/tests/labelers/test_labeler_utils.py
@@ -1,7 +1,6 @@
 import logging
 import tempfile
 import unittest
-from unittest import mock
 
 import numpy as np
 import pandas as pd

From 6ca2c875408e4f08faf78fda858799df45cea8b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= <gliptak@gmail.com>
Date: Tue, 11 Jun 2024 09:39:46 -0400
Subject: [PATCH 17/17] Synced with requirements

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f36c52663..ee9bddf6a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -55,7 +55,7 @@ repos:
             pyarrow>=1.0.1,
             chardet>=3.0.4,
             fastavro>=1.0.0.post1,
-            cramjam>=2.7.0,
+            python-snappy>=0.7.1,
             charset-normalizer>=1.3.6,
             psutil>=4.0.0,
             scipy>=1.4.1,