ing-bank · mbaak · Jan 20, 2025 · Dec 6, 2024
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -10,7 +10,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest]
-        python: ['3.8', '3.9', '3.10', '3.11']
+        python: ['3.9', '3.10', '3.11', '3.12']
     runs-on: ${{ matrix.os }}
 
     steps:
@@ -35,21 +35,23 @@ jobs:
     - name: Test with pytest
       run: |
         pytest -m "not spark"
-        
+
   test_spark:
     strategy:
       matrix:
         include:
-#          - SPARK_VERSION: "2.4.8"
-#            HADOOP_VERSION: "2.7"
-#            JAVA_VERSION: "8"
-#            python: "3.7"
-#            os: ubuntu-latest
           - SPARK_VERSION: "3.3.2"
             HADOOP_VERSION: "3"
             JAVA_VERSION: "11"
-            python: "3.8"
+            python: "3.9"
             os: ubuntu-latest
+            dependency_constraints: '"pandas<2" "numpy<2"'
+          - SPARK_VERSION: "3.5.4"
+            HADOOP_VERSION: "3"
+            JAVA_VERSION: "11"
+            python: "3.12"
+            os: ubuntu-latest
+            dependency_constraints: '"pandas>=2" "numpy>=2"'
     runs-on: ${{ matrix.os }}
     name: ${{ matrix.os }}, Spark ${{ matrix.SPARK_VERSION}}, Python ${{ matrix.python }}
 
@@ -67,10 +69,9 @@ jobs:
           /home/runner/work/spark.tgz
           ~/.cache/pip
         key: ${{ runner.os }}-spark-${{ matrix.SPARK_VERSION }}-hadoop${{ matrix.HADOOP_VERSION }}-java${{ matrix.JAVA_VERSION }}-${{ hashFiles('**/pyproject.toml') }}
-    - name: Install dependencies
+    - name: Install pip and setuptools
       run: |
         python -m pip install --upgrade pip setuptools
-        pip install -e .[test]
     - name: Download spark
       if: steps.cache-spark.outputs.cache-hit != 'true'
       env:
@@ -93,6 +94,12 @@ jobs:
         # https://github.com/python-poetry/poetry/issues/6792
         pip3 install "pypandoc<1.8" 
         pip install "pyspark==${SPARK_VERSION}"
+    - name: Install Spark-related dependency versions
+      run: |
+        pip install ${{ matrix.dependency_constraints }}
+    - name: Install project dependencies
+      run: |
+        pip install -e .[test]
     - name: Test with pytest (spark-specific)
       env:
         BUILD_DIR: "/home/runner/work/" #${{ github.workspace }}
@@ -106,15 +113,15 @@ jobs:
 
   examples:
     runs-on: ubuntu-latest
-    needs: 
+    needs:
     - test
     - test_spark
     steps:
       - uses: actions/checkout@v2
-      - name: Set up Python 3.8
+      - name: Set up Python 3.9
         uses: actions/setup-python@v1
         with:
-          python-version: 3.8
+          python-version: 3.9
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
@@ -130,17 +137,17 @@ jobs:
           python flight_delays.py
           cd synthetic_data_streams
           python hyperplane.py
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v4
         with:
           name: synthetic-report
           path: examples/test_data_report.html
           if-no-files-found: error
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v4
         with:
           name: flight-delays-report
           path: examples/flight_delays_report.html
           if-no-files-found: error
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v4
         with:
           name: hyperplane-1-report
           path: examples/synthetic_data_streams/reports/hyperplane_1.html

diff --git a/.gitignore b/.gitignore
@@ -147,3 +147,5 @@ docs/build
 # Developer's playground
 /playground/
 .ruff_cache/
+
+notebooks/report.html
diff --git a/docs/source/developing.rst b/docs/source/developing.rst
@@ -27,8 +27,8 @@ For this you'll need to install our test requirements:
 .. code-block:: bash
 
   cd popmon/
-  pip install -r requirements-test.txt
-  python setup.py test
+  pip install -r .[test]
+  pytest
 
 That's it!
 

diff --git a/popmon/analysis/functions.py b/popmon/analysis/functions.py
@@ -83,7 +83,7 @@ def expanding_mean(df, shift: int = 1):
     :param int shift: size of shift. default is 1.
     :return: df with expanding means of columns
     """
-    return df.shift(shift).expanding().mean()
+    return df.shift(shift).expanding().mean(numeric_only=True)
 
 
 def expanding_std(df, shift: int = 1):
@@ -95,7 +95,7 @@ def expanding_std(df, shift: int = 1):
     :param int shift: size of shift. default is 1.
     :return: df with expanding std of columns
     """
-    return df.shift(shift).expanding().std()
+    return df.shift(shift).expanding().std(numeric_only=True)
 
 
 def expanding_apply(df, func, shift: int = 1, *args, **kwargs):
@@ -123,7 +123,7 @@ def rolling_std(df, window, shift: int = 1):
     :param int window: size of rolling window.
     :return: df with rolling std of columns
     """
-    return df.shift(shift).rolling(window).std()
+    return df.shift(shift).rolling(window).std(numeric_only=True)
 
 
 def rolling_mean(df, window, shift: int = 1):
@@ -136,7 +136,7 @@ def rolling_mean(df, window, shift: int = 1):
     :param int window: size of rolling window.
     :return: df with rolling mean of columns
     """
-    return df.shift(shift).rolling(window).mean()
+    return df.shift(shift).rolling(window).mean(numeric_only=True)
 
 
 def rolling_apply(df, window, func, shift: int = 1, *args, **kwargs):

diff --git a/popmon/analysis/profiling/profiles.py b/popmon/analysis/profiling/profiles.py
@@ -186,9 +186,7 @@ def replace(bl):
     if len(bin_labels) == 0 or len(bin_labels) > 4 or np.sum(bin_entries) == 0:
         return np.nan
     if not np.all([isinstance(bl, (bool, np.bool_)) for bl in bin_labels]):
-        if not np.all(
-            [isinstance(bl, (str, np.str_, np.string_)) for bl in bin_labels]
-        ):
+        if not np.all([isinstance(bl, (str, np.str_, np.bytes_)) for bl in bin_labels]):
             return np.nan
         # all strings from hereon
         n_true = (bin_labels == "True").sum() + (bin_labels == "true").sum()

diff --git a/popmon/analysis/profiling/pull_calculator.py b/popmon/analysis/profiling/pull_calculator.py
@@ -16,7 +16,7 @@
 # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
+from functools import partial
 
 import numpy as np
 import pandas as pd
@@ -233,8 +233,8 @@ def __init__(
         :param kwargs: (dict, optional): residual kwargs passed on to mean and std functions
         """
         super().__init__(
-            np.mean,
-            np.std,
+            partial(pd.DataFrame.mean, numeric_only=True),
+            partial(pd.DataFrame.std, numeric_only=True, ddof=0),
             reference_key,
             assign_to_key,
             store_key,

diff --git a/popmon/notebooks/__init__.py b/popmon/notebooks/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2023 ING Analytics Wholesale Banking
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/popmon/resources.py b/popmon/resources.py
@@ -20,37 +20,30 @@
 
 # Resources lookup file for popmon
 import json
-import pathlib
+from importlib import resources
 
 from jinja2 import Environment, FileSystemLoader
-from pkg_resources import resource_filename
 
-import popmon
+from popmon import notebooks, test_data, visualization
 
 # data files that are shipped with popmon.
-_DATA = {
-    _.name: _
-    for _ in pathlib.Path(resource_filename(popmon.__name__, "test_data")).glob("*")
-}
+_DATA = {_.name: _ for _ in resources.files(test_data).iterdir()}
 
 # Tutorial notebooks
 _NOTEBOOK = {
-    _.name: _
-    for _ in pathlib.Path(resource_filename(popmon.__name__, "notebooks")).glob(
-        "*.ipynb"
-    )
+    p.name: p for p in resources.files(notebooks).iterdir() if p.suffix == ".ipynb"
 }
 
 # Resource types
 _RESOURCES = {"data": _DATA, "notebook": _NOTEBOOK}
 
 # Environment for visualization templates' directory
-_TEMPLATES_ENV = Environment(
-    loader=FileSystemLoader(
-        resource_filename(popmon.__name__, "visualization/templates")
-    ),
-    autoescape=True,
-)
+ref = resources.files(visualization) / "templates"
+with resources.as_file(ref) as templates_dir_path:
+    _TEMPLATES_ENV = Environment(
+        loader=FileSystemLoader(templates_dir_path),
+        autoescape=True,
+    )
 _TEMPLATES_ENV.filters["fmt_metric"] = lambda x: x.replace("_", " ")
 
 

diff --git a/popmon/test_data/__init__.py b/popmon/test_data/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2023 ING Analytics Wholesale Banking
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,14 +17,14 @@ keywords = [
     "ipython"
 ]
 readme = "README.rst"
-requires-python = ">=3.7"
+requires-python = ">=3.9"
 authors = [{name = "ING Analytics Wholesale Banking", email = "[email protected]"}]
 license = {type = "MIT", file = "LICENSE"}
 dependencies = [
     "numpy>=1.18.0",
-    "pandas>=0.25.1,<2",
+    "pandas>=0.25.1",
     "scipy>=1.5.2",
-    "histogrammar>=1.0.32",
+    "histogrammar>=1.0.34",
     "phik",
     "jinja2",
     "tqdm",

diff --git a/requirements.txt b/requirements.txt
@@ -1,12 +1,13 @@
 numpy>=1.18.0
 pandas>=0.25.1
 scipy>=1.5.2
-histogrammar>=1.0.32
+histogrammar>=1.0.34
 phik
 jinja2
 tqdm
 plotly>=5.8.0
 joblib>=0.14.0
 htmlmin
-pydantic
-typing_extensions
+pydantic>=2
+pydantic-settings
+typing_extensions
diff --git a/tests/popmon/analysis/profiling/test_apply_func.py b/tests/popmon/analysis/profiling/test_apply_func.py
@@ -13,6 +13,11 @@
 from popmon.base import Pipeline
 
 
+def mean(x):
+    """Column-wise np.mean."""
+    return np.mean(x, axis=0)
+
+
 def get_test_data():
     df = pd.DataFrame()
     df["a"] = np.arange(100)
@@ -25,7 +30,7 @@ def test_pull():
 
     module1 = ApplyFunc(apply_to_key="to_profile")
     module1.add_apply_func(np.std, suffix="_std", entire=True)
-    module1.add_apply_func(np.mean, suffix="_mean", entire=True)
+    module1.add_apply_func(mean, suffix="_mean", entire=True)
 
     module2 = ApplyFunc(apply_to_key="to_profile", features=["asc_numbers"])
     module2.add_apply_func(
@@ -57,7 +62,7 @@ def func(x):
     )
 
     module.add_apply_func(np.std, entire=True)
-    module.add_apply_func(np.mean, entire=True)
+    module.add_apply_func(mean, entire=True)
     module.add_apply_func(func)
 
     datastore = module.transform(datastore)
@@ -77,7 +82,7 @@ def test_variance_comparer():
         apply_to_key="to_profile", features=["the_feature", "dummy_feature"]
     )
     module1.add_apply_func(np.std, suffix="_std", entire=True)
-    module1.add_apply_func(np.mean, suffix="_mean", entire=True)
+    module1.add_apply_func(mean, suffix="_mean", entire=True)
 
     module2 = ApplyFunc(
         apply_to_key="to_profile", features=["the_feature", "dummy_feature"]
@@ -171,7 +176,7 @@ def test_apply_func():
 
     apply_funcs = [
         {"func": np.std, "features": [feature], "metrics": ["a", "b"], "entire": True},
-        {"func": np.mean, "features": [feature], "metrics": ["a", "b"], "entire": True},
+        {"func": mean, "features": [feature], "metrics": ["a", "b"], "entire": True},
     ]
 
     d = apply_func(
@@ -195,7 +200,7 @@ def test_apply_func_array():
 
     apply_funcs = [
         {"func": np.std, "features": [feature], "metrics": ["a", "b"], "entire": True},
-        {"func": np.mean, "features": [feature], "metrics": ["a", "b"], "entire": True},
+        {"func": mean, "features": [feature], "metrics": ["a", "b"], "entire": True},
     ]
 
     f, p = apply_func_array(