From 03b5fe79fffde377ebc85dfc36873c57e7882f04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20C=C3=A9spedes=20Sisniega?= Date: Mon, 14 Aug 2023 12:53:02 +0200 Subject: [PATCH 1/4] Add unit test MMD univariate data --- frouros/tests/unit/detectors/__init__.py | 1 + .../unit/detectors/data_drift/__init__.py | 1 + .../detectors/data_drift/batch/__init__.py | 1 + .../batch/distance_based/__init__.py | 1 + .../batch/distance_based/test_mmd.py | 49 +++++++++++++++++++ 5 files changed, 53 insertions(+) create mode 100644 frouros/tests/unit/detectors/__init__.py create mode 100644 frouros/tests/unit/detectors/data_drift/__init__.py create mode 100644 frouros/tests/unit/detectors/data_drift/batch/__init__.py create mode 100644 frouros/tests/unit/detectors/data_drift/batch/distance_based/__init__.py create mode 100644 frouros/tests/unit/detectors/data_drift/batch/distance_based/test_mmd.py diff --git a/frouros/tests/unit/detectors/__init__.py b/frouros/tests/unit/detectors/__init__.py new file mode 100644 index 0000000..ff97033 --- /dev/null +++ b/frouros/tests/unit/detectors/__init__.py @@ -0,0 +1 @@ +"""Detectors test init.""" diff --git a/frouros/tests/unit/detectors/data_drift/__init__.py b/frouros/tests/unit/detectors/data_drift/__init__.py new file mode 100644 index 0000000..f4b2a65 --- /dev/null +++ b/frouros/tests/unit/detectors/data_drift/__init__.py @@ -0,0 +1 @@ +"""Data drift detectors test init.""" diff --git a/frouros/tests/unit/detectors/data_drift/batch/__init__.py b/frouros/tests/unit/detectors/data_drift/batch/__init__.py new file mode 100644 index 0000000..4eb94a3 --- /dev/null +++ b/frouros/tests/unit/detectors/data_drift/batch/__init__.py @@ -0,0 +1 @@ +"""Batch data drift detectors test init.""" diff --git a/frouros/tests/unit/detectors/data_drift/batch/distance_based/__init__.py b/frouros/tests/unit/detectors/data_drift/batch/distance_based/__init__.py new file mode 100644 index 0000000..6f6178d --- /dev/null +++ b/frouros/tests/unit/detectors/data_drift/batch/distance_based/__init__.py @@ -0,0 +1 @@ +"""Distance based batch data drift detectors test init.""" diff --git a/frouros/tests/unit/detectors/data_drift/batch/distance_based/test_mmd.py b/frouros/tests/unit/detectors/data_drift/batch/distance_based/test_mmd.py new file mode 100644 index 0000000..ba3cddb --- /dev/null +++ b/frouros/tests/unit/detectors/data_drift/batch/distance_based/test_mmd.py @@ -0,0 +1,49 @@ +"""Test MMD.""" + +from functools import partial +from typing import Tuple + +import numpy as np # type: ignore +import pytest # type: ignore + +from frouros.detectors.data_drift import MMD +from frouros.utils.kernels import rbf_kernel + + +@pytest.mark.parametrize( + "distribution_p, distribution_q, expected_distance", + [ + ((0, 1, 100), (0, 1, 100), 0.00052755), # (mean, std, size) + ((0, 1, 100), (0, 1, 10), -0.03200193), + ((0, 1, 10), (0, 1, 100), 0.07154671), + ((2, 1, 100), (0, 1, 100), 0.43377622), + ((2, 1, 100), (0, 1, 10), 0.23051378), + ((2, 1, 10), (0, 1, 100), 0.62530767), + ], +) +def test_mmd_batch_univariate( + distribution_p: Tuple[float, float, int], + distribution_q: Tuple[float, float, int], + expected_distance: float, +) -> None: + """Test MMD batch with univariate data. + + :param distribution_p: mean, std and size of samples from distribution p + :type distribution_p: Tuple[float, float, int] + :param distribution_q: mean, std and size of samples from distribution q + :type distribution_q: Tuple[float, float, int] + :param expected_distance: expected distance value + :type expected_distance: float + """ + np.random.seed(seed=31) + X_ref = np.random.normal(*distribution_p) # noqa: N806 + X_test = np.random.normal(*distribution_q) # noqa: N806 + + detector = MMD( + kernel=partial(rbf_kernel, sigma=0.5), + ) + _ = detector.fit(X=X_ref) + + result = detector.compare(X=X_test)[0] + + assert np.isclose(result.distance, expected_distance) From d2ef3e62b18e5d79cb62c026393dfe78d8891ecf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20C=C3=A9spedes=20Sisniega?= Date: Mon, 14 Aug 2023 16:02:32 +0200 Subject: [PATCH 2/4] Precompute kernel matrix of reference data in MMD --- .../data_drift/batch/distance_based/mmd.py | 108 +++++++++++------- 1 file changed, 69 insertions(+), 39 deletions(-) diff --git a/frouros/detectors/data_drift/batch/distance_based/mmd.py b/frouros/detectors/data_drift/batch/distance_based/mmd.py index 038c44f..8310c84 100644 --- a/frouros/detectors/data_drift/batch/distance_based/mmd.py +++ b/frouros/detectors/data_drift/batch/distance_based/mmd.py @@ -64,6 +64,7 @@ def __init__( # noqa: D107 ) self.kernel = kernel self.chunk_size = chunk_size + self._expected_k_xx = None @property def chunk_size(self) -> Optional[int]: @@ -122,11 +123,47 @@ def _distance_measure( Y=X, kernel=self.kernel, chunk_size=self.chunk_size, + expected_k_xx=self._expected_k_xx, **kwargs, ) distance_test = DistanceResult(distance=mmd) return distance_test + def _fit( + self, + X: np.ndarray, # noqa: N803 + ) -> None: + super()._fit(X=X) + # Add dimension only for the kernel calculation (if dim == 1) + if X.ndim == 1: + X = np.expand_dims(X, axis=1) # noqa: N806 + x_num_samples = len(self.X_ref) # type: ignore # noqa: N806 + + chunk_size_x = ( + x_num_samples + if self.chunk_size is None + else self.chunk_size # type: ignore + ) + + x_chunks = self._get_chunks( # noqa: N806 + data=X, + chunk_size=chunk_size_x, # type: ignore + ) + x_chunks_combinations = itertools.product(x_chunks, repeat=2) # noqa: N806 + + k_xx_sum = ( + self._compute_kernel( + chunk_combinations=x_chunks_combinations, # type: ignore + kernel=self.kernel, + ) + # Remove diagonal (j!=i case) + - x_num_samples # type: ignore + ) + + self._expected_k_xx = k_xx_sum / ( # type: ignore + x_num_samples * (x_num_samples - 1) # type: ignore + ) + @staticmethod def _compute_kernel(chunk_combinations: Generator, kernel: Callable) -> float: k_sum = np.array([kernel(*chunk).sum() for chunk in chunk_combinations]).sum() @@ -159,13 +196,39 @@ def _mmd( # pylint: disable=too-many-locals if "chunk_size" in kwargs and kwargs["chunk_size"] is not None else x_num_samples ) - x_chunks, x_chunks_copy = itertools.tee( # noqa: N806 - MMD._get_chunks( + + # If expected_k_xx is provided, we don't need to compute it again + if "expected_k_xx" in kwargs: + x_chunks_copy = MMD._get_chunks( # noqa: N806 data=X, chunk_size=chunk_size_x, # type: ignore - ), - 2, - ) + ) + expected_k_xx = kwargs["expected_k_xx"] + else: + # Compute expected_k_xx + x_chunks, x_chunks_copy = itertools.tee( # noqa: N806 + MMD._get_chunks( + data=X, + chunk_size=chunk_size_x, # type: ignore + ), + 2, + ) + x_chunks_combinations = itertools.product( # noqa: N806 + x_chunks, + repeat=2, + ) + k_xx_sum = ( + MMD._compute_kernel( + chunk_combinations=x_chunks_combinations, # type: ignore + kernel=kernel, + ) + # Remove diagonal (j!=i case) + - x_num_samples # type: ignore + ) + expected_k_xx = k_xx_sum / ( # type: ignore + x_num_samples * (x_num_samples - 1) # type: ignore + ) + y_num_samples = len(Y) # noqa: N806 chunk_size_y = ( kwargs["chunk_size"] @@ -179,10 +242,6 @@ def _mmd( # pylint: disable=too-many-locals ), 2, ) - x_chunks_combinations = itertools.product( # noqa: N806 - x_chunks, - repeat=2, - ) y_chunks_combinations = itertools.product( # noqa: N806 y_chunks, repeat=2, @@ -192,35 +251,6 @@ def _mmd( # pylint: disable=too-many-locals y_chunks_copy, ) - if kwargs.get("verbose", False): - num_chunks_x = math.ceil(x_num_samples / chunk_size_x) # type: ignore - num_chunks_y = math.ceil(y_num_samples / chunk_size_y) # type: ignore - num_chunks_x_combinations = num_chunks_x**2 - num_chunks_y_combinations = num_chunks_y**2 - num_chunks_xy = ( - math.ceil(len(X) / chunk_size_x) * num_chunks_y # type: ignore - ) - x_chunks_combinations = tqdm.tqdm( - x_chunks_combinations, - total=num_chunks_x_combinations, - ) - y_chunks_combinations = tqdm.tqdm( - y_chunks_combinations, - total=num_chunks_y_combinations, - ) - xy_chunks_combinations = tqdm.tqdm( - xy_chunks_combinations, - total=num_chunks_xy, - ) - - k_xx_sum = ( - MMD._compute_kernel( - chunk_combinations=x_chunks_combinations, # type: ignore - kernel=kernel, - ) - # Remove diagonal (j!=i case) - - x_num_samples # type: ignore - ) k_yy_sum = ( MMD._compute_kernel( chunk_combinations=y_chunks_combinations, # type: ignore @@ -234,7 +264,7 @@ def _mmd( # pylint: disable=too-many-locals kernel=kernel, ) mmd = ( - +k_xx_sum / (x_num_samples * (x_num_samples - 1)) + + expected_k_xx + k_yy_sum / (y_num_samples * (y_num_samples - 1)) - 2 * k_xy_sum / (x_num_samples * y_num_samples) # type: ignore ) From f137843081b4cc72cdb9196f2c158f486acf94a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20C=C3=A9spedes=20Sisniega?= Date: Mon, 14 Aug 2023 16:06:59 +0200 Subject: [PATCH 3/4] Add unit test MMD precomputed --- .../batch/distance_based/test_mmd.py | 57 ++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/frouros/tests/unit/detectors/data_drift/batch/distance_based/test_mmd.py b/frouros/tests/unit/detectors/data_drift/batch/distance_based/test_mmd.py index ba3cddb..8eed867 100644 --- a/frouros/tests/unit/detectors/data_drift/batch/distance_based/test_mmd.py +++ b/frouros/tests/unit/detectors/data_drift/batch/distance_based/test_mmd.py @@ -1,7 +1,7 @@ """Test MMD.""" from functools import partial -from typing import Tuple +from typing import Optional, Tuple import numpy as np # type: ignore import pytest # type: ignore @@ -47,3 +47,58 @@ def test_mmd_batch_univariate( result = detector.compare(X=X_test)[0] assert np.isclose(result.distance, expected_distance) + + +@pytest.mark.parametrize( + "distribution_p, distribution_q, chunk_size", + [ + ((0, 1, 100), (0, 1, 100), None), # (mean, std, size) + ((0, 1, 100), (0, 1, 100), 2), + ((0, 1, 100), (0, 1, 100), 10), + ((0, 1, 100), (0, 1, 10), None), + ((0, 1, 100), (0, 1, 10), 2), + ((0, 1, 100), (0, 1, 10), 10), + ((0, 1, 10), (0, 1, 100), None), + ((0, 1, 10), (0, 1, 100), 2), + ((0, 1, 10), (0, 1, 100), 10), + ], +) +def test_mmd_batch_precomputed_expected_k_xx( + distribution_p: Tuple[float, float, int], + distribution_q: Tuple[float, float, int], + chunk_size: Optional[int], +) -> None: + """Test MMD batch with precomputed expected k_xx. + + :param distribution_p: mean, std and size of samples from distribution p + :type distribution_p: Tuple[float, float, int] + :param distribution_q: mean, std and size of samples from distribution q + :type distribution_q: Tuple[float, float, int] + :param chunk_size: chunk size + :type chunk_size: Optional[int] + """ + np.random.seed(seed=31) + X_ref = np.random.normal(*distribution_p) # noqa: N806 + X_test = np.random.normal(*distribution_q) # noqa: N806 + + kernel = partial(rbf_kernel, sigma=0.5) + + detector = MMD( + kernel=kernel, + chunk_size=chunk_size, + ) + _ = detector.fit(X=X_ref) + + # Computes mmd using precomputed expected k_xx + precomputed_distance = detector.compare(X=X_test)[0].distance + + # Computes mmd from scratch + scratch_distance = MMD._mmd( + X=X_ref, + Y=X_test, + kernel=kernel, + chunk_size=chunk_size, + ) + + assert np.isclose(precomputed_distance, scratch_distance) + From 3598aa26524a32628557ed3366ff9a9580eaac06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20C=C3=A9spedes=20Sisniega?= Date: Mon, 14 Aug 2023 16:27:18 +0200 Subject: [PATCH 4/4] Fix PEP8 --- .../data_drift/batch/distance_based/mmd.py | 42 +++++++++---------- .../batch/distance_based/test_mmd.py | 3 +- 2 files changed, 20 insertions(+), 25 deletions(-) diff --git a/frouros/detectors/data_drift/batch/distance_based/mmd.py b/frouros/detectors/data_drift/batch/distance_based/mmd.py index 8310c84..9dc9641 100644 --- a/frouros/detectors/data_drift/batch/distance_based/mmd.py +++ b/frouros/detectors/data_drift/batch/distance_based/mmd.py @@ -1,11 +1,9 @@ """MMD (Maximum Mean Discrepancy) module.""" import itertools -import math from typing import Callable, Generator, Optional, List, Union import numpy as np # type: ignore -import tqdm # type: ignore from frouros.callbacks.batch.base import BaseCallbackBatch from frouros.detectors.data_drift.base import MultivariateData @@ -137,7 +135,7 @@ def _fit( # Add dimension only for the kernel calculation (if dim == 1) if X.ndim == 1: X = np.expand_dims(X, axis=1) # noqa: N806 - x_num_samples = len(self.X_ref) # type: ignore # noqa: N806 + x_num_samples = len(self.X_ref) # type: ignore chunk_size_x = ( x_num_samples @@ -147,7 +145,7 @@ def _fit( x_chunks = self._get_chunks( # noqa: N806 data=X, - chunk_size=chunk_size_x, # type: ignore + chunk_size=chunk_size_x, ) x_chunks_combinations = itertools.product(x_chunks, repeat=2) # noqa: N806 @@ -157,11 +155,11 @@ def _fit( kernel=self.kernel, ) # Remove diagonal (j!=i case) - - x_num_samples # type: ignore + - x_num_samples ) self._expected_k_xx = k_xx_sum / ( # type: ignore - x_num_samples * (x_num_samples - 1) # type: ignore + x_num_samples * (x_num_samples - 1) ) @staticmethod @@ -201,33 +199,31 @@ def _mmd( # pylint: disable=too-many-locals if "expected_k_xx" in kwargs: x_chunks_copy = MMD._get_chunks( # noqa: N806 data=X, - chunk_size=chunk_size_x, # type: ignore + chunk_size=chunk_size_x, ) expected_k_xx = kwargs["expected_k_xx"] else: # Compute expected_k_xx - x_chunks, x_chunks_copy = itertools.tee( # noqa: N806 + x_chunks, x_chunks_copy = itertools.tee( # type: ignore MMD._get_chunks( data=X, - chunk_size=chunk_size_x, # type: ignore + chunk_size=chunk_size_x, ), 2, ) - x_chunks_combinations = itertools.product( # noqa: N806 + x_chunks_combinations = itertools.product( # type: ignore x_chunks, repeat=2, ) k_xx_sum = ( - MMD._compute_kernel( - chunk_combinations=x_chunks_combinations, # type: ignore - kernel=kernel, - ) - # Remove diagonal (j!=i case) - - x_num_samples # type: ignore - ) - expected_k_xx = k_xx_sum / ( # type: ignore - x_num_samples * (x_num_samples - 1) # type: ignore + MMD._compute_kernel( + chunk_combinations=x_chunks_combinations, # type: ignore + kernel=kernel, + ) + # Remove diagonal (j!=i case) + - x_num_samples ) + expected_k_xx = k_xx_sum / (x_num_samples * (x_num_samples - 1)) y_num_samples = len(Y) # noqa: N806 chunk_size_y = ( @@ -238,7 +234,7 @@ def _mmd( # pylint: disable=too-many-locals y_chunks, y_chunks_copy = itertools.tee( # noqa: N806 MMD._get_chunks( data=Y, - chunk_size=chunk_size_y, # type: ignore + chunk_size=chunk_size_y, ), 2, ) @@ -257,15 +253,15 @@ def _mmd( # pylint: disable=too-many-locals kernel=kernel, ) # Remove diagonal (j!=i case) - - y_num_samples # type: ignore + - y_num_samples ) k_xy_sum = MMD._compute_kernel( chunk_combinations=xy_chunks_combinations, # type: ignore kernel=kernel, ) mmd = ( - + expected_k_xx + +expected_k_xx + k_yy_sum / (y_num_samples * (y_num_samples - 1)) - - 2 * k_xy_sum / (x_num_samples * y_num_samples) # type: ignore + - 2 * k_xy_sum / (x_num_samples * y_num_samples) ) return mmd diff --git a/frouros/tests/unit/detectors/data_drift/batch/distance_based/test_mmd.py b/frouros/tests/unit/detectors/data_drift/batch/distance_based/test_mmd.py index 8eed867..b275809 100644 --- a/frouros/tests/unit/detectors/data_drift/batch/distance_based/test_mmd.py +++ b/frouros/tests/unit/detectors/data_drift/batch/distance_based/test_mmd.py @@ -93,7 +93,7 @@ def test_mmd_batch_precomputed_expected_k_xx( precomputed_distance = detector.compare(X=X_test)[0].distance # Computes mmd from scratch - scratch_distance = MMD._mmd( + scratch_distance = MMD._mmd( # pylint: disable=protected-access X=X_ref, Y=X_test, kernel=kernel, @@ -101,4 +101,3 @@ def test_mmd_batch_precomputed_expected_k_xx( ) assert np.isclose(precomputed_distance, scratch_distance) -