From 68ee7abd6ed505b052522f3fe8616f192ace842d Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 17 Apr 2024 13:39:25 +0200 Subject: [PATCH] [bug] fix sample_weight check for IncrementalBasicStatistics (#1799) * Update incremental_basic_statistics.py * formatting' * Update incremental_basic_statistics.py * Update test_incremental_basic_statistics.py * Update incremental_basic_statistics.py * Update incremental_basic_statistics.py * Update incremental_basic_statistics.py * Update incremental_basic_statistics.py * Update incremental_basic_statistics.py * Update test_incremental_basic_statistics.py --- .../incremental_basic_statistics.py | 34 +++++++++++++------ .../test_incremental_basic_statistics.py | 20 +++++------ 2 files changed, 33 insertions(+), 21 deletions(-) diff --git a/sklearnex/basic_statistics/incremental_basic_statistics.py b/sklearnex/basic_statistics/incremental_basic_statistics.py index 1362d8d406..805bc4d716 100644 --- a/sklearnex/basic_statistics/incremental_basic_statistics.py +++ b/sklearnex/basic_statistics/incremental_basic_statistics.py @@ -17,6 +17,7 @@ import numpy as np from sklearn.base import BaseEstimator from sklearn.utils import check_array, gen_batches +from sklearn.utils.validation import _check_sample_weight from daal4py.sklearn._n_jobs_support import control_n_jobs from daal4py.sklearn._utils import sklearn_check_version @@ -139,7 +140,7 @@ def _onedal_finalize_fit(self): self._onedal_estimator.finalize_fit() self._need_to_finalize = False - def _onedal_partial_fit(self, X, weights, queue): + def _onedal_partial_fit(self, X, sample_weight=None, queue=None): first_pass = not hasattr(self, "n_samples_seen_") or self.n_samples_seen_ == 0 if sklearn_check_version("1.0"): @@ -152,9 +153,11 @@ def _onedal_partial_fit(self, X, weights, queue): X = check_array( X, dtype=[np.float64, np.float32], - copy=self.copy_X, ) + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X) + if first_pass: self.n_samples_seen_ = X.shape[0] self.n_features_in_ = X.shape[1] @@ -168,15 +171,18 @@ def _onedal_partial_fit(self, X, weights, queue): self._onedal_estimator = self._onedal_incremental_basic_statistics( **onedal_params ) - self._onedal_estimator.partial_fit(X, weights, queue) + self._onedal_estimator.partial_fit(X, sample_weight, queue) self._need_to_finalize = True - def _onedal_fit(self, X, weights, queue=None): + def _onedal_fit(self, X, sample_weight=None, queue=None): if sklearn_check_version("1.0"): X = self._validate_data(X, dtype=[np.float64, np.float32]) else: X = check_array(X, dtype=[np.float64, np.float32]) + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X) + n_samples, n_features = X.shape if self.batch_size is None: self.batch_size_ = 5 * n_features @@ -189,7 +195,7 @@ def _onedal_fit(self, X, weights, queue=None): for batch in gen_batches(X.shape[0], self.batch_size_): X_batch = X[batch] - weights_batch = weights[batch] if weights is not None else None + weights_batch = sample_weight[batch] if sample_weight is not None else None self._onedal_partial_fit(X_batch, weights_batch, queue=queue) if sklearn_check_version("1.2"): @@ -217,7 +223,7 @@ def __getattr__(self, attr): f"'{self.__class__.__name__}' object has no attribute '{attr}'" ) - def partial_fit(self, X, weights=None): + def partial_fit(self, X, sample_weight=None): """Incremental fit with X. All of X is processed as a single batch. Parameters @@ -226,7 +232,10 @@ def partial_fit(self, X, weights=None): Data for compute, where `n_samples` is the number of samples and `n_features` is the number of features. - weights : array-like of shape (n_samples,) + y : Ignored + Not used, present for API consistency by convention. + + sample_weight : array-like of shape (n_samples,), default=None Weights for compute weighted statistics, where `n_samples` is the number of samples. Returns @@ -242,11 +251,11 @@ def partial_fit(self, X, weights=None): "sklearn": None, }, X, - weights, + sample_weight, ) return self - def fit(self, X, weights=None): + def fit(self, X, y=None, sample_weight=None): """Compute statistics with X, using minibatches of size batch_size. Parameters @@ -255,7 +264,10 @@ def fit(self, X, weights=None): Data for compute, where `n_samples` is the number of samples and `n_features` is the number of features. - weights : array-like of shape (n_samples,) + y : Ignored + Not used, present for API consistency by convention. + + sample_weight : array-like of shape (n_samples,), default=None Weights for compute weighted statistics, where `n_samples` is the number of samples. Returns @@ -271,6 +283,6 @@ def fit(self, X, weights=None): "sklearn": None, }, X, - weights, + sample_weight, ) return self diff --git a/sklearnex/basic_statistics/tests/test_incremental_basic_statistics.py b/sklearnex/basic_statistics/tests/test_incremental_basic_statistics.py index e9cb93e26a..9df1a3b16c 100644 --- a/sklearnex/basic_statistics/tests/test_incremental_basic_statistics.py +++ b/sklearnex/basic_statistics/tests/test_incremental_basic_statistics.py @@ -52,7 +52,7 @@ def test_partial_fit_multiple_options_on_gold_data(dataframe, queue, weighted, d weights_split_df = _convert_to_dataframe( weights_split[i], sycl_queue=queue, target_df=dataframe ) - result = incbs.partial_fit(X_split_df, weights_split_df) + result = incbs.partial_fit(X_split_df, sample_weight=weights_split_df) else: result = incbs.partial_fit(X_split_df) @@ -103,7 +103,7 @@ def test_partial_fit_single_option_on_random_data( weights_split_df = _convert_to_dataframe( weights_split[i], sycl_queue=queue, target_df=dataframe ) - result = incbs.partial_fit(X_split_df, weights_split_df) + result = incbs.partial_fit(X_split_df, sample_weight=weights_split_df) else: result = incbs.partial_fit(X_split_df) @@ -146,7 +146,7 @@ def test_partial_fit_multiple_options_on_random_data( weights_split_df = _convert_to_dataframe( weights_split[i], sycl_queue=queue, target_df=dataframe ) - result = incbs.partial_fit(X_split_df, weights_split_df) + result = incbs.partial_fit(X_split_df, sample_weight=weights_split_df) else: result = incbs.partial_fit(X_split_df) @@ -199,7 +199,7 @@ def test_partial_fit_all_option_on_random_data( weights_split_df = _convert_to_dataframe( weights_split[i], sycl_queue=queue, target_df=dataframe ) - result = incbs.partial_fit(X_split_df, weights_split_df) + result = incbs.partial_fit(X_split_df, sample_weight=weights_split_df) else: result = incbs.partial_fit(X_split_df) @@ -233,7 +233,7 @@ def test_fit_multiple_options_on_gold_data(dataframe, queue, weighted, dtype): incbs = IncrementalBasicStatistics(batch_size=1) if weighted: - result = incbs.fit(X_df, weights_df) + result = incbs.fit(X_df, sample_weight=weights_df) else: result = incbs.fit(X_df) @@ -272,7 +272,7 @@ def test_fit_single_option_on_random_data( X = X.astype(dtype=dtype) X_df = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) if weighted: - weights = gen.uniform(low=-0.5, high=+1.0, size=row_count) + weights = gen.uniform(low=-0.5, high=1.0, size=row_count) weights = weights.astype(dtype=dtype) weights_df = _convert_to_dataframe(weights, sycl_queue=queue, target_df=dataframe) incbs = IncrementalBasicStatistics( @@ -280,7 +280,7 @@ def test_fit_single_option_on_random_data( ) if weighted: - result = incbs.fit(X_df, weights_df) + result = incbs.fit(X_df, sample_weight=weights_df) else: result = incbs.fit(X_df) @@ -311,7 +311,7 @@ def test_partial_fit_multiple_options_on_random_data( X = X.astype(dtype=dtype) X_df = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) if weighted: - weights = gen.uniform(low=-0.5, high=+1.0, size=row_count) + weights = gen.uniform(low=-0.5, high=1.0, size=row_count) weights = weights.astype(dtype=dtype) weights_df = _convert_to_dataframe(weights, sycl_queue=queue, target_df=dataframe) incbs = IncrementalBasicStatistics( @@ -319,7 +319,7 @@ def test_partial_fit_multiple_options_on_random_data( ) if weighted: - result = incbs.fit(X_df, weights_df) + result = incbs.fit(X_df, sample_weight=weights_df) else: result = incbs.fit(X_df) @@ -366,7 +366,7 @@ def test_fit_all_option_on_random_data( incbs = IncrementalBasicStatistics(result_options="all", batch_size=batch_size) if weighted: - result = incbs.fit(X_df, weights_df) + result = incbs.fit(X_df, sample_weight=weights_df) else: result = incbs.fit(X_df)