fix cv generator in feature selection classes (#803)

solegalli · web-flow · commit af7189334a6f · 2024-08-27T17:17:45.000+02:00
* fix cv generator in feature selection classes

* add tests|

* fix error shuffle features

* fix style
diff --git a/feature_engine/selection/shuffle_features.py b/feature_engine/selection/shuffle_features.py
@@ -1,3 +1,4 @@
+from types import GeneratorType
 from typing import List, MutableSequence, Union
 
 import numpy as np
@@ -221,12 +222,14 @@ def fit(
         # check that there are more than 1 variable to select from
         self._check_variable_number()
 
+        cv = list(self.cv) if isinstance(self.cv, GeneratorType) else self.cv
+
         # train model with all features and cross-validation
         model = cross_validate(
             self.estimator,
             X[self.variables_],
             y,
-            cv=self.cv,
+            cv=cv,
             return_estimator=True,
             scoring=self.scoring,
             params={"sample_weight": sample_weight},
@@ -236,7 +239,7 @@ def fit(
         self.initial_model_performance_ = model["test_score"].mean()
 
         # extract the validation folds
-        cv_ = check_cv(self.cv, y=y, classifier=is_classifier(self.estimator))
+        cv_ = check_cv(cv, y=y, classifier=is_classifier(self.estimator))
         validation_indices = [val_index for _, val_index in cv_.split(X, y)]
 
         # get performance metric
diff --git a/feature_engine/selection/smart_correlation_selection.py b/feature_engine/selection/smart_correlation_selection.py
@@ -1,3 +1,4 @@
+from types import GeneratorType
 from typing import List, Union
 
 import pandas as pd
@@ -317,13 +318,14 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None):
         # select best performing feature according to estimator
         if self.selection_method == "model_performance":
             correlated_dict = dict()
+            cv = list(self.cv) if isinstance(self.cv, GeneratorType) else self.cv
             for feature_group in correlated_groups:
                 feature_performance, _ = single_feature_performance(
                     X,
                     y,
                     feature_group,
                     self.estimator,
-                    self.cv,
+                    cv,
                     self.scoring,
                 )
                 # get most important feature
diff --git a/feature_engine/selection/target_mean_selection.py b/feature_engine/selection/target_mean_selection.py
@@ -1,3 +1,4 @@
+from types import GeneratorType
 from typing import List, Union
 
 import pandas as pd
@@ -299,6 +300,8 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
 
         self.feature_performance_ = {}
 
+        cv = list(self.cv) if isinstance(self.cv, GeneratorType) else self.cv
+
         for variable in self.variables_:
             # clone estimator
             estimator = clone(est)
@@ -310,7 +313,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
                 estimator,
                 X,
                 y,
-                cv=self.cv,
+                cv=cv,
                 scoring=self.scoring,
             )
 
diff --git a/tests/test_selection/test_shuffle_features.py b/tests/test_selection/test_shuffle_features.py
@@ -1,8 +1,10 @@
 import numpy as np
 import pandas as pd
 import pytest
+
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.linear_model import LinearRegression
+from sklearn.model_selection import StratifiedKFold
 from sklearn.tree import DecisionTreeRegressor
 
 from feature_engine.selection import SelectByShuffling
@@ -93,6 +95,42 @@ def test_regression_cv_2_and_mse(load_diabetes_dataset):
     pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
 
 
+def test_cv_generator(df_test):
+    X, y = df_test
+    cv = StratifiedKFold(n_splits=3)
+
+    X, y = df_test
+    sel = SelectByShuffling(
+        RandomForestClassifier(random_state=1),
+        threshold=0.01,
+        random_state=1,
+        cv=3,
+    )
+    sel.fit(X, y)
+
+    # expected result
+    Xtransformed = pd.DataFrame(X["var_7"].copy())
+    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
+
+    sel = SelectByShuffling(
+        RandomForestClassifier(random_state=1),
+        threshold=0.01,
+        random_state=1,
+        cv=cv,
+    )
+    sel.fit(X, y)
+    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
+
+    sel = SelectByShuffling(
+        RandomForestClassifier(random_state=1),
+        threshold=0.01,
+        random_state=1,
+        cv=cv.split(X, y),
+    )
+    sel.fit(X, y)
+    pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
+
+
 def test_raises_threshold_error():
     with pytest.raises(ValueError):
         SelectByShuffling(RandomForestClassifier(random_state=1), threshold="hello")
diff --git a/tests/test_selection/test_smart_correlation_selection.py b/tests/test_selection/test_smart_correlation_selection.py
@@ -3,6 +3,7 @@
 import pytest
 from sklearn.datasets import make_classification
 from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import KFold
 
 from feature_engine.selection import SmartCorrelatedSelection
 from tests.estimator_checks.init_params_allowed_values_checks import (
@@ -213,6 +214,27 @@ def test_model_performance_2_correlated_groups(df_test):
     pd.testing.assert_frame_equal(Xt, df)
 
 
+def test_cv_generator(df_single):
+    X, y = df_single
+    cv = KFold(3)
+
+    transformer = SmartCorrelatedSelection(
+        variables=None,
+        method="pearson",
+        threshold=0.8,
+        missing_values="raise",
+        selection_method="model_performance",
+        estimator=RandomForestClassifier(n_estimators=10, random_state=1),
+        scoring="roc_auc",
+        cv=cv.split(X, y),
+    )
+
+    Xt = transformer.fit_transform(X, y)
+
+    df = X[["var_0", "var_2", "var_3", "var_4", "var_5"]].copy()
+    pd.testing.assert_frame_equal(Xt, df)
+
+
 def test_error_if_select_model_performance_and_y_is_none(df_single):
     X, y = df_single
 
diff --git a/tests/test_selection/test_target_mean_selection.py b/tests/test_selection/test_target_mean_selection.py
@@ -1,6 +1,8 @@
 import pandas as pd
 import pytest
 
+from sklearn.model_selection import StratifiedKFold
+
 from feature_engine.selection import SelectByTargetMeanPerformance
 
 
@@ -161,6 +163,21 @@ def test_regression():
     pd.testing.assert_frame_equal(sel.transform(X), Xtransformed)
 
 
+def test_cv_generator():
+    X, y = df_classification()
+    cv = StratifiedKFold(n_splits=2)
+    sel = SelectByTargetMeanPerformance(
+        variables=None,
+        scoring="accuracy",
+        threshold=None,
+        bins=2,
+        strategy="equal_width",
+        cv=cv.split(X, y),
+    )
+    sel.fit(X, y)
+    pd.testing.assert_frame_equal(sel.transform(X), X[["cat_var_A", "num_var_A"]])
+
+
 def test_error_wrong_params():
     with pytest.raises(ValueError):
         SelectByTargetMeanPerformance(scoring="mean_squared")