Merge pull request #17 from achamma723/main

[FIX]: Fix tests and update bbi
Parietal-INRIA · Jun 2, 2024 · 40ac401 · 40ac401
2 parents d991917 + 33bd71b
commit 40ac401
Show file tree

Hide file tree

Showing 10 changed files with 243 additions and 73 deletions.
diff --git a/doc/conf.py b/doc/conf.py
@@ -220,9 +220,6 @@
     pyvista.OFF_SCREEN = False
 except Exception:
     pass
-else:
-    brain_scraper = mne.viz._brain._BrainScraper()
-    scrapers += (brain_scraper, 'pyvista')
 if any(x in scrapers for x in ('pyvista')):
     from traits.api import push_exception_handler
     push_exception_handler(reraise_exceptions=True)

diff --git a/doc/index.rst b/doc/index.rst
@@ -116,6 +116,18 @@ Application to source localization (MEG/EEG data):
   desparsified multi-task Lasso. In Proceedings of the 34th Conference on
   Neural Information Processing Systems (NeurIPS 2020), Vancouver, Canada.
 
+Single/Group statistically validated importance using conditional permutations:
+
+* Chamma, A., Thirion, B., & Engemann, D. (2024). **Variable importance in
+high-dimensional settings requires grouping**. In Proceedings of the 38th
+Conference of the Association for the Advancement of Artificial
+Intelligence(AAAI 2024), Vancouver, Canada.
+
+* Chamma, A., Engemann, D., & Thirion, B. (2023). **Statistically Valid Variable
+Importance Assessment through Conditional Permutations**. In Proceedings of the
+37th Conference on Neural Information Processing Systems (NeurIPS 2023), New
+Orleans, USA.
+
 If you use our packages, we would appreciate citations to the relevant
 aforementioned papers.
 

diff --git a/examples/plot_diabetesFeatures_importance_example.py b/examples/plot_diabetesFeatures_importance_example.py
@@ -0,0 +1,101 @@
+"""
+Variable Importance on diabetes dataset
+=======================================
+
+This example compares the standard permutation approach for variable importance
+and its conditional variant on the diabetes dataset for the single-level case.
+"""
+
+#############################################################################
+# Imports needed for this script
+# ------------------------------
+
+import numpy as np
+from hidimstat.BBI import BlockBasedImportance
+from sklearn.datasets import load_diabetes
+import matplotlib.pyplot as plt
+plt.rcParams.update({'font.size': 14})
+
+# Fixing the random seed
+rng = np.random.RandomState(2024)
+
+diabetes = load_diabetes()
+X, y = diabetes.data, diabetes.target
+
+# Use or not a cross-validation with the provided learner
+k_fold = 2
+# Identifying the categorical (nominal & ordinal) variables
+list_nominal = {}
+
+#############################################################################
+# Standard Variable Importance
+# ----------------------------
+
+bbi_perm = BlockBasedImportance(
+    estimator='RF',
+    importance_estimator="Mod_RF",
+    do_hyper=True,
+    dict_hyper=None,
+    conditional=False,
+    group_stacking=False,
+    prob_type="regression",
+    k_fold=k_fold,
+    list_nominal=list_nominal,
+    n_jobs=10,
+    verbose=0,
+    n_perm=100,
+)
+bbi_perm.fit(X, y)
+print("Computing the importance scores with standard permutation")
+results_perm = bbi_perm.compute_importance()
+pvals_perm = -np.log10(results_perm["pval"] + 1e-10)
+
+#############################################################################
+# Conditional Variable Importance
+# -------------------------------
+
+bbi_cond = BlockBasedImportance(
+    estimator='RF',
+    importance_estimator="Mod_RF",
+    do_hyper=True,
+    dict_hyper=None,
+    conditional=True,
+    group_stacking=False,
+    prob_type="regression",
+    k_fold=k_fold,
+    list_nominal=list_nominal,
+    n_jobs=10,
+    verbose=0,
+    n_perm=100,
+)
+bbi_cond.fit(X, y)
+print("Computing the importance scores with conditional permutation")
+results_cond = bbi_cond.compute_importance()
+pvals_cond = -np.log10(results_cond["pval"] + 1e-5)
+
+#############################################################################
+# Plotting the comparison
+# -----------------------
+
+list_res = {'Perm': [], 'Cond': []}
+for ind_el, el in enumerate(diabetes.feature_names):
+    list_res['Perm'].append(pvals_perm[ind_el][0])
+    list_res['Cond'].append(pvals_cond[ind_el][0])
+
+x = np.arange(len(diabetes.feature_names))
+width = 0.25  # the width of the bars
+multiplier = 0
+fig, ax = plt.subplots(figsize=(5, 5), layout='constrained')
+
+for attribute, measurement in list_res.items():
+    offset = width * multiplier
+    rects = ax.bar(x + offset, measurement, width, label=attribute)
+    multiplier += 1
+
+ax.set_ylabel(r'$-log_{10}p_{val}$')
+ax.set_xticks(x + width/2, diabetes.feature_names)
+ax.legend(loc='upper left', ncols=2)
+ax.set_ylim(0, 3)
+ax.axhline(y=-np.log10(0.05), color='r', linestyle='-')
+
+plt.show()
diff --git a/examples/plot_meg_data_example.py → ...es_not_exhibited/plot_meg_data_example.py b/examples/plot_meg_data_example.py → ...es_not_exhibited/plot_meg_data_example.py
@@ -100,7 +100,7 @@ def _load_somato(cond):
     # Get data paths
     data_path = somato.data_path()
     subject = '01'
-    subjects_dir = data_path + '/derivatives/freesurfer/subjects'
+    subjects_dir = data_path / '/derivatives/freesurfer/subjects'
     raw_fname = os.path.join(data_path, f'sub-{subject}', 'meg',
                              f'sub-{subject}_task-{cond}_meg.fif')
     fwd_fname = os.path.join(data_path, 'derivatives', f'sub-{subject}',

diff --git a/hidimstat/BBI.py b/hidimstat/BBI.py
@@ -16,7 +16,7 @@
     roc_auc_score,
     r2_score,
 )
-from sklearn.model_selection import KFold
+from sklearn.model_selection import KFold, GroupKFold
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import OneHotEncoder, StandardScaler
 from sklearn.utils.validation import check_is_fitted
@@ -79,11 +79,12 @@ class BlockBasedImportance(BaseEstimator, TransformerMixin):
         Fixing the seeds of the random generator.
     com_imp: boolean, default=True
         Compute or not the importance scores.
+    group_label: list, default=None
+        The list of group labels to perform GroupKFold
     Attributes
     ----------
     ToDO
     """
-
     def __init__(
         self,
         estimator=None,
@@ -102,11 +103,13 @@ def __init__(
         verbose=0,
         groups=None,
         group_stacking=False,
+        sub_groups=None,
         k_fold=2,
         prop_out_subLayers=0,
         index_i=None,
         random_state=2023,
         com_imp=True,
+        group_fold=None,
     ):
         self.estimator = estimator
         self.importance_estimator = importance_estimator
@@ -123,6 +126,7 @@ def __init__(
         self.n_jobs = n_jobs
         self.verbose = verbose
         self.groups = groups
+        self.sub_groups = sub_groups
         self.group_stacking = group_stacking
         self.k_fold = k_fold
         self.prop_out_subLayers = prop_out_subLayers
@@ -139,6 +143,7 @@ def __init__(
         self.scaler_x = [None] * max(self.k_fold, 1)
         self.scaler_y = [None] * max(self.k_fold, 1)
         self.com_imp = com_imp
+        self.group_fold = group_fold
         # Check for applying the stacking approach with the RidgeCV estimator
         self.apply_ridge = False
         # Check for the case of a coffeine transformer with provided groups
@@ -221,14 +226,14 @@ def fit(self, X, y=None):
         # number of variables provided
         list_count = [item for sublist in self.groups for item in sublist]
         if self.coffeine_transformer is None:
-            if len(list_count) != X.shape[1]:
+            if len(set(list_count)) != X.shape[1]:
                 raise Exception("The provided groups are missing some variables!")
         else:
             if self.transformer_grp:
-                if len(list_count) != (X.shape[1] * self.coffeine_transformer[1]):
+                if len(set(list_count)) != (X.shape[1] * self.coffeine_transformer[1]):
                     raise Exception("The provided groups are missing some variables!")
             else:
-                if len(list_count) != X.shape[1]:
+                if len(set(list_count)) != X.shape[1]:
                     raise Exception("The provided groups are missing some variables!")
 
         # Check if categorical variables exist within the columns of the design
@@ -319,6 +324,7 @@ def fit(self, X, y=None):
                         current_grp += self.dict_cont[i]
                 self.list_grps.append(current_grp)
 
+            # To check
             if len(self.coffeine_transformers) == 1:
                 X = self.coffeine_transformers[0].fit_transform(
                     pd.DataFrame(X, columns=self.X_cols), np.ravel(y))
@@ -406,12 +412,18 @@ def fit(self, X, y=None):
 
         if self.k_fold != 0:
             # Implementing k-fold cross validation as the default behavior
-            kf = KFold(
-                n_splits=self.k_fold,
-                random_state=self.random_state,
-                shuffle=True,
-            )
-            for ind_fold, (train_index, test_index) in enumerate(kf.split(X)):
+            if self.group_fold:
+                kf = GroupKFold(n_splits=self.k_fold)
+                list_splits = kf.split(X, y, self.group_fold)
+            else:
+                kf = KFold(
+                    n_splits=self.k_fold,
+                    random_state=self.random_state,
+                    shuffle=True,
+                )
+                list_splits = kf.split(X)
+
+            for ind_fold, (train_index, test_index) in enumerate(list_splits):
                 print(f"Processing: {ind_fold+1}")
                 X_fold = X.copy()
                 y_fold = y.copy()
@@ -697,11 +709,12 @@ def compute_importance(self, X=None, y=None):
         else:
             if self.coffeine_transformer is not None:
                 X = self.coffeine_transformers[0].transform(pd.DataFrame(X, columns=self.X_cols))
-                # Variables are provided as the third element of the
-                # coffeine transformer parameter
-                if len(self.coffeine_transformer) > 2:
-                    X = X[:, self.coffeine_transformer[2]]
-                    self.list_cont = np.arange(len(self.coffeine_transformer[2]))
+                if not self.transformer_grp:
+                    # Variables are provided as the third element of the
+                    # coffeine transformer parameter
+                    if len(self.coffeine_transformer) > 2:
+                        X = X[:, self.coffeine_transformer[2]]
+                        self.list_cont = np.arange(len(self.coffeine_transformer[2]))
             # Perform stacking if enabled
             if self.apply_ridge:
                 X_prev = X.copy()
@@ -773,6 +786,7 @@ def compute_importance(self, X=None, y=None):
                                     index_i=ind_fold + 1,
                                     group_stacking=self.group_stacking,
                                     random_state=list_seeds_imp[perm],
+                                    verbose=self.verbose,
                                 )
                                 for p_col in range(len(self.list_cols))
                                 for perm in range(self.n_perm)
@@ -812,9 +826,11 @@ def compute_importance(self, X=None, y=None):
                                     proc_col=p_col,
                                     index_i=ind_fold + 1,
                                     group_stacking=self.group_stacking,
+                                    sub_groups=[self.list_cols, self.sub_groups],
                                     list_seeds=list_seeds_imp,
                                     Perm=self.Perm,
                                     output_dim=output_dim,
+                                    verbose=self.verbose,
                                 )
                                 for p_col in range(len(self.list_cols))
                             )