Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#43 - Reproducibility #45

Draft
wants to merge 24 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
3ae7f24
Git - added to gitignore folder for testing reproducibility
jtimko16 Jul 22, 2024
3fb56dc
Mod - modified gitignore
jtimko16 Jul 22, 2024
11d388d
Gitignore - added folder autofeat_reproducibility
jtimko16 Jul 22, 2024
24c2c20
Add - Random seeds
jtimko16 Jul 22, 2024
c1821d2
Mod - change list to sorted (avoid randomness)
jtimko16 Jul 23, 2024
ec9457a
Mod - fix the Parallel function
jtimko16 Jul 23, 2024
3cae5d2
Mod - fix reproduciblity when sorting columns
jtimko16 Jul 25, 2024
5727461
Mod - Random seed added to definition of run_select_features
jtimko16 Jul 25, 2024
bbbfa7e
Mod - make consistent another seed
jtimko16 Jul 25, 2024
dcdfec0
Add - added random seed to _noise_fintering
jtimko16 Jul 25, 2024
2a9ea60
Clean - remove extra print statements
jtimko16 Jul 25, 2024
1b9b7da
Merge pull request #1 from jtimko16/43-reprod-issue
jtimko16 Jul 25, 2024
306eacf
Format - run RUFF formatting on featset
jtimko16 Aug 5, 2024
77336c5
Mod - added separate cross validation before fitting models
jtimko16 Aug 5, 2024
1e8e69f
Rem - removed extra random seed
jtimko16 Aug 5, 2024
b2f6c7a
Mod - solve the seed within 1run of select features
jtimko16 Aug 5, 2024
ea1f742
Mod - solved the random seed generator
jtimko16 Aug 5, 2024
73b8381
Typing - fixed typing hint of random_seed
jtimko16 Aug 5, 2024
0a02690
Mod - removed extra randomness in selecting columns
jtimko16 Aug 5, 2024
9812d86
Merge pull request #2 from jtimko16/43-reprod-issue
jtimko16 Aug 5, 2024
eff9428
Mod - using KFold with all CV models; move random_seed_generator to u…
jtimko16 Aug 6, 2024
0c87f7d
Merge pull request #3 from jtimko16/43-reprod-issue
jtimko16 Aug 6, 2024
de21a01
Mod - replaced custom function by np.random.default_rng(); fixed the …
jtimko16 Aug 8, 2024
8d0b566
Merge pull request #4 from jtimko16/43-reprod-issue
jtimko16 Aug 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,4 @@ pyrightconfig.json
poetry.lock
notebooks/profile_autofeat.py
notebooks/newtons_law_of_cooling.ipynb
notebooks/autofeat_reproducibility/*
75 changes: 56 additions & 19 deletions src/autofeat/featsel.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import sklearn.linear_model as lm
from joblib import Parallel, delayed
from sklearn.base import BaseEstimator
from sklearn.model_selection import KFold
from sklearn.utils.validation import check_array, check_is_fitted, check_X_y

from autofeat.nb_utils import nb_standard_scale
Expand Down Expand Up @@ -43,6 +44,7 @@ def _noise_filtering(
target: np.ndarray,
good_cols: list | None = None,
problem_type: str = "regression",
random_seed: int | None = None,
) -> list:
"""
Trains a prediction model with additional noise features and selects only those of the
Expand All @@ -62,14 +64,16 @@ def _noise_filtering(
good_cols = list(range(n_feat))
assert len(good_cols) == n_feat, "fewer column names provided than features in X."
# perform noise filtering on these features
kf = KFold(n_splits=5, shuffle=True, random_state=random_seed)
if problem_type == "regression":
model = lm.LassoLarsCV(cv=5, eps=1e-8)
model = lm.LassoLarsCV(cv=kf, eps=1e-8)
elif problem_type == "classification":
model = lm.LogisticRegressionCV(cv=5, penalty="l1", solver="saga", class_weight="balanced")
model = lm.LogisticRegressionCV(cv=kf, penalty="l1", solver="saga", class_weight="balanced", random_state=random_seed)
else:
logging.warning(f"[featsel] Unknown problem_type {problem_type} - not performing noise filtering.")
model = None
if model is not None:
np.random.seed(random_seed) # Set seed for noise feature addition and permutation
X = _add_noise_features(X)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
Expand All @@ -89,7 +93,9 @@ def _noise_filtering(
return good_cols


def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: str = "regression", verbose: int = 0) -> list:
def _select_features_1run(
df: pd.DataFrame, target: np.ndarray, problem_type: str = "regression", verbose: int = 0, random_seed: int | None = None
) -> list:
"""
One feature selection run.

Expand All @@ -105,6 +111,11 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st
"""
if df.shape[0] <= 1:
raise ValueError(f"n_samples = {df.shape[0]}")

# Set random seed
if random_seed is not None:
np.random.seed(random_seed)

# initial selection of too few but (hopefully) relevant features
if problem_type == "regression":
model = lm.LassoLarsCV(cv=5, eps=1e-8)
Expand All @@ -128,15 +139,19 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st
# weight threshold: select at most 0.2*n_train initial features
thr = sorted(coefs, reverse=True)[min(df.shape[1] - 1, df.shape[0] // 5)]
initial_cols = list(df.columns[coefs > thr])

# noise filter
initial_cols = _noise_filtering(df[initial_cols].to_numpy(), target, initial_cols, problem_type)
jtimko16 marked this conversation as resolved.
Show resolved Hide resolved
initial_cols = _noise_filtering(df[initial_cols].to_numpy(), target, initial_cols, problem_type, random_seed=random_seed)
good_cols_set = set(initial_cols)
if verbose > 0:
logging.info(f"[featsel]\t {len(initial_cols)} initial features.")

# add noise features
X_w_noise = _add_noise_features(df[initial_cols].to_numpy())

# go through all remaining features in splits of n_feat <= 0.5*n_train
other_cols = list(np.random.permutation(list(set(df.columns).difference(initial_cols))))
# other_cols = list(np.random.permutation(list(set(df.columns).difference(initial_cols))))
other_cols = list(np.random.permutation(sorted(set(df.columns).difference(initial_cols))))
if other_cols:
n_splits = int(np.ceil(len(other_cols) / max(10, 0.5 * df.shape[0] - len(initial_cols))))
split_size = int(np.ceil(len(other_cols) / n_splits))
Expand All @@ -146,7 +161,9 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st
if problem_type == "regression":
model = lm.LassoLarsCV(cv=5, eps=1e-8)
jtimko16 marked this conversation as resolved.
Show resolved Hide resolved
else:
model = lm.LogisticRegressionCV(cv=5, penalty="l1", solver="saga", class_weight="balanced")
model = lm.LogisticRegressionCV(
cv=5, penalty="l1", solver="saga", class_weight="balanced", random_state=random_seed
)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
# TODO: remove if sklearn least_angle issue is fixed
Expand All @@ -160,9 +177,11 @@ def _select_features_1run(df: pd.DataFrame, target: np.ndarray, problem_type: st
# for classification, model.coefs_ is n_classes x n_features, but we need n_features
coefs = np.abs(model.coef_) if problem_type == "regression" else np.max(np.abs(model.coef_), axis=0)
weights = dict(zip(current_cols, coefs[: len(current_cols)]))

# only include features that are more important than our known noise features
noise_w_thr = np.max(coefs[len(current_cols) :])
good_cols_set.update([c for c in weights if abs(weights[c]) > noise_w_thr])

if verbose > 0:
print(
f"[featsel]\t Split {i + 1:2}/{n_splits}: {len(good_cols_set):3} candidate features identified.",
Expand All @@ -184,6 +203,7 @@ def select_features(
problem_type: str = "regression",
n_jobs: int = 1,
verbose: int = 0,
random_seed: int | None = None,
) -> list:
"""
Selects predictive features given the data and targets.
Expand All @@ -201,6 +221,10 @@ def select_features(
Returns:
- good_cols: list of column names for df with which a regression model can be trained
"""
# Set random seed
if random_seed is not None:
np.random.seed(random_seed)

if not (len(df) == len(target)):
raise ValueError("[featsel] df and target dimension mismatch.")
if keep is None:
Expand All @@ -223,36 +247,53 @@ def select_features(

# select good features in k runs in parallel
# by doing sort of a cross-validation (i.e., randomly subsample data points)
def run_select_features(i: int):
def run_select_features(i: int, random_seed: int):
if verbose > 0:
logging.info(f"[featsel] Feature selection run {i + 1}/{featsel_runs}")
np.random.seed(i)
np.random.seed(random_seed)
loop_seed = np.random.randint(
10**6
) # Added to random_seed to make sure that the 1run seed is different for each run, but globally reproducible
seed = random_seed + loop_seed if random_seed is not None else loop_seed
rand_idx = np.random.permutation(df_scaled.index)[: max(10, int(0.85 * len(df_scaled)))]
return _select_features_1run(df_scaled.iloc[rand_idx], target_scaled[rand_idx], problem_type, verbose=verbose - 1)
return _select_features_1run(
df_scaled.iloc[rand_idx], target_scaled[rand_idx], problem_type, verbose=verbose - 1, random_seed=seed
)

if featsel_runs >= 1 and problem_type in ("regression", "classification"):
if n_jobs == 1 or featsel_runs == 1:
# only use parallelization code if you actually parallelize
selected_columns = []
for i in range(featsel_runs):
selected_columns.extend(run_select_features(i))
selected_columns.extend(run_select_features(i, random_seed))

else:
# Generate a list of seeds, one for each run
def random_seed_generator(low=0, high=2**32 - 1):
jtimko16 marked this conversation as resolved.
Show resolved Hide resolved
while True:
seed = np.random.randint(low, high)
yield seed

seeds = random_seed_generator()

def flatten_lists(l: list):
return [item for sublist in l for item in sublist]

selected_columns = flatten_lists(
Parallel(n_jobs=n_jobs, verbose=100 * verbose)(delayed(run_select_features)(i) for i in range(featsel_runs)),
Parallel(n_jobs=n_jobs, verbose=100 * verbose)(
delayed(run_select_features)(i, seeds[i]) for i in range(featsel_runs)
)
)

if selected_columns:
selected_columns_counter = Counter(selected_columns)
# sort by frequency, but down weight longer formulas to break ties
# sort by frequency, but down weight longer formulas to break ties. Also added some randomness to fix reproducibility when equal freq and length
jtimko16 marked this conversation as resolved.
Show resolved Hide resolved
selected_columns = sorted(
selected_columns_counter,
key=lambda x: selected_columns_counter[x] - 0.000001 * len(str(x)),
reverse=True,
)

if verbose > 0:
logging.info(f"[featsel] {len(selected_columns)} features after {featsel_runs} feature selection runs")
# correlation filtering
Expand Down Expand Up @@ -294,6 +335,7 @@ def __init__(
keep: list | None = None,
n_jobs: int = 1,
verbose: int = 0,
random_seed: int | None = None,
):
"""
multi-step cross-validated feature selection
Expand All @@ -316,6 +358,7 @@ def __init__(
self.keep = keep
self.n_jobs = n_jobs
self.verbose = verbose
self.random_seed = random_seed

def fit(self, X: np.ndarray | pd.DataFrame, y: np.ndarray | pd.DataFrame):
"""
Expand All @@ -339,13 +382,7 @@ def fit(self, X: np.ndarray | pd.DataFrame, y: np.ndarray | pd.DataFrame):
df = pd.DataFrame(X, columns=cols)
# do the feature selection
self.good_cols_ = select_features(
df,
target,
self.featsel_runs,
self.keep,
self.problem_type,
self.n_jobs,
self.verbose,
df, target, self.featsel_runs, self.keep, self.problem_type, self.n_jobs, self.verbose, self.random_seed
)
self.n_features_in_ = X.shape[1]
return self
Expand Down