-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
function for forward elim and backward elim
- Loading branch information
1 parent
bbf5260
commit a4cfb82
Showing
2 changed files
with
180 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import statsmodels.discrete.discrete_model as sm | ||
|
||
|
||
def backward_elimination(x, Y, sl, columns): | ||
""" | ||
:param x: numpy array of training variables | ||
:param Y: Y is numpy array of response variable | ||
:param sl: significance level in float | ||
:param columns: list of columns in same horizontal order with x | ||
:return: numpy array of selected x, list of selected training variables passing sig level | ||
""" | ||
numVars = len(x[0]) # Get length of a row for num of vars | ||
for i in range(0, numVars): | ||
regressor_OLS = sm.Logit(Y, x, maxiter=200).fit() | ||
# for loop to fit current set of all vars in x and response Y | ||
# As loop goes on, x gets smaller as columns are deleted, and columns keeping track of col names also edited | ||
maxVar = max(regressor_OLS.pvalues).astype(float) | ||
print('Regression model retrained with ' + str(numVars) + ' variables.') | ||
print('Max p value for a feature is: ' + str(maxVar)) | ||
# get max p value and if its more than sig level, start deleting jth column | ||
# Since columns are getting deleted and x gets smaller, we need to update columns keeping track of column names | ||
# Hence the only way to ensure the deletion of right column | ||
# is to check the max p value with the current pvalues[j] in regression model | ||
# if they are same, then jth column safe to delete | ||
if maxVar > sl: | ||
print('Max p value > ' + str(sl) + ', feature will be removed.') | ||
for j in range(0, numVars - i): | ||
if (regressor_OLS.pvalues[j].astype(float) == maxVar): | ||
print(str(j) + 'th column deleted: ' + str(columns[j])) | ||
x = np.delete(x, j, 1) | ||
columns = np.delete(columns, j) | ||
numVars -= 1 | ||
else: | ||
print('All p values are above ' + str(sl) + '. Terminating model training') | ||
print('p values list: ' + str(regressor_OLS.pvalues)) | ||
break | ||
|
||
print(regressor_OLS.summary()) | ||
return x, columns # Return x data and list of columns |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
import numpy as np | ||
import pandas as pd | ||
import operator | ||
from collections import Counter | ||
from imblearn.over_sampling import RandomOverSampler | ||
from sklearn.model_selection import train_test_split | ||
from imblearn.over_sampling import SMOTE | ||
from imblearn.over_sampling import ADASYN | ||
import statsmodels.discrete.discrete_model as sm | ||
|
||
|
||
def forward_selection(df, sig_level, response, removelist, sampling='nil', testratio=0): | ||
""" | ||
:param df: dataframe with both training and response variables | ||
:param sig_level: significance level to accept/reject var during forward selection | ||
:param response: name of response var in dataframe | ||
:param removelist: list of training variables to remove from dataframe | ||
:param sampling: type of oversampling to use, smote, naive or nil, default: no sampling done | ||
:param testratio: proportion of dataset to remove out before doing oversampling, default: 0 | ||
:return: list of training variables, actualvars | ||
""" | ||
|
||
if isinstance(removelist, str) == True: | ||
temp_str = removelist | ||
internallist = [] | ||
internallist.append(temp_str) | ||
else: | ||
internallist = removelist | ||
X = df.drop(internallist, axis=1) | ||
y = df[response] | ||
# Get list of column names | ||
colnames = list(X.columns.values) | ||
print(colnames) | ||
|
||
# Start of train-test split and oversampling (if relevant) | ||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testratio, random_state=42) | ||
if sampling.lower() == 'smote': | ||
print("SMOTE Oversampling selected..") | ||
X_train, y_train = SMOTE().fit_resample(X_train, y_train) | ||
# train test split keeps X_test and y_test as pd series, oversampler converts X_train, y_train to numpy | ||
# Convert all to numpy array for XGBoost to not have bugs | ||
X_test = X_test.values | ||
y_test = y_test.values | ||
print("Number of Xs and Ys for: " + str(sampling.upper())) | ||
print(sorted(Counter(y_train).items())) | ||
print("Oversampling is complete!") | ||
elif sampling.lower() == 'naive': | ||
print("Naive Oversampling selected..") | ||
ros = RandomOverSampler(random_state=42) | ||
X_train, y_train = ros.fit_resample(X_train, y_train) | ||
# train test split keeps X_test and y_test as pd series, oversampler converts X_train, y_train to numpy | ||
# Convert all to numpy array for XGBoost to not have bugs | ||
X_test = X_test.values | ||
y_test = y_test.values | ||
print("Number of Xs and Ys for: " + str(sampling.upper())) | ||
print(sorted(Counter(y_train).items())) | ||
print("Oversampling is complete!") | ||
else: | ||
print("No sampling selected..") | ||
|
||
# Total features to select = k | ||
# In each iteration, the current set of n features is concatenated with a new feature not inside current set | ||
# It is then sent for training with the logistic regression | ||
# The model performance for each feature + current features is evaluated by its highest p value (worst feature) | ||
# All highest p values of all feature addition to n features (k-n iterations) are put into a dictionary | ||
# Next, the lowest p value out of all the iterations (for n features + 1) is chosen for evaluation | ||
# Set significance level, which is compared to the lowest p value of the best model in the current training iteration | ||
# If best model in current training iteration of n vars has any vars with p value > sig level, then the model training stops | ||
# Because all the different models are worse or equally bad as the current best model, we can terminate selection process | ||
# If not, repeat this iteration with now n+1 features and k-n-1 iterations | ||
|
||
maxcolsnum = X_train.shape[1] | ||
full_x = np.array(False) | ||
allowed_nums = {} | ||
for i in range(maxcolsnum): | ||
allowed_nums[i] = True | ||
actual_nums = [] | ||
actual_vars = [] | ||
terminate_early = False | ||
y = y_train | ||
for i in range(maxcolsnum): | ||
# Reset boolean and pval_list | ||
terminate_early = False | ||
pval_list = {} | ||
for j in range(maxcolsnum): | ||
if allowed_nums[j] == True: | ||
# Need to reshape to single column instead of a long array for concating properly | ||
jth_x = X_train[:, j].reshape(-1, 1) | ||
if full_x.any(): | ||
iter_x = np.concatenate((full_x, jth_x), axis=1) | ||
else: | ||
iter_x = jth_x | ||
regressor_OLS = sm.Logit(y_train, iter_x).fit(disp=0) | ||
pval_list[j] = max(regressor_OLS.pvalues) | ||
# Special condition where all the features have p values of 0, directly use these variables for training | ||
if max(regressor_OLS.pvalues) == 0: | ||
if full_x.any(): | ||
full_x = np.concatenate((full_x, jth_x), axis=1) | ||
allowed_nums[j] = False | ||
actual_nums.append(j) | ||
print("Features all have p value of 0, using feature: [" + str(colnames[j]) + "]") | ||
else: | ||
full_x = jth_x | ||
allowed_nums[j] = False | ||
actual_nums.append(j) | ||
print("First model trained using feature: [" + str(colnames[j]) + "] with p value of 0") | ||
terminate_early = True | ||
break | ||
else: | ||
continue | ||
if i > 0 and terminate_early == False: | ||
print("Building new model with lowest p-values with " + str(len(actual_nums)) + " variables.") | ||
max_pval_col = min(pval_list.items(), key=operator.itemgetter(1))[0] | ||
max_pval = pval_list[max_pval_col] | ||
# Need to reshape to single column instead of a long array for concating properly | ||
jth_x = X_train[:, max_pval_col].reshape(-1, 1) | ||
if max_pval < sig_level: | ||
if full_x.any(): | ||
full_x = np.concatenate((full_x, jth_x), axis=1) | ||
allowed_nums[max_pval_col] = False | ||
actual_nums.append(max_pval_col) | ||
print("New model trained using feature: [" + str( | ||
colnames[max_pval_col]) + "] with lowest p values of " + str(max_pval)) | ||
else: | ||
full_x = jth_x | ||
allowed_nums[max_pval_col] = False | ||
actual_nums.append(max_pval_col) | ||
print("First model trained using feature: [" + str( | ||
colnames[max_pval_col]) + "] with lowest p values of " + str(max_pval)) | ||
else: | ||
print("TERMINATING AS best model trained using feature: [" + str( | ||
colnames[max_pval_col]) + "] with high p value of " + str( | ||
max_pval) + " above significance level: " + str(sig_level)) | ||
break | ||
|
||
for k in actual_nums: | ||
actual_vars.append(colnames[k]) | ||
print('Final variables selected:') | ||
print(actual_vars) | ||
|
||
return actual_vars |