From a4cfb824b6fd71e7fe78eb1a1a0f8f0d8775fc61 Mon Sep 17 00:00:00 2001 From: unknown Date: Tue, 19 May 2020 18:08:42 +0800 Subject: [PATCH] function for forward elim and backward elim --- backward_elim_binary.py | 39 +++++++++++ forward_elim_binary.py | 141 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 180 insertions(+) create mode 100644 backward_elim_binary.py create mode 100644 forward_elim_binary.py diff --git a/backward_elim_binary.py b/backward_elim_binary.py new file mode 100644 index 0000000..e03bcd6 --- /dev/null +++ b/backward_elim_binary.py @@ -0,0 +1,39 @@ +import statsmodels.discrete.discrete_model as sm + + +def backward_elimination(x, Y, sl, columns): + """ + :param x: numpy array of training variables + :param Y: Y is numpy array of response variable + :param sl: significance level in float + :param columns: list of columns in same horizontal order with x + :return: numpy array of selected x, list of selected training variables passing sig level + """ + numVars = len(x[0]) # Get length of a row for num of vars + for i in range(0, numVars): + regressor_OLS = sm.Logit(Y, x, maxiter=200).fit() + # for loop to fit current set of all vars in x and response Y + # As loop goes on, x gets smaller as columns are deleted, and columns keeping track of col names also edited + maxVar = max(regressor_OLS.pvalues).astype(float) + print('Regression model retrained with ' + str(numVars) + ' variables.') + print('Max p value for a feature is: ' + str(maxVar)) + # get max p value and if its more than sig level, start deleting jth column + # Since columns are getting deleted and x gets smaller, we need to update columns keeping track of column names + # Hence the only way to ensure the deletion of right column + # is to check the max p value with the current pvalues[j] in regression model + # if they are same, then jth column safe to delete + if maxVar > sl: + print('Max p value > ' + str(sl) + ', feature will be removed.') + for j in range(0, numVars - i): + if (regressor_OLS.pvalues[j].astype(float) == maxVar): + print(str(j) + 'th column deleted: ' + str(columns[j])) + x = np.delete(x, j, 1) + columns = np.delete(columns, j) + numVars -= 1 + else: + print('All p values are above ' + str(sl) + '. Terminating model training') + print('p values list: ' + str(regressor_OLS.pvalues)) + break + + print(regressor_OLS.summary()) + return x, columns # Return x data and list of columns \ No newline at end of file diff --git a/forward_elim_binary.py b/forward_elim_binary.py new file mode 100644 index 0000000..1535d70 --- /dev/null +++ b/forward_elim_binary.py @@ -0,0 +1,141 @@ +import numpy as np +import pandas as pd +import operator +from collections import Counter +from imblearn.over_sampling import RandomOverSampler +from sklearn.model_selection import train_test_split +from imblearn.over_sampling import SMOTE +from imblearn.over_sampling import ADASYN +import statsmodels.discrete.discrete_model as sm + + +def forward_selection(df, sig_level, response, removelist, sampling='nil', testratio=0): + """ + :param df: dataframe with both training and response variables + :param sig_level: significance level to accept/reject var during forward selection + :param response: name of response var in dataframe + :param removelist: list of training variables to remove from dataframe + :param sampling: type of oversampling to use, smote, naive or nil, default: no sampling done + :param testratio: proportion of dataset to remove out before doing oversampling, default: 0 + :return: list of training variables, actualvars + """ + + if isinstance(removelist, str) == True: + temp_str = removelist + internallist = [] + internallist.append(temp_str) + else: + internallist = removelist + X = df.drop(internallist, axis=1) + y = df[response] + # Get list of column names + colnames = list(X.columns.values) + print(colnames) + + # Start of train-test split and oversampling (if relevant) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testratio, random_state=42) + if sampling.lower() == 'smote': + print("SMOTE Oversampling selected..") + X_train, y_train = SMOTE().fit_resample(X_train, y_train) + # train test split keeps X_test and y_test as pd series, oversampler converts X_train, y_train to numpy + # Convert all to numpy array for XGBoost to not have bugs + X_test = X_test.values + y_test = y_test.values + print("Number of Xs and Ys for: " + str(sampling.upper())) + print(sorted(Counter(y_train).items())) + print("Oversampling is complete!") + elif sampling.lower() == 'naive': + print("Naive Oversampling selected..") + ros = RandomOverSampler(random_state=42) + X_train, y_train = ros.fit_resample(X_train, y_train) + # train test split keeps X_test and y_test as pd series, oversampler converts X_train, y_train to numpy + # Convert all to numpy array for XGBoost to not have bugs + X_test = X_test.values + y_test = y_test.values + print("Number of Xs and Ys for: " + str(sampling.upper())) + print(sorted(Counter(y_train).items())) + print("Oversampling is complete!") + else: + print("No sampling selected..") + + # Total features to select = k + # In each iteration, the current set of n features is concatenated with a new feature not inside current set + # It is then sent for training with the logistic regression + # The model performance for each feature + current features is evaluated by its highest p value (worst feature) + # All highest p values of all feature addition to n features (k-n iterations) are put into a dictionary + # Next, the lowest p value out of all the iterations (for n features + 1) is chosen for evaluation + # Set significance level, which is compared to the lowest p value of the best model in the current training iteration + # If best model in current training iteration of n vars has any vars with p value > sig level, then the model training stops + # Because all the different models are worse or equally bad as the current best model, we can terminate selection process + # If not, repeat this iteration with now n+1 features and k-n-1 iterations + + maxcolsnum = X_train.shape[1] + full_x = np.array(False) + allowed_nums = {} + for i in range(maxcolsnum): + allowed_nums[i] = True + actual_nums = [] + actual_vars = [] + terminate_early = False + y = y_train + for i in range(maxcolsnum): + # Reset boolean and pval_list + terminate_early = False + pval_list = {} + for j in range(maxcolsnum): + if allowed_nums[j] == True: + # Need to reshape to single column instead of a long array for concating properly + jth_x = X_train[:, j].reshape(-1, 1) + if full_x.any(): + iter_x = np.concatenate((full_x, jth_x), axis=1) + else: + iter_x = jth_x + regressor_OLS = sm.Logit(y_train, iter_x).fit(disp=0) + pval_list[j] = max(regressor_OLS.pvalues) + # Special condition where all the features have p values of 0, directly use these variables for training + if max(regressor_OLS.pvalues) == 0: + if full_x.any(): + full_x = np.concatenate((full_x, jth_x), axis=1) + allowed_nums[j] = False + actual_nums.append(j) + print("Features all have p value of 0, using feature: [" + str(colnames[j]) + "]") + else: + full_x = jth_x + allowed_nums[j] = False + actual_nums.append(j) + print("First model trained using feature: [" + str(colnames[j]) + "] with p value of 0") + terminate_early = True + break + else: + continue + if i > 0 and terminate_early == False: + print("Building new model with lowest p-values with " + str(len(actual_nums)) + " variables.") + max_pval_col = min(pval_list.items(), key=operator.itemgetter(1))[0] + max_pval = pval_list[max_pval_col] + # Need to reshape to single column instead of a long array for concating properly + jth_x = X_train[:, max_pval_col].reshape(-1, 1) + if max_pval < sig_level: + if full_x.any(): + full_x = np.concatenate((full_x, jth_x), axis=1) + allowed_nums[max_pval_col] = False + actual_nums.append(max_pval_col) + print("New model trained using feature: [" + str( + colnames[max_pval_col]) + "] with lowest p values of " + str(max_pval)) + else: + full_x = jth_x + allowed_nums[max_pval_col] = False + actual_nums.append(max_pval_col) + print("First model trained using feature: [" + str( + colnames[max_pval_col]) + "] with lowest p values of " + str(max_pval)) + else: + print("TERMINATING AS best model trained using feature: [" + str( + colnames[max_pval_col]) + "] with high p value of " + str( + max_pval) + " above significance level: " + str(sig_level)) + break + + for k in actual_nums: + actual_vars.append(colnames[k]) + print('Final variables selected:') + print(actual_vars) + + return actual_vars \ No newline at end of file