function for forward elim and backward elim

kohjiaxuan · May 19, 2020 · a4cfb82 · a4cfb82
1 parent bbf5260
commit a4cfb82
Show file tree

Hide file tree

Showing 2 changed files with 180 additions and 0 deletions.
diff --git a/backward_elim_binary.py b/backward_elim_binary.py
@@ -0,0 +1,39 @@
+import statsmodels.discrete.discrete_model as sm
+
+
+def backward_elimination(x, Y, sl, columns):
+    """
+    :param x: numpy array of training variables
+    :param Y: Y is numpy array of response variable
+    :param sl: significance level in float
+    :param columns: list of columns in same horizontal order with x
+    :return: numpy array of selected x, list of selected training variables passing sig level
+    """
+    numVars = len(x[0])  # Get length of a row for num of vars
+    for i in range(0, numVars):
+        regressor_OLS = sm.Logit(Y, x, maxiter=200).fit()
+        # for loop to fit current set of all vars in x and response Y
+        # As loop goes on, x gets smaller as columns are deleted, and columns keeping track of col names also edited
+        maxVar = max(regressor_OLS.pvalues).astype(float)
+        print('Regression model retrained with ' + str(numVars) + ' variables.')
+        print('Max p value for a feature is: ' + str(maxVar))
+        # get max p value and if its more than sig level, start deleting jth column
+        # Since columns are getting deleted and x gets smaller, we need to update columns keeping track of column names
+        # Hence the only way to ensure the deletion of right column
+        # is to check the max p value with the current pvalues[j] in regression model
+        # if they are same, then jth column safe to delete
+        if maxVar > sl:
+            print('Max p value > ' + str(sl) + ', feature will be removed.')
+            for j in range(0, numVars - i):
+                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
+                    print(str(j) + 'th column deleted: ' + str(columns[j]))
+                    x = np.delete(x, j, 1)
+                    columns = np.delete(columns, j)
+                    numVars -= 1
+        else:
+            print('All p values are above ' + str(sl) + '. Terminating model training')
+            print('p values list: ' + str(regressor_OLS.pvalues))
+            break
+
+    print(regressor_OLS.summary())
+    return x, columns  # Return x data and list of columns
diff --git a/forward_elim_binary.py b/forward_elim_binary.py
@@ -0,0 +1,141 @@
+import numpy as np
+import pandas as pd
+import operator
+from collections import Counter
+from imblearn.over_sampling import RandomOverSampler
+from sklearn.model_selection import train_test_split
+from imblearn.over_sampling import SMOTE
+from imblearn.over_sampling import ADASYN
+import statsmodels.discrete.discrete_model as sm
+
+
+def forward_selection(df, sig_level, response, removelist, sampling='nil', testratio=0):
+    """
+    :param df: dataframe with both training and response variables
+    :param sig_level: significance level to accept/reject var during forward selection
+    :param response: name of response var in dataframe
+    :param removelist: list of training variables to remove from dataframe
+    :param sampling: type of oversampling to use, smote, naive or nil, default: no sampling done
+    :param testratio: proportion of dataset to remove out before doing oversampling, default: 0
+    :return: list of training variables, actualvars
+    """
+
+    if isinstance(removelist, str) == True:
+        temp_str = removelist
+        internallist = []
+        internallist.append(temp_str)
+    else:
+        internallist = removelist
+    X = df.drop(internallist, axis=1)
+    y = df[response]
+    # Get list of column names
+    colnames = list(X.columns.values)
+    print(colnames)
+
+    # Start of train-test split and oversampling (if relevant)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testratio, random_state=42)
+    if sampling.lower() == 'smote':
+        print("SMOTE Oversampling selected..")
+        X_train, y_train = SMOTE().fit_resample(X_train, y_train)
+        # train test split keeps X_test and y_test as pd series, oversampler converts X_train, y_train to numpy
+        # Convert all to numpy array for XGBoost to not have bugs
+        X_test = X_test.values
+        y_test = y_test.values
+        print("Number of Xs and Ys for: " + str(sampling.upper()))
+        print(sorted(Counter(y_train).items()))
+        print("Oversampling is complete!")
+    elif sampling.lower() == 'naive':
+        print("Naive Oversampling selected..")
+        ros = RandomOverSampler(random_state=42)
+        X_train, y_train = ros.fit_resample(X_train, y_train)
+        # train test split keeps X_test and y_test as pd series, oversampler converts X_train, y_train to numpy
+        # Convert all to numpy array for XGBoost to not have bugs
+        X_test = X_test.values
+        y_test = y_test.values
+        print("Number of Xs and Ys for: " + str(sampling.upper()))
+        print(sorted(Counter(y_train).items()))
+        print("Oversampling is complete!")
+    else:
+        print("No sampling selected..")
+
+    # Total features to select = k
+    # In each iteration, the current set of n features is concatenated with a new feature not inside current set
+    # It is then sent for training with the logistic regression
+    # The model performance for each feature + current features is evaluated by its highest p value (worst feature)
+    # All highest p values of all feature addition to n features (k-n iterations) are put into a dictionary
+    # Next, the lowest p value out of all the iterations (for n features + 1) is chosen for evaluation
+    # Set significance level, which is compared to the lowest p value of the best model in the current training iteration
+    # If best model in current training iteration of n vars has any vars with p value > sig level, then the model training stops
+    # Because all the different models are worse or equally bad as the current best model, we can terminate selection process
+    # If not, repeat this iteration with now n+1 features and k-n-1 iterations
+
+    maxcolsnum = X_train.shape[1]
+    full_x = np.array(False)
+    allowed_nums = {}
+    for i in range(maxcolsnum):
+        allowed_nums[i] = True
+    actual_nums = []
+    actual_vars = []
+    terminate_early = False
+    y = y_train
+    for i in range(maxcolsnum):
+        # Reset boolean and pval_list
+        terminate_early = False
+        pval_list = {}
+        for j in range(maxcolsnum):
+            if allowed_nums[j] == True:
+                # Need to reshape to single column instead of a long array for concating properly
+                jth_x = X_train[:, j].reshape(-1, 1)
+                if full_x.any():
+                    iter_x = np.concatenate((full_x, jth_x), axis=1)
+                else:
+                    iter_x = jth_x
+                regressor_OLS = sm.Logit(y_train, iter_x).fit(disp=0)
+                pval_list[j] = max(regressor_OLS.pvalues)
+                # Special condition where all the features have p values of 0, directly use these variables for training
+                if max(regressor_OLS.pvalues) == 0:
+                    if full_x.any():
+                        full_x = np.concatenate((full_x, jth_x), axis=1)
+                        allowed_nums[j] = False
+                        actual_nums.append(j)
+                        print("Features all have p value of 0, using feature: [" + str(colnames[j]) + "]")
+                    else:
+                        full_x = jth_x
+                        allowed_nums[j] = False
+                        actual_nums.append(j)
+                        print("First model trained using feature: [" + str(colnames[j]) + "] with p value of 0")
+                    terminate_early = True
+                    break
+            else:
+                continue
+        if i > 0 and terminate_early == False:
+            print("Building new model with lowest p-values with " + str(len(actual_nums)) + " variables.")
+            max_pval_col = min(pval_list.items(), key=operator.itemgetter(1))[0]
+            max_pval = pval_list[max_pval_col]
+            # Need to reshape to single column instead of a long array for concating properly
+            jth_x = X_train[:, max_pval_col].reshape(-1, 1)
+            if max_pval < sig_level:
+                if full_x.any():
+                    full_x = np.concatenate((full_x, jth_x), axis=1)
+                    allowed_nums[max_pval_col] = False
+                    actual_nums.append(max_pval_col)
+                    print("New model trained using feature: [" + str(
+                        colnames[max_pval_col]) + "] with lowest p values of " + str(max_pval))
+                else:
+                    full_x = jth_x
+                    allowed_nums[max_pval_col] = False
+                    actual_nums.append(max_pval_col)
+                    print("First model trained using feature: [" + str(
+                        colnames[max_pval_col]) + "] with lowest p values of " + str(max_pval))
+            else:
+                print("TERMINATING AS best model trained using feature: [" + str(
+                    colnames[max_pval_col]) + "] with high p value of " + str(
+                    max_pval) + " above significance level: " + str(sig_level))
+                break
+
+    for k in actual_nums:
+        actual_vars.append(colnames[k])
+    print('Final variables selected:')
+    print(actual_vars)
+
+    return actual_vars