From 61aaf9287dc2674461d133df500ebdfe35e77c66 Mon Sep 17 00:00:00 2001
From: unknown <kohjx1993@gmail.com>
Date: Fri, 7 Feb 2020 23:58:05 +0800
Subject: [PATCH] Added PR_AUC graphs and made edits to py files

---
 machinelearningpipelinedraft.py           | 202 ----------------
 sklearn_classification_pipeline.py        |  79 +++++--
 sklearn_classifier_pipeline_optionalCV.py | 266 ++++++++++++++++++++++
 3 files changed, 331 insertions(+), 216 deletions(-)
 delete mode 100644 machinelearningpipelinedraft.py
 create mode 100644 sklearn_classifier_pipeline_optionalCV.py

diff --git a/machinelearningpipelinedraft.py b/machinelearningpipelinedraft.py
deleted file mode 100644
index 85402d0..0000000
--- a/machinelearningpipelinedraft.py
+++ /dev/null
@@ -1,202 +0,0 @@
-# importing packages
-import numpy as np
-import pandas as pd
-import scipy.stats as stats
-import sklearn
-import imblearn
-import matplotlib.pyplot as plt
-import seaborn as sns
-plt.style.use('ggplot')
-
-# reading in CSV
-df = pd.read_csv('creditcard.csv')
-df.sample(5)
-
-df.info()
-
-df.describe()
-
-# taking a closer look at the class variable
-sns.countplot('Class', data = df)
-plt.title('No Fraud (0) vs. Fraud (1)')
-
-from imblearn.over_sampling import RandomOverSampler
-from sklearn.model_selection import train_test_split
-from sklearn.linear_model import LogisticRegression
-from sklearn.ensemble import RandomForestClassifier
-import xgboost as xgb 
-from sklearn import metrics
-from imblearn.over_sampling import SMOTE
-from imblearn.over_sampling import ADASYN
-import math
-from sklearn.model_selection import GridSearchCV
-from sklearn.ensemble import RandomForestClassifier
-
-class modelpipeline:
-    def __init__(self):
-        pass
-    
-    def run_model(self, df, testratio, standardize, sampletype, modelname, text):
-        df = df.drop('Time', axis=1)
-        if standardize == True:
-            df = self.standardize(df)
-        if sampletype == 'smote':
-            X_train, X_test, y_train, y_test = sampling.smote_oversample(df, testratio)
-        elif sampletype == 'adasyn':
-            X_train, X_test, y_train, y_test = sampling.adasyn_oversample(df, testratio)
-        else:
-            X_train, X_test, y_train, y_test = sampling.naive_oversample(df, testratio)
-        store = self.build_model(X_train, X_test, y_train, y_test, text, modelname)
-        # test model with all actual fraud results
-        store['actual_accuracy'] = evaluate.actual_acc(df, store['model'])
-        return store
-    
-    def build_model(self, X_train, X_test, y_train, y_test, text, modelname):
-        if modelname == 'LogisticRegression':
-            param_grid = dict(C=[0.8,0.9,1,1.1], max_iter=[300], solver='liblinear')
-            LogRegression = LogisticRegression()
-            model = GridSearchCV(LogRegression, param_grid, cv=10, scoring='f1')
-            model.fit(X_train,y_train)
-            print("Best f1 score: " + str(model.best_score_))
-            print("Best parameters: " + str(model.best_params_))
-        elif modelname == 'XGBoost':
-            model = xgb.XGBClassifier(seed=42, nthread=1, max_depth=math.ceil(math.sqrt(X_train.shape[1])),
-                                      n_estimators=100, random_state=42)
-            model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=5)
-        elif modelname == 'RandomForest':
-            start_value = math.ceil(math.sqrt(X_train.shape[1]))
-            end_value = start_value + 5          
-            treedepth = list(range(start_value, end_value, 2))
-            param_grid = dict(random_state=[42], max_depth=treedepth, n_estimators=[100,150])
-            RFC = RandomForestClassifier()
-            model = GridSearchCV(RFC, param_grid, cv=10, scoring='f1')
-            model.fit(X_train,y_train)
-            print("Best f1 score: " + str(model.best_score_))
-            print("Best parameters: " + str(model.best_params_))                      
-        else:
-            model = LogisticRegression()
-            model.fit(X_train,y_train)
-        y_predict = model.predict(X_test)
-        results = evaluate.model_results(y_test, y_predict, text)
-        store = {"model": model, "X_train": X_train, "X_test": X_test, "y_train": y_train, 
-                 "y_test": y_test, "results": results}
-        print("Model fitting and results are complete!")
-        return store
-    
-    def standardize(self, df):
-        columns = df.columns.values.tolist()
-        columns.remove('Class')
-        for column in columns:
-            df[column] = (df[column] - df[column].mean()) / df[column].std()
-        return df
-
-class sampling:
-    def __init__(self):
-        pass
-    @staticmethod
-    def naive_oversample(df, testratio):
-        X = df.drop(['Class'], axis=1)
-        y = df['Class']
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testratio, random_state=42)
-        ros = RandomOverSampler(random_state=42)
-        X_train, y_train = ros.fit_resample(X_train, y_train)
-        # train test split keeps X_test and y_test as pd series, oversampler converts X_train, y_train to numpy
-        # Convert all to numpy array for XGBoost to not have bugs
-        X_test = X_test.values
-        y_test = y_test.values
-        print("Oversampling is complete!")
-        return X_train, X_test, y_train, y_test
-    
-    @staticmethod
-    def smote_oversample(df, testratio):
-        X = df.drop(['Class'], axis=1)
-        y = df['Class']
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testratio, random_state=42)
-        X_train, y_train = SMOTE().fit_resample(X_train, y_train)
-        # train test split keeps X_test and y_test as pd series, oversampler converts X_train, y_train to numpy
-        # Convert all to numpy array for XGBoost to not have bugs
-        X_test = X_test.values
-        y_test = y_test.values
-        print("Number of Xs and Ys for SMOTE:")
-        print(sorted(Counter(y_train).items()))
-        print("Oversampling is complete!")
-        return X_train, X_test, y_train, y_test
-    
-    @staticmethod
-    def adasyn_oversample(df, testratio):
-        X = df.drop(['Class'], axis=1)
-        y = df['Class']
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testratio, random_state=42)
-        X_train, y_train = ADASYN().fit_resample(X_train, y_train)
-        # train test split keeps X_test and y_test as pd series, oversampler converts X_train, y_train to numpy
-        # Convert all to numpy array for XGBoost to not have bugs
-        X_test = X_test.values
-        y_test = y_test.values
-        print("Number of Xs and Ys for ADASYN:")
-        print(sorted(Counter(y_train).items()))
-        print("Oversampling is complete!")
-        return X_train, X_test, y_train, y_test
-
-
-
-class evaluate:
-    def __init__(self):
-        pass
-    
-    @staticmethod
-    def model_results(y_test, y_predict, text):
-        cm = metrics.confusion_matrix(y_test, y_predict)
-        print(cm)
-        RFC_CM = pd.DataFrame(cm, ['Actual 0', 'Actual 1'], ['Predict 0', 'Predict 1'])
-        sns.heatmap(RFC_CM, annot=True, annot_kws={"size": 16}, cmap='Greens', linewidths=1, fmt='g')# font size
-        sns.set(font_scale=1.4)#for label size
-        plt.title("Confusion Matrix for " + text)
-
-        # fix for mpl bug that cuts off top/bottom of seaborn viz
-        b, t = plt.ylim() 
-        b += 0.5 
-        t -= 0.5 
-        plt.ylim(b, t) 
-        plt.show() 
-
-        accuracy = metrics.accuracy_score(y_test, y_predict)
-        print('Accuracy: ' + str(accuracy))
-        sensitivity = cm[1][1] / (cm[1][1] + cm[1][0])
-        recall = sensitivity
-        print('Sensitivity: ' + str(sensitivity))
-        specificity = cm[0][0] / (cm[0][0] + cm[0][1])
-        print('Specificity: ' + str(specificity))
-        precision = cm[1][1] / (cm[1][1] + cm[0][1])
-        print('Precision: ' + str(precision))
-        f1 = 2 * (recall * precision)/(recall + precision)
-        print('f1 score: ' + str(f1))
-        auc = evaluate.ROC(y_test, y_predict, text)
-        results = {"accuracy": accuracy, "sensitivity": sensitivity, "specificity": specificity, 
-                   "precision": precision, "f1": f1, "auc": auc}
-        print("Model classification metrics have finished calculating!")
-        return results
-    
-    @staticmethod
-    def ROC(y_test, y_predict, text):
-        # IMPORTANT: first argument is true values, second argument is predicted probabilities
-        auc = metrics.roc_auc_score(y_test, y_predict)
-        print("AUC value is: " + str(auc))
-        fpr, tpr, thresholds = metrics.roc_curve(y_test, y_predict)
-        plt.plot(fpr, tpr)
-        plt.xlim([0.0, 1.0])
-        plt.ylim([0.0, 1.0])
-        plt.title('ROC curve for ' + text)
-        plt.xlabel('False Positive Rate (1 - Specificity)')
-        plt.ylabel('True Positive Rate (Sensitivity)')
-        plt.grid(True)
-        return auc
-
-    @staticmethod
-    def actual_acc(df, model):
-        allpositive = df[df['Class'] == 1].copy()
-        x_positive = allpositive.drop(['Class'], axis=1)
-        y_positive = allpositive['Class']
-        y_pospredict = model.predict(x_positive)
-        accuracy_positive = metrics.accuracy_score(y_positive, y_pospredict)
-        print("Accuracy with all fraud results is " + str(accuracy_positive * 100) + "%")
-        return accuracy_positive
diff --git a/sklearn_classification_pipeline.py b/sklearn_classification_pipeline.py
index 18929bf..76ae81e 100644
--- a/sklearn_classification_pipeline.py
+++ b/sklearn_classification_pipeline.py
@@ -47,7 +47,7 @@ def run_model(self, df, varlist, response, standardize, sampletype, modelname, t
         if isinstance(n_fold, int) and n_fold > 1:
             # Initialize dictionary to store results
             self.store = {"accuracy": [], "actual_accuracy": [], "sensitivity": [], "specificity": [], 
-                          "precision": [], "f1": [], "auc": [], "final": {}}
+                          "precision": [], "f1": [], "auc": [], "pr_auc": [], "final": {}}
             
             # Split dataframes into 2, one for positive response and one for negative
             df_zero = df[df[response] == 0]
@@ -147,6 +147,7 @@ def run_model(self, df, varlist, response, standardize, sampletype, modelname, t
             self.store['final']['precision'] = self.avg(self.store['precision'])
             self.store['final']['f1'] = self.avg(self.store['f1'])
             self.store['final']['auc'] = self.avg(self.store['auc'])
+            self.store['final']['pr_auc'] = self.avg(self.store['pr_auc'])
             self.store['final']['actual_accuracy'] = self.avg(self.store['actual_accuracy'])
             
             print('Final Results of ' + str(n_fold) + ' fold CV:')
@@ -171,10 +172,16 @@ def build_model(self, X_train, X_test, y_train, y_test, text, modelname, i, n_fo
                                       n_estimators=100, random_state=42)
             model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=5)
         elif modelname == 'XGBoostplus1':
-            # XGBoost with one less depth
+            # XGBoost with one more depth
             model = xgb.XGBClassifier(seed=42, nthread=1, max_depth=math.ceil(math.sqrt(X_train.shape[1]))+1,
                                       n_estimators=100, random_state=42)
             model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=5)
+        elif modelname == 'XGBoostplus3':
+            # XGBoost with 3 more depth
+            model = xgb.XGBClassifier(seed=42, nthread=1, max_depth=math.ceil(math.sqrt(X_train.shape[1]))+3,
+                                      n_estimators=100, random_state=42)
+            model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=5)
+            
         # Use Linear SVC instead of sklearn svm.SVC as the former as way faster processing speed
         # However, LinearSVC does not have .predict_proba function to get probability of response 1
         # Hence, we need to use CalibratedClassifier that provides .predict_proba functionality
@@ -208,6 +215,14 @@ def build_model(self, X_train, X_test, y_train, y_test, text, modelname, i, n_fo
             treedepth = math.ceil(math.sqrt(X_train.shape[1]))-2
             model = RandomForestClassifier(random_state=42, max_depth=treedepth, n_estimators=100)
             model.fit(X_train,y_train)
+        elif modelname == 'RandomForestplus2':
+            treedepth = math.ceil(math.sqrt(X_train.shape[1]))+2
+            model = RandomForestClassifier(random_state=42, max_depth=treedepth, n_estimators=100)
+            model.fit(X_train,y_train)
+        elif modelname == 'RandomForestplus4':
+            treedepth = math.ceil(math.sqrt(X_train.shape[1]))+4
+            model = RandomForestClassifier(random_state=42, max_depth=treedepth, n_estimators=100)
+            model.fit(X_train,y_train)
         else:
             # Parameters based on gridsearchcv of modelname = logistic regresion
             # Leave parameter blank for modelname to run this instance of logistic regression
@@ -216,7 +231,7 @@ def build_model(self, X_train, X_test, y_train, y_test, text, modelname, i, n_fo
         
         y_predict = model.predict(X_test)
         y_predictprob = model.predict_proba(X_test)[:, 1]
-        store = evaluate.model_results(y_test, y_predict, y_predictprob, text, store)
+        store = evaluate.model_results(y_test, y_predict, y_predictprob, text, store, i, n_fold)
         
         # Store model for usage in measuring actual accuracy of fraud cases
         store['model'] = model
@@ -273,7 +288,7 @@ def __init__(self):
         pass
     
     @staticmethod
-    def model_results(y_test, y_predict, y_predictprob, text, store):
+    def model_results(y_test, y_predict, y_predictprob, text, store, i, n_fold):
         cm = metrics.confusion_matrix(y_test, y_predict)
         print(cm)
         RFC_CM = pd.DataFrame(cm, ['Actual 0', 'Actual 1'], ['Predict 0', 'Predict 1'])
@@ -286,6 +301,7 @@ def model_results(y_test, y_predict, y_predictprob, text, store):
         b += 0.5 
         t -= 0.5 
         plt.ylim(b, t) 
+        plt.figure(1,figsize=(4,4))
         plt.show() 
 
         accuracy = metrics.accuracy_score(y_test, y_predict)
@@ -299,7 +315,7 @@ def model_results(y_test, y_predict, y_predictprob, text, store):
         # print('Precision: ' + str(precision))
         f1 = 2 * (recall * precision)/(recall + precision)
         # print('f1 score: ' + str(f1))
-        auc = evaluate.ROC(y_test, y_predictprob, text)
+        auc, pr_auc = evaluate.ROC(y_test, y_predictprob, text, i, n_fold)
         
         store['accuracy'].append(accuracy)
         store['sensitivity'].append(sensitivity)
@@ -307,24 +323,59 @@ def model_results(y_test, y_predict, y_predictprob, text, store):
         store['precision'].append(precision)
         store['f1'].append(f1)
         store['auc'].append(auc)
+        store['pr_auc'].append(pr_auc)
 
         return store
     
+#     @staticmethod
+#     def ROC(y_test, y_predictprob, text):
+#         # IMPORTANT: first argument is true values, second argument is predicted probabilities
+#         auc = metrics.roc_auc_score(y_test, y_predictprob)
+#         # print("AUC value is: " + str(auc))
+#         fpr, tpr, thresholds = metrics.roc_curve(y_test, y_predictprob)
+#         # print("AUC value is also: " + str(metrics.auc(fpr, tpr)))
+#         plt.plot(fpr, tpr)
+#         plt.xlim([0.0, 1.0])
+#         plt.ylim([0.0, 1.0])
+#         plt.title('ROC curve for ' + text)
+#         plt.xlabel('False Positive Rate (1 - Specificity)')
+#         plt.ylabel('True Positive Rate (Sensitivity)')
+#         plt.grid(True)
+#         return auc
+
     @staticmethod
-    def ROC(y_test, y_predictprob, text):
+    def ROC(y_test, y_predictprob, text, i, n_fold):
         # IMPORTANT: first argument is true values, second argument is predicted probabilities
         auc = metrics.roc_auc_score(y_test, y_predictprob)
         # print("AUC value is: " + str(auc))
+        print("AUC value is: " + str(auc))
         fpr, tpr, thresholds = metrics.roc_curve(y_test, y_predictprob)
         # print("AUC value is also: " + str(metrics.auc(fpr, tpr)))
-        plt.plot(fpr, tpr)
-        plt.xlim([0.0, 1.0])
-        plt.ylim([0.0, 1.0])
-        plt.title('ROC curve for ' + text)
-        plt.xlabel('False Positive Rate (1 - Specificity)')
-        plt.ylabel('True Positive Rate (Sensitivity)')
-        plt.grid(True)
-        return auc
+        # Calculate precision and recall for each threshold
+        precision, recall, _ = metrics.precision_recall_curve(y_test, y_predictprob)
+        pr_auc = metrics.auc(recall, precision)
+        # Only show ROC-AUC graph and PR-AUC graph on last iteration as they look very similar
+        # The full results can be obtained in the results section
+        if n_fold == i:
+            fullgraph = plt.figure(1,figsize=(10,20))
+            plt.style.use('ggplot')
+            ROCAUC_plot = fullgraph.add_subplot(211)
+            ROCAUC_plot.plot(fpr, tpr, color='blue')
+            ROCAUC_plot.set_title('ROC curve for ' + text)
+            ROCAUC_plot.set_xlabel('False Positive Rate (1 - Specificity)')
+            ROCAUC_plot.set_ylabel('True Positive Rate (Sensitivity)')
+            ROCAUC_plot.set_xlim([0.0, 1.0])
+            ROCAUC_plot.set_ylim([0.0, 1.0])
+            ROCAUC_plot.grid(True)
+            PRAUC_plot = fullgraph.add_subplot(212)
+            PRAUC_plot.plot(precision, recall, color='purple')
+            PRAUC_plot.set_title('Precision-Recall curve for ' + text)
+            PRAUC_plot.set_xlabel('Recall')
+            PRAUC_plot.set_ylabel('Precision')
+            PRAUC_plot.set_xlim([0.0, 1.0])
+            PRAUC_plot.set_ylim([0.0, 1.0])
+            PRAUC_plot.grid(True)
+        return auc, pr_auc
 
     @staticmethod
     def actual_acc(df, model, response):
diff --git a/sklearn_classifier_pipeline_optionalCV.py b/sklearn_classifier_pipeline_optionalCV.py
new file mode 100644
index 0000000..fe3be22
--- /dev/null
+++ b/sklearn_classifier_pipeline_optionalCV.py
@@ -0,0 +1,266 @@
+# importing packages
+import math
+import numpy as np
+import pandas as pd
+import scipy.stats as stats
+import sklearn
+import imblearn
+import matplotlib.pyplot as plt
+import seaborn as sns
+plt.style.use('ggplot')
+
+from imblearn.over_sampling import RandomOverSampler
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.svm import LinearSVC
+import xgboost as xgb 
+from sklearn import metrics
+from imblearn.over_sampling import SMOTE
+from imblearn.over_sampling import ADASYN
+import math
+from sklearn.model_selection import GridSearchCV
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.model_selection import KFold
+from sklearn.model_selection import cross_val_score
+from sklearn.utils import shuffle
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.calibration import CalibratedClassifierCV
+from collections import Counter
+
+class modelpipeline:
+    def __init__(self):
+        pass
+    
+    def run_model(self, df, varlist, response, testratio, standardize, sampletype, modelname, text, CV):
+        # Align field orders in df (including response)
+        df = df[varlist]
+
+        if sampletype == 'smote':
+            X_train, X_test, y_train, y_test = sampling.smote_oversample(df, testratio, response)
+        elif sampletype == 'adasyn':
+            X_train, X_test, y_train, y_test = sampling.adasyn_oversample(df, testratio, response)
+        else:
+            X_train, X_test, y_train, y_test = sampling.naive_oversample(df, testratio, response)
+            
+        if standardize == True:
+            scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
+            X_train = scaling.transform(X_train)
+            X_test = scaling.transform(X_test)
+            
+        store = self.build_model(X_train, X_test, y_train, y_test, text, modelname, CV)
+        # test model with all actual fraud results
+        store['actual_accuracy'] = evaluate.actual_acc(df, store['model'], response)
+        return store
+    
+    def build_model(self, X_train, X_test, y_train, y_test, text, modelname, CV):
+        if modelname == 'LogisticRegression':
+            if CV == True:
+                param_grid = dict(C=[0.8,1,1.2], max_iter=[300], solver=['liblinear'])
+                LogRegression = LogisticRegression()
+                model = GridSearchCV(LogRegression, param_grid, cv=5, scoring='f1', verbose=10)
+                model.fit(X_train,y_train)
+                print("Best f1 score: " + str(model.best_score_))
+                print("Best parameters: " + str(model.best_params_))
+            else:
+                model = LogisticRegression(max_iter=300, C=0.8, solver='liblinear')
+                model.fit(X_train,y_train)
+        elif modelname == 'XGBoost':
+            if CV == True:
+                end_value = math.ceil(math.sqrt(X_train.shape[1]))
+                start_value = end_value - 2       
+                # treedepth = list(range(start_value, end_value+1, 2))
+                param_grid = dict(n_estimators=[100], max_depth=[end_value])
+                GradientBoost = GradientBoostingClassifier()
+                model = GridSearchCV(GradientBoost, param_grid, cv=5, scoring='f1', verbose=10)
+                model.fit(X_train,y_train)
+                print("Best f1 score: " + str(model.best_score_))
+                print("Best parameters: " + str(model.best_params_))
+                
+                # Testing out xgb.cv (incomplete)
+                # model = xgb.XGBClassifier(seed=42, nthread=1, max_depth=start_value, n_estimators=100, random_state=42)
+                # xgb_param = dict(n_estimators=100, max_depth=end_value)
+                # xgtrain = xgb.DMatrix(X_train, label=y_train)
+                # model = xgb.cv(params=xgb_param, dtrain=xgtrain, nfold=5, metrics='auc')
+                # model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=5)
+                
+                # USING kfold library to do kfold testing on XGBoost:
+                # cross_val_score using kfold does not fit the model, so nothing can be predicted
+                # it's just to see the results but the model has to be fitted later on
+                # kfold = KFold(n_splits=3, random_state=42)
+                # print(kfold)
+                # scores = cross_val_score(model, X_train, y_train, cv=kfold)
+                # print("CV Accuracy: %.2f%% (%.2f%%)" % (scores.mean()*100, scores.std()*100))
+            else:
+                model = xgb.XGBClassifier(seed=42, nthread=1, max_depth=math.ceil(math.sqrt(X_train.shape[1])),
+                                          n_estimators=100, random_state=42)
+                model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=5)
+        elif modelname == 'RandomForest':
+            if CV == True:
+                start_value = math.ceil(math.sqrt(X_train.shape[1]))
+                end_value = start_value + 11         
+                treedepth = list(range(start_value, end_value, 5))
+                param_grid = dict(random_state=[42], max_depth=treedepth, n_estimators=[100,150])
+                RFC = RandomForestClassifier()
+                model = GridSearchCV(RFC, param_grid, cv=5, scoring='f1', verbose=10)
+                model.fit(X_train,y_train)
+                print("Best f1 score: " + str(model.best_score_))
+                print("Best parameters: " + str(model.best_params_))
+            else:
+                treedepth = math.ceil(math.sqrt(X_train.shape[1]))
+                model = RandomForestClassifier(random_state=42, max_depth=treedepth, n_estimators=150)
+                model.fit(X_train,y_train)
+        else:
+            # Parameters based on gridsearchcv of modelname = logistic regresion
+            # Leave parameter blank for modelname to run this instance of logistic regression
+            model = LogisticRegression(C=0.8, max_iter=300, solver='liblinear')
+            model.fit(X_train,y_train)
+        
+        y_predict = model.predict(X_test)
+        y_predictprob = model.predict_proba(X_test)[:, 1]
+        results = evaluate.model_results(y_test, y_predict, y_predictprob, text)
+        store = {"model": model, "X_train": X_train, "X_test": X_test, "y_train": y_train, 
+                 "y_test": y_test, "results": results}
+        print("Model fitting and results are complete!")
+        return store
+    
+    def standardize(self, df):
+        # Variables already standardized except for Amount
+        # columns = df.columns.values.tolist()
+        # columns.remove(response)
+        for column in ['Amount']:
+            df[column] = (df[column] - df[column].mean()) / df[column].std()
+        return df
+
+class sampling:
+    def __init__(self):
+        pass
+    @staticmethod
+    def naive_oversample(df, testratio, response):
+        X = df.drop([response], axis=1)
+        y = df[response]
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testratio, random_state=41)
+        ros = RandomOverSampler(random_state=42)
+        X_train, y_train = ros.fit_resample(X_train, y_train)
+        # train test split keeps X_test and y_test as pd series, oversampler converts X_train, y_train to numpy
+        # Convert all to numpy array for XGBoost to not have bugs
+        X_test = X_test.values
+        y_test = y_test.values
+        print("Oversampling is complete!")
+        return X_train, X_test, y_train, y_test
+    
+    @staticmethod
+    def smote_oversample(df, testratio, response):
+        X = df.drop([response], axis=1)
+        y = df[response]
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testratio, random_state=41)
+        X_train, y_train = SMOTE().fit_resample(X_train, y_train)
+        # train test split keeps X_test and y_test as pd series, oversampler converts X_train, y_train to numpy
+        # Convert all to numpy array for XGBoost to not have bugs
+        X_test = X_test.values
+        y_test = y_test.values
+        print("Number of Xs and Ys for SMOTE:")
+        print(sorted(Counter(y_train).items()))
+        print("Oversampling is complete!")
+        return X_train, X_test, y_train, y_test
+    
+    @staticmethod
+    def adasyn_oversample(df, testratio, response):
+        X = df.drop([response], axis=1)
+        y = df[response]
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testratio, random_state=41)
+        X_train, y_train = ADASYN().fit_resample(X_train, y_train)
+        # train test split keeps X_test and y_test as pd series, oversampler converts X_train, y_train to numpy
+        # Convert all to numpy array for XGBoost to not have bugs
+        X_test = X_test.values
+        y_test = y_test.values
+        print("Number of Xs and Ys for ADASYN:")
+        print(sorted(Counter(y_train).items()))
+        print("Oversampling is complete!")
+        return X_train, X_test, y_train, y_test
+
+
+
+class evaluate:
+    def __init__(self):
+        pass
+    
+    @staticmethod
+    def model_results(y_test, y_predict, y_predictprob, text):
+        cm = metrics.confusion_matrix(y_test, y_predict)
+        print(cm)
+        RFC_CM = pd.DataFrame(cm, ['Actual 0', 'Actual 1'], ['Predict 0', 'Predict 1'])
+        sns.heatmap(RFC_CM, annot=True, annot_kws={"size": 16}, cmap='Greens', linewidths=1, fmt='g')# font size
+        sns.set(font_scale=1.4)#for label size
+        plt.title("Confusion Matrix for " + text)
+
+        # fix for mpl bug that cuts off top/bottom of seaborn viz
+        b, t = plt.ylim() 
+        b += 0.5 
+        t -= 0.5 
+        plt.ylim(b, t) 
+        plt.show() 
+
+        accuracy = metrics.accuracy_score(y_test, y_predict)
+        print('Accuracy: ' + str(accuracy))
+        sensitivity = cm[1][1] / (cm[1][1] + cm[1][0])
+        recall = sensitivity
+        print('Sensitivity: ' + str(sensitivity))
+        specificity = cm[0][0] / (cm[0][0] + cm[0][1])
+        print('Specificity: ' + str(specificity))
+        precision = cm[1][1] / (cm[1][1] + cm[0][1])
+        print('Precision: ' + str(precision))
+        f1 = 2 * (recall * precision)/(recall + precision)
+        print('f1 score: ' + str(f1))
+        auc, pr_auc = evaluate.ROC(y_test, y_predictprob, text)
+        results = {"accuracy": accuracy, "sensitivity": sensitivity, "specificity": specificity, 
+                   "precision": precision, "f1": f1, "auc": auc, "pr_auc": pr_auc}
+        print("Model classification metrics have finished calculating!")
+        print(results)
+        return results
+    
+    @staticmethod
+    def ROC(y_test, y_predictprob, text):
+        # IMPORTANT: first argument is true values, second argument is predicted probabilities
+        auc = metrics.roc_auc_score(y_test, y_predictprob)
+        # print("AUC value is: " + str(auc))
+        print("AUC value is: " + str(auc))
+        fpr, tpr, thresholds = metrics.roc_curve(y_test, y_predictprob)
+        # print("AUC value is also: " + str(metrics.auc(fpr, tpr)))
+        # Calculate precision and recall for each threshold
+        precision, recall, _ = metrics.precision_recall_curve(y_test, y_predictprob)
+        pr_auc = metrics.auc(recall, precision)
+        fullgraph = plt.figure(1,figsize=(10,20))
+        plt.style.use('ggplot')
+
+        ROCAUC_plot = fullgraph.add_subplot(211)
+        ROCAUC_plot.plot(fpr, tpr, color='blue')
+        ROCAUC_plot.set_title('ROC curve for ' + text)
+        ROCAUC_plot.set_xlabel('False Positive Rate (1 - Specificity)')
+        ROCAUC_plot.set_ylabel('True Positive Rate (Sensitivity)')
+        ROCAUC_plot.set_xlim([0.0, 1.0])
+        ROCAUC_plot.set_ylim([0.0, 1.0])
+        ROCAUC_plot.grid(True)
+        PRAUC_plot = fullgraph.add_subplot(212)
+        PRAUC_plot.plot(precision, recall, color='purple')
+        PRAUC_plot.set_title('Precision-Recall curve for ' + text)
+        PRAUC_plot.set_xlabel('Recall')
+        PRAUC_plot.set_ylabel('Precision')
+        PRAUC_plot.set_xlim([0.0, 1.0])
+        PRAUC_plot.set_ylim([0.0, 1.0])
+        PRAUC_plot.grid(True)
+        return auc, pr_auc
+
+    @staticmethod
+    def actual_acc(df, model, response):
+        allpositive = df[df[response] == 1].copy()
+        x_positive = allpositive.drop([response], axis=1)
+        y_positive = allpositive[response]
+        # Convert to numpy array due to XGBoost model.predict not working well for pandas
+        x_positive = x_positive.values
+        y_positive = y_positive.values
+        y_pospredict = model.predict(x_positive)
+        accuracy_positive = metrics.accuracy_score(y_positive, y_pospredict)
+        print("Accuracy with all fraud results is " + str(accuracy_positive * 100) + "%")
+        return accuracy_positive
\ No newline at end of file