forked from Imacharia/Default_predictor
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmyfunctions.py
More file actions
157 lines (132 loc) · 5.04 KB
/
myfunctions.py
File metadata and controls
157 lines (132 loc) · 5.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# function to read data
def read_data():
return pd.read_excel('C:/Users/rianm/Documents/data science/Default_predictor/data/default of credit card clients.xls',index_col=0)
def clean_data(data):
# Selecting correct column names
column_names = data.iloc[0]
# Replacing column names to the dataset
data = data.iloc[1:]
data.columns = column_names
# replacing formatting to have only the first letter capitalized
data = data.rename(columns=str.capitalize)
data.rename(columns={'Default payment next month': 'Target'}, inplace=True)
data.drop_duplicates(inplace=True)
# This function converts the columns to appropriate data type based on their contents
def convert_columns(data):
data = data.infer_objects()
return data
# applying the function to our data
data = convert_columns(data)
# Converting columns to appropriate data type
# Sex column.
data['Sex'] = data.Sex.astype('category')
# Education column
data['Education'] = data.Education.astype('category')
# Marriage column
data['Marriage'] = data.Marriage.astype('category')
# Define function to map gender
def map_gender(gender):
return 'Male' if gender == 1 else 'Female'
# Applying the function
data['Sex'] = data['Sex'].apply(map_gender)
# Define a function to map the education level to their corresponding value
def education_level(level):
if level == 1:
return "Graduate School"
elif level == 2:
return "University"
elif level == 3:
return "High School"
else:
return "Other"
# Applying the function to Education column
data['Education'] = data['Education'].apply(education_level)
# Define a function to map marital status to corresponding value
def marital_status(status):
if status == 1:
return "Married"
elif status == 2:
return "Single"
else:
return "Other"
# Applying the function
data['Marriage'] = data['Marriage'].apply(marital_status)
# Define a function to map the repayment status values to their corresponding groups
def map_repayment_status(status):
if status in [-1, 0]:
return "Performing"
elif status in [1, 2, 3]:
return "Watch"
elif status in [4, 5, 6]:
return "Substandard"
elif status in [7, 8, 9]:
return "Debt Collection"
else:
return "Defaulter"
# Applying the function to relevant columns
relevant_columns = ['Pay_0', 'Pay_2', 'Pay_3', 'Pay_4', 'Pay_5', 'Pay_6']
for column in relevant_columns:
data[column] = data[column].map(map_repayment_status)
# define function to rename columns
def rename_columns(data, column_mapping):
return data.rename(columns=column_mapping)
# define columns to rename
column_mapping = {
'Pay_0': 'Pay_status_Apr',
'Pay_2': 'Pay_status_May',
'Pay_3': 'Pay_Status_Jun',
'Pay_4': 'Pay_Status_Jul',
'Pay_5': 'Pay_Status_Aug',
'Pay_6': 'Pay_Status_Sept',
'Bill_amt1': 'Bill_amt_Apr',
'Bill_amt2': 'Bill_amt_May',
'Bill_amt3': 'Bill_amt_Jun',
'Bill_amt4': 'Bill_amt_Jul',
'Bill_amt5': 'Bill_amt_Aug',
'Bill_amt6': 'Bill_amt_Sept',
'Pay_amt1' : 'Paid_amt_Apr',
'Pay_amt2' : 'Paid_amt_May',
'Pay_amt3' : 'Paid_amt_Jun',
'Pay_amt4' : 'Paid_amt_Jul',
'Pay_amt5' : 'Paid_amt_Aug',
'Pay_amt6' : 'Paid_amt_Sept'
}
#Applying the function
data = rename_columns(data, column_mapping)
return data.copy()
class ClassificationEvaluator:
def __init__(self,y_true, y_pred):
self.y_true = y_true
self.y_pred = y_pred
def accuracy(self):
return accuracy_score(self.y_true, self.y_pred)
def precision(self):
return precision_score(self.y_true, self.y_pred)
def recall(self):
return recall_score(self.y_true, self.y_pred)
def f1_score(self):
return f1_score(self.y_true, self.y_pred)
def roc_auc(self):
return roc_auc_score(self.y_true, self.y_pred)
def evaluate(self):
metrics = {
'Accuracy': self.accuracy(),
'Precision': self.precision(),
'Recall': self.recall(),
'F1-Score': self.f1_score(),
'ROC AUC': self.roc_auc(),
}
return metrics