This repository has been archived by the owner on Feb 22, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmeasure.py
254 lines (206 loc) · 8.82 KB
/
measure.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#Z0096
import pandas as pd
from scipy.stats import ttest_ind, chi2_contingency
from sklearn.metrics import confusion_matrix, classification_report
#################### Measure telco_churn Data ####################
def two_sample_ttest(a, b, alpha=0.05, equal_var=True,
alternative='two-sided'):
'''
Perform T-Test using scipy.stats.ttest, prints whether to reject or accept
the null hypothesis, as well as alpha, p-value, and t-value
'''
t, p = ttest_ind(a, b, equal_var=equal_var, alternative=alternative)
null_hyp = f'there is no difference in {a.name} between the two populations'
# print alpha and p-value
print(f'''
alpha: {alpha}
p-value: {p:.1g}''')
# print if our p-value is less than our significance level
if p < alpha:
print(f'''
Due to our p-value of {p:.1g} being less than our significance level of {alpha}, we must reject the null hypothesis
that {null_hyp}.
''')
# print if our p-value is greater than our significance level
else:
print(f'''
Due to our p-value of {p:.1g} being less than our significance level of {alpha}, we fail to reject the null hypothesis
that {null_hyp}.
''')
def chi_test(cat, target, alpha=0.05):
'''
Takes in a category for comparison with the passed target and
default alpha=0.05, then creates a crosstab DataFrame containing
the observed values
Performs a chi2_contingency to find p-value and ouputs a DataFrame
of expected values, then prints alpha, p-value, whether to accept
or reject null hypothesis, and the dataframes of observed and
expected values
More robust version of chi_test_lite
'''
# set observed DataFrame with crosstab
observed = pd.crosstab(cat, target)
a = observed.iloc[0,0]
b = observed.iloc[0,1]
c = observed.iloc[1,0]
d = observed.iloc[1,1]
# assign returned values from chi2_contigency
chi2, p, degf, expected = chi2_contingency(observed)
# set expected DataFrame from returned array
expected = pd.DataFrame(expected)
a2 = expected.iloc[0,0]
b2 = expected.iloc[0,1]
c2 = expected.iloc[1,0]
d2 = expected.iloc[1,1]
# set null hypothesis
null_hyp = f'{target.name} is independent of {cat.name}'
# print alpha and p-value
print(f'''
alpha: {alpha}
p-value: {p:.1g}''')
# print if our p-value is less than our significance level
if p < alpha:
print(f'''
Due to our p-value of {p:.1g} being less than our significance level of {alpha}, we must reject the null hypothesis
that {null_hyp}.''')
# print if our p-value is greater than our significance level
else:
print(f'''
Due to our p-value of {p:.1g} being less than our significance level of {alpha}, we fail to reject the null hypothesis
that {null_hyp}.''')
# print observed and expected DataFrames side by side
print(f'''
** Observed ** | ** Expected **
--------------------------------------|--------------------------------------
No Churn Churn | No Churn Churn
|
No Fiber {a:<10.0f} {b:<10.0f} | No Fiber {a2:<10.0f} {b2:<10.0f}
|
Fiber {c:<10.0f} {d:<10.0f} | Fiber {c2:<10.0f} {d2:<10.0f}
''')
def chi_test_lite(cat, target, alpha=0.05):
'''
Takes in a category for comparison with the passed target and
default alpha=0.05, then creates a crosstab DataFrame containing
the observed values
Performs a chi2_contingency to find p-value and ouputs a DataFrame
of expected values, then prints alpha, p-value, and whether to
accept or reject null hypothesis
Simpler output version of chi_test
'''
# set observed DataFrame with crosstab
observed = pd.crosstab(cat, target)
a = observed.iloc[0,0]
b = observed.iloc[0,1]
c = observed.iloc[1,0]
d = observed.iloc[1,1]
# assign returned values from chi2_contigency
chi2, p, degf, expected = chi2_contingency(observed)
# set expected DataFrame from returned array
expected = pd.DataFrame(expected)
a2 = expected.iloc[0,0]
b2 = expected.iloc[0,1]
c2 = expected.iloc[1,0]
d2 = expected.iloc[1,1]
# set null hypothesis
null_hyp = f'{target.name} is independent of {cat.name}'
# print alpha and p-value
print(f'''
alpha: {alpha}
p-value: {p:.1g}''')
# print if our p-value is less than our significance level
if p < alpha:
print(f'''
Due to our p-value of {p:.1g} being less than our significance level of {alpha}, we must reject the null hypothesis
that {null_hyp}.
''')
# print if our p-value is greater than our significance level
else:
print(f'''
Due to our p-value of {p:.1g} being less than our significance level of {alpha}, we fail to reject the null hypothesis
that {null_hyp}.
''')
def cmatrix(y_true, y_pred):
'''
Takes in true and predicted values to create a confusion matrix,
then ouputs dictionary holding the true pos, true, neg, false pos,
and false neg rates discerned from the matrix
Used in conjunction with model_report
'''
# assign TN, FN, TP, FP
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
#do math to find rates
tpr = (tp / (tp + fn))
tnr = (tn / (tn + fp))
fpr = 1 - tnr
fnr = 1 - tpr
cmatrix_dict = {'tpr':tpr, 'tnr':tnr, 'fpr':fpr, 'fnr':fnr}
return cmatrix_dict
def model_report(y_true, y_pred, lite=False):
'''
Takes in true and predicted values to create classificant report
dictionary and uses cmatrix function to obtain positive and
negative prediction rates, prints out table containing all metrics
for the positive class of target
'''
# create dictionary for classification report and confusion matrix
report_dict = classification_report(y_true, y_pred, output_dict=True)
cmatrix_dict = cmatrix(y_true, y_pred)
# print formatted table with desired information for model report
if lite == False:
print(f'''
*** Model Report ***
---------------------
_____________________________________________
| Positive Case: churn==1 |
| Negative Case: churn==0 |
|---------------------------------------------|
| Accuracy: {report_dict['accuracy']:>8.2%} |
| True Positive Rate: {cmatrix_dict['tpr']:>8.2%} |
| False Positive Rate: {cmatrix_dict['fpr']:>8.2%} |
| True Negative Rate: {cmatrix_dict['tnr']:>8.2%} |
| False Negative Rate: {cmatrix_dict['fnr']:>8.2%} |
| Precision: {report_dict['1']['precision']:>8.2%} |
| Recall: {report_dict['1']['recall']:>8.2%} |
| F1-Score: {report_dict['1']['f1-score']:>8.2%} |
| |
| Positive Support: {report_dict['1']['support']:>8} |
| Negative Support: {report_dict['0']['support']:>8} |
| Total Support: {report_dict['macro avg']['support']:>8} |
|_____________________________________________|
''')
elif lite == True:
print(f'''
*** Model Report ***
---------------------
_____________________________________________
| Positive Case: churn==1 |
| Negative Case: churn==0 |
|---------------------------------------------|
| Accuracy: {report_dict['accuracy']:>8.2%} |
| Precision: {report_dict['1']['precision']:>8.2%} |
| Recall: {report_dict['1']['recall']:>8.2%} |
| Total Support: {report_dict['macro avg']['support']:>8} |
|_____________________________________________|
''')
def validate(X, y, model, lite=False):
'''
Takes in feature DataFrame, true target, and fitted model to obtain
model_report for model predictions on validation dataset
Same function as final_test
'''
# assign model predictions on validate data
y_pred = model.predict(X)
model_report(y, y_pred, lite=lite)
return y_pred
def final_test(X, y, model):
'''
Takes in feature DataFrame, true target, and fitted model to obtain
model_report for model predictions on test dataset
Same function as final_test
'''
# assign model predictions on test data
y_pred = model.predict(X)
# print model metrics on test data
model_report(y, y_pred, lite=False)
return y_pred