This repository has been archived by the owner on Oct 30, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathshow_diff.py
159 lines (127 loc) · 5.93 KB
/
show_diff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
from data_io import DataIO
import numpy as np
import matplotlib.pyplot as plt
dio = DataIO("Settings.json")
model_names = [
"ExtraTree_min_sample2_40trees_200f_noNorm_categoryTimeType_log",
"vowpall",
#"ExtraTree_min_sample2_10trees_200f_noNorm_categoryTimeType_count_fake_14split_new_log",
#"ExtraTree_min_sample2_10trees_200f_noNorm_categoryTimeType_count_fake_split_new_log",
#"ExtraTree_min_sample2_10trees_200f_noNorm_categoryTimeType_count_rf10_4split_new1_log"
#"ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_count_rf10_4split_newOKsalPredictValid_log",
"ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_count_exTre20_4split_new_faked_log",
#"ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_count_exTre20_4split_newOKsalPredictValid_log",
#"ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_count_exTre20_4split_newPredictsalPredictValid_log",
"Ridge_tfidf_05d_log"
#"ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_count_exTre20_4split_newPredictsalPredictValid1_log",
]
valid_salaries = dio.get_salaries("valid", log=False)
ylim = (0, 8000)
xlim = (-50000, 50000)
grid = True
def encode_salaries(salaries, bins):
bin_edges = np.linspace(11500.0, 100000, bins + 1, endpoint=True)
#hist, bin_edges = np.histogram(salaries, bins)
#bin_edges = list(bin_edges)
#bin_edges.insert(0, 0)
#bin_edges.append(salaries.max() + 1)
print np.diff(bin_edges)
idxs = np.searchsorted(bin_edges, salaries, side="right")
return idxs, bin_edges
def my_plot(plot_smt, filename=None, transform_prediction=np.exp, type_n="valid", ylim=None, xlim=None, grid=False, xlabel='Difference between salary and prediction', ylabel='Number of diferences'):
fig = plt.figure()
num_models = len(model_names)
for idx, model_name in enumerate(model_names):
prediction_salaries = dio.get_prediction(model_name=model_name, type_n=type_n)
#print prediction_salaries[1:10]
prediction_salaries = transform_prediction(prediction_salaries)
#print prediction_salaries[1:10]
diff = prediction_salaries - valid_salaries
abs_diff = np.abs(diff)
print model_name
print "min diff: {:6,.4f}".format(abs_diff.min())
print "max diff: {:6,.4f}".format(abs_diff.max())
print "std diff: {:6,.4f}".format(abs_diff.std())
print "mean diff: {:6,.4f}".format(abs_diff.mean())
print "median diff: {:6,.4f}".format(np.median(abs_diff))
quantile = np.percentile(prediction_salaries, [0, 0.25, 0.5, 0.75, 1])
print "quantile predictions: ", quantile
ax = fig.add_subplot(num_models, 1, idx + 1)
plot_smt(ax, diff, abs_diff, prediction_salaries)
ax.grid(grid)
if ylim is not None:
ax.set_ylim(ylim)
if xlim is not None:
ax.set_xlim(xlim)
if idx == (num_models - 1):
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
ax.set_title(model_name)
print idx
if filename is not None:
plt.savefig(filename)
plt.show()
def plot_hist(axis, diff, abs_diff, cur_pred):
axis.hist(diff, bins=200)
def plot_hist2d(axis, diff, abs_diff, cur_pred):
axis.hist2d(range(len(diff)), diff, bins=200)
def plot_hist2d1(axis, diff, abs_diff, cur_pred):
axis.hist2d(valid_salaries, cur_pred, bins=200)
axis.hold(True)
axis.plot(valid_salaries, valid_salaries, color='r')
def plot(axis, diff, abs_diff, cur_pred):
sort_indices = np.argsort(valid_salaries)
axis.plot(diff[sort_indices])
def plot_sorted(axis, diff, abs_diff, cur_pred):
sort_indices = np.argsort(valid_salaries)
axis.plot(diff[sort_indices], color='b')
axis.hold(True)
axis.plot(valid_salaries[sort_indices], color='g')
def plot_valid_pred_sorted(axis, diff, abs_diff, cur_pred):
sort_indices = np.argsort(valid_salaries)
axis.plot(cur_pred[sort_indices], color='r')
axis.hold(True)
axis.plot(valid_salaries[sort_indices], color='g')
#axis.plot(valid_salaries_enc[sort_indices] * 1000, color='y')
#axis.plot(upper_limits[sort_indices], color='b')
valid_salaries_enc, bin_edges = encode_salaries(valid_salaries, 4)
#bin_edges = list(bin_edges)
#bin_edges.insert(0, 0)
#bin_edges.append(valid_salaries.max() + 1)
#upper_limits = np.array(map(lambda x: bin_edges[x], valid_salaries_enc))
my_plot(plot_hist, ylim=ylim, xlim=xlim, grid=True)
my_plot(plot_hist2d)
my_plot(plot_hist2d1, xlabel="salarie", ylabel="predicted salarie")
my_plot(plot, xlabel="salarie", ylabel="predicted salarie")
my_plot(plot_sorted, xlabel="Ad", ylabel="diff from valid salarie")
my_plot(plot_valid_pred_sorted, xlabel="Ad", ylabel="valid salarie predicted salarie")
os.exit()
valid_salaries, bin_edges = encode_salaries(valid_salaries, 4)
model_names = [
#"sgd_class_tfidf_titleFullLoc_bin4",
#"multinomialnb_tfidf_titleFullLoc_bin4",
#"randomForest_tfidf_titleFullLoc_bin4",
"multinomialnb_tfidf_titleFullLoc_f1_bin4",
"sgd_class_tfidf_titleFullLoc_f1_bin4",
"randomForest_tfidf_titleFullLoc_f1_bin4",
"extraTree_tfidf_titleFullLoc_f1_bin4",
]
my_plot(plot_valid_pred_sorted, transform_prediction=lambda x: x, xlabel="Ad", ylabel="salarie class", type_n="valid_classes")
my_plot(plot_hist, transform_prediction=lambda x: x, xlabel="Ad", ylabel="salarie class", type_n="valid_classes")
#sorted_diff_indices = np.argsort(diff)
#print diff[sorted_diff_indices[500]]
#print diff.shape
#diffBiger = diff > 40000
#bigIds = valid_salaries_ids[diffBiger]
#bigDiffs = diff[diffBiger]
#bigSalares = valid_salaries[diffBiger]
#bigPredictions = valid_predictions[diffBiger]
#print "Vecjih od 40000", bigPredictions.shape[0]
#print bigSalares
#print bigIds
#n = 0
#for bigId, bigSalarie, bigPrediction, bigDiff in zip(bigIds, bigSalares, bigPredictions, bigDiffs):
#print "ID: %i Salarie: %0.2f Prediction: %0.2f Difference: %0.2f" % (bigId, bigSalarie, bigPrediction, bigDiff)
#n = n + 1
#if n > 100:
#break