-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
150 lines (109 loc) · 6.95 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# Created by Antonio Di Mariano ([email protected]) at 22/11/2019
from classes.EDA import EDA
from classes.LinearAlgorithms import LineaRegression_Prediction
_std_label_to_study = 'SalePrice'
if __name__ == "__main__":
# first load test and train data
eda = EDA(test_data_csv='test.csv', train_data_csv='train.csv')
# features's categories. I divide the categories in quantitatives and qualitatives
train_categorized_variables = eda.extract_quantitative_and_qualitative_variables(data_set=eda.train_data)
test_categorized_variables = eda.extract_quantitative_and_qualitative_variables(data_set=eda.test_data)
# HEre is you want to have a nice output :)
# print("Quantitative's features:",categorized_variables.get('quantitative'))
# print("Qualitative's features:", categorized_variables.get('qualitative'))
# see the correlation with quantitative features
# print("---------------------------------")
eda.print_bivariants_correlation_with_quantitative_features(label=_std_label_to_study, data_set=eda.train_data,
quantitative_list=train_categorized_variables.get(
'quantitative'))
# Plot the SalePrice #histogram and normal probability plot
eda.histogram_and_normal_probability(data_set=eda.train_data, label=_std_label_to_study)
# Correlation matrix
correlation_mat_df = eda.correlation_matrix(data_set=eda.train_data)
print(correlation_mat_df)
# plot a detailed correlation matrix
eda.detailed_correlation_matrix(data_set=eda.train_data,matrix=correlation_mat_df,feature_to_zoom=_std_label_to_study,show=True)
#
# Here for each of the qualitative and quantitative features a plot is created.
#
"""
for feature in categorized_variables['qualitative']:
eda.plot_qualitative_relationship_against_lable(data_set=eda.train_data, feature=str(feature), label=_std_label_to_study)
for feature in categorized_variables['quantitative']:
eda.plot_quantitative_relationship_against_lable(data_set=eda.train_data, feature=str(feature),label=_std_label_to_study)
"""
# Value Missing and Imputing
eda.get_total_and_percentage_of_missing_data(title='Quantitative features',
dataset=eda.train_data[train_categorized_variables.get('quantitative')])
eda.get_total_and_percentage_of_missing_data(title='Qualitative features',
dataset=eda.train_data[train_categorized_variables.get('qualitative')])
# now imputing missing values with scalar and values and then check again
## For train data
for quantitative in train_categorized_variables['quantitative']:
eda.imputing_missing_value_with_value(dataset=eda.train_data, feature_with_missing_value=quantitative)
for qualitative in train_categorized_variables['qualitative']:
eda.imputing_missing_value_with_value(dataset=eda.train_data, feature_with_missing_value=qualitative,
value_to_fill='NA')
## For test data
eda.get_total_and_percentage_of_missing_data(title='Quantitative features',
dataset=eda.test_data[train_categorized_variables.get('quantitative')])
eda.get_total_and_percentage_of_missing_data(title='Qualitative features',
dataset=eda.test_data[train_categorized_variables.get('qualitative')])
## here we can plot a grid to visualize how each qualitative features is distributed
eda.visualize_the_distribution_grid_of_numerical_features(dataset=eda.train_data,label=_std_label_to_study,quantitative_list=train_categorized_variables.get('quantitative'))
## here we can plot a grid to visualize how each quantitative features is distributed
eda.visualize_the_scatter_plot_grid_of_numerical_features(dataset=eda.train_data, label=_std_label_to_study,quantitative_list=train_categorized_variables.get('quantitative'))
"""
here we can plot a scatter plot with automatic regression btw SalePrice and one of the feature with the highest correlation
We see how data are distributed against a linear regression
"""
eda.plot_scatter_plot_with_automatic_regression(label_a='SalePrice', label_b='GrLivArea', data=eda.train_data)
eda.histogram(data=eda.train_data[_std_label_to_study])
"""
we see that the distribution is skewed.
To make the distribution more symmetric, we can try taking its logarithm
"""
eda.histogram(data=eda.train_data[_std_label_to_study],log_data=True)
"""
Dummification.
I selected all the qualitative features
"""
eda.dummification(data=eda.train_data,qualitative_list=train_categorized_variables.get('qualitative'))
list_of_added_features = eda.features_engineering_against_a_dataset(data=eda.train_data)
print("Added features:",list_of_added_features)
for feature in list_of_added_features:
print(eda.train_data[feature].head())
# This dict has the ratio I want to add
ratio_to_add_dict = {
'Total_num_of_bathrooms': {
'against':'TotalArea',
'col_name':'Total_num_of_bathrooms_vs_TotalArea'
},
'Total_num_of_halfbathrooms': {
'against': 'TotalArea',
'col_name': 'Total_num_of_halfbathrooms_vs_TotalArea'
},
'GarageCars': {
'against': 'TotalArea',
'col_name': 'GarageCars_vs_TotalArea'
}
}
for ratio in ratio_to_add_dict:
eda.adding_ratio_between_columns(data=eda.train_data, feature_a=ratio,
feature_b=ratio_to_add_dict[ratio].get('against'),
column_to_add=ratio_to_add_dict[ratio].get('col_name'))
features_to_model = list(ratio_to_add_dict.keys()) +list_of_added_features
print("Features:",features_to_model)
## Model
linear_reg_with_train_data = LineaRegression_Prediction(dataset=eda.train_data, features_list=features_to_model, label=_std_label_to_study)
linear_reg_with_train_data.build_training_set()
linear_reg_with_train_data.fit_model(set_x=linear_reg_with_train_data.training_set_x, set_y=linear_reg_with_train_data.training_set_y)
linear_reg_with_train_data.make_prediction(validation_data=linear_reg_with_train_data.validation_data_x)
# now with all dataset
X = eda.train_data[features_to_model]
y = eda.train_data[_std_label_to_study]
linear_reg_with_train_all_data = LineaRegression_Prediction(dataset=eda.train_data, features_list=features_to_model, label=_std_label_to_study)
linear_reg_with_train_all_data.fit_model(set_x=X,set_y=y)
linear_reg_with_train_data.make_prediction(validation_data=linear_reg_with_train_data.validation_data_x)
linear_reg_with_train_data.print_out(test_data=eda.test_data)
#Make prediciton on test data