-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathtrain.py
98 lines (82 loc) Β· 4.19 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""Main module."""
import pandas as pd
import os
import pickle
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
if __name__ == "__main__":
time = datetime.now()
dt_string = time.strftime("%d-%m-%Y_%H.%M.%S")
data_file = "data.csv"
credit = pd.read_csv(data_file, sep=',',engine="python") #To download go to https://github.com/Giskard-AI/giskard-client/tree/main/sample_data/classification
# Declare the type of each column in the dataset(example: category, numeric, text)
column_types = {'default':"category",
'account_check_status':"category",
'duration_in_month':"numeric",
'credit_history':"category",
'purpose':"category",
'credit_amount':"numeric",
'savings':"category",
'present_employment_since':"category",
'installment_as_income_perc':"numeric",
'sex':"category",
'personal_status':"category",
'other_debtors':"category",
'present_residence_since':"numeric",
'property':"category",
'age':"numeric",
'other_installment_plans':"category",
'housing':"category",
'credits_this_bank':"numeric",
'job':"category",
'people_under_maintenance':"numeric",
'telephone':"category",
'foreign_worker':"category"}
# feature_types is used to declare the features the model is trained on
feature_types = {i:column_types[i] for i in column_types if i!='default'}
# Pipeline to fill missing values, transform and scale the numeric columns
columns_to_scale = [key for key in feature_types.keys() if feature_types[key]=="numeric"]
numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
# Pipeline to fill missing values and one hot encode the categorical values
columns_to_encode = [key for key in feature_types.keys() if feature_types[key]=="category"]
categorical_transformer = Pipeline([
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore',sparse=False)) ])
# Perform preprocessing of the columns with the above pipelines
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, columns_to_scale),
('cat', categorical_transformer, columns_to_encode)
]
)
# Pipeline for the model Logistic Regression
clf_logistic_regression = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', LogisticRegression(max_iter =1000))])
# Split the data into train and test
Y = credit['default']
X = credit.drop(columns="default")
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.20,random_state = 30, stratify = Y)
# Fit and score your model
clf_logistic_regression.fit(X_train, Y_train)
print("model score: ", clf_logistic_regression.score(X_test, Y_test))
# Prepare data to upload on Giskard
test_data = pd.concat([X_test, Y_test ], axis=1)
# Dumping
trained_model_path='trained_model'
if not os.path.isdir(trained_model_path):
os.system('mkdir '+trained_model_path)
if not os.path.isdir(trained_model_path+'/'+dt_string):
os.system('mkdir '+trained_model_path+'/'+dt_string)
model_filename = trained_model_path+'/'+dt_string+'/logistic_regression_model.pkl'
test_data_filename = trained_model_path+'/'+dt_string+'/test_data.zip'
pickle.dump(clf_logistic_regression, open(model_filename, 'wb'))
test_data.to_pickle(test_data_filename, compression='zip')