-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathknn-classifier.py
249 lines (190 loc) · 7.33 KB
/
knn-classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
"""
Created on Sun Nov 5 12:29:10 2023
@author: iasonas kakandris
@author: kostantina Gatzi
Course : Pattern Recognition
"""
#%% import libraries
import pandas as pd
import matplotlib.pyplot as plt # for data visualization purposes
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import seaborn as sns
from sklearn.metrics import confusion_matrix # To check for TP , TN , FP , FN
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import os
#%% Import data
# Get the current working directory
current_dir = os.getcwd()
# Define the file name
file_name = 'dataset.csv' # Adjust the file name as needed
# Create the file path by joining the current directory and the file name
data = os.path.join(current_dir, file_name)
#data = 'C:/pattern-recognition/dataset.csv'
df = pd.read_csv(data , header = None)
df.shape
#%% change to nd array for easiness
df = pd.DataFrame(df).to_numpy()
#%% load the X and y data
X=df[:,0:-1]
y=df[:,-1]
#%% training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state = 0)
#%% Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
#%% knn classifiers initialization
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = [] #list of classifiers
classifiers = 10
y_prediction = [] #predictions of classifiers
#initialization of the knn classifiers
for i in range(classifiers):
knn = KNeighborsClassifier(n_neighbors=i+1)
knn_classifier.append(knn)
#%% model fit
# fit the model to the training set
for i in range(classifiers) :
knn_classifier[i].fit(X_train , y_train)
y_prediction.append(knn_classifier[i].predict(X_test))
#%% accuracy score
for i in range(classifiers) :
print('Model accuracy score for ' , i+1 , ' neighboor(s): {0:0.4f}'. format(
accuracy_score(y_test, y_prediction[i])))
#%% training set score
y_prediction_train = []
for i in range(classifiers) :
y_prediction_train.append(knn_classifier[i].predict(X_train))
print('Training-set accuracy score for' , i+1 ,' neighboor(s): {0:0.4f}'. format(
accuracy_score(y_train, y_prediction_train[i])))
#%% Confusion Matrix
class_names = ['1' , '2' , '3']
for i in range(classifiers):
print("Number of neighbors is {}".format(i+1))
# Create the confusion matrixes
cm = confusion_matrix(y_test, y_prediction[i])
heatmap = sns.heatmap(cm, annot=True, fmt='d', cmap='YlGnBu',
xticklabels=class_names, yticklabels=class_names)
heatmap.set_title("Confusion matrix for {} neighbor(s)".format(i+1))
#plt.figure(figsize=(5,4))
plt.xlabel('Predicted')
plt.ylabel('Actual')
print(classification_report(y_test, y_prediction[i]))
plt.show()
print("\n")
print("\n")
print("\n")
#%% Data presentation
for i in range(classifiers):
class1 = 1
# Find indices of data belonging to the specified classes
indices_to_keep1 = np.isin(y_train, class1).flatten()
# Filter the data based on the selected indices
X_train1= X_train[indices_to_keep1]
class2= 2
# Find indices of data belonging to the specified classes
indices_to_keep2 = np.isin(y_train, class2).flatten()
# Filter the data based on the selected indices
X_train2= X_train[indices_to_keep2]
class3= 3
# Find indices of data belonging to the specified classes
indices_to_keep3 = np.isin(y_train, class3).flatten()
# Filter the data based on the selected indices
X_train3= X_train[indices_to_keep3]
# plt.figure(figsize=(4,4))
plt.subplot(2, 2, 1)
plt.scatter(X_train1[:,0],X_train1[:,1])
plt.scatter(X_train2[:,0],X_train2[:,1])
plt.scatter(X_train3[:,0],X_train3[:,1])
plt.legend(['Class 1','Class 2', 'Class 3'])
plt.title('Data of train set')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
#plt.show()
class1 = 1
# Find indices of data belonging to the specified classes
indices_to_keep4 = np.isin(y_prediction_train[i], class1).flatten()
# Filter the data based on the selected indices
X_train4= X_train[indices_to_keep4]
class5= 2
# Find indices of data belonging to the specified classes
indices_to_keep5 = np.isin(y_prediction_train[i], class2).flatten()
# Filter the data based on the selected indices
X_train5= X_train[indices_to_keep5]
class6= 3
# Find indices of data belonging to the specified classes
indices_to_keep6 = np.isin(y_prediction_train[i], class3).flatten()
# Filter the data based on the selected indices
X_train6= X_train[indices_to_keep6]
#plt.figure(figsize=(4,4))
plt.subplot(2, 2, 2)
plt.scatter(X_train4[:,0],X_train4[:,1])
plt.scatter(X_train5[:,0],X_train5[:,1])
plt.scatter(X_train6[:,0],X_train6[:,1])
plt.legend(['Class 1','Class 2', 'Class 3'])
plt.title('Data of predicted train set')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
#plt.show()
class1 = 1
# Find indices of data belonging to the specified classes
indices_to_keep7 = np.isin(y_test, class1).flatten()
# Filter the data based on the selected indices
X_test1= X_test[indices_to_keep7]
class2= 2
# Find indices of data belonging to the specified classes
indices_to_keep8 = np.isin(y_test, class2).flatten()
# Filter the data based on the selected indices
X_test2= X_test[indices_to_keep8]
class3= 3
# Find indices of data belonging to the specified classes
indices_to_keep9 = np.isin(y_test, class3).flatten()
# Filter the data based on the selected indices
X_test3= X_test[indices_to_keep9]
#plt.figure(figsize=(4,4))
plt.subplot(2, 2, 3)
plt.scatter(X_test1[:,0],X_test1[:,1])
plt.scatter(X_test2[:,0],X_test2[:,1])
plt.scatter(X_test3[:,0],X_test3[:,1])
plt.legend(['Class 1','Class 2', 'Class 3'])
plt.title('Data of test set')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
#plt.show()
class1 = 1
# Find indices of data belonging to the specified classes
indices_to_keep10 = np.isin(y_prediction[i], class1).flatten()
# Filter the data based on the selected indices
X_test4= X_test[indices_to_keep10]
class5= 2
# Find indices of data belonging to the specified classes
indices_to_keep11 = np.isin(y_prediction[i], class2).flatten()
# Filter the data based on the selected indices
X_test5= X_test[indices_to_keep11]
class6= 3
# Find indices of data belonging to the specified classes
indices_to_keep12 = np.isin(y_prediction[i], class3).flatten()
# Filter the data based on the selected indices
X_test6= X_test[indices_to_keep12]
#plt.figure(figsize=(4,4))
plt.subplot(2, 2, 4)
plt.scatter(X_test4[:,0],X_test4[:,1])
plt.scatter(X_test5[:,0],X_test5[:,1])
plt.scatter(X_test6[:,0],X_test6[:,1])
plt.legend(['Class 1','Class 2', 'Class 3'])
plt.title('Data of predicted test set')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.suptitle("Number of neighbors is {}".format(i+1))
plt.show()
# =============================================================================
# We can see that the best model is the one with the 5 neighboors
# Model accuracy score for 5 neighboor(s): 0.8929
# Training-set accuracy score for 5 neighboor(s): 0.9000
# we also see that we dont have overfitting
# =============================================================================