-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathkmeansppclustering.py
91 lines (84 loc) · 2.63 KB
/
kmeansppclustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import sys
import numpy as np
import scipy.io as sio
#import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.svm import SVC
filename = sys.argv[1]
datafile = sio.loadmat(filename)
data = datafile['variable_name']
sizedata=[len(data), len(data[0])]
disp = []
optimal_ks = []
#Determining the optimal number of k with gap statistic method
def gap_statistic(data):
sizedata = [len(data),len(data[0])]
SD = []
gap = []
for knum in xrange(0,20):
print knum
#Clustering original Data
kmeanspp = KMeans(n_clusters=knum,init = 'k-means++',max_iter = 100,n_jobs = 1)
kmeanspp.fit(data)
dispersion = kmeanspp.inertia_
#Clustering Reference Data
nrefs = 10
refDisp = np.zeros(nrefs)
for nref in xrange(nrefs):
refdata = np.random.random_sample(tuple(sizedata))
refkmeans = KMeans(n_clusters=knum,init='k-means++',max_iter=100,n_jobs=1)
refkmeans.fit(refdata)
refdisp = refkmeans.inertia_
refDisp[nref]=np.log(refdisp)
mean_log_refdisp = np.mean(refDisp)
gap.append(mean_log_refdisp-np.log(dispersion))
sd = (sum([(r-m)**2 for r,m in zip(refDisp,[mean_log_refdisp]*nrefs)])/nrefs)**0.5
SD.append(sd)
SD = [sd*((1+(1/nrefs))**0.5) for sd in SD]
opt_k = None
diff = []
for i in xrange(len(gap)-1):
diff = (SD[i+1]-(gap[i+1]-gap[i]))
if diff>0:
opt_k = i+10
break
if opt_k < 20:
print opt_k
return opt_k
else:
return 20
#Here maximum limit is used 20 because the data used while testing this code was not supposed to have more than 20 clusters
#However it may be changed according to need. Hence the last if-else statement may not be reuired in that case.
ntrials = 50
for ntrial in xrange(ntrials):
print 'ntrial: ',ntrial
optimal_ks.append(gap_statistic(data))
#Plotting the Gap curve(Values to be provided accordingly)
#plt.plot(np.linspace(10,19,10,True),gap)
#plt.show()
#The following lines of code were written to test if there are more than one clusters for which optimal number of K is obtained
#Though that should not happen
unique_opt_k = list(set(optimal_ks))
k_count = {}
count_opt_k = 0
second_opt_k = 0
opt_k = 0
for u_o_k in unique_opt_k:
count = optimal_ks.count(u_o_k)
k_count[u_o_k]=count
if count>count_opt_k:
count_opt_k = count
opt_k = u_o_k
elif count==count_opt_k:
second_opt_k = u_o_k
print opt_k
#Clusterin with optimal number of k
kmeanspp = KMeans(n_clusters = opt_k,init='k-means++',max_iter=100,n_jobs=1)
kmeanspp.fit(data)
centers = kmeanspp.cluster_centers_
clusterlabels = kmeanspp.labels_
print clusterlabels
mdict = {}
mdict['clusterlabels'] = clusterlabels
sio.savemat('clusterlabels.mat',mdict,format = '4',oned_as = 'column')
print 'dan dana dan done...'