-
Notifications
You must be signed in to change notification settings - Fork 11
Expand file tree
/
Copy pathkmeans.py
More file actions
64 lines (52 loc) · 2.11 KB
/
kmeans.py
File metadata and controls
64 lines (52 loc) · 2.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import matplotlib.pylab as plt
from numpy import *
from random import *
import distance
def kmeans(X, k, dist=distance.euclid):
m, n = shape(X)
centroids = mat([[randrange(min(X[:, i]), max(X[:, i])) for i in xrange(n)]
for x in xrange(k)])
dist_label = zeros((m, 2))
cluster_changed = True
while cluster_changed:
cluster_changed = False
for i, x in enumerate(X):
d, l = min([(dist(x, c) ** 2, j) for j, c in enumerate(centroids)])
cluster_changed = cluster_changed or l != dist_label[i, 1]
dist_label[i] = d, l
for i in xrange(k):
sub_X = X[nonzero(dist_label[:, 1] == i)[0]]
if len(sub_X) != 0:
centroids[i, :] = mean(sub_X, axis=0)
return dist_label, centroids
def bi_kmeans(X, k, dist=distance.euclid):
m, n = shape(X)
centroids = [[mean(X[:, i]) for i in xrange(n)]]
dist_label = zeros((m, 2))
for i, x in enumerate(X):
dist_label[i] = dist(x, centroids[0]) ** 2, 0
best_label = -1
best_sse = inf
best_sub_centroids = None
while len(centroids) < k:
for i, c in enumerate(centroids):
sub_X = X[nonzero(dist_label[:, 1] == i)[0]]
sub_dist_label, sub_centroids = kmeans(sub_X, 2)
sse = sum(dist_label[nonzero(dist_label[:, 1] != i)[0]]) + sum(
sub_dist_label)
# find the best cluster
if best_sse > sse:
best_sse = sse
best_label = i
best_dist_label = sub_dist_label
best_sub_centroids = sub_centroids
# update centroid by the best cluster
centroids[best_label] = best_sub_centroids[0]
centroids.append(best_sub_centroids[1])
# update the label of X
for i, j in zip(nonzero(dist_label[:, 1] == best_label)[0], xrange(len(best_dist_label))):
if best_dist_label[j][1] == 1:
dist_label[i] = best_dist_label[j][0], len(centroids) - 1
else:
dist_label[i, 0] = best_dist_label[j][0]
return dist_label, centroids