diff --git a/KMeans/kmeans.py b/KMeans/kmeans.py index 464c57c..dab9f58 100644 --- a/KMeans/kmeans.py +++ b/KMeans/kmeans.py @@ -1,170 +1,144 @@ -#coding=utf-8 - -''' -@author: wepon, http://2hwp.com -Reference: - Book: <<Machine Learning in Action>> - Software: sklearn.cluster.KMeans - -''' import numpy as np -class KMeans(object): - """ - - 参数 - n_clusters: - 聚类个数,即k - initCent: - 质心初始化方式,可选"random"或指定一个具体的array,默认random,即随机初始化 - max_iter: - 最大迭代次数 - """ - def __init__(self,n_clusters=5,initCent='random',max_iter=300): - if hasattr(initCent, '__array__'): - n_clusters = initCent.shape[0] - self.centroids = np.asarray(initCent, dtype=np.float) - else: - self.centroids = None - +class KMeans: + def __init__(self, n_clusters=5, initCent='random', max_iter=300): self.n_clusters = n_clusters self.max_iter = max_iter self.initCent = initCent - self.clusterAssment = None - self.labels = None - self.sse = None - - #计算两点的欧式距离 - def _distEclud(self, vecA, vecB): + self.centroids = None + self.cluster_assessment = None + self.labels = None + self.sse = None + + def _dist_euclidean(self, vecA, vecB): return np.linalg.norm(vecA - vecB) - - #随机选取k个质心,必须在数据集的边界内 - def _randCent(self, X, k): - n = X.shape[1] #特征维数 - centroids = np.empty((k,n)) #k*n的矩阵,用于存储质心 - for j in range(n): #产生k个质心,一维一维地随机初始化 - minJ = min(X[:,j]) - rangeJ = float(max(X[:,j]) - minJ) - centroids[:,j] = (minJ + rangeJ * np.random.rand(k,1)).flatten() + + def _rand_centroids(self, X): + n = X.shape[1] + centroids = np.empty((self.n_clusters, n)) + for j in range(n): + min_j = min(X[:, j]) + range_j = float(max(X[:, j]) - min_j) + centroids[:, j] = (min_j + range_j * np.random.rand(self.n_clusters)) return centroids - + def fit(self, X): - #类型检查 - if not isinstance(X,np.ndarray): - try: + try: + if not isinstance(X, np.ndarray): X = np.asarray(X) - except: - raise TypeError("numpy.ndarray required for X") - - m = X.shape[0]#m代表样本数量 - self.clusterAssment = np.empty((m,2))#m*2的矩阵,第一列存储样本点所属的族的索引值, - #第二列存储该点与所属族的质心的平方误差 + except: + raise TypeError("numpy.ndarray required for X") + + m = X.shape[0 + self.cluster_assessment = np.empty((m, 2)) if self.initCent == 'random': - self.centroids = self._randCent(X, self.n_clusters) - - clusterChanged = True + self.centroids = self._rand_centroids(X) + for _ in range(self.max_iter): - clusterChanged = False - for i in range(m):#将每个样本点分配到离它最近的质心所属的族 - minDist = np.inf; minIndex = -1 + cluster_changed = False + for i in range(m): + min_dist = np.inf + min_index = -1 for j in range(self.n_clusters): - distJI = self._distEclud(self.centroids[j,:],X[i,:]) - if distJI < minDist: - minDist = distJI; minIndex = j - if self.clusterAssment[i,0] != minIndex: - clusterChanged = True - self.clusterAssment[i,:] = minIndex,minDist**2 - - if not clusterChanged:#若所有样本点所属的族都不改变,则已收敛,结束迭代 - break - for i in range(self.n_clusters):#更新质心,即将每个族中的点的均值作为质心 - ptsInClust = X[np.nonzero(self.clusterAssment[:,0]==i)[0]]#取出属于第i个族的所有点 - self.centroids[i,:] = np.mean(ptsInClust, axis=0) - - self.labels = self.clusterAssment[:,0] - self.sse = sum(self.clusterAssment[:,1]) - - - def predict(self,X):#根据聚类结果,预测新输入数据所属的族 - #类型检查 - if not isinstance(X,np.ndarray): - try: + dist_ji = self._dist_euclidean(self.centroids[j, :], X[i, :]) + if dist_ji < min_dist: + min_dist = dist_ji + min_index = j + if self.cluster_assessment[i, 0] != min_index: + cluster_changed = True + self.cluster_assessment[i, :] = min_index, min_dist ** 2 + + if not cluster_changed: + break + for i in range(self.n_clusters): + pts_in_cluster = X[np.where(self.cluster_assessment[:, 0] == i)[0]] + self.centroids[i, :] = np.mean(pts_in_cluster, axis=0) + + self.labels = self.cluster_assessment[:, 0] + self.sse = sum(self.cluster_assessment[:, 1]) + + def predict(self, X): + try: + if not isinstance(X, np.ndarray): X = np.asarray(X) - except: - raise TypeError("numpy.ndarray required for X") - - m = X.shape[0]#m代表样本数量 + except: + raise TypeError("numpy.ndarray required for X") + + m = X.shape[0 preds = np.empty((m,)) - for i in range(m):#将每个样本点分配到离它最近的质心所属的族 - minDist = np.inf + for i in range(m): + min_dist = np.inf for j in range(self.n_clusters): - distJI = self._distEclud(self.centroids[j,:],X[i,:]) - if distJI < minDist: - minDist = distJI + dist_ji = self._dist_euclidean(self.centroids[j, :], X[i, :]) + if dist_ji < min_dist: + min_dist = dist_ji preds[i] = j return preds - - -class biKMeans(object): - def __init__(self,n_clusters=5): + +class BiKMeans: + def __init__(self, n_clusters=5): self.n_clusters = n_clusters self.centroids = None - self.clusterAssment = None + self.cluster_assessment = None self.labels = None self.sse = None - - - #计算两点的欧式距离 - def _distEclud(self, vecA, vecB): + + def _dist_euclidean(self, vecA, vecB): return np.linalg.norm(vecA - vecB) - - def fit(self,X): - m = X.shape[0] - self.clusterAssment = np.zeros((m,2)) + + def fit(self, X): + try: + if not isinstance(X, np.ndarray): + X = np.asarray(X) + except: + raise TypeError("numpy.ndarray required for X") + + m = X.shape[0 + self.cluster_assessment = np.zeros((m, 2)) centroid0 = np.mean(X, axis=0).tolist() - centList =[centroid0] - for j in range(m):#计算每个样本点与质心之间初始的平方误差 - self.clusterAssment[j,1] = self._distEclud(np.asarray(centroid0), X[j,:])**2 - - while (len(centList) < self.n_clusters): - lowestSSE = np.inf - for i in range(len(centList)):#尝试划分每一族,选取使得误差最小的那个族进行划分 - ptsInCurrCluster = X[np.nonzero(self.clusterAssment[:,0]==i)[0],:] + cent_list = [centroid0] + + for j in range(m): + self.cluster_assessment[j, 1] = self._dist_euclidean(np.asarray(centroid0), X[j, :]) ** 2 + + while len(cent_list) < self.n_clusters: + lowest_sse = np.inf + for i in range(len(cent_list)): + pts_in_curr_cluster = X[np.where(self.cluster_assessment[:, 0] == i)[0], :] clf = KMeans(n_clusters=2) - clf.fit(ptsInCurrCluster) - centroidMat, splitClustAss = clf.centroids, clf.clusterAssment#划分该族后,所得到的质心、分配结果及误差矩阵 - sseSplit = sum(splitClustAss[:,1]) - sseNotSplit = sum(self.clusterAssment[np.nonzero(self.clusterAssment[:,0]!=i)[0],1]) - if (sseSplit + sseNotSplit) < lowestSSE: - bestCentToSplit = i - bestNewCents = centroidMat - bestClustAss = splitClustAss.copy() - lowestSSE = sseSplit + sseNotSplit - #该族被划分成两个子族后,其中一个子族的索引变为原族的索引,另一个子族的索引变为len(centList),然后存入centList - bestClustAss[np.nonzero(bestClustAss[:,0] == 1)[0],0] = len(centList) - bestClustAss[np.nonzero(bestClustAss[:,0] == 0)[0],0] = bestCentToSplit - centList[bestCentToSplit] = bestNewCents[0,:].tolist() - centList.append(bestNewCents[1,:].tolist()) - self.clusterAssment[np.nonzero(self.clusterAssment[:,0] == bestCentToSplit)[0],:]= bestClustAss - - self.labels = self.clusterAssment[:,0] - self.sse = sum(self.clusterAssment[:,1]) - self.centroids = np.asarray(centList) - - def predict(self,X):#根据聚类结果,预测新输入数据所属的族 - #类型检查 - if not isinstance(X,np.ndarray): - try: + clf.fit(pts_in_curr_cluster) + centroid_mat, split_cluster_assessment = clf.centroids, clf.cluster_assessment + sse_split = sum(split_cluster_assessment[:, 1]) + sse_not_split = sum(self.cluster_assessment[np.where(self.cluster_assessment[:, 0] != i)[0], 1]) + if (sse_split + sse_not_split) < lowest_sse: + best_cent_to_split = i + best_new_cents = centroid_mat + best_cluster_assessment = split_cluster_assessment.copy() + lowest_sse = sse_split + sse_not_split + best_cluster_assessment[np.where(best_cluster_assessment[:, 0] == 1)[0], 0] = len(cent_list) + best_cluster_assessment[np.where(best_cluster_assessment[:, 0] == 0)[0], 0] = best_cent_to_split + cent_list[best_cent_to_split] = best_new_cents[0, :].tolist() + cent_list.append(best_new_cents[1, :].tolist() + self.cluster_assessment[np.where(self.cluster_assessment[:, 0] == best_cent_to_split)[0], :] = best_cluster_assessment + + self.labels = self.cluster_assessment[:, 0] + self.sse = sum(self.cluster_assessment[:, 1]) + self.centroids = np.asarray(cent_list) + + def predict(self, X): + try: + if not isinstance(X, np.ndarray): X = np.asarray(X) - except: - raise TypeError("numpy.ndarray required for X") - - m = X.shape[0]#m代表样本数量 + except: + raise TypeError("numpy.ndarray required for X") + + m = X.shape[0 preds = np.empty((m,)) - for i in range(m):#将每个样本点分配到离它最近的质心所属的族 - minDist = np.inf + for i in range(m): + min_dist = np.inf for j in range(self.n_clusters): - distJI = self._distEclud(self.centroids[j,:],X[i,:]) - if distJI < minDist: - minDist = distJI + dist_ji = self._dist_euclidean(self.centroids[j, :], X[i, :]) + if dist_ji < min_dist: + min_dist = dist_ji preds[i] = j return preds