Custom Mean Shift

7AM7 · 7AM7 · commit 03a9308ffe6f · 2018-05-07T00:17:41.000+02:00
diff --git a/Clustering/Mean-Shift/Custom_Mean-Shift.py b/Clustering/Mean-Shift/Custom_Mean-Shift.py
@@ -3,94 +3,88 @@
 style.use('ggplot')
 import numpy as np
 
-class K_Means:
-    def __init__(self, k=2, tol=0.001, max_iter=300):
-        self.k = k
-        self.tol = tol
-        self.max_iter = max_iter
+class Mean_Shift:
+    def __init__(self, radius=4):
+        self.radius = radius
 
-    def fit(self,data):
+    def fit(self, data):
+        centroids = {}
+        ## make id for values 
+        for i in range(len(data)):
+            centroids[i] = data[i]
+        #print(centroids)
 
-        self.centroids = {} # self.centroids this mean center get best center point
+            
+        #Make all datapoints centroids
+        #Take mean of all featuresets within centroid's radius, setting this mean as new centroid.
+        #Repeat step #2 until convergence.
+        while True:
+            new_centroids = []
+            for i in centroids:
+                in_bandwidth = []
+                centroid = centroids[i]
+                for featureset in data:
+                    ## if distance between featureset and centroid less then radius
+                    ## add featureset in bandwidth list
+                    if np.linalg.norm(featureset-centroid) < self.radius:
+                        in_bandwidth.append(featureset)
         
-        # add point from 0 to k in the dictionary ... just start  k point 
-        for i in range(self.k):
-            self.centroids[i] = data[i]
-
+                # get the average between values in bandwidth list 
+                new_centroid = np.average(in_bandwidth,axis=0)
+                new_centroids.append(tuple(new_centroid))
 
-        for i in range(self.max_iter):
-            self.classifications = {}
-            ## create  classification dictionary and set it empty list form 0 to k
-            for i in range(self.k):
-                self.classifications[i] = []
-            
-            for featureset in data:
-                ## get distances between featureset and center point 
-                distances = [np.linalg.norm(featureset-self.centroids[centroid]) for centroid in self.centroids]
-                ## get  index of min value in distances is go to group 0 or 1 or ...  to k
-                classification = distances.index(min(distances))
-                ## add featureset to the the gourp of k
-                # exmp: if you have k = 2 you have 2 group
-                # so add the distances min value to her group 1 or 0 
-                self.classifications[classification].append(featureset)
-                
-            ## last centroids point before get average
-            prev_centroids = dict(self.centroids)
-            for classification in self.classifications:
-                ## get average to the point of the gourp 0 or 1 or ... in range k and add her to centroids dictionary
-                self.centroids[classification] = np.average(self.classifications[classification],axis=0)
+            uniques = sorted(list(set(new_centroids))) # sorted and remove duplicate value
+            print(new_centroids)
+            prev_centroids = dict(centroids)
 
+            centroids = {}
+            for i in range(len(uniques)):
+                centroids[i] = np.array(uniques[i]) ## add uniques in centroids
 
-            ## compare between original_centroid and current_centroid
             optimized = True
-            for c in self.centroids:
-                original_centroid = prev_centroids[c]
-                current_centroid = self.centroids[c]
-                ## get all current_centroid and original_centroid if they are within our required tolerance, this is good
-                ## else the optimized = False and stop fist loop for i in range(self.max_iter):
-                if np.sum((current_centroid-original_centroid)/original_centroid*100.0) > self.tol:
-                    #print(c, np.sum((current_centroid-original_centroid)/original_centroid*100.0))
+            '''
+            Here we note the previous centroids, before we begin to reset "current" or "new" centroids
+            by setting them as the uniques. Finally, we compare the previous centroids to the new ones, and measure movement.
+            If any of the centroids have moved, then we're not content that we've got full convergence
+            and optimization, and we want to go ahead and run another cycle.
+            If we are optimized, great, we break, and then finally set the centroids attribute to the final centroids we came up with.
+            '''
+
+            for i in centroids:
+                if not np.array_equal(centroids[i], prev_centroids[i]):
                     optimized = False
-            
+                if not optimized:
+                    break
+                
             if optimized:
                 break
+
+        self.centroids = centroids
+
     def predict(self,data):
-        ## get distances between featureset and center point (centroid)
-        distances = [np.linalg.norm(data-self.centroids[centroid]) for centroid in self.centroids]
-        ## get  index of min value in distances is go to group 0 or 1 or ...  to k
-        classification = distances.index(min(distances))
-        return classification
+        pass
 
 
 X = np.array([[1, 2],
               [1.5, 1.8],
               [5, 8 ],
               [8, 8],
               [1, 0.6],
-              [9,11]])
+              [9,11],
+              [8,2],
+              [10,2],
+              [9,3],])
+
 colors = 10*["g","r","c","b","k"]
-clf = K_Means(k=2)
+
+clf = Mean_Shift()
 clf.fit(X)
 
-##  scatter center point of ths groups
-for centroid in clf.centroids:
-    plt.scatter(clf.centroids[centroid][0], clf.centroids[centroid][1],
-                marker="o", color="k", s=150, linewidths=5)
-    
-##  scatter classification point of ths group
-for classification in clf.classifications:
-    color = colors[classification]
-    for featureset in clf.classifications[classification]:
-        plt.scatter(featureset[0], featureset[1], marker="x", color=color, s=150, linewidths=5)
-
-
-## predict new features
-new_features = np.array([[1, 3],
-              [8, 9],
-              [0, 3 ],
-              [5, 4],
-              [6, 4],])
-for feature in new_features:
-    classification = clf.predict(feature)
-    plt.scatter(feature[0], feature[1], marker="*", color=colors[classification], s=150, linewidths=5)
+centroids = clf.centroids
+
+plt.scatter(X[:,0], X[:,1], s=150)
+
+for c in centroids:
+    plt.scatter(centroids[c][0], centroids[c][1], color='k', marker='*', s=150)
+
 plt.show()