-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclustering_util.py
470 lines (347 loc) · 15.3 KB
/
clustering_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
__author__ = "Michael Kern"
__email__ = '[email protected]'
########################################################################################################################
import random
import numpy as np
# use scipy to compute different distance matrices
from scipy.spatial.distance import pdist, squareform
import scipy.stats as stats
"""
http://eli.thegreenplace.net/2010/01/22/weighted-random-generation-in-python
--> good explanation to create weighted choices / random numbers
"""
def weightedChoice(weights):
# compute sum of all weights
sumTotal = sum(weights)
# compute a random with range[0, sumTotal]
rnd = random.random() * sumTotal
for index, weight in enumerate(weights):
# subtract current weight from random to find current index
rnd -= weight
if rnd < 0:
return index
# 20% faster if weights are sorted in descending order
########################################################################################################################
"""
Implementation of an binary tree for hierarchical clustering
"""
"""
Node of the tree containing information about it's id in data, children and the value
"""
class BinaryNode:
def __init__(self, value, id, size, leftChild, rightChild):
self.value = value
self.left = leftChild
self.right = rightChild
self.size = size
self.id = id
self.parent = None
self.indices = [id]
# create json info on the fly
self.json = {"id": self.id, "size": self.size, "value": self.value, "indices": [id]}
if leftChild is not None and rightChild is not None:
# self.json["value"] = np.mean(self.value)
self.json["children"] = [rightChild.json, leftChild.json]
self.indices = [] + rightChild.indices + leftChild.indices
self.json["indices"] = self.indices
def isLeave(self):
return self.left is None and self.right is None
########################################################################################################################
"""
Implementation of an hierarchical binary tree
"""
class BinaryTree:
# this tree must not be empty and must have at least two children (leaves)
def __init__(self, leftNode, rightNode, newID, newValue):
self.__createNewRoot(leftNode, rightNode, newID, newValue)
# ------------------------------------------------------------------------------------------------------------------
def addNode(self, newNode, newID, newValue):
self.__createNewRoot(self.root, newNode, newID, newValue)
return self
# ------------------------------------------------------------------------------------------------------------------
def merge(self, tree, newID, newValue):
self.__createNewRoot(self.root, tree.root, newID, newValue)
return self
# ------------------------------------------------------------------------------------------------------------------
def jsonify(self):
import json
return json.dumps(self.root.json)
# return self.root.json
# return self.__traverseJson(self.root)
# ------------------------------------------------------------------------------------------------------------------
def json(self):
return self.root.json
# ------------------------------------------------------------------------------------------------------------------
def cutTreeByClusters(self, k):
queue = [self.root]
while len(queue) < k:
node = queue.pop(0)
queue.append(node.left)
queue.append(node.right)
def keyFunc(x):
if x.isLeave():
return 0
else:
return -x.value
queue.sort(key=keyFunc)
clusters = []
for node in queue:
clusters.append(node.indices)
return clusters
# ------------------------------------------------------------------------------------------------------------------
def __traverseJson(self, node):
json = {"id": node.id, "size": node.size, "value": node.value}
if node.left is None and node.right is None:
return json
else:
json["children"] = [] + [self.__traverseJson(node.left)] + [self.__traverseJson(node.right)]
return json
# ------------------------------------------------------------------------------------------------------------------
def getLeaves(self):
return self.root.indices
# return self.__traverseIDs(self.root)
# ------------------------------------------------------------------------------------------------------------------
def __traverseIDs(self, node):
if node.left is None and node.right is None:
return [node.id]
else:
return [] + self.__traverseIDs(node.right) + self.__traverseIDs(node.left)
# ------------------------------------------------------------------------------------------------------------------
def __createNewRoot(self, leftNode, rightNode, newID, newValue):
newSize = leftNode.size + rightNode.size
self.root = BinaryNode(newValue, newID, newSize, leftNode, rightNode)
leftNode.parent = rightNode.parent = self.root
# ------------------------------------------------------------------------------------------------------------------
def cutJsonTreeByClusters(jsonData, k):
# import json
# tree = json.loads(jsonData)
queue = [jsonData]
while len(queue) < k:
node = queue.pop(0)
queue.append(node['children'][0])
queue.append(node['children'][1])
def keyFunc(x):
if 'children' not in x:
return 0
else:
return -x['value']
queue.sort(key=keyFunc)
clusters = []
for node in queue:
clusters.append(node['indices'])
return clusters
########################################################################################################################
def euclideanDistance(matrix, vector, squared=False):
"""
Computes the euclidean distance between a vector and the rows of a matrix in parallel.
:param matrix: array of observations or clusters
:param vector: cluster centroid or observation
:return:
"""
# compute distance between values in matrix and the vector
distMat = matrix - vector
numValues = len(matrix)
distances = np.array([0.0 for _ in range(numValues)], dtype=np.float)
for ii in range(numValues):
distance = distMat[ii]
# always try to use np.dot when computing euclidean distance
# it's way faster than ** 2 and sum(..., axis=1)
distances[ii] = np.dot(distance, distance)
if squared:
return distances
else:
return np.sqrt(distances)
# ----------------------------------------------------------------------------------------------------------------------
def correlationDistance(matrix, vector, method):
"""
:param matrix:
:param vector:
:return:
"""
numValues = len(matrix)
distances = np.array([0.0 for _ in range(numValues)], dtype=np.float)
for ii in range(numValues):
value = matrix[ii]
if method == 'pearson':
distances[ii], _ = stats.pearsonr(value, vector)
elif method == 'spearman':
distances[ii], _ = stats.spearmanr(value, vector)
elif method == 'kendall':
distances[ii], _ = stats.kendalltau(value, vector)
else:
raise AttributeError
return distances
# ----------------------------------------------------------------------------------------------------------------------
from scipy.spatial.distance import cdist
def similarityMeasurement(matrix, vector, method='euclidean'):
if method == 'euclidean':
return euclideanDistance(matrix, vector)
if method == 'sqeuclidean':
return euclideanDistance(matrix, vector, True)
spatialMethods = ['cityblock', 'chebyshev', 'canberra', 'correlation', 'hamming', 'mahalanobis',]
if method in spatialMethods:
return np.nan_to_num(cdist(matrix, np.atleast_2d(vector), method).flatten())
corrMethods = ['spearman', 'pearson', 'kendall']
if method in corrMethods:
return correlationDistance(matrix, vector, method)
raise AttributeError
# ----------------------------------------------------------------------------------------------------------------------
def euclideanDistanceMatrix(matrix, squared=False):
"""
Compute the euclidean distance matrix required for the algorithm
:param matrix:
:param n:
:return:
"""
n = np.shape(matrix)[0]
distMat = np.zeros((n, n))
# use Gram matrix and compute distances without inner products | FASTER than row-by-row method
"Gramiam matrix to compute dot products of each pair of elements: "
"<https://en.wikipedia.org/wiki/Gramian_matrix>"
gramMat = np.zeros((n, n))
for ii in range(n):
for jj in range(ii, n):
gramMat[ii, jj] = np.dot(matrix[ii], matrix[jj])
# # ! This is slower than computing dot products of rows manually in python
# # ! And we only require the upper triangle matrix of the Gram matrix
# gramMat = np.dot(self.__obs, self.__obs.T)
# make use of formula |a - b|^2 = a^2 - 2ab + b^2
for ii in range(n):
# self.__d[ii, ii] = self.__maxValue
jj = np.arange(ii + 1, n)
distMat[ii, jj] = gramMat[ii, ii] - 2 * gramMat[ii, jj] + gramMat[jj, jj]
distMat[jj, ii] = distMat[ii, jj]
# # take square root of distances to compute real euclidean distance
# distMat = np.sqrt(distMat)
"alternative version --> use scipy's fast euclidean distance implementation: FASTEST"
# distMat = spt.distance.pdist(self.__obs, 'euclidean')
# self.__d = spt.distance.squareform(distMat)
# print(distMat)
if squared:
return distMat
else:
return np.sqrt(distMat)
# ----------------------------------------------------------------------------------------------------------------------
def norm1Distance(matrix, vector):
"""
Computes the norm-1 distance between a vector and the rows of a matrix in parallel.
:param matrix: array of observations or clusters
:param vector: cluster centroid or observation
:return:
"""
distMat = np.abs(matrix - vector)
numValues = len(vector)
distances = np.sum(distMat, axis=1) / numValues
return distances
# ----------------------------------------------------------------------------------------------------------------------
def pearsonCorrelationMatrix(matrix):
"""
:param matrix:
:param n:
:return:
"""
# TODO! other possibilites like 1 - abs(corr) | sqrt(1 - corr ** 2) | (1 - corr) / 2
distMat = 1 - np.corrcoef(matrix)
return distMat
# ----------------------------------------------------------------------------------------------------------------------
def statsCorrelationMatrix(matrix, method):
if method == 'pearson':
return pearsonCorrelationMatrix(matrix)
n = np.shape(matrix)[0]
distMat = np.zeros((n, n))
for ii in range(n):
rowI = matrix[ii]
for jj in range(ii + 1, n):
rowJ = matrix[jj]
corr = 0
if method == 'spearman':
corr, _ = stats.spearmanr(rowI, rowJ)
if method == 'kendall':
corr, _ = stats.kendalltau(rowI, rowJ)
# TODO! other possibilites like 1 - abs(corr) | sqrt(1 - corr ** 2) | (1 - corr) / 2
corr = 1 - corr
distMat[ii, jj] = corr
distMat[jj, ii] = corr
return distMat
# ----------------------------------------------------------------------------------------------------------------------
def similarityMeasurementMatrix(matrix, method):
"""
Generic function to determine the similarity measurement for clustering
:param matrix:
:param method:
:return:
"""
if method == 'euclidean':
return euclideanDistanceMatrix(matrix)
# return squareform(pdist(matrix, method))
if method == 'sqeuclidean':
return euclideanDistanceMatrix(matrix, True)
# return squareform(pdist(matrix, method))
spatialMethods = ['cityblock', 'chebyshev', 'canberra', 'correlation', 'hamming', 'mahalanobis']
if method in spatialMethods:
return squareform(np.nan_to_num(pdist(matrix, method)))
corrMethods = ['spearman', 'pearson', 'kendall']
if method in corrMethods:
return statsCorrelationMatrix(matrix, method)
raise AttributeError
########################################################################################################################
# utility functions to compute distances between rows and cluster centroids
def computeClusterInternDistances(matrix, labels, sorted=True, metric='euclidean'):
"""
Computes the distances of each element in one cluster to the cluster's centroid. Returns distance values and labels
sorted in ascending order.
:param matrix:
:param labels:
:return: labels / indices of elements corresponding to distance array, distance values of cluster
"""
clusterLabels = np.array(labels)
if len(clusterLabels) == 0:
return [], []
subMatrix = matrix[clusterLabels]
# compute centroid of cluster along column (as we want to average each gene separately)
centroid = np.mean(subMatrix, axis=0)
# compute distances to centroid
dists = similarityMeasurement(subMatrix, centroid, metric)
if sorted == 'true':
# sort values
indices = range(len(dists))
indices.sort(key=dists.__getitem__)
dists.sort()
# reverse order if correlation coefficient is used
# (1 means perfect correlation while -1 denotes opposite correlation)
corrMetrics = ['pearson', 'spearman', 'kendall']
if metric in corrMetrics:
indices.reverse()
dists = dists[::-1]
# write back to our arrays
distLabels = clusterLabels[indices].tolist()
distValues = dists.tolist()
else:
distLabels = clusterLabels.tolist()
distValues = dists.tolist()
return distLabels, distValues
# ----------------------------------------------------------------------------------------------------------------------
def computeClusterExternDistances(matrix, labels, outerLabels, metric='euclidean'):
"""
Compute the distances of patients in one cluster to the centroids of all other clusters.
:param matrix:
:param labels:
:param outerLabels:
:return:
"""
externDists = []
internSubMatrix = matrix[labels]
for externLabels in outerLabels:
if len(externLabels) == 0:
externDists.append([])
# compute centroid of external cluster
subMatrix = matrix[externLabels]
centroid = np.mean(subMatrix, axis=0)
dists = similarityMeasurement(internSubMatrix, centroid, metric)
externDists.append(dists.tolist())
return externDists
########################################################################################################################
if __name__ == '__main__':
print(cdist([[1,1,1],[3,3,3],[5,5,5]],np.atleast_2d([2,2,2]), 'sqeuclidean').flatten())
from scipy.stats import spearmanr
print(spearmanr([1,2,3],[2,4,1]))