diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000..c2cdfb8
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,21 @@
+# EditorConfig helps developers define and maintain consistent
+# coding styles between different editors and IDEs
+# editorconfig.org
+
+root = true
+
+
+[*]
+
+# Change these settings to your own preference
+indent_style = space
+indent_size = 2
+
+# We recommend you to keep these unchanged
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = true
+
+[*.md]
+trim_trailing_whitespace = false
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..833eaab
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,126 @@
+# These settings are for any web project
+
+# Handle line endings automatically for files detected as text
+# and leave all files detected as binary untouched.
+* text=auto
+
+#
+# The above will handle all files NOT found below
+#
+
+#
+## These files are text and should be normalized (Convert crlf => lf)
+#
+
+# source code
+*.php text
+*.css text
+*.sass text
+*.scss text
+*.less text
+*.styl text
+*.js text
+*.ts text
+*.coffee text
+*.json text
+*.htm text
+*.html text
+*.xml text
+*.txt text
+*.ini text
+*.inc text
+*.pl text
+*.rb text
+*.py text
+*.scm text
+*.sql text
+*.sh text eof=LF
+*.bat text
+
+# templates
+*.hbt text
+*.jade text
+*.haml text
+*.hbs text
+*.dot text
+*.tmpl text
+*.phtml text
+
+# server config
+.htaccess text
+
+# git config
+.gitattributes text
+.gitignore text
+
+# code analysis config
+.jshintrc text
+.jscsrc text
+.jshintignore text
+.csslintrc text
+
+# misc config
+*.yaml text
+*.yml text
+.editorconfig text
+
+# build config
+*.npmignore text
+*.bowerrc text
+
+# Heroku
+Procfile text
+.slugignore text
+
+# Documentation
+*.md text
+LICENSE text
+AUTHORS text
+
+
+#
+## These files are binary and should be left untouched
+#
+
+# (binary is a macro for -text -diff)
+*.png binary
+*.jpg binary
+*.jpeg binary
+*.gif binary
+*.ico binary
+*.mov binary
+*.mp4 binary
+*.mp3 binary
+*.flv binary
+*.fla binary
+*.swf binary
+*.gz binary
+*.zip binary
+*.7z binary
+*.ttf binary
+*.pyc binary
+*.pdf binary
+
+# Source files
+# ============
+*.pxd text
+*.py text
+*.py3 text
+*.pyw text
+*.pyx text
+*.sh text eol=lf
+*.json text
+
+# Binary files
+# ============
+*.db binary
+*.p binary
+*.pkl binary
+*.pyc binary
+*.pyd binary
+*.pyo binary
+
+# Note: .db, .p, and .pkl files are associated
+# with the python modules ``pickle``, ``dbm.*``,
+# ``shelve``, ``marshal``, ``anydbm``, & ``bsddb``
+# (among others).
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 0d20b64..a00b0c0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,19 @@
+# general things to ignore
+/.tscache
+/.idea
+/build/
+/dist/
+*.egg-info/
+*.egg
+*.py[cod]
+__pycache__/
+*.so
+*~
+*.log
+*.pot
*.pyc
+*.swp
+*.lock
+# due to using tox and pytest
+.tox
+.cache
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..80a94cb
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,27 @@
+language: python
+sudo: required
+env:
+ - TOXENV=py27
+ - TOXENV=py34
+
+install:
+ - (!(test -f docker_packages.txt) || (cat docker_packages.txt | xargs sudo apt-get install -y))
+ - pip install -r requirements_dev.txt
+ - pip install -r requirements.txt
+
+script: npm run dist
+
+deploy:
+ provider: releases
+ api_key:
+ secure: TK9/P34Bi3WuppiDrBCwVcn41yCBwmILaU8hXTBzUPbT7TbeFIwsC6/4CtH85Z+ZrUve4S5pTmWRNf2dQDxWw3uYu7+bJuemV2J1LHG76mognj+TNEiYxfLQUt3Gql4W7C7FcI4Rlx5/uMN9wY1wro8TWUBMwT6jjSrUWIvK3GXoojd5bHvJx07XpjWl9wCon4D0ruZiFoM2mdeP23lbc2GckETi32oEKswnQXxkMACmxbPzoWbvkxH4aK8Bt2Rj2sl2TbPhVkN6DAkHGkGAvLI+2/aRfG27+oo3OKsaDjbuGABct8TfZccJ970CbQ8kbnCjYxstvqkg1JWjF0W67sX/flBZZOEUA5l0OLWo6HqMGMxm7/lEQhIdPMsRmvXL+HVOxkMrB2dda58QzxVwiZp+rRqUaeabPZp8Kl5xodGrVxsBvxe6zAbJ5jCtCSumG6+kLyKI00/kYlghqQNrgUw0ZsYJlQ34h3lo/24QpaeyDpQoCkGWQgtgqiXGpeKSu7bCnOqIqAy3nbT9Utwj7K8gIasTG5idosEAz/THMampNbGDuyxxc340sYGNMg9Bhm1g2ILWRdtV470p5hwBtIDTKi3/PAizEO26+Wh0zI47Sg3ao57avcbCsTmzbZUeA5J4bojmchhJCHX8su9cSCGh/2fJA/1eBIgEvOQ8LNE=
+ file_glob: true
+ file: dist/phovea_clustering*.egg
+ on:
+ tags: true
+
+notifications:
+ slack:
+ secure: E8/1UIdHSczUbN+6i6gd1d5LM4vmLdwLQ30tpyjvnM0wvfDce76oPxLJAy240WJ5ybXRZUtNrttpVpt4tEXCy8aLFCmxD7s77rVloH+q1J8R/ptTFWZGhFGEujk1awEmVbzcWxJkV9/JENQaeGBKxwv8/EQwWwEkAb7p/+AJb9owmH88b3wUZUGHBWtbMiyyaF4Rm1Wg1stJB8Z1Ga7PRF4cqufTgcDdsCPVv9gAY+VxOIGqX/Vfuc9UWpUH8vq8lHUE7Inn5QS78kuFfSgLWga3H6Mu/Gko1XNlWk0QWWQBUvEZ6ZC6Wuo68KzvUjJHDTnx8WyfHue2JNHIslcX+eJq2WHLeEgM24VeNkILCGo/H/60NGHiSjrIv/Y9h6bQ9FDjo6TUyE4nbdPYN1RN9FQ5UbI9Y4Gi753H9mqnHWlEywBOzHxdZCAuz9Wh03CCF/blsvJ+Obbyo6Jrfe+g44jyi9kQdBNQ78qG6v4EXws8FiYao6x3PpgIwFix42Cpr+soAh5FpA3C1zHSAyZZpXF65/lrDl5yPNofK7Wy0B9bw+0I6Z/u7ZKFNVZXvYPGYvtUVcsALGBdmYc61+LCta36Po0KZseWVAlJj6QnOJDYzv0wvV/zsuf9A5KpYFGiqV9Q7zmtiO5FYF5sBy+lE7O9tHVO4O18IRndhRQgxhs=
+ on_success: change
+ on_failure: always
diff --git a/.yo-rc.json b/.yo-rc.json
new file mode 100644
index 0000000..308c8c3
--- /dev/null
+++ b/.yo-rc.json
@@ -0,0 +1,52 @@
+{
+ "generator-phovea": {
+ "type": "slib",
+ "name": "phovea_clustering",
+ "author": "The Caleydo Team",
+ "githubAccount": "phovea",
+ "modules": [
+ "phovea_server"
+ ],
+ "extensions": [],
+ "sextensions": [
+ {
+ "type": "clustering",
+ "id": "caleydo-clustering-kmeans",
+ "module": "clustering_kmeans",
+ "extras": {}
+ },
+ {
+ "type": "clustering",
+ "id": "caleydo-clustering-hierarchical",
+ "module": "clustering_hierarchical",
+ "extras": {}
+ },
+ {
+ "type": "clustering",
+ "id": "caleydo-clustering-affinity",
+ "module": "clustering_affinity",
+ "extras": {}
+ },
+ {
+ "type": "clustering",
+ "id": "caleydo-clustering-fuzzy",
+ "module": "clustering_fuzzy",
+ "extras": {}
+ },
+ {
+ "type": "namespace",
+ "id": "caleydo-clustering",
+ "module": "clustering_api",
+ "extras": {
+ "namespace": "/api/clustering"
+ }
+ }
+ ],
+ "libraries": [],
+ "unknown": {
+ "requirements": [],
+ "dockerPackages": []
+ },
+ "today": "Fri, 23 Dec 2016 03:02:24 GMT"
+ }
+}
\ No newline at end of file
diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md
new file mode 100644
index 0000000..bdb156f
--- /dev/null
+++ b/ISSUE_TEMPLATE.md
@@ -0,0 +1,17 @@
+* Release number or git hash:
+* Web browser version and OS:
+* Environment (local or deployed):
+
+### Steps to reproduce
+
+1.
+2.
+
+### Observed behavior
+* Any unexpected output or action (or lack of expected output or action)
+* Web browser console errors (including tracebacks)
+* Server errors (relevant messages and tracebacks)
+* Static or animated images showing the UI behavior
+
+### Expected behavior
+*
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..fd6f461
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,22 @@
+Copyright (c) 2016, The Caleydo Team
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+ Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+ Neither the name of the Caleydo Software nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
index 1a3242f..97caba2 100644
--- a/README.md
+++ b/README.md
@@ -1,23 +1,44 @@
-Caleydo Clustering
-==================
-
-This repository is a server-side plugin for Caleydo to apply clustering algorithms on arbitrary matrices.
-
-Supported Algorithms:
---------------------
-- K-Means (Init methods: Forgy, Uniform, Random, Kmeans++)
-- Hierarchical Clustering (Single, Complete, Weighted, Median)
-- Affinity Propagation
-- Fuzzy Clustering
-- Various Distance Measurements(Euclidean, Chebyshef, Manhattan, Pearson, Spearman, ...)
-
-General Information:
--------------------
-- All these algorithms expect the input as matrix (or 2D numpy array)
-- It is assumed that the matrix is dense (no sparse matrix support right now)
-- NaN values will be converted to zero
-
-Future Work:
+phovea_clustering [![Phovea][phovea-image]][phovea-url] [![NPM version][npm-image]][npm-url] [![Build Status][travis-image]][travis-url] [![Dependency Status][daviddm-image]][daviddm-url]
+=====================
+
+
+
+Installation
------------
-- Improvement of algorithms
-- Combination of several clustering results
+
+```
+git clone https://github.com/phovea/phovea_clustering.git
+cd phovea_clustering
+npm install
+```
+
+Testing
+-------
+
+```
+npm test
+```
+
+Building
+--------
+
+```
+npm run build
+```
+
+
+
+***
+
+
+This repository is part of **[Phovea](http://phovea.caleydo.org/)**, a platform for developing web-based visualization applications. For tutorials, API docs, and more information about the build and deployment process, see the [documentation page](http://caleydo.org/documentation/).
+
+
+[phovea-image]: https://img.shields.io/badge/Phovea-Server%20Plugin-10ACDF.svg
+[phovea-url]: https://phovea.caleydo.org
+[npm-image]: https://badge.fury.io/js/phovea_clustering.svg
+[npm-url]: https://npmjs.org/package/phovea_clustering
+[travis-image]: https://travis-ci.org/phovea/phovea_clustering.svg?branch=master
+[travis-url]: https://travis-ci.org/phovea/phovea_clustering
+[daviddm-image]: https://david-dm.org/phovea/phovea_clustering.svg?theme=shields.io
+[daviddm-url]: https://david-dm.org/phovea/phovea_clustering
diff --git a/__init__.py b/__init__.py
deleted file mode 100644
index f425e42..0000000
--- a/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-__author__ = 'Michael Kern'
-__version__ = '1.0.0'
-__email__ = 'kernm@in.tum.de'
diff --git a/clustering_affinity.py b/clustering_affinity.py
deleted file mode 100644
index 80a4963..0000000
--- a/clustering_affinity.py
+++ /dev/null
@@ -1,304 +0,0 @@
-__author__ = 'Michael Kern'
-__version__ = '0.0.1'
-__email__ = 'kernm@in.tum.de'
-
-########################################################################################################################
-# libraries
-
-# module to load own configurations
-import caleydo_server.config
-# request config if needed for the future
-config = caleydo_server.config.view('caleydo-clustering')
-
-import numpy as np
-from clustering_util import similarityMeasurementMatrix
-from timeit import default_timer as timer
-
-########################################################################################################################
-
-class AffinityPropagation:
- """
- This is an implementation of the affinity propagation algorithm to cluster genomic data / matrices.
- Implementation details: .
- Matlab implementation:
- Returns the centroids and labels / stratification of each row belonging to one cluster.
- """
-
- def __init__(self, obs, damping=0.5, factor=1.0, prefMethod='minimum', distance='euclidean'):
- """
- Initializes the algorithm.
- :param obs: genomic data / matrix
- :param damping: controls update process to dampen oscillations
- :param factor: controls the preference value (influences number of clusters)
- :param prefMethod: all points are chosen equally with a given preference (median or minimum of similarity matrix)
- :return:
- """
- self.__n = np.shape(obs)[0]
- # observations, can be 1D array or 2D matrix with genes as rows and conditions as columns
- # remove all NaNs in data
- self.__obs = np.nan_to_num(obs)
- # variables influencing output of clustering algorithm
- self.__damping = damping
- self.__factor = factor
- self.__prevMethod = prefMethod
-
- # similarity matrix
- self.__S = np.zeros((self.__n, self.__n))
- # availability matrix
- self.__A = np.zeros((self.__n, self.__n))
- # responsibility matrix
- self.__R = np.zeros((self.__n, self.__n))
-
- self.minValue = np.finfo(np.float).min
-
- # self.__mx1 = np.full(self.__n, self.minValue)
- # self.__mx2 = np.full(self.__n, self.minValue)
-
- self.__idx = np.zeros(self.__n)
-
- # set similarity computation
- self.__distance = distance
-
- self.__computeSimilarity()
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def __call__(self):
- """
- Caller function for server API.
- """
- return self.run()
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def __computeSimilarity(self):
- """
- Compute the similarity matrix from the original observation matrix and set preference of each element.
- :return: Similarity matrix
- """
- # compute distance matrix containing the negative sq euclidean distances -|| xi - xj ||**2
- self.__S = -similarityMeasurementMatrix(self.__obs, self.__distance)
-
- # determine the preferences S(k,k) to control the output of clusters
- pref = 0
- # could be median or minimum
- if self.__prevMethod == 'median':
- pref = float(np.median(self.__S)) * self.__factor
- elif self.__prevMethod == 'minimum':
- pref = np.min(self.__S) * self.__factor
- else:
- raise AttributeError
-
- np.fill_diagonal(self.__S, pref)
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def run(self):
- """
- Runs the algorithm of affinity propagation. Conducts at least 100 iterations and checks if the outcome of
- current exemplars/clusters has converged. If not, the algorithm will continue until convergence is found
- or the maximum number of iterations (200) is reached.
- :return:
- """
- maxIter = 200
- maxConvIter = 100
-
- # sum all decisions for exemplars per round
- decisionSum = np.zeros(self.__n)
- # collect decisions for one exemplar per iteration round
- decisionIter = np.zeros((maxConvIter, self.__n))
- # counter for decisions (= consider data element as exemplar in each algorithm iteration)
- decisionCounter = maxConvIter
- # indicates if algorithm has converged
- isConverged = False
-
- centroids = []
- it = 0
- clusterI = []
-
- # helpful variables (that do not need recomputation)
- indexDiag = np.arange(self.__n)
- indicesDiag = np.diag_indices_from(self.__R)
- newA = np.zeros((self.__n, self.__n))
- newR = np.zeros((self.__n, self.__n))
-
- for it in range(1, maxIter + 1):
-
- # ----------------------------------------------------------------------------------------------------------
-
- # compute responsibility matrix
- AS = self.__A + self.__S
-
- maxY = np.max(AS, axis=1)
- indexY = np.argmax(AS, axis=1)
-
- # set values of maxima to zero in AS matrix
- AS[indexDiag, indexY] = self.minValue
-
- # look for second maxima
- maxY2 = np.max(AS, axis=1)
-
- # perform responsibility update
- for ii in range(self.__n):
- # s(i, k) - max({ a(i, k') + s(i, k') })
- newR[ii] = self.__S[ii] - maxY[ii]
-
- # subtract second maximum from row -> column entry with maximum value
- newR[indexDiag, indexY] = self.__S[indexDiag, indexY] - maxY2[indexDiag]
-
- # dampen values
- # self.__R = self.__damping * self.__R + (1 - self.__damping) * newR
- self.__R *= self.__damping
- self.__R += (1 - self.__damping) * newR
-
- # ----------------------------------------------------------------------------------------------------------
-
- # compute availability matrix
- # cut out negative elements
- # TODO! slow because of copy operation
- Rp = np.maximum(self.__R, 0)
-
- # write back all diagonal elements als self representatives
- Rp[indicesDiag] = self.__R[indicesDiag]
- sumCols = np.sum(Rp, axis=0)
-
- # apply availability update
- newA[:,] = sumCols
- newA -= Rp
- # for ii in range(self.__n):
- # # r(k, k) + sum(max(0, r(i',k))
- # newA[:, ii] = sumCols[ii] - Rp[:, ii]
-
- diagA = np.diag(newA)
- # take minimum of all the values in A, cut out all values above zero
- # newA = np.minimum(newA, 0)
- newA[newA > 0] = 0
- newA[indicesDiag] = diagA[indexDiag]
-
- # dampen values
- # self.__A = self.__damping * self.__A + (1 - self.__damping) * newA
- self.__A *= self.__damping
- self.__A += (1 - self.__damping) * newA
-
- # ----------------------------------------------------------------------------------------------------------
-
- # find exemplars for new clusters
- # old version which is slower
- # E = self.__R + self.__A
- # diagE = np.diag(E)
-
- # take the diagonal elements of the create matrix E
- diagE = np.diag(self.__R) + np.diag(self.__A)
-
- # all elements > 0 are considered to be an appropriate exemplar for the dataset
- clusterI = np.argwhere(diagE > 0).flatten()
-
- # count the number of clusters
- numClusters = len(clusterI)
-
- # ----------------------------------------------------------------------------------------------------------
-
- decisionCounter += 1
- if decisionCounter >= maxConvIter:
- decisionCounter = 0
-
- # subtract outcome of previous iteration (< 100) from the total sum of the decisions
- decisionSum -= decisionIter[decisionCounter]
-
- decisionIter[decisionCounter].fill(0)
- decisionIter[decisionCounter][clusterI] = 1
-
- # compute sum of decisions for each element being a exemplar
- decisionSum += decisionIter[decisionCounter]
-
- # check for convergence
- if it >= maxConvIter or it >= maxIter:
- isConverged = True
-
- for ii in range(self.__n):
- # if element is considered to be an exemplar in at least one iterations
- # and total of decisions in the last 100 iterations is not 100 --> no convergence
- if decisionSum[ii] != 0 and decisionSum[ii] != maxConvIter:
- isConverged = False
- break
-
- if isConverged and numClusters > 0:
- break
-
- # --------------------------------------------------------------------------------------------------------------
-
- # obtain centroids
- centroids = self.__obs[clusterI]
-
- # find maximum columns in AS matrix to assign elements to clusters / exemplars
- # fill A with negative values
- self.__A.fill(self.minValue)
- # set values of clusters to zero (as we only want to regard these values
- self.__A[:, clusterI] = 0.0
- # fill diagonal of similarity matrix to zero (remove preferences)
- np.fill_diagonal(self.__S, 0.0)
-
- # compute AS matrix
- AS = self.__A + self.__S
- # since values are < 0, look for the maximum number in each row and return its column index
- self.__idx = np.argmax(AS, axis=1)
-
- clusterI = clusterI.tolist()
- clusterLabels = [[] for _ in range(numClusters)]
-
- # create labels per cluster
- for ii in range(self.__n):
- index = clusterI.index(self.__idx[ii])
- self.__idx[ii] = index
- clusterLabels[index].append(ii)
-
- # return sorted cluster labels (that's why we call compute cluster distances, might be redundant)
- # for ii in range(numClusters):
- # clusterLabels[ii], _ = computeClusterInternDistances(self.__obs, clusterLabels[ii])
-
- # if isConverged:
- # print('Algorithm has converged after {} iterations'.format(it))
- # else:
- # print('Algorithm has not converged after 200 iterations')
- #
- # print('Number of detected clusters {}'.format(numClusters))
- # print('Centroids: {}'.format(centroids))
-
- return centroids.tolist(), self.__idx.tolist(), clusterLabels
-
-########################################################################################################################
-
-def _plugin_initialize():
- """
- optional initialization method of this module, will be called once
- :return:
- """
- pass
-
-# ----------------------------------------------------------------------------------------------------------------------
-
-def create(data, damping, factor, preference, distance):
- """
- by convention contain a factory called create returning the extension implementation
- :return:
- """
- return AffinityPropagation(data, damping, factor, preference, distance)
-
-########################################################################################################################
-
-# from timeit import default_timer as timer
-
-if __name__ == '__main__':
- np.random.seed(200)
- # data = np.array([[1,2,3],[5,4,5],[3,2,2],[8,8,7],[9,6,7],[2,3,4]])
- # data = np.array([np.random.rand(8000) * 4 - 2 for _ in range(500)])
- # data = np.array([[0.9],[1],[1.1],[10],[11],[12],[20],[21],[22]])
- data = np.array([1,1.1,5,8,5.2,8.3])
-
- s = timer()
- aff = AffinityPropagation(data, 0.9, 1.0, 'median', 'euclidean')
- result = aff.run()
- e = timer()
- print(result)
- print('time elapsed: {}'.format(e - s))
-
diff --git a/clustering_api.py b/clustering_api.py
deleted file mode 100644
index 3d9941c..0000000
--- a/clustering_api.py
+++ /dev/null
@@ -1,148 +0,0 @@
-__author__ = 'Michael Kern'
-__version__ = '0.0.1'
-__email__ = 'kernm@in.tum.de'
-
-########################################################################################################################
-# libraries
-
-# use flask library for server activities
-import flask
-# load services (that are executed by the server when certain website is called)
-from clustering_service import *
-
-# create new flask application for hosting namespace
-app = flask.Flask(__name__)
-
-########################################################################################################################
-
-@app.route('/kmeans////')
-def kmeansClustering(k, initMethod, distance, datasetID):
- """
- Access k-means clustering plugin.
- :param k: number of clusters
- :param initMethod: initialization method for initial clusters
- :param distance: distance measurement
- :param datasetID: identifier of data set
- :return: jsonified output
- """
- try:
- data = loadData(datasetID)
- response = runKMeans(data, int(k), initMethod, distance)
- return flask.jsonify(response)
- except:
- return flask.jsonify({})
-
-########################################################################################################################
-
-@app.route('/hierarchical////')
-def hierarchicalClustering(k, method, distance, datasetID):
- """
- Access hierarchical clustering plugin.
- :param k: number of desired clusters
- :param method: type of single linkage
- :param distance: distance measurement
- :param datasetID: identifier of data set
- :return: jsonified output
- """
- try:
- data = loadData(datasetID)
- response = runHierarchical(data, int(k), method, distance)
- return flask.jsonify(response)
- except:
- return flask.jsonify({})
-
-########################################################################################################################
-
-@app.route('/affinity/////')
-def affinityPropagationClustering(damping, factor, preference, distance, datasetID):
- """
- Access affinity propagation clustering plugin.
- :param damping:
- :param factor:
- :param preference:
- :param distance: distance measurement
- :param datasetID:
- :return:
- """
- try:
- data = loadData(datasetID)
- response = runAffinityPropagation(data, float(damping), float(factor), preference, distance)
- return flask.jsonify(response)
- except:
- return flask.jsonify({})
-
-########################################################################################################################
-
-@app.route('/fuzzy/////')
-def fuzzyClustering(numClusters, m, threshold, distance, datasetID):
- """
- :param numClusters:
- :param m:
- :param threshold:
- :param distance:
- :param datasetID:
- :return:
- """
- try:
- data = loadData(datasetID)
- response = runFuzzy(data, int(numClusters), float(m), float(threshold), distance)
- return flask.jsonify(response)
- except:
- return flask.jsonify({})
-
-########################################################################################################################
-
-def loadAttribute(jsonData, attr):
- import json
- data = json.loads(jsonData)
- if attr in data:
- return data[attr]
- else:
- return None
-
-########################################################################################################################
-
-@app.route('/distances///', methods=['POST'])
-def getDistances(metric, datasetID, sorted):
- """
- Compute the distances of the current stratification values to its centroid.
- :param metric:
- :param datasetID:
- :return: distances and labels sorted in ascending order
- """
- data = loadData(datasetID)
- labels = []
- externLabels = None
-
- if 'group' in flask.request.values:
- labels = loadAttribute(flask.request.values['group'], 'labels')
- externLabels = loadAttribute(flask.request.values['group'], 'externLabels')
- else:
- return ''
-
- response = getClusterDistances(data, labels, metric, externLabels, sorted)
- return flask.jsonify(response)
-
-########################################################################################################################
-
-@app.route('/dendrogram//', methods=['POST'])
-def dendrogramClusters(numClusters, datasetID):
- data = loadData(datasetID)
-
- if 'group' in flask.request.values:
- dendrogram = loadAttribute(flask.request.values['group'], 'dendrogram')
- else:
- return ''
-
- response = getClustersFromDendrogram(data, dendrogram, int(numClusters))
- return flask.jsonify(response)
-
-
-########################################################################################################################
-
-def create():
- """
- Standard Caleydo convention for creating the service when server is initialized.
- :return: Returns implementation of this plugin with given name
- """
- return app
diff --git a/clustering_fuzzy.py b/clustering_fuzzy.py
deleted file mode 100644
index ff4a607..0000000
--- a/clustering_fuzzy.py
+++ /dev/null
@@ -1,223 +0,0 @@
-__author__ = 'Michael Kern'
-__version__ = '0.0.3'
-__email__ = 'kernm@in.tum.de'
-
-########################################################################################################################
-# libraries
-
-# module to load own configurations
-import caleydo_server.config
-# request config if needed for the future
-config = caleydo_server.config.view('caleydo-clustering')
-
-# library to conduct matrix/vector calculus
-import numpy as np
-
-from clustering_util import similarityMeasurement
-
-########################################################################################################################
-# class definition
-
-class Fuzzy(object):
- """
- Formulas: https://en.wikipedia.org/wiki/Fuzzy_clustering
- """
-
- def __init__(self, obs, numClusters, m=2.0, threshold=-1, distance='euclidean', init=None, error=0.0001):
- """
- Initializes algorithm.
- :param obs: observation matrix / genomic data
- :param numClusters: number of clusters
- :param m: fuzzifier, controls degree of fuzziness, from [1; inf]
- :return:
- """
- # observation
- self.__obs = np.nan_to_num(obs)
-
- self.__n = obs.shape[0]
-
- # fuzzifier value
- self.__m = np.float(m)
- # number of clusters
- self.__c = numClusters
-
- # matrix u containing all the weights describing the degree of membership of each patient to the centroid
- if init is None:
- init = np.random.rand(self.__c, self.__n)
-
- self.__u = np.copy(init)
-
- # TODO! scikit normalizes the values at the beginning and at each step to [0; 1]
- self.__u /= np.ones((self.__c, 1)).dot(np.atleast_2d(np.sum(self.__u, axis=0))).astype(np.float64)
- # remove all zero values and set them to smallest possible value
- self.__u = np.fmax(self.__u, np.finfo(np.float64).eps)
- # centroids
- self.__centroids = np.zeros(self.__c)
- # threshold for stopping criterion
- self.__error = error
- # distance function
- self.__distance = distance
-
- # threshold or minimum probability used for cluster assignments
- if threshold == -1:
- self.__threshold = 1.0 / numClusters
- else:
- self.__threshold = threshold
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def __call__(self):
- """
- Caller function for server API
- :return:
- """
- return self.run()
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def computeCentroid(self):
- """
- Compute the new centroids using the computed partition matrix.
- :return:
- """
- uM = self.__u ** self.__m
-
- sumDataWeights = np.dot(uM, self.__obs)
- if self.__obs.ndim == 1:
- m = 1
- else:
- m = self.__obs.shape[1]
-
- sumWeights = np.sum(uM, axis=1)
- # tile array (sum of weights repeated in every row)
- sumWeights = np.ones((m, 1)).dot(np.atleast_2d(sumWeights)).T
-
- if self.__obs.ndim == 1:
- sumWeights = sumWeights.flatten()
-
- # divide by total sum to get new centroids
- self.__centroids = sumDataWeights / sumWeights
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def computeCoefficients(self):
- """
- Compute new partition matrix / weights describing the degree of membership of each patient to all clusters.
- :return:
- """
-
- # TODO you can also use cdist of scipy.spatial.distance module
- distMat = np.zeros((self.__c, self.__n))
-
- for ii in range(self.__c):
- distMat[ii] = similarityMeasurement(self.__obs, self.__centroids[ii], self.__distance)
-
- # set zero values to smallest values to prevent inf results
- distMat = np.fmax(distMat, np.finfo(np.float64).eps)
-
- # apply coefficient formula
- denom = np.float(self.__m - 1.0)
- self.__u = distMat ** (-2.0 / denom)
-
- sumCoeffs = np.sum(self.__u, axis=0)
-
- self.__u /= np.ones((self.__c, 1)).dot(np.atleast_2d(sumCoeffs))
- self.__u = np.fmax(self.__u, np.finfo(np.float64).eps)
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def run(self):
- """
- Perform the c-means fuzzy clustering.
- :return:
- """
- MAX_ITER = 100
- iter = 0
-
- while iter < MAX_ITER:
- # save last partition matrix
- uOld = np.copy(self.__u)
- # compute centroids with given weights
- self.computeCentroid()
- # compute new coefficient matrix
- self.computeCoefficients()
-
- # normalize weight / partition matrix u
- self.__u /= np.ones((self.__c, 1)).dot(np.atleast_2d(np.sum(self.__u, axis=0)))
- self.__u = np.fmax(self.__u, np.finfo(np.float64).eps)
-
- # compute the difference between the old and new matrix
- epsilon = np.linalg.norm(self.__u - uOld)
-
- # stop if difference (epsilon) is smaller than the user-defined threshold
- if epsilon < self.__error:
- break
-
- iter += 1
-
- self.__end()
-
- u = self.__u.T
- # print(self.__u.T)
-
- return self.__centroids.tolist(), self.__clusterLabels, u.tolist(), self.__threshold
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def __end(self):
- """
- Conduct the cluster assignments and creates clusterLabel array.
- :return:
- """
- # assign patient to clusters
- # transpose to get a (n, c) matrix
- u = self.__u.T
-
- self.__labels = np.zeros(self.__n, dtype=np.int)
- self.__clusterLabels = [[] for _ in range(self.__c)]
- # gather all probabilities / degree of memberships of each patient to the clusters
- # self.__clusterProbs = [[] for _ in range(self.__c)]
- # probability that the patients belongs to each cluster
- maxProb = np.float64(self.__threshold)
-
- for ii in range(self.__n):
- # clusterID = np.argmax(u[ii])
- # self.__labels = clusterID
- # self.__clusterLabels[clusterID].append(ii)
-
- for jj in range(self.__c):
- if u[ii][jj] >= maxProb:
- clusterID = jj
- self.__labels = clusterID
- self.__clusterLabels[clusterID].append(int(ii))
-
- # for ii in range(self.__c):
- # self.__clusterLabels[ii], _ = computeClusterInternDistances(self.__obs, self.__clusterLabels[ii])
-
-########################################################################################################################
-
-def _plugin_initialize():
- """
- optional initialization method of this module, will be called once
- :return:
- """
- pass
-
-# ----------------------------------------------------------------------------------------------------------------------
-
-def create(data, numCluster, m, threshold, distance):
- """
- by convention contain a factory called create returning the extension implementation
- :return:
- """
- return Fuzzy(data, numCluster, m, threshold, distance)
-
-########################################################################################################################
-
-if __name__ == '__main__':
-
- data = np.array([[1,1,2],[5,4,5],[3,2,2],[8,8,7],[9,8,9],[2,2,2]])
- # data = np.array([1,1.1,5,8,5.2,8.3])
-
- fuz = Fuzzy(data, 3, 1.5)
- print(fuz.run())
diff --git a/clustering_hierarchical.py b/clustering_hierarchical.py
deleted file mode 100644
index b432f40..0000000
--- a/clustering_hierarchical.py
+++ /dev/null
@@ -1,462 +0,0 @@
-__author__ = 'Michael Kern'
-__version__ = '0.0.3'
-__email__ = 'kernm@in.tum.de'
-
-########################################################################################################################
-# libraries
-
-# module to load own configurations
-import caleydo_server.config
-# request config if needed for the future
-config = caleydo_server.config.view('caleydo-clustering')
-
-# library to conduct matrix/vector calculus
-import numpy as np
-# fastest distance computation by scipy
-import scipy.spatial as spt
-
-# utility functions for clustering and creating the dendrogram trees
-from clustering_util import BinaryNode, BinaryTree
-from clustering_util import similarityMeasurementMatrix
-from clustering_util import computeClusterInternDistances
-
-########################################################################################################################
-
-class Hierarchical(object):
- """
- This is a implementation of hierarchical clustering on genomic data using the Lance-Williams dissimilarity update
- to compute different distance metrics (single linkage, complete linkage, ...).
- Lance-Williams explained in: http://arxiv.org/pdf/1105.0121.pdf
- """
-
- def __init__(self, obs, method='single', distance='euclidean'):
- """
- Initializes the algorithm
- :param obs: genomic data / matrix
- :param method: linkage method
- :return:
- """
- # genomic data / matrix
- # observations, can be 1D array or 2D matrix with genes as rows and conditions as columns
- # remove all NaNs in data
- self.__obs = np.nan_to_num(obs)
-
- numGenes = np.shape(self.__obs)[0]
- self.__n = numGenes
-
- # check if dimension is 2D
- # if self.__obs.ndim == 2:
- # # obtain number of observations (rows)
- # numGenes, _ = np.shape(self.__obs)
- # self.__n = numGenes
-
- # else:
- # print("[Error]:\tdata / observations must be 2D. 1D observation arrays are not supported")
- # raise AttributeError
-
- # distance measurement
- self.__distance = distance
-
- # distance / proximity matrix
- self.__d = []
- self.__computeProximityMatrix()
- # dictionary mapping the string id (i,j,k,...) of clusters to corresponding index in matrix
- self.__idMap = {}
- # inverse mapping of idMap --> returns the string id given a certain index
- self.__keyMap = {}
- # contains actual index of all clusters, old clusters are from [0, n - 1], new clusters have indices in range
- # [n, 2n - 1]
- self.__clusterMap = {}
- for ii in range(self.__n):
- self.__idMap[str(ii)] = ii
- self.__keyMap[ii] = str(ii)
- self.__clusterMap[str(ii)] = ii
-
- # linkage method for hierarchical clustering
- self.__method = method
-
- # internal dendrogram tree
- self.__tree = None
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def __call__(self):
- """
- Caller function for server API
- :return:
- """
- return self.run()
-
- # ------------------------------------------------------------------------------------------------------------------
-
- @property
- def tree(self):
- return self.__tree
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def __getCoefficients(self, clusterI, clusterJ):
- """
- Compute the coefficients for the Lance-Williams algorithm
- :param clusterI:
- :param clusterJ:
- :return:
- """
- # TODO! use hash map for storing numbers instead of computing them every time
- if self.__method == 'single':
- return 0.5, 0.5, 0, -0.5
- elif self.__method == 'complete':
- return 0.5, 0.5, 0, 0.5
- elif self.__method == 'weighted':
- return 0.5, 0.5, 0, 0
- elif self.__method == 'median':
- return 0.5, 0.5, -0.25, 0
-
- # TODO! ATTENTION! average method should compute the cluster centroids using the average
- # TODO! || clusterI - clusterJ || ** 2
- elif self.__method == 'average':
- nI = np.float(clusterI.count(',') + 1)
- nJ = np.float(clusterJ.count(',') + 1)
- sumN = nI + nJ
- return (nI / sumN), (nJ / sumN), 0, 0
-
- # TODO! ATTENTION! centroid method should compute the cluster centroids using the mean
- # TODO! || clusterI - clusterJ || ** 2
- elif self.__method == 'centroid':
- nI = np.float(clusterI.count(',') + 1)
- nJ = np.float(clusterJ.count(',') + 1)
- sumN = nI + nJ
- return (nI / sumN), (nJ / sumN), -((nI * nJ) / (sumN ** 2)), 0
-
- # TODO! Support ward method
- # TODO! (|clusterI| * |clusterJ|) / (|clusterI| + |clusterJ) * || clusterI - clusterJ || ** 2
- # elif self.__method == 'ward':
- # nI = np.float(clusterI.count(',') + 1)
- # nJ = np.float(clusterJ.count(',') + 1)
- # nK = np.float(clusterK.count(',') + 1)
- # sumN = nI + nJ + nK
- # return (nI + nK) / sumN, (nJ + nK) / sumN, -nK / sumN, 0
- else:
- raise AttributeError
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def __computeProximityMatrix(self):
- """
- Compute the proximity of each observation and store the results in a nxn matrix
- :return:
- """
-
- # create distance matrix of size n x n
- self.__d = np.zeros((self.__n, self.__n))
-
- # compute euclidean distance
- # TODO! implement generic distance functions
- # TODO! look for an alternative proximity analysis without computing all distances
- self.__d = similarityMeasurementMatrix(self.__obs, self.__distance)
-
- # get number of maximum value of float
- self.__maxValue = self.__d.max() + 1
-
- # fill diagonals with max value to exclude them from min dist process
- # TODO! operate only on upper triangle matrix of distance matrix
- np.fill_diagonal(self.__d, self.__maxValue)
-
- # print('\t-> finished.')
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def __getMatrixMinimumIndices(self):
- """
- Searches for the minimum distance in the distance matrix
- :return: indices of both clusters having the smallest distance
- """
- minDist = self.__d.min()
- minList = np.argwhere(self.__d == minDist)
-
- minI, minJ = 0, 0
-
- # look for indices, where i < j
- # TODO! for the future --> use upper triangle matrix
- for ii in range(len(minList)):
- minI, minJ = minList[ii]
- if minI < minJ:
- break
-
- if minI == minJ:
- print("ERROR")
-
- return self.__keyMap[minI], self.__keyMap[minJ], minDist
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def __deleteClusters(self, i, j):
- """
- Reorders and reduces the matrix to insert the new cluster formed of cluster i and j
- and its distance values, and removes the old clusters by cutting the last row.
- :param i: cluster index i
- :param j: cluster index j
- :return:
- """
- idI = self.__idMap[str(i)]
- idJ = self.__idMap[str(j)]
-
- minID = min(idI, idJ)
- maxID = max(idI, idJ)
-
- # now set column max ID to last column -> swap last and i column
- lastRow = self.__d[self.__n - 1]
- self.__d[maxID] = lastRow
- self.__d[:, maxID] = self.__d[:, (self.__n - 1)]
-
- # set key of last column (cluster) to column of the cluster with index maxID
- key = self.__keyMap[self.__n - 1]
- self.__idMap[key] = maxID
- self.__keyMap[maxID] = key
-
- # delete entries in id and key map --> not required anymore
- try:
- del self.__idMap[i]
- del self.__idMap[j]
- del self.__keyMap[self.__n - 1]
- except KeyError:
- print("\nERROR: Key {} not found in idMap".format(j))
- print("ERROR: Previous key: {} in idMap".format(i))
- print("Given keys: ")
- for key in self.__idMap:
- print(key)
- return
-
- # reduce dimension of matrix by one column and row
- self.__n -= 1
- self.__d = self.__d[:-1, :-1]
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def __mergeClusters(self, i, j):
- """
- Merges cluster i and j, computes the new ID and distances of the newly formed cluster
- and stores required information
- :param i: cluster index i
- :param j: cluster index j
- :return:
- """
- idI = self.__idMap[str(i)]
- idJ = self.__idMap[str(j)]
-
- minID = min(idI, idJ)
- maxID = max(idI, idJ)
-
- # use Lance-Williams formula to compute linkages
- DKI = self.__d[:, minID]
- DKJ = self.__d[:, maxID]
- DIJ = self.__d[minID, maxID]
- distIJ = np.abs(DKI - DKJ)
-
- # compute coefficients
- ai, aj, b, y = self.__getCoefficients(i, j)
-
- newEntries = ai * DKI + aj * DKJ + b * DIJ + y * distIJ
- newEntries[minID] = self.__maxValue
- newEntries[maxID] = self.__maxValue
-
- # add new column and row
- self.__d[minID] = newEntries
- self.__d[:, minID] = newEntries
-
- idIJ = minID
- newKey = i + ',' + j
- self.__idMap[newKey] = idIJ
- self.__keyMap[idIJ] = newKey
- self.__clusterMap[newKey] = len(self.__clusterMap)
-
- # delete old clusters
- self.__deleteClusters(i, j)
-
- # count number of elements
- return newKey.count(',') + 1
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def run(self):
- """
- Conducts the algorithm until there's only one cluster.
- :return:
- """
-
- # number of the current iteration
- m = 0
-
- # resulting matrix containing information Z[i,x], x=0: cluster i, x=1: cluster j, x=2: dist(i,j), x=3: num(i,j)
- runs = self.__n - 1
- Z = np.array([[0 for _ in range(4)] for _ in range(runs)], dtype=np.float)
-
- while m < runs:
- m += 1
-
- i, j, distIJ = self.__getMatrixMinimumIndices()
- numIJ = self.__mergeClusters(i, j)
-
- clusterI, clusterJ = self.__clusterMap[i], self.__clusterMap[j]
- Z[m - 1] = [int(min(clusterI, clusterJ)), int(max(clusterI, clusterJ)), np.float(distIJ), int(numIJ)]
-
- # reset number n to length of first dimension (number of genes)
- self.__n = np.shape(self.__obs)[0]
-
- self.__tree = self.generateTree(Z)
- return Z.tolist()
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def generateTree(self, linkageMatrix):
- """
- Computes the dendrogram tree for a given linkage matrix.
- :param linkageMatrix:
- :return:
- """
- self.__tree = None
-
- treeMap = {}
- numTrees = len(linkageMatrix)
-
- for ii in range(numTrees):
- entry = linkageMatrix[ii]
- currentID = self.__n + ii
- leftIndex, rightIndex, value, num = int(entry[1]), int(entry[0]), entry[2], int(entry[3])
- left = right = None
-
- if leftIndex < self.__n:
- left = BinaryNode(self.__obs[leftIndex].tolist(), leftIndex, 1, None, None)
- else:
- left = treeMap[leftIndex]
-
- if rightIndex < self.__n:
- right = BinaryNode(self.__obs[rightIndex].tolist(), rightIndex, 1, None, None)
- else:
- right = treeMap[rightIndex]
-
- if isinstance(left, BinaryNode) and isinstance(right, BinaryNode):
- treeMap[currentID] = BinaryTree(left, right, currentID, value)
- elif isinstance(left, BinaryNode):
- treeMap[currentID] = right.addNode(left, currentID, value)
- del treeMap[rightIndex]
- elif isinstance(right, BinaryNode):
- treeMap[currentID] = left.addNode(right, currentID, value)
- del treeMap[leftIndex]
- else:
- treeMap[currentID] = left.merge(right, currentID, value)
- del treeMap[rightIndex]
- del treeMap[leftIndex]
-
- self.__tree = treeMap[numTrees + self.__n - 1]
- return self.__tree
-
- # ------------------------------------------------------------------------------------------------------------------
-
-########################################################################################################################
-
-from clustering_util import cutJsonTreeByClusters
-
-def getClusters(k, obs, dendrogram, sorted=True):
- """
- First implementation to cut dendrogram tree automatically by choosing nodes having the greatest node values
- or rather distance to the other node / potential cluster
- :param k: number of desired clusters
- :param obs: set of observations
- :param dendrogram: dendrogram tree
- :return: centroids, sorted cluster labels and normal label list
- """
- obs = np.nan_to_num(obs)
- n = obs.shape[0]
-
- if isinstance(dendrogram, BinaryTree):
- clusterLabels = dendrogram.cutTreeByClusters(k)
- else:
- clusterLabels = cutJsonTreeByClusters(dendrogram, k)
-
- clusterCentroids = []
- labels = np.zeros(n, dtype=np.int)
- clusterID = 0
-
- for ii in range(len(clusterLabels)):
- cluster = clusterLabels[ii]
- subObs = obs[cluster]
- clusterCentroids.append(np.mean(subObs, axis=0).tolist())
-
- for id in cluster:
- labels[id] = clusterID
-
- # sort labels according to their distance
- if sorted:
- clusterLabels[ii], _ = computeClusterInternDistances(obs, cluster)
-
- clusterID += 1
-
- return clusterCentroids, clusterLabels, labels.tolist()
-
-########################################################################################################################
-
-def _plugin_initialize():
- """
- optional initialization method of this module, will be called once
- :return:
- """
- pass
-
-# ----------------------------------------------------------------------------------------------------------------------
-
-def create(data, method, distance):
- """
- by convention contain a factory called create returning the extension implementation
- :return:
- """
- return Hierarchical(data, method, distance)
-
-########################################################################################################################
-
-from timeit import default_timer as timer
-from scipy.cluster.hierarchy import linkage, leaves_list
-
-if __name__ == '__main__':
-
- np.random.seed(200)
- # data = np.array([[1,2,3],[5,4,5],[3,2,2],[8,8,7],[9,6,7],[2,3,4]])
- data = np.array([1,1.1,5,8,5.2,8.3])
-
- timeMine = 0
- timeTheirs = 0
-
-
- n = 10
-
- for i in range(n):
- # data = np.array([np.random.rand(6000) * 4 - 2 for _ in range(249)])
- # import time
- s1 = timer()
- hier = Hierarchical(data, 'complete')
- # s = time.time()
- linkageMatrix = hier.run()
- e1 = timer()
- print(linkageMatrix)
- tree = hier.generateTree(linkageMatrix)
- # print(tree.getLeaves())
- # print(tree.jsonify())
- # print(hier.getClusters(3))
- import json
- jsonTree = json.loads(tree.jsonify())
- getClusters(3, data, jsonTree)
-
-
- s2 = timer()
- linkageMatrix2 = linkage(data, 'complete')
- # print(leaves_list(linkageMatrix2))
- e2 = timer()
-
- timeMine += e1 - s1
- timeTheirs += e2 - s2
-
- # print(linkageMatrix)
- # print(linkageMatrix2)
- print('mine: {}'.format(timeMine / n))
- print('theirs: {}'.format(timeTheirs / n))
-
diff --git a/clustering_kmeans.py b/clustering_kmeans.py
deleted file mode 100644
index 5e5d2a2..0000000
--- a/clustering_kmeans.py
+++ /dev/null
@@ -1,398 +0,0 @@
-__author__ = 'Michael Kern'
-__version__ = '0.0.2'
-__email__ = 'kernm@in.tum.de'
-
-########################################################################################################################
-# libraries
-
-# module to load own configurations
-import caleydo_server.config
-# request config if needed in the future
-config = caleydo_server.config.view('caleydo-clustering')
-
-# numpy important to conduct matrix/vector calculus
-import numpy as np
-# creates random numbers
-import random
-
-# contains utility functions
-from clustering_util import weightedChoice, similarityMeasurement, computeClusterInternDistances
-
-########################################################################################################################
-
-class KMeans:
- """
- This is an implementation of the k-means algorithm to cluster genomic data / matrices.
- Returns the centroids, the labels / stratification of each row belonging to one cluster,
- distance matrix for cluster-cluster distance and distance arrays for row-clusterCentroid distance.
- Implementation detail:
- """
-
- def __init__(self, obs, k, initMode='kmeans++', distance='sqeuclidean', iters=1000):
- """
- Initializes the algorithm with observation, number of k clusters, the initial method and
- the maximum number of iterations.
- Initialization method of random cluster choice can be: forgy, uniform, random, plusplus
- :param obs: genomic data / matrix
- :param k: number of clusters
- :param initMode: initialization method
- :param distance: distance measurement
- :param iters: number of maximum iterations
- :return:
- """
-
- # number of clusters
- self.__k = k
- # observations, can be 1D array or 2D matrix with genes as rows and conditions as columns
- # remove all NaNs in data
- self.__obs = np.nan_to_num(obs)
- # number of observations / genes
- self.__n = np.shape(obs)[0]
- # maps the element ids to clusters
- self.__labelMap = np.zeros(self.__n, dtype=np.int)
- # cluster means and number of elements
- self.__clusterMeans = np.array([obs[0] for _ in range(k)], dtype=np.float)
- self.__clusterNums = np.array([0 for _ in range(k)], dtype=np.int)
- # tells if any cluster has changed or rather if any data item was moved
- self.__changed = True
- # number of iterations
- self.__iters = iters
- # initialization method
- self.__initMode = initMode
- # compare function
- self.__distance = distance
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def __call__(self):
- """
- Caller function for server API.
- """
- return self.run()
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def __init(self):
- """
- Initialize clustering with random clusters using a user-specified method
- :return:
- """
- # TODO! consider to init k-Means algorithm with Principal Component Analysis (PCA)
- # TODO! see
- # init cluster
- if self.__initMode == 'forgy':
- self.__forgyMethod()
- elif self.__initMode == 'uniform':
- self.__uniformMethod()
- elif self.__initMode == 'random':
- self.__randomMethod()
- elif self.__initMode == 'kmeans++':
- self.__plusplusMethod()
- else:
- raise AttributeError
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def __forgyMethod(self):
- """
- Initialization method:
- Randomly choose k observations from the data using a uniform random distribution.
- :return:
- """
- for ii in range(self.__k):
- self.__clusterMeans[ii] = (self.__obs[random.randint(0, self.__n - 1)])
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def __uniformMethod(self):
- """
- Initialization method:
- Randomly assign each observation to one of the k clusters using uniform random distribution
- and compute the centroids of each cluster.
- :return:
- """
- for i in range(self.__n):
- self.__labelMap[i] = random.randint(0, self.__k - 1)
-
- self.__update()
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def __randomMethod(self):
- """
- Initialization method:
- Randomly choose k observations from the data by estimating the mean and standard deviation of the data and
- using the gaussian random distribution.
- :return:
- """
- mean = np.mean(self.__obs, axis=0)
- std = np.std(self.__obs, axis=0)
-
- for ii in range(self.__k):
- self.__clusterMeans[ii] = np.random.normal(mean, std)
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def __plusplusMethod(self):
- """
- Initialization method:
- Chooses k observations by computing probabilities for each observation and using a weighted random distribution.
- Algorithm: . This method should accelerate the algorithm by finding
- the appropriate clusters right at the beginning and hence should make it more robust.
- :return:
- """
- # 1) choose random center out of data
- self.__clusterMeans[0] = (random.choice(self.__obs))
-
- maxValue = np.max(self.__obs) + 1
- probs = np.array([maxValue for _ in range(self.__n)])
-
- for i in range(1, self.__k):
- probs.fill(maxValue)
- # compute new probabilities, choose min of all distances
- for j in range(0, i):
- dists = similarityMeasurement(self.__obs, self.__clusterMeans[j], self.__distance)
- # collect minimum squared distances to cluster centroids
- probs = np.minimum(probs, dists)
-
- # sum all squared distances
- sumProbs = np.float(np.sum(probs))
-
- if sumProbs != 0:
- probs /= sumProbs
- # 3) choose new center based on probabilities
- self.__clusterMeans[i] = (self.__obs[weightedChoice(probs)])
- else:
- print('ERROR: cannot find enough cluster centroids for given k = ' + str(self.__k))
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def getClusterMean(self, num):
- """
- Returns the centroid of the cluster with index num.
- :param num:
- :return:
- """
- if num >= self.__k:
- return None
- else:
- return self.__clusterMeans[num]
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def getClusterOfElement(self, index):
- """
- :param index: number of element in observation array
- :return: cluster id of observation with given index.
- """
- if index >= self.__n:
- return None
- else:
- return self.__labelMap[index]
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def printClusters(self):
- """
- Print the cluster centroids and the labels.
- :return:
- """
- print('Centroids: ' + str(self.__centroids) + ' | Labels: ' + str(self.__labels))
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def __assignment(self):
- """
- Assignment step:
- Compute distance of current observation to each cluster centroid and move gene to the nearest cluster.
- :return:
- """
- for i in range(self.__n):
- value = self.__obs[i]
-
- # compute squared distances to each mean
- dists = similarityMeasurement(self.__clusterMeans, value, self.__distance)
- # nearest cluster
- nearestID = np.argmin(dists)
-
- if self.__labelMap[i] != nearestID:
- self.__changed = True
- self.__labelMap[i] = nearestID
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def __update(self):
- """
- Update step:
- Compute the new centroids of each cluster after the assignment.
- :return:
- """
- self.__clusterMeans.fill(0)
- self.__clusterNums.fill(0)
-
- self.__clusterLabels = [[] for _ in range(self.__k)]
-
- for ii in range(self.__n):
- clusterID = self.__labelMap[ii]
- self.__clusterLabels[clusterID].append(ii)
- self.__clusterNums[clusterID] += 1
-
- for ii in range(self.__k):
- self.__clusterMeans[ii] = np.mean(self.__obs[self.__clusterLabels[ii]], axis=0)
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def __end(self):
- """
- Writes the results to the corresponding member variables.
- :return:
- """
- # returned values | have to be reinitialized in case of sequential running
- # centroids
- self.__centroids = np.array([self.__obs[0] for _ in range(self.__k)], dtype=np.float)
- # labels of observations
- self.__labels = np.array([0 for _ in range(self.__n)], dtype=np.int)
- # distances between centroids
- # self.__centroidDistMat = np.zeros((self.__k, self.__k))
-
- # we do not use OrderedDict here, so obtain dict.values and fill array manually
- for index in range(self.__n):
- clusterID = self.__labelMap[index]
- self.__labels[index] = clusterID
-
- # collect centroids
- for ii in range(self.__k):
- # self.__centroids.append(self.__clusterMeans[ii].tolist())
- self.__centroids[ii] = self.__clusterMeans[ii]
-
- # compute distances between each centroids
- # for ii in range(self.__k - 1):
- # # compute indices of other clusters
- # jj = range(ii + 1, self.__k)
- # # select matrix of cluster centroids
- # centroidMat = self.__centroids[jj]
- # distances = np.sqrt(self.__compare(centroidMat, self.__centroids[ii]))
- # self.__centroidDistMat[ii, jj] = distances
- # self.__centroidDistMat[jj, ii] = distances
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def run(self):
- """
- Runs the algorithm of k-means, using the initialization method and the assignment/update step.
- Conducts at most iters iterations and terminates if this number is exceeded or no observations
- was moved to another cluster.
- :return:
- """
- # 1) init algorithm by choosing cluster centroids
- self.__init()
-
- MAX_ITERS = self.__iters
- counter = 0
- # 2) run clustering
- while self.__changed and counter < MAX_ITERS:
- self.__changed = False
-
- self.__assignment()
- self.__update()
-
- counter += 1
-
- self.numIters = counter
-
- # write results to the class members
- self.__end()
- return self.__centroids.tolist(), self.__labels.tolist(), self.__clusterLabels
- #, self.__centroidDistMat.tolist()
-
- # ------------------------------------------------------------------------------------------------------------------
-
- # def getDistsPerCentroid(self):
- # """
- # Compute the distances between observations belonging to one cluster and the corresponding cluster centroid.
- # Cluster labels are sorted in ascending order using their distances
- # :return: array of distance arrays for each cluster and ordered labels
- # """
- #
- # # labels per centroid
- # # self.__clusterLabels = [[] for _ in range(self.__k)]
- # # distances of obs to their cluster
- # self.__centroidDists = [[] for _ in range(self.__k)]
- #
- # for ii in range(self.__k):
- # self.__clusterLabels[ii] = np.array(self.__clusterLabels[ii], dtype=np.int)
- #
- # # compute euclidean distances of values to cluster mean
- # for ii in range(self.__k):
- # mean = self.__clusterMeans[ii]
- # obs = self.__obs[self.__clusterLabels[ii]]
- # dists = similarityMeasurement(obs, mean, self.__compare).tolist()
- # self.__centroidDists[ii] = dists
- #
- # # sort indices in ascending order using the distances
- # indices = range(len(dists))
- # indices.sort(key=dists.__getitem__)
- # self.__clusterLabels[ii] = self.__clusterLabels[ii][indices].tolist()
- # self.__centroidDists[ii].sort()
- #
- # return self.__clusterLabels, self.__centroidDists
-
-########################################################################################################################
-
-def _plugin_initialize():
- """
- optional initialization method of this module, will be called once
- :return:
- """
- pass
-
-# ----------------------------------------------------------------------------------------------------------------------
-
-def create(data, k, initMethod, distance):
- """
- by convention contain a factory called create returning the extension implementation
- :return:
- """
- return KMeans(data, k, initMethod, distance)
-
-########################################################################################################################
-
-from timeit import default_timer as timer
-from scipy.cluster.vq import kmeans2, kmeans
-
-"""
-This is for testing the algorithm and comparing the resuls between this and scipy's algorithm
-"""
-if __name__ == '__main__':
- from datetime import datetime
- #np.random.seed(datetime.now())
- # data = np.array([[1,2,3],[5,4,5],[3,2,2],[8,8,7],[9,6,7],[2,3,4]])
- data = np.array([1,1.1,5,8,5.2,8.3])
-
- # data = np.array([np.random.rand(2) * 5 for _ in range(10)])
- k = 3
-
- timeMine = 0
- timeTheirs = 0
- n = 10
-
- for i in range(10):
- s1 = timer()
- kMeansPlus = KMeans(data, k, 'kmeans++', 'sqeuclidean', 10)
- result1 = kMeansPlus.run()
- #print(result)
- e1 = timer()
- # labels = kMeansPlus.getDistsPerCentroid()
- # l, d = computeClusterDistances(data, labels[0])
-
- s2 = timer()
- result2 = kmeans2(data, k)
- e2 = timer()
-
- timeMine += e1 - s1
- timeTheirs += e2 - s2
-
- print(result1)
- print(result2)
- print('mine: {}'.format(timeMine / n))
- print('theirs: {}'.format(timeTheirs / n))
diff --git a/clustering_service.py b/clustering_service.py
deleted file mode 100644
index 0bed35c..0000000
--- a/clustering_service.py
+++ /dev/null
@@ -1,151 +0,0 @@
-__author__ = 'Michael Kern'
-__version__ = '0.0.1'
-__email__ = 'kernm@in.tum.de'
-
-import numpy as np
-from clustering_hierarchical import getClusters
-
-########################################################################################################################
-
-def loadData(datasetID):
- """
- Loads the genomic data with given identifier datasetID.
- :param datasetID: identifier
- :return: array of the genomic data
- """
- import caleydo_server.dataset as dt
- # obtain Caleydo dataset from ID
- dataset = dt.get(datasetID)
- # choose loaded attribute and load raw data in numpy format
- # somehow hack to get a numpy array out of the data
- try:
- arr = np.array(list(dataset.asnumpy()))
- except:
- raise Exception
- return arr
-
-########################################################################################################################
-
-def loadPlugin(pluginID, *args, **kwargs):
- """
- Loads the clustering plugin with given arguments.
- :param pluginID: identifier of plugin
- :param *args: additional caller function arguments
- :param **kwargs: additional arguments
- :return: plugin
- """
- import caleydo_server.plugin
- # obtain all plugins with 'pluginID' extension
- plugins = caleydo_server.plugin.list('clustering')
- # choose plugin with given ID
- for plugin in plugins:
- if plugin.id == pluginID:
- # load the implementation of the plugin
- return plugin.load().factory(*args, **kwargs)
-
- raise NotImplementedError
-
-
-########################################################################################################################
-
-def runKMeans(data, k, initMethod, distance):
- """
- Runs the k-Means clustering algorithm given the loaded data set, the number of clusters k and the initialization
- method.
- :param data: observation matrix
- :param k: number of clusters
- :param initMethod: number of clusters
- :return: result of k-means
- """
- KMeans = loadPlugin('caleydo-clustering-kmeans', data, k, initMethod, distance)
- # and run the kmeans extension
- centroids, labels, clusterLabels = KMeans()
- # clusterLabels, clusterDists = KMeans.getDistsPerCentroid()
-
- return {'centroids': centroids, 'clusterLabels': clusterLabels}
-
-########################################################################################################################
-
-def runHierarchical(data, k, method, distance):
- """
- Runs the hierarchical clustering algorithm given the loaded data set and type of linkage method.
- :param data: observation matrix
- :param method: linkage method
- :return: linkage matrix / dendrogram of the algorithm
- """
- Hierarchical = loadPlugin('caleydo-clustering-hierarchical', data, method, distance)
- # and use the extension
- Hierarchical()
- # obtain k-number of clusters
- centroids, clusterLabels, labels = getClusters(k, data, Hierarchical.tree, False)
-
- return {'centroids': centroids, 'clusterLabels': clusterLabels, 'dendrogram': Hierarchical.tree.json()}
- # print('\t-> creating dendrogram tree...')
- # tree = Hierarchical.generateTree(linkage)
- # print('\t-> creating json string ...')
- # dendrogram = tree.jsonify()
- # print('\t-> finished.')
-
- # return {'dendrogram': dendrogram} --> if needed later
-
-########################################################################################################################
-
-def runAffinityPropagation(data, damping, factor, preference, distance):
- """
- Runs the affinity propagation algorithm given the loaded dataset, a damping value, a certain factor and
- a preference method.
- :param data:
- :param damping:
- :param factor:
- :param preference:
- :return:
- """
- Affinity = loadPlugin('caleydo-clustering-affinity', data, damping, factor, preference, distance)
- # use this extension
- centroids, labels, clusterLabels = Affinity()
-
- return {'centroids': centroids, 'clusterLabels': clusterLabels}
-
-########################################################################################################################
-
-def runFuzzy(data, numClusters, m, threshold, distance):
- Fuzzy = loadPlugin('caleydo-clustering-fuzzy', data, numClusters, m, threshold, distance)
-
- centroids, clusterLabels, partitionMatrix, maxProb = Fuzzy()
-
- return {'centroids': centroids, 'clusterLabels': clusterLabels, 'partitionMatrix': partitionMatrix,
- 'maxProbability': maxProb}
-
-########################################################################################################################
-
-def getClusterDistances(data, labels, metric, externLabels = None, sorted = True):
- """
- Compute the cluster distances in a given data among certain rows (labels)
- :param data: genomic data
- :param labels: indices of rows
- :param metric: distance metric
- :param externLabels:
- :return: labels and distances values sorted in ascending order
- """
- from clustering_util import computeClusterInternDistances, computeClusterExternDistances
- distLabels, distValues = computeClusterInternDistances(data, labels, sorted, metric)
-
- if externLabels is not None:
- externDists = computeClusterExternDistances(data, distLabels, externLabels, metric)
- return {'labels': distLabels, 'distances': distValues, 'externDistances': externDists}
- else:
- return {'labels': distLabels, 'distances': distValues}
-
-########################################################################################################################
-
-def getClustersFromDendrogram(data, dendrogram, numClusters):
- """
-
- :param data:
- :param dendrogram:
- :param numClusters:
- :return:
- """
-
- centroids, clusterLabels, _ = getClusters(numClusters, data, dendrogram)
- return {'centroids': centroids, 'clusterLabels': clusterLabels}
diff --git a/clustering_util.py b/clustering_util.py
deleted file mode 100644
index 575a0d2..0000000
--- a/clustering_util.py
+++ /dev/null
@@ -1,470 +0,0 @@
-__author__ = "Michael Kern"
-__email__ = 'kernm@in.tum.de'
-
-########################################################################################################################
-
-import random
-import numpy as np
-
-# use scipy to compute different distance matrices
-from scipy.spatial.distance import pdist, squareform
-import scipy.stats as stats
-
-"""
-http://eli.thegreenplace.net/2010/01/22/weighted-random-generation-in-python
---> good explanation to create weighted choices / random numbers
-"""
-def weightedChoice(weights):
-
- # compute sum of all weights
- sumTotal = sum(weights)
- # compute a random with range[0, sumTotal]
- rnd = random.random() * sumTotal
-
- for index, weight in enumerate(weights):
- # subtract current weight from random to find current index
- rnd -= weight
- if rnd < 0:
- return index
-
- # 20% faster if weights are sorted in descending order
-
-########################################################################################################################
-
-"""
-Implementation of an binary tree for hierarchical clustering
-"""
-
-"""
-Node of the tree containing information about it's id in data, children and the value
-"""
-class BinaryNode:
- def __init__(self, value, id, size, leftChild, rightChild):
- self.value = value
- self.left = leftChild
- self.right = rightChild
- self.size = size
- self.id = id
- self.parent = None
- self.indices = [id]
-
- # create json info on the fly
- self.json = {"id": self.id, "size": self.size, "value": self.value, "indices": [id]}
- if leftChild is not None and rightChild is not None:
- # self.json["value"] = np.mean(self.value)
- self.json["children"] = [rightChild.json, leftChild.json]
- self.indices = [] + rightChild.indices + leftChild.indices
- self.json["indices"] = self.indices
-
- def isLeave(self):
- return self.left is None and self.right is None
-
-########################################################################################################################
-
-"""
-Implementation of an hierarchical binary tree
-"""
-class BinaryTree:
- # this tree must not be empty and must have at least two children (leaves)
- def __init__(self, leftNode, rightNode, newID, newValue):
- self.__createNewRoot(leftNode, rightNode, newID, newValue)
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def addNode(self, newNode, newID, newValue):
- self.__createNewRoot(self.root, newNode, newID, newValue)
- return self
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def merge(self, tree, newID, newValue):
- self.__createNewRoot(self.root, tree.root, newID, newValue)
- return self
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def jsonify(self):
- import json
- return json.dumps(self.root.json)
- # return self.root.json
- # return self.__traverseJson(self.root)
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def json(self):
- return self.root.json
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def cutTreeByClusters(self, k):
- queue = [self.root]
-
- while len(queue) < k:
- node = queue.pop(0)
- queue.append(node.left)
- queue.append(node.right)
-
- def keyFunc(x):
- if x.isLeave():
- return 0
- else:
- return -x.value
-
- queue.sort(key=keyFunc)
-
- clusters = []
-
- for node in queue:
- clusters.append(node.indices)
-
- return clusters
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def __traverseJson(self, node):
- json = {"id": node.id, "size": node.size, "value": node.value}
- if node.left is None and node.right is None:
- return json
- else:
- json["children"] = [] + [self.__traverseJson(node.left)] + [self.__traverseJson(node.right)]
-
- return json
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def getLeaves(self):
- return self.root.indices
- # return self.__traverseIDs(self.root)
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def __traverseIDs(self, node):
-
- if node.left is None and node.right is None:
- return [node.id]
- else:
- return [] + self.__traverseIDs(node.right) + self.__traverseIDs(node.left)
-
- # ------------------------------------------------------------------------------------------------------------------
-
- def __createNewRoot(self, leftNode, rightNode, newID, newValue):
- newSize = leftNode.size + rightNode.size
- self.root = BinaryNode(newValue, newID, newSize, leftNode, rightNode)
- leftNode.parent = rightNode.parent = self.root
-
- # ------------------------------------------------------------------------------------------------------------------
-
-def cutJsonTreeByClusters(jsonData, k):
- # import json
- # tree = json.loads(jsonData)
- queue = [jsonData]
-
- while len(queue) < k:
- node = queue.pop(0)
- queue.append(node['children'][0])
- queue.append(node['children'][1])
-
- def keyFunc(x):
- if 'children' not in x:
- return 0
- else:
- return -x['value']
-
- queue.sort(key=keyFunc)
-
- clusters = []
-
- for node in queue:
- clusters.append(node['indices'])
-
- return clusters
-
-########################################################################################################################
-
-def euclideanDistance(matrix, vector, squared=False):
- """
- Computes the euclidean distance between a vector and the rows of a matrix in parallel.
- :param matrix: array of observations or clusters
- :param vector: cluster centroid or observation
- :return:
- """
-
- # compute distance between values in matrix and the vector
- distMat = matrix - vector
- numValues = len(matrix)
- distances = np.array([0.0 for _ in range(numValues)], dtype=np.float)
-
- for ii in range(numValues):
- distance = distMat[ii]
- # always try to use np.dot when computing euclidean distance
- # it's way faster than ** 2 and sum(..., axis=1)
- distances[ii] = np.dot(distance, distance)
-
- if squared:
- return distances
- else:
- return np.sqrt(distances)
-
-# ----------------------------------------------------------------------------------------------------------------------
-
-def correlationDistance(matrix, vector, method):
- """
-
- :param matrix:
- :param vector:
- :return:
- """
-
- numValues = len(matrix)
- distances = np.array([0.0 for _ in range(numValues)], dtype=np.float)
-
- for ii in range(numValues):
- value = matrix[ii]
-
- if method == 'pearson':
- distances[ii], _ = stats.pearsonr(value, vector)
- elif method == 'spearman':
- distances[ii], _ = stats.spearmanr(value, vector)
- elif method == 'kendall':
- distances[ii], _ = stats.kendalltau(value, vector)
- else:
- raise AttributeError
-
- return distances
-
-# ----------------------------------------------------------------------------------------------------------------------
-
-from scipy.spatial.distance import cdist
-
-def similarityMeasurement(matrix, vector, method='euclidean'):
-
- if method == 'euclidean':
- return euclideanDistance(matrix, vector)
-
- if method == 'sqeuclidean':
- return euclideanDistance(matrix, vector, True)
-
- spatialMethods = ['cityblock', 'chebyshev', 'canberra', 'correlation', 'hamming', 'mahalanobis',]
-
- if method in spatialMethods:
- return np.nan_to_num(cdist(matrix, np.atleast_2d(vector), method).flatten())
-
- corrMethods = ['spearman', 'pearson', 'kendall']
-
- if method in corrMethods:
- return correlationDistance(matrix, vector, method)
-
- raise AttributeError
-
-# ----------------------------------------------------------------------------------------------------------------------
-
-def euclideanDistanceMatrix(matrix, squared=False):
- """
- Compute the euclidean distance matrix required for the algorithm
- :param matrix:
- :param n:
- :return:
- """
-
- n = np.shape(matrix)[0]
- distMat = np.zeros((n, n))
-
- # use Gram matrix and compute distances without inner products | FASTER than row-by-row method
- "Gramiam matrix to compute dot products of each pair of elements: "
- ""
- gramMat = np.zeros((n, n))
- for ii in range(n):
- for jj in range(ii, n):
- gramMat[ii, jj] = np.dot(matrix[ii], matrix[jj])
-
- # # ! This is slower than computing dot products of rows manually in python
- # # ! And we only require the upper triangle matrix of the Gram matrix
- # gramMat = np.dot(self.__obs, self.__obs.T)
-
- # make use of formula |a - b|^2 = a^2 - 2ab + b^2
- for ii in range(n):
- # self.__d[ii, ii] = self.__maxValue
- jj = np.arange(ii + 1, n)
- distMat[ii, jj] = gramMat[ii, ii] - 2 * gramMat[ii, jj] + gramMat[jj, jj]
- distMat[jj, ii] = distMat[ii, jj]
-
- # # take square root of distances to compute real euclidean distance
- # distMat = np.sqrt(distMat)
-
- "alternative version --> use scipy's fast euclidean distance implementation: FASTEST"
- # distMat = spt.distance.pdist(self.__obs, 'euclidean')
- # self.__d = spt.distance.squareform(distMat)
- # print(distMat)
-
- if squared:
- return distMat
- else:
- return np.sqrt(distMat)
-
-# ----------------------------------------------------------------------------------------------------------------------
-
-def norm1Distance(matrix, vector):
- """
- Computes the norm-1 distance between a vector and the rows of a matrix in parallel.
- :param matrix: array of observations or clusters
- :param vector: cluster centroid or observation
- :return:
- """
- distMat = np.abs(matrix - vector)
- numValues = len(vector)
-
- distances = np.sum(distMat, axis=1) / numValues
- return distances
-
-# ----------------------------------------------------------------------------------------------------------------------
-
-def pearsonCorrelationMatrix(matrix):
- """
-
- :param matrix:
- :param n:
- :return:
- """
- # TODO! other possibilites like 1 - abs(corr) | sqrt(1 - corr ** 2) | (1 - corr) / 2
- distMat = 1 - np.corrcoef(matrix)
-
- return distMat
-
-# ----------------------------------------------------------------------------------------------------------------------
-
-def statsCorrelationMatrix(matrix, method):
- if method == 'pearson':
- return pearsonCorrelationMatrix(matrix)
-
- n = np.shape(matrix)[0]
- distMat = np.zeros((n, n))
-
- for ii in range(n):
- rowI = matrix[ii]
- for jj in range(ii + 1, n):
- rowJ = matrix[jj]
- corr = 0
-
- if method == 'spearman':
- corr, _ = stats.spearmanr(rowI, rowJ)
-
- if method == 'kendall':
- corr, _ = stats.kendalltau(rowI, rowJ)
-
- # TODO! other possibilites like 1 - abs(corr) | sqrt(1 - corr ** 2) | (1 - corr) / 2
- corr = 1 - corr
-
- distMat[ii, jj] = corr
- distMat[jj, ii] = corr
-
- return distMat
-
-# ----------------------------------------------------------------------------------------------------------------------
-
-def similarityMeasurementMatrix(matrix, method):
- """
- Generic function to determine the similarity measurement for clustering
- :param matrix:
- :param method:
- :return:
- """
- if method == 'euclidean':
- return euclideanDistanceMatrix(matrix)
- # return squareform(pdist(matrix, method))
-
- if method == 'sqeuclidean':
- return euclideanDistanceMatrix(matrix, True)
- # return squareform(pdist(matrix, method))
-
- spatialMethods = ['cityblock', 'chebyshev', 'canberra', 'correlation', 'hamming', 'mahalanobis']
-
- if method in spatialMethods:
- return squareform(np.nan_to_num(pdist(matrix, method)))
-
- corrMethods = ['spearman', 'pearson', 'kendall']
-
- if method in corrMethods:
- return statsCorrelationMatrix(matrix, method)
-
- raise AttributeError
-
-
-########################################################################################################################
-# utility functions to compute distances between rows and cluster centroids
-
-def computeClusterInternDistances(matrix, labels, sorted=True, metric='euclidean'):
- """
- Computes the distances of each element in one cluster to the cluster's centroid. Returns distance values and labels
- sorted in ascending order.
- :param matrix:
- :param labels:
- :return: labels / indices of elements corresponding to distance array, distance values of cluster
- """
- clusterLabels = np.array(labels)
- if len(clusterLabels) == 0:
- return [], []
-
- subMatrix = matrix[clusterLabels]
- # compute centroid of cluster along column (as we want to average each gene separately)
- centroid = np.mean(subMatrix, axis=0)
-
- # compute distances to centroid
- dists = similarityMeasurement(subMatrix, centroid, metric)
-
- if sorted == 'true':
- # sort values
- indices = range(len(dists))
- indices.sort(key=dists.__getitem__)
- dists.sort()
-
- # reverse order if correlation coefficient is used
- # (1 means perfect correlation while -1 denotes opposite correlation)
- corrMetrics = ['pearson', 'spearman', 'kendall']
- if metric in corrMetrics:
- indices.reverse()
- dists = dists[::-1]
-
- # write back to our arrays
- distLabels = clusterLabels[indices].tolist()
- distValues = dists.tolist()
- else:
- distLabels = clusterLabels.tolist()
- distValues = dists.tolist()
-
- return distLabels, distValues
-
-# ----------------------------------------------------------------------------------------------------------------------
-
-def computeClusterExternDistances(matrix, labels, outerLabels, metric='euclidean'):
- """
- Compute the distances of patients in one cluster to the centroids of all other clusters.
- :param matrix:
- :param labels:
- :param outerLabels:
- :return:
- """
- externDists = []
- internSubMatrix = matrix[labels]
-
- for externLabels in outerLabels:
-
- if len(externLabels) == 0:
- externDists.append([])
-
- # compute centroid of external cluster
- subMatrix = matrix[externLabels]
- centroid = np.mean(subMatrix, axis=0)
-
- dists = similarityMeasurement(internSubMatrix, centroid, metric)
- externDists.append(dists.tolist())
-
- return externDists
-
-########################################################################################################################
-
-if __name__ == '__main__':
- print(cdist([[1,1,1],[3,3,3],[5,5,5]],np.atleast_2d([2,2,2]), 'sqeuclidean').flatten())
-
- from scipy.stats import spearmanr
-
- print(spearmanr([1,2,3],[2,4,1]))
diff --git a/docker_packages.txt b/docker_packages.txt
new file mode 100644
index 0000000..e69de29
diff --git a/package.json b/package.json
index 678bd7e..ab5f3a3 100644
--- a/package.json
+++ b/package.json
@@ -1,46 +1,37 @@
{
- "name": "caleydo_clustering",
- "version": "1.0.0",
- "license" : "SEE LICENSE IN LICENSE",
- "repository": "K3rn1n4tor/caleydo_clustering",
- "dependencies": {
+ "files": [
+ "phovea_clustering",
+ "__init__.py",
+ "__main__.py",
+ "build",
+ "requirements.txt",
+ "requirements_dev.txt",
+ "docker_packages.txt"
+ ],
+ "scripts": {
+ "check": "flake8",
+ "pretest": "npm run check",
+ "test": "python setup.py test",
+ "prebuild": "node -e \"process.exit(process.env.PHOVEA_SKIP_TESTS === undefined?1:0)\" || npm run test",
+ "build": "python -c \"from distutils.dir_util import mkpath ; mkpath('./build/source')\" && (tar -c ./phovea_clustering --exclude '*.pyc' | tar -xC build/source)",
+ "predist": "npm run build",
+ "dist": "python setup.py bdist_egg"
},
- "peerDependencies": {
- "caleydo_server": "*"
+ "name": "phovea_clustering",
+ "description": "",
+ "homepage": "https://phovea.caleydo.org",
+ "version": "1.0.0",
+ "author": {
+ "name": "The Caleydo Team",
+ "email": "contact@caleydo.org",
+ "url": "https://caleydo.org"
},
- "caleydo": {
- "plugins": {
- "python": [
- {
- "type": "clustering",
- "id": "caleydo-clustering-kmeans",
- "file": "clustering_kmeans"
- },
- {
- "type": "clustering",
- "id": "caleydo-clustering-hierarchical",
- "file": "clustering_hierarchical"
- },
- {
- "type": "clustering",
- "id": "caleydo-clustering-affinity",
- "file": "clustering_affinity"
- },
- {
- "type": "clustering",
- "id": "caleydo-clustering-fuzzy",
- "file": "clustering_fuzzy"
- },
- {
- "type": "namespace",
- "id": "caleydo-clustering",
- "file": "clustering_api",
- "namespace": "/api/clustering"
- }
- ]
- }
+ "license": "BSD-3-Clause",
+ "bugs": {
+ "url": "https://github.com/phovea/phovea_clustering/issues"
},
- "publishConfig": {
- "registry": "http://registry.caleydo.org/"
+ "repository": {
+ "type": "git",
+ "url": "https://github.com/phovea/phovea_clustering.git"
}
}
diff --git a/phovea_clustering/__init__.py b/phovea_clustering/__init__.py
new file mode 100644
index 0000000..0a25671
--- /dev/null
+++ b/phovea_clustering/__init__.py
@@ -0,0 +1,36 @@
+###############################################################################
+# Caleydo - Visualization for Molecular Biology - http://caleydo.org
+# Copyright (c) The Caleydo Team. All rights reserved.
+# Licensed under the new BSD license, available at http://caleydo.org/license
+###############################################################################
+
+
+def phovea(registry):
+ """
+ register extension points
+ :param registry:
+ """
+ # generator-phovea:begin
+ registry.append('clustering', 'caleydo-clustering-kmeans', 'phovea_clustering.clustering_kmeans', {})
+
+ registry.append('clustering', 'caleydo-clustering-hierarchical', 'phovea_clustering.clustering_hierarchical', {})
+
+ registry.append('clustering', 'caleydo-clustering-affinity', 'phovea_clustering.clustering_affinity', {})
+
+ registry.append('clustering', 'caleydo-clustering-fuzzy', 'phovea_clustering.clustering_fuzzy', {})
+
+ registry.append('namespace', 'caleydo-clustering', 'phovea_clustering.clustering_api', {
+ 'namespace': '/api/clustering'
+ })
+ # generator-phovea:end
+ pass
+
+
+def phovea_config():
+ """
+ :return: file pointer to config file
+ """
+ from os import path
+ here = path.abspath(path.dirname(__file__))
+ config_file = path.join(here, 'config.json')
+ return config_file if path.exists(config_file) else None
diff --git a/phovea_clustering/clustering_affinity.py b/phovea_clustering/clustering_affinity.py
new file mode 100644
index 0000000..b5b3283
--- /dev/null
+++ b/phovea_clustering/clustering_affinity.py
@@ -0,0 +1,307 @@
+########################################################################################################################
+# libraries
+
+# module to load own configurations
+import phovea_server.config
+import numpy as np
+from clustering_util import similarity_measurementmatrix
+from timeit import default_timer as timer
+
+# request config if needed for the future
+config = phovea_server.config.view('caleydo-clustering')
+
+__author__ = 'Michael Kern'
+__version__ = '0.0.1'
+__email__ = 'kernm@in.tum.de'
+
+
+########################################################################################################################
+
+class AffinityPropagation:
+ """
+ This is an implementation of the affinity propagation algorithm to cluster genomic data / matrices.
+ Implementation details: .
+ Matlab implementation:
+ Returns the centroids and labels / stratification of each row belonging to one cluster.
+ """
+
+ def __init__(self, obs, damping=0.5, factor=1.0, pref_method='minimum', distance='euclidean'):
+ """
+ Initializes the algorithm.
+ :param obs: genomic data / matrix
+ :param damping: controls update process to dampen oscillations
+ :param factor: controls the preference value (influences number of clusters)
+ :param pref_method: all points are chosen equally with a given preference (median or minimum of similarity matrix)
+ :return:
+ """
+ self.__n = np.shape(obs)[0]
+ # observations, can be 1D array or 2D matrix with genes as rows and conditions as columns
+ # remove all NaNs in data
+ self.__obs = np.nan_to_num(obs)
+ # variables influencing output of clustering algorithm
+ self.__damping = damping
+ self.__factor = factor
+ self.__prev_method = pref_method
+
+ # similarity matrix
+ self.__S = np.zeros((self.__n, self.__n))
+ # availability matrix
+ self.__A = np.zeros((self.__n, self.__n))
+ # responsibility matrix
+ self.__R = np.zeros((self.__n, self.__n))
+
+ self.min_value = np.finfo(np.float).min
+
+ # self.__mx1 = np.full(self.__n, self.min_value)
+ # self.__mx2 = np.full(self.__n, self.min_value)
+
+ self.__idx = np.zeros(self.__n)
+
+ # set similarity computation
+ self.__distance = distance
+
+ self.__compute_similarity()
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def __call__(self):
+ """
+ Caller function for server API.
+ """
+ return self.run()
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def __compute_similarity(self):
+ """
+ Compute the similarity matrix from the original observation matrix and set preference of each element.
+ :return: _similarity matrix
+ """
+ # compute distance matrix containing the negative sq euclidean distances -|| xi - xj ||**2
+ self.__S = -similarity_measurementmatrix(self.__obs, self.__distance)
+
+ # determine the preferences S(k,k) to control the output of clusters
+ pref = 0
+ # could be median or minimum
+ if self.__prev_method == 'median':
+ pref = float(np.median(self.__S)) * self.__factor
+ elif self.__prev_method == 'minimum':
+ pref = np.min(self.__S) * self.__factor
+ else:
+ raise AttributeError
+
+ np.fill_diagonal(self.__S, pref)
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def run(self):
+ """
+ Runs the algorithm of affinity propagation. Conducts at least 100 iterations and checks if the outcome of
+ current exemplars/clusters has converged. If not, the algorithm will continue until convergence is found
+ or the maximum number of iterations (200) is reached.
+ :return:
+ """
+ max_iter = 200
+ max_conv_iter = 100
+
+ # sum all decisions for exemplars per round
+ decision_sum = np.zeros(self.__n)
+ # collect decisions for one exemplar per iteration round
+ decision_iter = np.zeros((max_conv_iter, self.__n))
+ # counter for decisions (= consider data element as exemplar in each algorithm iteration)
+ decision_counter = max_conv_iter
+ # indicates if algorithm has converged
+ is_converged = False
+
+ centroids = []
+ it = 0
+ cluster_i = []
+
+ # helpful variables (that do not need recomputation)
+ index_diag = np.arange(self.__n)
+ indices_diag = np.diag_indices_from(self.__R)
+ new_a = np.zeros((self.__n, self.__n))
+ new_r = np.zeros((self.__n, self.__n))
+
+ for it in range(1, max_iter + 1):
+
+ # ----------------------------------------------------------------------------------------------------------
+
+ # compute responsibility matrix
+ m_as = self.__A + self.__S
+
+ max_y = np.max(m_as, axis=1)
+ index_y = np.argmax(m_as, axis=1)
+
+ # set values of maxima to zero in m_as matrix
+ m_as[index_diag, index_y] = self.min_value
+
+ # look for second maxima
+ max_y2 = np.max(m_as, axis=1)
+
+ # perform responsibility update
+ for ii in range(self.__n):
+ # s(i, k) - max({ a(i, k') + s(i, k') })
+ new_r[ii] = self.__S[ii] - max_y[ii]
+
+ # subtract second maximum from row -> column entry with maximum value
+ new_r[index_diag, index_y] = self.__S[index_diag, index_y] - max_y2[index_diag]
+
+ # dampen values
+ # self.__R = self.__damping * self.__R + (1 - self.__damping) * new_r
+ self.__R *= self.__damping
+ self.__R += (1 - self.__damping) * new_r
+
+ # ----------------------------------------------------------------------------------------------------------
+
+ # compute availability matrix
+ # cut out negative elements
+ # TODO! slow because of copy operation
+ rp = np.maximum(self.__R, 0)
+
+ # write back all diagonal elements als self representatives
+ rp[indices_diag] = self.__R[indices_diag]
+ sum_cols = np.sum(rp, axis=0)
+
+ # apply availability update
+ new_a[:, ] = sum_cols
+ new_a -= rp
+ # for ii in range(self.__n):
+ # # r(k, k) + sum(max(0, r(i',k))
+ # new_a[:, ii] = sum_cols[ii] - Rp[:, ii]
+
+ diag_a = np.diag(new_a)
+ # take minimum of all the values in A, cut out all values above zero
+ # new_a = np.minimum(new_a, 0)
+ new_a[new_a > 0] = 0
+ new_a[indices_diag] = diag_a[index_diag]
+
+ # dampen values
+ # self.__A = self.__damping * self.__A + (1 - self.__damping) * new_a
+ self.__A *= self.__damping
+ self.__A += (1 - self.__damping) * new_a
+
+ # ----------------------------------------------------------------------------------------------------------
+
+ # find exemplars for new clusters
+ # old version which is slower
+ # E = self.__R + self.__A
+ # diag_e = np.diag(E)
+
+ # take the diagonal elements of the create matrix E
+ diag_e = np.diag(self.__R) + np.diag(self.__A)
+
+ # all elements > 0 are considered to be an appropriate exemplar for the dataset
+ cluster_i = np.argwhere(diag_e > 0).flatten()
+
+ # count the number of clusters
+ num_clusters = len(cluster_i)
+
+ # ----------------------------------------------------------------------------------------------------------
+
+ decision_counter += 1
+ if decision_counter >= max_conv_iter:
+ decision_counter = 0
+
+ # subtract outcome of previous iteration (< 100) from the total sum of the decisions
+ decision_sum -= decision_iter[decision_counter]
+
+ decision_iter[decision_counter].fill(0)
+ decision_iter[decision_counter][cluster_i] = 1
+
+ # compute sum of decisions for each element being a exemplar
+ decision_sum += decision_iter[decision_counter]
+
+ # check for convergence
+ if it >= max_conv_iter or it >= max_iter:
+ is_converged = True
+
+ for ii in range(self.__n):
+ # if element is considered to be an exemplar in at least one iterations
+ # and total of decisions in the last 100 iterations is not 100 --> no convergence
+ if decision_sum[ii] != 0 and decision_sum[ii] != max_conv_iter:
+ is_converged = False
+ break
+
+ if is_converged and num_clusters > 0:
+ break
+
+ # --------------------------------------------------------------------------------------------------------------
+
+ # obtain centroids
+ centroids = self.__obs[cluster_i]
+
+ # find maximum columns in m_as matrix to assign elements to clusters / exemplars
+ # fill A with negative values
+ self.__A.fill(self.min_value)
+ # set values of clusters to zero (as we only want to regard these values
+ self.__A[:, cluster_i] = 0.0
+ # fill diagonal of similarity matrix to zero (remove preferences)
+ np.fill_diagonal(self.__S, 0.0)
+
+ # compute m_as matrix
+ m_as = self.__A + self.__S
+ # since values are < 0, look for the maximum number in each row and return its column index
+ self.__idx = np.argmax(m_as, axis=1)
+
+ cluster_i = cluster_i.tolist()
+ cluster_labels = [[] for _ in range(num_clusters)]
+
+ # create labels per cluster
+ for ii in range(self.__n):
+ index = cluster_i.index(self.__idx[ii])
+ self.__idx[ii] = index
+ cluster_labels[index].append(ii)
+
+ # return sorted cluster labels (that's why we call compute cluster distances, might be redundant)
+ # for ii in range(num_clusters):
+ # cluster_labels[ii], _ = compute_cluster_intern_distances(self.__obs, cluster_labels[ii])
+
+ # if is_converged:
+ # print('Algorithm has converged after {} iterations'.format(it))
+ # else:
+ # print('Algorithm has not converged after 200 iterations')
+ #
+ # print('Number of detected clusters {}'.format(num_clusters))
+ # print('Centroids: {}'.format(centroids))
+
+ return centroids.tolist(), self.__idx.tolist(), cluster_labels
+
+
+########################################################################################################################
+
+def _plugin_initialize():
+ """
+ optional initialization method of this module, will be called once
+ :return:
+ """
+ pass
+
+
+# ----------------------------------------------------------------------------------------------------------------------
+
+def create(data, damping, factor, preference, distance):
+ """
+ by convention contain a factory called create returning the extension implementation
+ :return:
+ """
+ return AffinityPropagation(data, damping, factor, preference, distance)
+
+
+########################################################################################################################
+
+# from timeit import default_timer as timer
+
+if __name__ == '__main__':
+ np.random.seed(200)
+ # data = np.array([[1,2,3],[5,4,5],[3,2,2],[8,8,7],[9,6,7],[2,3,4]])
+ # data = np.array([np.random.rand(8000) * 4 - 2 for _ in range(500)])
+ # data = np.array([[0.9],[1],[1.1],[10],[11],[12],[20],[21],[22]])
+ data = np.array([1, 1.1, 5, 8, 5.2, 8.3])
+
+ s = timer()
+ aff = AffinityPropagation(data, 0.9, 1.0, 'median', 'euclidean')
+ result = aff.run()
+ e = timer()
+ print(result)
+ print('time elapsed: {}'.format(e - s))
diff --git a/phovea_clustering/clustering_api.py b/phovea_clustering/clustering_api.py
new file mode 100644
index 0000000..4aa1200
--- /dev/null
+++ b/phovea_clustering/clustering_api.py
@@ -0,0 +1,156 @@
+########################################################################################################################
+# libraries
+
+# use flask library for server activities
+from phovea_server import ns
+# load services (that are executed by the server when certain website is called)
+from clustering_service import get_cluster_distances, get_clusters_from_dendrogram, load_data, run_affinity_propagation, run_fuzzy, run_hierarchical, run_kmeans
+
+
+__author__ = 'Michael Kern'
+__version__ = '0.0.1'
+__email__ = 'kernm@in.tum.de'
+
+# create new flask application for hosting namespace
+app = ns.Namespace(__name__)
+
+
+########################################################################################################################
+
+@app.route('/kmeans////')
+def kmeans_clustering(k, init_method, distance, dataset_id):
+ """
+ Access k-means clustering plugin.
+ :param k: number of clusters
+ :param init_method: initialization method for initial clusters
+ :param distance: distance measurement
+ :param dataset_id: identifier of data set
+ :return: jsonified output
+ """
+ try:
+ data = load_data(dataset_id)
+ response = run_kmeans(data, int(k), init_method, distance)
+ return ns.jsonify(response)
+ except:
+ return ns.jsonify({})
+
+
+########################################################################################################################
+
+@app.route('/hierarchical////')
+def hierarchical_clustering(k, method, distance, dataset_id):
+ """
+ Access hierarchical clustering plugin.
+ :param k: number of desired clusters
+ :param method: type of single linkage
+ :param distance: distance measurement
+ :param dataset_id: identifier of data set
+ :return: jsonified output
+ """
+ try:
+ data = load_data(dataset_id)
+ response = run_hierarchical(data, int(k), method, distance)
+ return ns.jsonify(response)
+ except:
+ return ns.jsonify({})
+
+
+########################################################################################################################
+
+@app.route('/affinity/////')
+def affinity_propagation_clustering(damping, factor, preference, distance, dataset_id):
+ """
+ Access affinity propagation clustering plugin.
+ :param damping:
+ :param factor:
+ :param preference:
+ :param distance: distance measurement
+ :param dataset_id:
+ :return:
+ """
+ try:
+ data = load_data(dataset_id)
+ response = run_affinity_propagation(data, float(damping), float(factor), preference, distance)
+ return ns.jsonify(response)
+ except:
+ return ns.jsonify({})
+
+
+########################################################################################################################
+
+@app.route('/fuzzy/////')
+def fuzzy_clustering(num_clusters, m, threshold, distance, dataset_id):
+ """
+ :param num_clusters:
+ :param m:
+ :param threshold:
+ :param distance:
+ :param dataset_id:
+ :return:
+ """
+ try:
+ data = load_data(dataset_id)
+ response = run_fuzzy(data, int(num_clusters), float(m), float(threshold), distance)
+ return ns.jsonify(response)
+ except:
+ return ns.jsonify({})
+
+
+########################################################################################################################
+
+def load_atttribute(json_data, attr):
+ import json
+ data = json.loads(json_data)
+ if attr in data:
+ return data[attr]
+ else:
+ return None
+
+
+########################################################################################################################
+
+@app.route('/distances///', methods=['POST'])
+def get_distances(metric, dataset_id, sorted):
+ """
+ Compute the distances of the current stratification values to its centroid.
+ :param metric:
+ :param dataset_id:
+ :return: distances and labels sorted in ascending order
+ """
+ data = load_data(dataset_id)
+ labels = []
+ extern_labels = None
+
+ if 'group' in ns.request.values:
+ labels = load_atttribute(ns.request.values['group'], 'labels')
+ extern_labels = load_atttribute(ns.request.values['group'], 'externLabels')
+ else:
+ return ''
+
+ response = get_cluster_distances(data, labels, metric, extern_labels, sorted)
+ return ns.jsonify(response)
+
+
+########################################################################################################################
+
+@app.route('/dendrogram//', methods=['POST'])
+def dendrogram_clusters(num_clusters, dataset_id):
+ data = load_data(dataset_id)
+
+ if 'group' in ns.request.values:
+ dendrogram = load_atttribute(ns.request.values['group'], 'dendrogram')
+ else:
+ return ''
+
+ response = get_clusters_from_dendrogram(data, dendrogram, int(num_clusters))
+ return ns.jsonify(response)
+
+
+########################################################################################################################
+
+def create():
+ """
+ Standard Caleydo convention for creating the service when server is initialized.
+ :return: Returns implementation of this plugin with given name
+ """
+ return app
diff --git a/phovea_clustering/clustering_fuzzy.py b/phovea_clustering/clustering_fuzzy.py
new file mode 100644
index 0000000..8b95f59
--- /dev/null
+++ b/phovea_clustering/clustering_fuzzy.py
@@ -0,0 +1,225 @@
+########################################################################################################################
+# libraries
+
+# module to load own configurations
+import phovea_server.config
+# library to conduct matrix/vector calculus
+import numpy as np
+from clustering_util import similarity_measurement
+
+# request config if needed for the future
+config = phovea_server.config.view('caleydo-clustering')
+
+__author__ = 'Michael Kern'
+__version__ = '0.0.3'
+__email__ = 'kernm@in.tum.de'
+
+
+########################################################################################################################
+# class definition
+
+class Fuzzy(object):
+ """
+ Formulas: https://en.wikipedia.org/wiki/Fuzzy_clustering
+ """
+
+ def __init__(self, obs, num_clusters, m=2.0, threshold=-1, distance='euclidean', init=None, error=0.0001):
+ """
+ Initializes algorithm.
+ :param obs: observation matrix / genomic data
+ :param num_clusters: number of clusters
+ :param m: fuzzifier, controls degree of fuzziness, from [1; inf]
+ :return:
+ """
+ # observation
+ self.__obs = np.nan_to_num(obs)
+
+ self.__n = obs.shape[0]
+
+ # fuzzifier value
+ self.__m = np.float(m)
+ # number of clusters
+ self.__c = num_clusters
+
+ # matrix u containing all the weights describing the degree of membership of each patient to the centroid
+ if init is None:
+ init = np.random.rand(self.__c, self.__n)
+
+ self.__u = np.copy(init)
+
+ # TODO! scikit normalizes the values at the beginning and at each step to [0; 1]
+ self.__u /= np.ones((self.__c, 1)).dot(np.atleast_2d(np.sum(self.__u, axis=0))).astype(np.float64)
+ # remove all zero values and set them to smallest possible value
+ self.__u = np.fmax(self.__u, np.finfo(np.float64).eps)
+ # centroids
+ self.__centroids = np.zeros(self.__c)
+ # threshold for stopping criterion
+ self.__error = error
+ # distance function
+ self.__distance = distance
+
+ # threshold or minimum probability used for cluster assignments
+ if threshold == -1:
+ self.__threshold = 1.0 / num_clusters
+ else:
+ self.__threshold = threshold
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def __call__(self):
+ """
+ Caller function for server API
+ :return:
+ """
+ return self.run()
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def compute_centroid(self):
+ """
+ Compute the new centroids using the computed partition matrix.
+ :return:
+ """
+ u_m = self.__u ** self.__m
+
+ sum_data_weights = np.dot(u_m, self.__obs)
+ if self.__obs.ndim == 1:
+ m = 1
+ else:
+ m = self.__obs.shape[1]
+
+ sum_weights = np.sum(u_m, axis=1)
+ # tile array (sum of weights repeated in every row)
+ sum_weights = np.ones((m, 1)).dot(np.atleast_2d(sum_weights)).T
+
+ if self.__obs.ndim == 1:
+ sum_weights = sum_weights.flatten()
+
+ # divide by total sum to get new centroids
+ self.__centroids = sum_data_weights / sum_weights
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def compute_coefficients(self):
+ """
+ Compute new partition matrix / weights describing the degree of membership of each patient to all clusters.
+ :return:
+ """
+
+ # TODO you can also use cdist of scipy.spatial.distance module
+ dist_mat = np.zeros((self.__c, self.__n))
+
+ for ii in range(self.__c):
+ dist_mat[ii] = similarity_measurement(self.__obs, self.__centroids[ii], self.__distance)
+
+ # set zero values to smallest values to prevent inf results
+ dist_mat = np.fmax(dist_mat, np.finfo(np.float64).eps)
+
+ # apply coefficient formula
+ denom = np.float(self.__m - 1.0)
+ self.__u = dist_mat ** (-2.0 / denom)
+
+ sum_coeffs = np.sum(self.__u, axis=0)
+
+ self.__u /= np.ones((self.__c, 1)).dot(np.atleast_2d(sum_coeffs))
+ self.__u = np.fmax(self.__u, np.finfo(np.float64).eps)
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def run(self):
+ """
+ Perform the c-means fuzzy clustering.
+ :return:
+ """
+ max_iter = 100
+ iter = 0
+
+ while iter < max_iter:
+ # save last partition matrix
+ u_old = np.copy(self.__u)
+ # compute centroids with given weights
+ self.compute_centroid()
+ # compute new coefficient matrix
+ self.compute_coefficients()
+
+ # normalize weight / partition matrix u
+ self.__u /= np.ones((self.__c, 1)).dot(np.atleast_2d(np.sum(self.__u, axis=0)))
+ self.__u = np.fmax(self.__u, np.finfo(np.float64).eps)
+
+ # compute the difference between the old and new matrix
+ epsilon = np.linalg.norm(self.__u - u_old)
+
+ # stop if difference (epsilon) is smaller than the user-defined threshold
+ if epsilon < self.__error:
+ break
+
+ iter += 1
+
+ self.__end()
+
+ u = self.__u.T
+ # print(self.__u.T)
+
+ return self.__centroids.tolist(), self.__cluster_labels, u.tolist(), self.__threshold
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def __end(self):
+ """
+ Conduct the cluster assignments and creates cluster_label array.
+ :return:
+ """
+ # assign patient to clusters
+ # transpose to get a (n, c) matrix
+ u = self.__u.T
+
+ self.__labels = np.zeros(self.__n, dtype=np.int)
+ self.__cluster_labels = [[] for _ in range(self.__c)]
+ # gather all probabilities / degree of memberships of each patient to the clusters
+ # self.__cluster_probs = [[] for _ in range(self.__c)]
+ # probability that the patients belongs to each cluster
+ max_prob = np.float64(self.__threshold)
+
+ for ii in range(self.__n):
+ # cluster_iD = np.argmax(u[ii])
+ # self.__labels = cluster_iD
+ # self.__cluster_labels[cluster_iD].append(ii)
+
+ for jj in range(self.__c):
+ if u[ii][jj] >= max_prob:
+ cluster_id = jj
+ self.__labels = cluster_id
+ self.__cluster_labels[cluster_id].append(int(ii))
+
+ # for ii in range(self.__c):
+ # self.__cluster_labels[ii], _ = compute_cluster_intern_distances(self.__obs, self.__cluster_labels[ii])
+
+
+########################################################################################################################
+
+def _plugin_initialize():
+ """
+ optional initialization method of this module, will be called once
+ :return:
+ """
+ pass
+
+
+# ----------------------------------------------------------------------------------------------------------------------
+
+def create(data, num_cluster, m, threshold, distance):
+ """
+ by convention contain a factory called create returning the extension implementation
+ :return:
+ """
+ return Fuzzy(data, num_cluster, m, threshold, distance)
+
+
+########################################################################################################################
+
+if __name__ == '__main__':
+ data = np.array([[1, 1, 2], [5, 4, 5], [3, 2, 2], [8, 8, 7], [9, 8, 9], [2, 2, 2]])
+ # data = np.array([1,1.1,5,8,5.2,8.3])
+
+ fuz = Fuzzy(data, 3, 1.5)
+ print(fuz.run())
diff --git a/phovea_clustering/clustering_hierarchical.py b/phovea_clustering/clustering_hierarchical.py
new file mode 100644
index 0000000..668d77b
--- /dev/null
+++ b/phovea_clustering/clustering_hierarchical.py
@@ -0,0 +1,467 @@
+########################################################################################################################
+# libraries
+
+# module to load own configurations
+import phovea_server.config
+
+
+# library to conduct matrix/vector calculus
+import numpy as np
+# fastest distance computation by scipy
+# import scipy.spatial as spt
+
+# utility functions for clustering and creating the dendrogram trees
+from clustering_util import BinaryNode, BinaryTree
+from clustering_util import similarity_measurement_matrix
+from clustering_util import compute_cluster_intern_distances
+from clustering_util import cut_json_tree_by_clusters
+
+__author__ = 'Michael Kern'
+__version__ = '0.0.3'
+__email__ = 'kernm@in.tum.de'
+# request config if needed for the future
+config = phovea_server.config.view('caleydo-clustering')
+
+
+########################################################################################################################
+
+class Hierarchical(object):
+ """
+ This is a implementation of hierarchical clustering on genomic data using the Lance-Williams dissimilarity update
+ to compute different distance metrics (single linkage, complete linkage, ...).
+ Lance-Williams explained in: http://arxiv.org/pdf/1105.0121.pdf
+ """
+
+ def __init__(self, obs, method='single', distance='euclidean'):
+ """
+ Initializes the algorithm
+ :param obs: genomic data / matrix
+ :param method: linkage method
+ :return:
+ """
+ # genomic data / matrix
+ # observations, can be 1D array or 2D matrix with genes as rows and conditions as columns
+ # remove all NaNs in data
+ self.__obs = np.nan_to_num(obs)
+
+ num_genes = np.shape(self.__obs)[0]
+ self.__n = num_genes
+
+ # check if dimension is 2D
+ # if self.__obs.ndim == 2:
+ # # obtain number of observations (rows)
+ # num_genes, _ = np.shape(self.__obs)
+ # self.__n = num_genes
+
+ # else:
+ # print("[Error]:\tdata / observations must be 2D. 1D observation arrays are not supported")
+ # raise Attribute_error
+
+ # distance measurement
+ self.__distance = distance
+
+ # distance / proximity matrix
+ self.__d = []
+ self.__compute_proximity_matrix()
+ # dictionary mapping the string id (i,j,k,...) of clusters to corresponding index in matrix
+ self.__id_map = {}
+ # inverse mapping of id_map --> returns the string id given a certain index
+ self.__key_map = {}
+ # contains actual index of all clusters, old clusters are from [0, n - 1], new clusters have indices in range
+ # [n, 2n - 1]
+ self.__cluster_map = {}
+ for ii in range(self.__n):
+ self.__id_map[str(ii)] = ii
+ self.__key_map[ii] = str(ii)
+ self.__cluster_map[str(ii)] = ii
+
+ # linkage method for hierarchical clustering
+ self.__method = method
+
+ # internal dendrogram tree
+ self.__tree = None
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def __call__(self):
+ """
+ Caller function for server API
+ :return:
+ """
+ return self.run()
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ @property
+ def tree(self):
+ return self.__tree
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def __get_coefficients(self, cluster_i, cluster_j):
+ """
+ Compute the coefficients for the Lance-Williams algorithm
+ :param cluster_i:
+ :param cluster_j:
+ :return:
+ """
+ # TODO! use hash map for storing numbers instead of computing them every time
+ if self.__method == 'single':
+ return 0.5, 0.5, 0, -0.5
+ elif self.__method == 'complete':
+ return 0.5, 0.5, 0, 0.5
+ elif self.__method == 'weighted':
+ return 0.5, 0.5, 0, 0
+ elif self.__method == 'median':
+ return 0.5, 0.5, -0.25, 0
+
+ # TODO! ATTENTION! average method should compute the cluster centroids using the average
+ # TODO! || cluster_i - cluster_j || ** 2
+ elif self.__method == 'average':
+ n_i = np.float(cluster_i.count(',') + 1)
+ n_j = np.float(cluster_j.count(',') + 1)
+ sum_n = n_i + n_j
+ return (n_i / sum_n), (n_j / sum_n), 0, 0
+
+ # TODO! ATTENTION! centroid method should compute the cluster centroids using the mean
+ # TODO! || cluster_i - cluster_j || ** 2
+ elif self.__method == 'centroid':
+ n_i = np.float(cluster_i.count(',') + 1)
+ n_j = np.float(cluster_j.count(',') + 1)
+ sum_n = n_i + n_j
+ return (n_i / sum_n), (n_j / sum_n), -((n_i * n_j) / (sum_n ** 2)), 0
+
+ # TODO! Support ward method
+ # TODO! (|cluster_i| * |cluster_j|) / (|cluster_i| + |cluster_j) * || cluster_i - cluster_j || ** 2
+ # elif self.__method == 'ward':
+ # n_i = np.float(cluster_i.count(',') + 1)
+ # n_j = np.float(cluster_j.count(',') + 1)
+ # n_k = np.float(cluster_k.count(',') + 1)
+ # sum_n = n_i + n_j + n_k
+ # return (n_i + n_k) / sum_n, (n_j + n_k) / sum_n, -n_k / sum_n, 0
+ else:
+ raise AttributeError
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def __compute_proximity_matrix(self):
+ """
+ Compute the proximity of each observation and store the results in a nxn matrix
+ :return:
+ """
+
+ # create distance matrix of size n x n
+ self.__d = np.zeros((self.__n, self.__n))
+
+ # compute euclidean distance
+ # TODO! implement generic distance functions
+ # TODO! look for an alternative proximity analysis without computing all distances
+ self.__d = similarity_measurement_matrix(self.__obs, self.__distance)
+
+ # get number of maximum value of float
+ self.__max_value = self.__d.max() + 1
+
+ # fill diagonals with max value to exclude them from min dist process
+ # TODO! operate only on upper triangle matrix of distance matrix
+ np.fill_diagonal(self.__d, self.__max_value)
+
+ # print('\t-> finished.')
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def __get_matrix_minimum_indices(self):
+ """
+ Searches for the minimum distance in the distance matrix
+ :return: indices of both clusters having the smallest distance
+ """
+ min_dist = self.__d.min()
+ min_list = np.argwhere(self.__d == min_dist)
+
+ min_i, min_j = 0, 0
+
+ # look for indices, where i < j
+ # TODO! for the future --> use upper triangle matrix
+ for ii in range(len(min_list)):
+ min_i, min_j = min_list[ii]
+ if min_i < min_j:
+ break
+
+ if min_i == min_j:
+ print("ERROR")
+
+ return self.__key_map[min_i], self.__key_map[min_j], min_dist
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def __delete_clusters(self, i, j):
+ """
+ Reorders and reduces the matrix to insert the new cluster formed of cluster i and j
+ and its distance values, and removes the old clusters by cutting the last row.
+ :param i: cluster index i
+ :param j: cluster index j
+ :return:
+ """
+ id_i = self.__id_map[str(i)]
+ id_j = self.__id_map[str(j)]
+
+ # min_id = min(id_i, id_j)
+ max_id = max(id_i, id_j)
+
+ # now set column max ID to last column -> swap last and i column
+ last_row = self.__d[self.__n - 1]
+ self.__d[max_id] = last_row
+ self.__d[:, max_id] = self.__d[:, (self.__n - 1)]
+
+ # set key of last column (cluster) to column of the cluster with index max_id
+ key = self.__key_map[self.__n - 1]
+ self.__id_map[key] = max_id
+ self.__key_map[max_id] = key
+
+ # delete entries in id and key map --> not required anymore
+ try:
+ del self.__id_map[i]
+ del self.__id_map[j]
+ del self.__key_map[self.__n - 1]
+ except KeyError:
+ print("\nERROR: Key {} not found in id_map".format(j))
+ print("ERROR: Previous key: {} in id_map".format(i))
+ print("Given keys: ")
+ for key in self.__id_map:
+ print(key)
+ return
+
+ # reduce dimension of matrix by one column and row
+ self.__n -= 1
+ self.__d = self.__d[:-1, :-1]
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def __merge_clusters(self, i, j):
+ """
+ Merges cluster i and j, computes the new ID and distances of the newly formed cluster
+ and stores required information
+ :param i: cluster index i
+ :param j: cluster index j
+ :return:
+ """
+ id_i = self.__id_map[str(i)]
+ id_j = self.__id_map[str(j)]
+
+ min_id = min(id_i, id_j)
+ max_id = max(id_i, id_j)
+
+ # use Lance-Williams formula to compute linkages
+ dki = self.__d[:, min_id]
+ dkj = self.__d[:, max_id]
+ dij = self.__d[min_id, max_id]
+ dist_ij = np.abs(dki - dkj)
+
+ # compute coefficients
+ ai, aj, b, y = self.__get_coefficients(i, j)
+
+ new_entries = ai * dki + aj * dkj + b * dij + y * dist_ij
+ new_entries[min_id] = self.__max_value
+ new_entries[max_id] = self.__max_value
+
+ # add new column and row
+ self.__d[min_id] = new_entries
+ self.__d[:, min_id] = new_entries
+
+ id_ij = min_id
+ new_key = i + ',' + j
+ self.__id_map[new_key] = id_ij
+ self.__key_map[id_ij] = new_key
+ self.__cluster_map[new_key] = len(self.__cluster_map)
+
+ # delete old clusters
+ self.__delete_clusters(i, j)
+
+ # count number of elements
+ return new_key.count(',') + 1
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def run(self):
+ """
+ Conducts the algorithm until there's only one cluster.
+ :return:
+ """
+
+ # number of the current iteration
+ m = 0
+
+ # resulting matrix containing information Z[i,x], x=0: cluster i, x=1: cluster j, x=2: dist(i,j), x=3: num(i,j)
+ runs = self.__n - 1
+ z = np.array([[0 for _ in range(4)] for _ in range(runs)], dtype=np.float)
+
+ while m < runs:
+ m += 1
+
+ i, j, dist_ij = self.__get_matrix_minimum_indices()
+ num_ij = self.__merge_clusters(i, j)
+
+ cluster_i, cluster_j = self.__cluster_map[i], self.__cluster_map[j]
+ z[m - 1] = [int(min(cluster_i, cluster_j)), int(max(cluster_i, cluster_j)), np.float(dist_ij), int(num_ij)]
+
+ # reset number n to length of first dimension (number of genes)
+ self.__n = np.shape(self.__obs)[0]
+
+ self.__tree = self.generate_tree(z)
+ return z.tolist()
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def generate_tree(self, linkage_matrix):
+ """
+ Computes the dendrogram tree for a given linkage matrix.
+ :param linkage_matrix:
+ :return:
+ """
+ self.__tree = None
+
+ tree_map = {}
+ num_trees = len(linkage_matrix)
+
+ for ii in range(num_trees):
+ entry = linkage_matrix[ii]
+ current_id = self.__n + ii
+ left_index, right_index, value = int(entry[1]), int(entry[0]), entry[2], int(entry[3])
+ left = right = None
+
+ if left_index < self.__n:
+ left = BinaryNode(self.__obs[left_index].tolist(), left_index, 1, None, None)
+ else:
+ left = tree_map[left_index]
+
+ if right_index < self.__n:
+ right = BinaryNode(self.__obs[right_index].tolist(), right_index, 1, None, None)
+ else:
+ right = tree_map[right_index]
+
+ if isinstance(left, BinaryNode) and isinstance(right, BinaryNode):
+ tree_map[current_id] = BinaryTree(left, right, current_id, value)
+ elif isinstance(left, BinaryNode):
+ tree_map[current_id] = right.add_node(left, current_id, value)
+ del tree_map[right_index]
+ elif isinstance(right, BinaryNode):
+ tree_map[current_id] = left.add_node(right, current_id, value)
+ del tree_map[left_index]
+ else:
+ tree_map[current_id] = left.merge(right, current_id, value)
+ del tree_map[right_index]
+ del tree_map[left_index]
+
+ self.__tree = tree_map[num_trees + self.__n - 1]
+ return self.__tree
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+
+########################################################################################################################
+
+
+def get_clusters(k, obs, dendrogram, sorted=True):
+ """
+ First implementation to cut dendrogram tree automatically by choosing nodes having the greatest node values
+ or rather distance to the other node / potential cluster
+ :param k: number of desired clusters
+ :param obs: set of observations
+ :param dendrogram: dendrogram tree
+ :return: centroids, sorted cluster labels and normal label list
+ """
+ obs = np.nan_to_num(obs)
+ n = obs.shape[0]
+
+ if isinstance(dendrogram, BinaryTree):
+ cluster_labels = dendrogram.cut_tree_by_clusters(k)
+ else:
+ cluster_labels = cut_json_tree_by_clusters(dendrogram, k)
+
+ cluster_centroids = []
+ labels = np.zeros(n, dtype=np.int)
+ cluster_id = 0
+
+ for ii in range(len(cluster_labels)):
+ cluster = cluster_labels[ii]
+ sub_obs = obs[cluster]
+ cluster_centroids.append(np.mean(sub_obs, axis=0).tolist())
+
+ for id in cluster:
+ labels[id] = cluster_id
+
+ # sort labels according to their distance
+ if sorted:
+ cluster_labels[ii], _ = compute_cluster_intern_distances(obs, cluster)
+
+ cluster_id += 1
+
+ return cluster_centroids, cluster_labels, labels.tolist()
+
+
+########################################################################################################################
+
+def _plugin_initialize():
+ """
+ optional initialization method of this module, will be called once
+ :return:
+ """
+ pass
+
+
+# ----------------------------------------------------------------------------------------------------------------------
+
+def create(data, method, distance):
+ """
+ by convention contain a factory called create returning the extension implementation
+ :return:
+ """
+ return Hierarchical(data, method, distance)
+
+
+########################################################################################################################
+def _main():
+ from timeit import default_timer as timer
+ # from scipy.cluster.hierarchy import linkage, leaves_list
+
+ np.random.seed(200)
+ # data = np.array([[1,2,3],[5,4,5],[3,2,2],[8,8,7],[9,6,7],[2,3,4]])
+ data = np.array([1, 1.1, 5, 8, 5.2, 8.3])
+
+ time_mine = 0
+ time_theirs = 0
+
+ n = 10
+
+ for i in range(n):
+ # data = np.array([np.random.rand(6000) * 4 - 2 for _ in range(249)])
+ # import time
+ s1 = timer()
+ hier = Hierarchical(data, 'complete')
+ # s = time.time()
+ linkage_matrix = hier.run()
+ e1 = timer()
+ print(linkage_matrix)
+ tree = hier.generate_tree(linkage_matrix)
+ # print(tree.get_leaves())
+ # print(tree.jsonify())
+ # print(hier.get_clusters(3))
+ import json
+
+ json_tree = json.loads(tree.jsonify())
+ get_clusters(3, data, json_tree)
+
+ s2 = timer()
+ # linkage_matrix2 = linkage(data, 'complete')
+ # print(leaves_list(linkage_matrix2))
+ e2 = timer()
+
+ time_mine += e1 - s1
+ time_theirs += e2 - s2
+
+ # print(linkage_matrix)
+ # print(linkage_matrix2)
+ print('mine: {}'.format(time_mine / n))
+ print('theirs: {}'.format(time_theirs / n))
+
+if __name__ == '__main__':
+ _main()
diff --git a/phovea_clustering/clustering_kmeans.py b/phovea_clustering/clustering_kmeans.py
new file mode 100644
index 0000000..8cc53b0
--- /dev/null
+++ b/phovea_clustering/clustering_kmeans.py
@@ -0,0 +1,406 @@
+########################################################################################################################
+# libraries
+
+# module to load own configurations
+import phovea_server.config
+
+# numpy important to conduct matrix/vector calculus
+import numpy as np
+# creates random numbers
+import random
+
+# contains utility functions
+from clustering_util import weighted_choice, similarity_measurement
+
+__author__ = 'Michael Kern'
+__version__ = '0.0.2'
+__email__ = 'kernm@in.tum.de'
+
+# request config if needed in the future
+config = phovea_server.config.view('caleydo-clustering')
+
+
+########################################################################################################################
+
+class KMeans:
+ """
+ This is an implementation of the k-means algorithm to cluster genomic data / matrices.
+ Returns the centroids, the labels / stratification of each row belonging to one cluster,
+ distance matrix for cluster-cluster distance and distance arrays for row-cluster_centroid distance.
+ Implementation detail:
+ """
+
+ def __init__(self, obs, k, init_mode='kmeans++', distance='sqeuclidean', iters=1000):
+ """
+ Initializes the algorithm with observation, number of k clusters, the initial method and
+ the maximum number of iterations.
+ Initialization method of random cluster choice can be: forgy, uniform, random, plusplus
+ :param obs: genomic data / matrix
+ :param k: number of clusters
+ :param init_mode: initialization method
+ :param distance: distance measurement
+ :param iters: number of maximum iterations
+ :return:
+ """
+
+ # number of clusters
+ self.__k = k
+ # observations, can be 1D array or 2D matrix with genes as rows and conditions as columns
+ # remove all NaNs in data
+ self.__obs = np.nan_to_num(obs)
+ # number of observations / genes
+ self.__n = np.shape(obs)[0]
+ # maps the element ids to clusters
+ self.__label_map = np.zeros(self.__n, dtype=np.int)
+ # cluster means and number of elements
+ self.__cluster_means = np.array([obs[0] for _ in range(k)], dtype=np.float)
+ self.__cluster_nums = np.array([0 for _ in range(k)], dtype=np.int)
+ # tells if any cluster has changed or rather if any data item was moved
+ self.__changed = True
+ # number of iterations
+ self.__iters = iters
+ # initialization method
+ self.__init_mode = init_mode
+ # compare function
+ self.__distance = distance
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def __call__(self):
+ """
+ Caller function for server API.
+ """
+ return self.run()
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def __init(self):
+ """
+ Initialize clustering with random clusters using a user-specified method
+ :return:
+ """
+ # TODO! consider to init k-Means algorithm with Principal Component Analysis (PCA)
+ # TODO! see
+ # init cluster
+ if self.__init_mode == 'forgy':
+ self.__forgy_method()
+ elif self.__init_mode == 'uniform':
+ self.__uniform_method()
+ elif self.__init_mode == 'random':
+ self.__random_method()
+ elif self.__init_mode == 'kmeans++':
+ self.__plusplus_method()
+ else:
+ raise AttributeError
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def __forgy_method(self):
+ """
+ Initialization method:
+ Randomly choose k observations from the data using a uniform random distribution.
+ :return:
+ """
+ for ii in range(self.__k):
+ self.__cluster_means[ii] = (self.__obs[random.randint(0, self.__n - 1)])
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def __uniform_method(self):
+ """
+ Initialization method:
+ Randomly assign each observation to one of the k clusters using uniform random distribution
+ and compute the centroids of each cluster.
+ :return:
+ """
+ for i in range(self.__n):
+ self.__label_map[i] = random.randint(0, self.__k - 1)
+
+ self.__update()
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def __random_method(self):
+ """
+ Initialization method:
+ Randomly choose k observations from the data by estimating the mean and standard deviation of the data and
+ using the gaussian random distribution.
+ :return:
+ """
+ mean = np.mean(self.__obs, axis=0)
+ std = np.std(self.__obs, axis=0)
+
+ for ii in range(self.__k):
+ self.__cluster_means[ii] = np.random.normal(mean, std)
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def __plusplus_method(self):
+ """
+ Initialization method:
+ Chooses k observations by computing probabilities for each observation and using a weighted random distribution.
+ Algorithm: . This method should accelerate the algorithm by finding
+ the appropriate clusters right at the beginning and hence should make it more robust.
+ :return:
+ """
+ # 1) choose random center out of data
+ self.__cluster_means[0] = (random.choice(self.__obs))
+
+ max_value = np.max(self.__obs) + 1
+ probs = np.array([max_value for _ in range(self.__n)])
+
+ for i in range(1, self.__k):
+ probs.fill(max_value)
+ # compute new probabilities, choose min of all distances
+ for j in range(0, i):
+ dists = similarity_measurement(self.__obs, self.__cluster_means[j], self.__distance)
+ # collect minimum squared distances to cluster centroids
+ probs = np.minimum(probs, dists)
+
+ # sum all squared distances
+ sum_probs = np.float(np.sum(probs))
+
+ if sum_probs != 0:
+ probs /= sum_probs
+ # 3) choose new center based on probabilities
+ self.__cluster_means[i] = (self.__obs[weighted_choice(probs)])
+ else:
+ print('ERROR: cannot find enough cluster centroids for given k = ' + str(self.__k))
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def get_cluster_mean(self, num):
+ """
+ Returns the centroid of the cluster with index num.
+ :param num:
+ :return:
+ """
+ if num >= self.__k:
+ return None
+ else:
+ return self.__cluster_means[num]
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def get_cluster_of_element(self, index):
+ """
+ :param index: number of element in observation array
+ :return: cluster id of observation with given index.
+ """
+ if index >= self.__n:
+ return None
+ else:
+ return self.__label_map[index]
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def print_clusters(self):
+ """
+ Print the cluster centroids and the labels.
+ :return:
+ """
+ print('Centroids: ' + str(self.__centroids) + ' | _labels: ' + str(self.__labels))
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def __assignment(self):
+ """
+ Assignment step:
+ Compute distance of current observation to each cluster centroid and move gene to the nearest cluster.
+ :return:
+ """
+ for i in range(self.__n):
+ value = self.__obs[i]
+
+ # compute squared distances to each mean
+ dists = similarity_measurement(self.__cluster_means, value, self.__distance)
+ # nearest cluster
+ nearest_i_d = np.argmin(dists)
+
+ if self.__label_map[i] != nearest_i_d:
+ self.__changed = True
+ self.__label_map[i] = nearest_i_d
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def __update(self):
+ """
+ Update step:
+ Compute the new centroids of each cluster after the assignment.
+ :return:
+ """
+ self.__cluster_means.fill(0)
+ self.__cluster_nums.fill(0)
+
+ self.__cluster_labels = [[] for _ in range(self.__k)]
+
+ for ii in range(self.__n):
+ cluster_i_d = self.__label_map[ii]
+ self.__cluster_labels[cluster_i_d].append(ii)
+ self.__cluster_nums[cluster_i_d] += 1
+
+ for ii in range(self.__k):
+ self.__cluster_means[ii] = np.mean(self.__obs[self.__cluster_labels[ii]], axis=0)
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def __end(self):
+ """
+ Writes the results to the corresponding member variables.
+ :return:
+ """
+ # returned values | have to be reinitialized in case of sequential running
+ # centroids
+ self.__centroids = np.array([self.__obs[0] for _ in range(self.__k)], dtype=np.float)
+ # labels of observations
+ self.__labels = np.array([0 for _ in range(self.__n)], dtype=np.int)
+ # distances between centroids
+ # self.__centroid_dist_mat = np.zeros((self.__k, self.__k))
+
+ # we do not use Ordered_dict here, so obtain dict.values and fill array manually
+ for index in range(self.__n):
+ cluster_i_d = self.__label_map[index]
+ self.__labels[index] = cluster_i_d
+
+ # collect centroids
+ for ii in range(self.__k):
+ # self.__centroids.append(self.__cluster_means[ii].tolist())
+ self.__centroids[ii] = self.__cluster_means[ii]
+
+ # compute distances between each centroids
+ # for ii in range(self.__k - 1):
+ # # compute indices of other clusters
+ # jj = range(ii + 1, self.__k)
+ # # select matrix of cluster centroids
+ # centroid_mat = self.__centroids[jj]
+ # distances = np.sqrt(self.__compare(centroid_mat, self.__centroids[ii]))
+ # self.__centroid_dist_mat[ii, jj] = distances
+ # self.__centroid_dist_mat[jj, ii] = distances
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def run(self):
+ """
+ Runs the algorithm of k-means, using the initialization method and the assignment/update step.
+ Conducts at most iters iterations and terminates if this number is exceeded or no observations
+ was moved to another cluster.
+ :return:
+ """
+ # 1) init algorithm by choosing cluster centroids
+ self.__init()
+
+ max_iters = self.__iters
+ counter = 0
+ # 2) run clustering
+ while self.__changed and counter < max_iters:
+ self.__changed = False
+
+ self.__assignment()
+ self.__update()
+
+ counter += 1
+
+ self.num_iters = counter
+
+ # write results to the class members
+ self.__end()
+ return self.__centroids.tolist(), self.__labels.tolist(), self.__cluster_labels
+ # , self.__centroid_dist_mat.tolist()
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ # def get_dists_per_centroid(self):
+ # """
+ # Compute the distances between observations belonging to one cluster and the corresponding cluster centroid.
+ # Cluster labels are sorted in ascending order using their distances
+ # :return: array of distance arrays for each cluster and ordered labels
+ # """
+ #
+ # # labels per centroid
+ # # self.__cluster_labels = [[] for _ in range(self.__k)]
+ # # distances of obs to their cluster
+ # self.__centroid_dists = [[] for _ in range(self.__k)]
+ #
+ # for ii in range(self.__k):
+ # self.__cluster_labels[ii] = np.array(self.__cluster_labels[ii], dtype=np.int)
+ #
+ # # compute euclidean distances of values to cluster mean
+ # for ii in range(self.__k):
+ # mean = self.__cluster_means[ii]
+ # obs = self.__obs[self.__cluster_labels[ii]]
+ # dists = similarity_measurement(obs, mean, self.__compare).tolist()
+ # self.__centroid_dists[ii] = dists
+ #
+ # # sort indices in ascending order using the distances
+ # indices = range(len(dists))
+ # indices.sort(key=dists.__getitem__)
+ # self.__cluster_labels[ii] = self.__cluster_labels[ii][indices].tolist()
+ # self.__centroid_dists[ii].sort()
+ #
+ # return self.__cluster_labels, self.__centroid_dists
+
+
+########################################################################################################################
+
+def _plugin_initialize():
+ """
+ optional initialization method of this module, will be called once
+ :return:
+ """
+ pass
+
+
+# ----------------------------------------------------------------------------------------------------------------------
+
+def create(data, k, init_method, distance):
+ """
+ by convention contain a factory called create returning the extension implementation
+ :return:
+ """
+ return KMeans(data, k, init_method, distance)
+
+
+########################################################################################################################
+
+
+def _main():
+ from timeit import default_timer as timer
+ from scipy.cluster.vq import kmeans2
+ # np.random.seed(datetime.now())
+ # data = np.array([[1,2,3],[5,4,5],[3,2,2],[8,8,7],[9,6,7],[2,3,4]])
+ data = np.array([1, 1.1, 5, 8, 5.2, 8.3])
+
+ # data = np.array([np.random.rand(2) * 5 for _ in range(10)])
+ k = 3
+
+ time_mine = 0
+ time_theirs = 0
+ n = 10
+
+ for i in range(10):
+ s1 = timer()
+ k_means_plus = KMeans(data, k, 'kmeans++', 'sqeuclidean', 10)
+ result1 = k_means_plus.run()
+ # print(result)
+ e1 = timer()
+ # labels = k_means_plus.get_dists_per_centroid()
+ # l, d = compute_cluster_distances(data, labels[0])
+
+ s2 = timer()
+ result2 = kmeans2(data, k)
+ e2 = timer()
+
+ time_mine += e1 - s1
+ time_theirs += e2 - s2
+
+ print(result1)
+ print(result2)
+ print('mine: {}'.format(time_mine / n))
+ print('theirs: {}'.format(time_theirs / n))
+
+
+"""
+This is for testing the algorithm and comparing the resuls between this and scipy's algorithm
+"""
+if __name__ == '__main__':
+ _main()
diff --git a/phovea_clustering/clustering_service.py b/phovea_clustering/clustering_service.py
new file mode 100644
index 0000000..c789b7b
--- /dev/null
+++ b/phovea_clustering/clustering_service.py
@@ -0,0 +1,158 @@
+import numpy as np
+from clustering_hierarchical import get_clusters
+
+__author__ = 'Michael Kern'
+__version__ = '0.0.1'
+__email__ = 'kernm@in.tum.de'
+
+
+########################################################################################################################
+
+def load_data(dataset_id):
+ """
+ Loads the genomic data with given identifier dataset_id.
+ :param dataset_id: identifier
+ :return: array of the genomic data
+ """
+ import phovea_server.dataset as dt
+ # obtain Caleydo dataset from ID
+ dataset = dt.get(dataset_id)
+ # choose loaded attribute and load raw data in numpy format
+ # somehow hack to get a numpy array out of the data
+ try:
+ arr = np.array(list(dataset.asnumpy()))
+ except:
+ raise Exception
+ return arr
+
+
+########################################################################################################################
+
+def load_plugin(plugin_id, *args, **kwargs):
+ """
+ Loads the clustering plugin with given arguments.
+ :param plugin_id: identifier of plugin
+ :param *args: additional caller function arguments
+ :param **kwargs: additional arguments
+ :return: plugin
+ """
+ import phovea_server.plugin
+ # obtain all plugins with 'plugin_id' extension
+ plugins = phovea_server.plugin.list('clustering')
+ # choose plugin with given ID
+ for plugin in plugins:
+ if plugin.id == plugin_id:
+ # load the implementation of the plugin
+ return plugin.load().factory(*args, **kwargs)
+
+ raise NotImplementedError
+
+
+########################################################################################################################
+
+def run_kmeans(data, k, init_method, distance):
+ """
+ Runs the k-Means clustering algorithm given the loaded data set, the number of clusters k and the initialization
+ method.
+ :param data: observation matrix
+ :param k: number of clusters
+ :param init_method: number of clusters
+ :return: result of k-means
+ """
+ kmeans = load_plugin('caleydo-clustering-kmeans', data, k, init_method, distance)
+ # and run the kmeans extension
+ centroids, labels, cluster_labels = kmeans()
+ # cluster_labels, clusterDists = KMeans.getDistsPerCentroid()
+
+ return {'centroids': centroids, 'clusterLabels': cluster_labels}
+
+
+########################################################################################################################
+
+def run_hierarchical(data, k, method, distance):
+ """
+ Runs the hierarchical clustering algorithm given the loaded data set and type of linkage method.
+ :param data: observation matrix
+ :param method: linkage method
+ :return: linkage matrix / dendrogram of the algorithm
+ """
+ hierarchical = load_plugin('caleydo-clustering-hierarchical', data, method, distance)
+ # and use the extension
+ hierarchical()
+ # obtain k-number of clusters
+ centroids, cluster_labels, labels = get_clusters(k, data, hierarchical.tree, False)
+
+ return {'centroids': centroids, 'clusterLabels': cluster_labels, 'dendrogram': hierarchical.tree.json()}
+ # print('\t-> creating dendrogram tree...')
+ # tree = Hierarchical.generateTree(linkage)
+ # print('\t-> creating json string ...')
+ # dendrogram = tree.jsonify()
+ # print('\t-> finished.')
+
+ # return {'dendrogram': dendrogram} --> if needed later
+
+
+########################################################################################################################
+
+def run_affinity_propagation(data, damping, factor, preference, distance):
+ """
+ Runs the affinity propagation algorithm given the loaded dataset, a damping value, a certain factor and
+ a preference method.
+ :param data:
+ :param damping:
+ :param factor:
+ :param preference:
+ :return:
+ """
+ affinity = load_plugin('caleydo-clustering-affinity', data, damping, factor, preference, distance)
+ # use this extension
+ centroids, labels, cluster_labels = affinity()
+
+ return {'centroids': centroids, 'clusterLabels': cluster_labels}
+
+
+########################################################################################################################
+
+def run_fuzzy(data, num_clusters, m, threshold, distance):
+ fuzzy = load_plugin('caleydo-clustering-fuzzy', data, num_clusters, m, threshold, distance)
+
+ centroids, cluster_labels, partition_matrix, max_prob = fuzzy()
+
+ return {'centroids': centroids, 'clusterLabels': cluster_labels, 'partitionMatrix': partition_matrix,
+ 'maxProbability': max_prob}
+
+
+########################################################################################################################
+
+def get_cluster_distances(data, labels, metric, extern_labels=None, sorted=True):
+ """
+ Compute the cluster distances in a given data among certain rows (labels)
+ :param data: genomic data
+ :param labels: indices of rows
+ :param metric: distance metric
+ :param extern_labels:
+ :return: labels and distances values sorted in ascending order
+ """
+ from clustering_util import compute_cluster_intern_distances, compute_cluster_extern_distances
+ dist_labels, dist_values = compute_cluster_intern_distances(data, labels, sorted, metric)
+
+ if extern_labels is not None:
+ extern_dists = compute_cluster_extern_distances(data, dist_labels, extern_labels, metric)
+ return {'labels': dist_labels, 'distances': dist_values, 'externDistances': extern_dists}
+ else:
+ return {'labels': dist_labels, 'distances': dist_values}
+
+
+########################################################################################################################
+
+def get_clusters_from_dendrogram(data, dendrogram, num_clusters):
+ """
+
+ :param data:
+ :param dendrogram:
+ :param num_clusters:
+ :return:
+ """
+
+ centroids, cluster_labels, _ = get_clusters(num_clusters, data, dendrogram)
+ return {'centroids': centroids, 'clusterLabels': cluster_labels}
diff --git a/phovea_clustering/clustering_util.py b/phovea_clustering/clustering_util.py
new file mode 100644
index 0000000..5470538
--- /dev/null
+++ b/phovea_clustering/clustering_util.py
@@ -0,0 +1,489 @@
+########################################################################################################################
+
+import random
+import numpy as np
+
+# use scipy to compute different distance matrices
+from scipy.spatial.distance import pdist, squareform
+import scipy.stats as stats
+
+__author__ = "Michael Kern"
+__email__ = 'kernm@in.tum.de'
+
+"""
+http://eli.thegreenplace.net/2010/01/22/weighted-random-generation-in-python
+--> good explanation to create weighted choices / random numbers
+"""
+
+
+def weighted_choice(weights):
+ # compute sum of all weights
+ sum_total = sum(weights)
+ # compute a random with range[0, sum_total]
+ rnd = random.random() * sum_total
+
+ for index, weight in enumerate(weights):
+ # subtract current weight from random to find current index
+ rnd -= weight
+ if rnd < 0:
+ return index
+
+ # 20% faster if weights are sorted in descending order
+
+
+########################################################################################################################
+
+"""
+Implementation of an binary tree for hierarchical clustering
+"""
+
+"""
+Node of the tree containing information about it's id in data, children and the value
+"""
+
+
+class BinaryNode:
+ def __init__(self, value, id, size, left_child, right_child):
+ self.value = value
+ self.left = left_child
+ self.right = right_child
+ self.size = size
+ self.id = id
+ self.parent = None
+ self.indices = [id]
+
+ # create json info on the fly
+ self.json = {"id": self.id, "size": self.size, "value": self.value, "indices": [id]}
+ if left_child is not None and right_child is not None:
+ # self.json["value"] = np.mean(self.value)
+ self.json["children"] = [right_child.json, left_child.json]
+ self.indices = [] + right_child.indices + left_child.indices
+ self.json["indices"] = self.indices
+
+ def is_leave(self):
+ return self.left is None and self.right is None
+
+
+########################################################################################################################
+
+"""
+Implementation of an hierarchical binary tree
+"""
+
+
+class BinaryTree:
+ # this tree must not be empty and must have at least two children (leaves)
+ def __init__(self, left_node, right_node, new_id, new_value):
+ self.__create_new_root(left_node, right_node, new_id, new_value)
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def add_node(self, new_node, new_id, new_value):
+ self.__create_new_root(self.root, new_node, new_id, new_value)
+ return self
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def merge(self, tree, new_id, new_value):
+ self.__create_new_root(self.root, tree.root, new_id, new_value)
+ return self
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def jsonify(self):
+ import json
+ return json.dumps(self.root.json)
+ # return self.root.json
+ # return self.__traverseJson(self.root)
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def json(self):
+ return self.root.json
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def cut_tree_by_clusters(self, k):
+ queue = [self.root]
+
+ while len(queue) < k:
+ node = queue.pop(0)
+ queue.append(node.left)
+ queue.append(node.right)
+
+ def key_func(x):
+ if x.is_leave():
+ return 0
+ else:
+ return -x.value
+
+ queue.sort(key=key_func)
+
+ clusters = []
+
+ for node in queue:
+ clusters.append(node.indices)
+
+ return clusters
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def __traverse_json(self, node):
+ json = {"id": node.id, "size": node.size, "value": node.value}
+ if node.left is None and node.right is None:
+ return json
+ else:
+ json["children"] = [] + [self.__traverse_json(node.left)] + [self.__traverse_json(node.right)]
+
+ return json
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def get_leaves(self):
+ return self.root.indices
+ # return self.__traverseIDs(self.root)
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def __traverse_ids(self, node):
+
+ if node.left is None and node.right is None:
+ return [node.id]
+ else:
+ return [] + self.__traverse_ids(node.right) + self.__traverse_ids(node.left)
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+ def __create_new_root(self, left_node, right_node, new_id, new_value):
+ new_size = left_node.size + right_node.size
+ self.root = BinaryNode(new_value, new_id, new_size, left_node, right_node)
+ left_node.parent = right_node.parent = self.root
+
+ # ------------------------------------------------------------------------------------------------------------------
+
+
+def cut_json_tree_by_clusters(json_data, k):
+ # import json
+ # tree = json.loads(json_data)
+ queue = [json_data]
+
+ while len(queue) < k:
+ node = queue.pop(0)
+ queue.append(node['children'][0])
+ queue.append(node['children'][1])
+
+ def key_func(x):
+ if 'children' not in x:
+ return 0
+ else:
+ return -x['value']
+
+ queue.sort(key=key_func)
+
+ clusters = []
+
+ for node in queue:
+ clusters.append(node['indices'])
+
+ return clusters
+
+
+########################################################################################################################
+
+def euclidean_distance(matrix, vector, squared=False):
+ """
+ Computes the euclidean distance between a vector and the rows of a matrix in parallel.
+ :param matrix: array of observations or clusters
+ :param vector: cluster centroid or observation
+ :return:
+ """
+
+ # compute distance between values in matrix and the vector
+ dist_mat = matrix - vector
+ num_values = len(matrix)
+ distances = np.array([0.0 for _ in range(num_values)], dtype=np.float)
+
+ for ii in range(num_values):
+ distance = dist_mat[ii]
+ # always try to use np.dot when computing euclidean distance
+ # it's way faster than ** 2 and sum(..., axis=1)
+ distances[ii] = np.dot(distance, distance)
+
+ if squared:
+ return distances
+ else:
+ return np.sqrt(distances)
+
+
+# ----------------------------------------------------------------------------------------------------------------------
+
+def correlation_distance(matrix, vector, method):
+ """
+
+ :param matrix:
+ :param vector:
+ :return:
+ """
+
+ num_values = len(matrix)
+ distances = np.array([0.0 for _ in range(num_values)], dtype=np.float)
+
+ for ii in range(num_values):
+ value = matrix[ii]
+
+ if method == 'pearson':
+ distances[ii], _ = stats.pearsonr(value, vector)
+ elif method == 'spearman':
+ distances[ii], _ = stats.spearmanr(value, vector)
+ elif method == 'kendall':
+ distances[ii], _ = stats.kendalltau(value, vector)
+ else:
+ raise AttributeError
+
+ return distances
+
+
+# ----------------------------------------------------------------------------------------------------------------------
+
+
+def similarity_measurement(matrix, vector, method='euclidean'):
+ from scipy.spatial.distance import cdist
+
+ if method == 'euclidean':
+ return euclidean_distance(matrix, vector)
+
+ if method == 'sqeuclidean':
+ return euclidean_distance(matrix, vector, True)
+
+ spatial_methods = ['cityblock', 'chebyshev', 'canberra', 'correlation', 'hamming', 'mahalanobis', ]
+
+ if method in spatial_methods:
+ return np.nan_to_num(cdist(matrix, np.atleast_2d(vector), method).flatten())
+
+ corr_methods = ['spearman', 'pearson', 'kendall']
+
+ if method in corr_methods:
+ return correlation_distance(matrix, vector, method)
+
+ raise AttributeError
+
+
+# ----------------------------------------------------------------------------------------------------------------------
+
+def euclidean_distance_matrix(matrix, squared=False):
+ """
+ Compute the euclidean distance matrix required for the algorithm
+ :param matrix:
+ :param n:
+ :return:
+ """
+
+ n = np.shape(matrix)[0]
+ dist_mat = np.zeros((n, n))
+
+ # use Gram matrix and compute distances without inner products | FASTER than row-by-row method
+ "Gramiam matrix to compute dot products of each pair of elements: "
+ ""
+ gram_mat = np.zeros((n, n))
+ for ii in range(n):
+ for jj in range(ii, n):
+ gram_mat[ii, jj] = np.dot(matrix[ii], matrix[jj])
+
+ # # ! This is slower than computing dot products of rows manually in python
+ # # ! And we only require the upper triangle matrix of the Gram matrix
+ # gram_mat = np.dot(self.__obs, self.__obs.T)
+
+ # make use of formula |a - b|^2 = a^2 - 2ab + b^2
+ for ii in range(n):
+ # self.__d[ii, ii] = self.__maxValue
+ jj = np.arange(ii + 1, n)
+ dist_mat[ii, jj] = gram_mat[ii, ii] - 2 * gram_mat[ii, jj] + gram_mat[jj, jj]
+ dist_mat[jj, ii] = dist_mat[ii, jj]
+
+ # # take square root of distances to compute real euclidean distance
+ # dist_mat = np.sqrt(dist_mat)
+
+ "alternative version --> use scipy's fast euclidean distance implementation: FASTEST"
+ # dist_mat = spt.distance.pdist(self.__obs, 'euclidean')
+ # self.__d = spt.distance.squareform(dist_mat)
+ # print(dist_mat)
+
+ if squared:
+ return dist_mat
+ else:
+ return np.sqrt(dist_mat)
+
+
+# ----------------------------------------------------------------------------------------------------------------------
+
+def norm1_distance(matrix, vector):
+ """
+ Computes the norm-1 distance between a vector and the rows of a matrix in parallel.
+ :param matrix: array of observations or clusters
+ :param vector: cluster centroid or observation
+ :return:
+ """
+ dist_mat = np.abs(matrix - vector)
+ num_values = len(vector)
+
+ distances = np.sum(dist_mat, axis=1) / num_values
+ return distances
+
+
+# ----------------------------------------------------------------------------------------------------------------------
+
+def pearson_correlation_matrix(matrix):
+ """
+
+ :param matrix:
+ :param n:
+ :return:
+ """
+ # TODO! other possibilites like 1 - abs(corr) | sqrt(1 - corr ** 2) | (1 - corr) / 2
+ dist_mat = 1 - np.corrcoef(matrix)
+
+ return dist_mat
+
+
+# ----------------------------------------------------------------------------------------------------------------------
+
+def stats_correlation_matrix(matrix, method):
+ if method == 'pearson':
+ return pearson_correlation_matrix(matrix)
+
+ n = np.shape(matrix)[0]
+ dist_mat = np.zeros((n, n))
+
+ for ii in range(n):
+ row_i = matrix[ii]
+ for jj in range(ii + 1, n):
+ row_j = matrix[jj]
+ corr = 0
+
+ if method == 'spearman':
+ corr, _ = stats.spearmanr(row_i, row_j)
+
+ if method == 'kendall':
+ corr, _ = stats.kendalltau(row_i, row_j)
+
+ # TODO! other possibilites like 1 - abs(corr) | sqrt(1 - corr ** 2) | (1 - corr) / 2
+ corr = 1 - corr
+
+ dist_mat[ii, jj] = corr
+ dist_mat[jj, ii] = corr
+
+ return dist_mat
+
+
+# ----------------------------------------------------------------------------------------------------------------------
+
+def similarity_measurement_matrix(matrix, method):
+ """
+ Generic function to determine the similarity measurement for clustering
+ :param matrix:
+ :param method:
+ :return:
+ """
+ if method == 'euclidean':
+ return euclidean_distance_matrix(matrix)
+ # return squareform(pdist(matrix, method))
+
+ if method == 'sqeuclidean':
+ return euclidean_distance_matrix(matrix, True)
+ # return squareform(pdist(matrix, method))
+
+ spatial_methods = ['cityblock', 'chebyshev', 'canberra', 'correlation', 'hamming', 'mahalanobis']
+
+ if method in spatial_methods:
+ return squareform(np.nan_to_num(pdist(matrix, method)))
+
+ corr_methods = ['spearman', 'pearson', 'kendall']
+
+ if method in corr_methods:
+ return stats_correlation_matrix(matrix, method)
+
+ raise AttributeError
+
+
+########################################################################################################################
+# utility functions to compute distances between rows and cluster centroids
+
+def compute_cluster_intern_distances(matrix, labels, sorted=True, metric='euclidean'):
+ """
+ Computes the distances of each element in one cluster to the cluster's centroid. Returns distance values and labels
+ sorted in ascending order.
+ :param matrix:
+ :param labels:
+ :return: labels / indices of elements corresponding to distance array, distance values of cluster
+ """
+ cluster_labels = np.array(labels)
+ if len(cluster_labels) == 0:
+ return [], []
+
+ sub_matrix = matrix[cluster_labels]
+ # compute centroid of cluster along column (as we want to average each gene separately)
+ centroid = np.mean(sub_matrix, axis=0)
+
+ # compute distances to centroid
+ dists = similarity_measurement(sub_matrix, centroid, metric)
+
+ if sorted == 'true':
+ # sort values
+ indices = range(len(dists))
+ indices.sort(key=dists.__getitem__)
+ dists.sort()
+
+ # reverse order if correlation coefficient is used
+ # (1 means perfect correlation while -1 denotes opposite correlation)
+ corr_metrics = ['pearson', 'spearman', 'kendall']
+ if metric in corr_metrics:
+ indices.reverse()
+ dists = dists[::-1]
+
+ # write back to our arrays
+ dist_labels = cluster_labels[indices].tolist()
+ dist_values = dists.tolist()
+ else:
+ dist_labels = cluster_labels.tolist()
+ dist_values = dists.tolist()
+
+ return dist_labels, dist_values
+
+
+# ----------------------------------------------------------------------------------------------------------------------
+
+def compute_cluster_extern_distances(matrix, labels, outer_labels, metric='euclidean'):
+ """
+ Compute the distances of patients in one cluster to the centroids of all other clusters.
+ :param matrix:
+ :param labels:
+ :param outer_labels:
+ :return:
+ """
+ extern_dists = []
+ intern_sub_matrix = matrix[labels]
+
+ for extern_labels in outer_labels:
+
+ if len(extern_labels) == 0:
+ extern_dists.append([])
+
+ # compute centroid of external cluster
+ sub_matrix = matrix[extern_labels]
+ centroid = np.mean(sub_matrix, axis=0)
+
+ dists = similarity_measurement(intern_sub_matrix, centroid, metric)
+ extern_dists.append(dists.tolist())
+
+ return extern_dists
+
+
+########################################################################################################################
+
+if __name__ == '__main__':
+ from scipy.spatial.distance import cdist
+ print(cdist([[1, 1, 1], [3, 3, 3], [5, 5, 5]], np.atleast_2d([2, 2, 2]), 'sqeuclidean').flatten())
+
+ from scipy.stats import spearmanr
+
+ print(spearmanr([1, 2, 3], [2, 4, 1]))
diff --git a/phovea_clustering/config.json b/phovea_clustering/config.json
new file mode 100644
index 0000000..0967ef4
--- /dev/null
+++ b/phovea_clustering/config.json
@@ -0,0 +1 @@
+{}
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..f74d796
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+-e git+https://github.com/phovea/phovea_server.git#egg=phovea_server
\ No newline at end of file
diff --git a/requirements_dev.txt b/requirements_dev.txt
new file mode 100644
index 0000000..0e2902f
--- /dev/null
+++ b/requirements_dev.txt
@@ -0,0 +1,4 @@
+flake8==3.0.4
+pep8-naming==0.4.1
+pytest==3.0.3
+pytest-runner==2.9
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..5519f85
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,14 @@
+###############################################################################
+# Caleydo - Visualization for Molecular Biology - http://caleydo.org
+# Copyright (c) The Caleydo Team. All rights reserved.
+# Licensed under the new BSD license, available at http://caleydo.org/license
+###############################################################################
+
+[bdist_wheel]
+# This flag says that the code is written to work on both Python 2 and Python
+# 3. If at all possible, it is good practice to do this. If you cannot, you
+# will need to generate wheels for each Python version that you support.
+universal=1
+
+[aliases]
+test=pytest
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..2f7a0a1
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,81 @@
+###############################################################################
+# Caleydo - Visualization for Molecular Biology - http://caleydo.org
+# Copyright (c) The Caleydo Team. All rights reserved.
+# Licensed under the new BSD license, available at http://caleydo.org/license
+###############################################################################
+from __future__ import with_statement, print_function
+from setuptools import setup
+from codecs import open
+from os import path
+
+here = path.abspath(path.dirname(__file__))
+
+
+def read_it(name):
+ with open(path.join(here, name), encoding='utf-8') as f:
+ return f.read()
+
+
+# read package.json information
+with open(path.join(here, 'package.json'), encoding='utf-8') as json_data:
+ import json
+
+ pkg = json.load(json_data)
+
+
+def packaged(*files):
+ r = {}
+ global pkg
+ r[pkg['name'].encode('ascii')] = list(files)
+ return r
+
+
+setup(
+ name=pkg['name'],
+ version=pkg['version'],
+ description=pkg['description'],
+ long_description=read_it('README.md'),
+ keywords=pkg.get('keywords', ''),
+ author=pkg['author']['name'],
+ author_email=pkg['author']['email'],
+ license=pkg['license'],
+ zip_safe=False,
+
+ entry_points={
+ 'phovea.registry': ['{0} = {0}:phovea'.format(pkg['name'])],
+ 'phovea.config': ['{0} = {0}:phovea_config'.format(pkg['name'])]
+ },
+
+ # See https://pypi.python.org/pypi?%3Aaction=list_classifiers
+ classifiers=[
+ 'Intended Audience :: Developers',
+ 'Operating System :: OS Independent',
+ # Pick your license as you wish (should match "license" above)
+ 'License :: OSI Approved :: ' + pkg['license'],
+ 'Programming Language :: Python',
+ 'Programming Language :: Python :: 2.7',
+ 'Programming Language :: Python :: 3.4'
+ ],
+
+ # You can just specify the packages manually here if your project is
+ # simple. Or you can use find_packages().
+ py_modules=[pkg['name']],
+
+ # List run-time dependencies here. These will be installed by pip when
+ # your project is installed. For an analysis of "install_requires" vs pip's
+ # requirements files see:
+ # https://packaging.python.org/en/latest/requirements.html
+ install_requires=[r for r in read_it('requirements.txt').split('\n') if not r.startswith('-e git+https://')],
+ tests_require=read_it('requirements_dev.txt').split('\n'),
+
+ # If there are data files included in your packages that need to be
+ # installed, specify them here. If using Python 2.6 or less, then these
+ # have to be included in MANIFEST.in as well.
+ package_data=packaged('config.json'),
+
+ # Although 'package_data' is the preferred approach, in some case you may
+ # need to place data files outside of your packages. See:
+ # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa
+ # In this case, 'data_file' will be installed into '/my_data'
+ data_files=[] # [('my_data', ['data/data_file'])],
+)
diff --git a/tests/test_dummy.py b/tests/test_dummy.py
new file mode 100644
index 0000000..4b8fe97
--- /dev/null
+++ b/tests/test_dummy.py
@@ -0,0 +1,4 @@
+
+
+def test_dummy():
+ assert 1 == 1
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..f2734b2
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,28 @@
+###############################################################################
+# Caleydo - Visualization for Molecular Biology - http://caleydo.org
+# Copyright (c) The Caleydo Team. All rights reserved.
+# Licensed under the new BSD license, available at http://caleydo.org/license
+###############################################################################
+
+[tox]
+envlist = py{27,34}
+
+[testenv]
+basepython =
+ py27: python2.7
+ py34: python3.4
+deps =
+ flake8
+ pytest
+commands =
+ check-manifest --ignore tox.ini,tests*
+ python setup.py check -m -r -s
+ flake8 .
+ py.test tests
+
+[flake8]
+ignore=E111,E114,E501
+exclude = .tox,*.egg,build,data,.git,__pycache__,docs,node_modules
+
+[pytest]
+testpaths = tests