diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..c2cdfb8 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,21 @@ +# EditorConfig helps developers define and maintain consistent +# coding styles between different editors and IDEs +# editorconfig.org + +root = true + + +[*] + +# Change these settings to your own preference +indent_style = space +indent_size = 2 + +# We recommend you to keep these unchanged +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true + +[*.md] +trim_trailing_whitespace = false diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..833eaab --- /dev/null +++ b/.gitattributes @@ -0,0 +1,126 @@ +# These settings are for any web project + +# Handle line endings automatically for files detected as text +# and leave all files detected as binary untouched. +* text=auto + +# +# The above will handle all files NOT found below +# + +# +## These files are text and should be normalized (Convert crlf => lf) +# + +# source code +*.php text +*.css text +*.sass text +*.scss text +*.less text +*.styl text +*.js text +*.ts text +*.coffee text +*.json text +*.htm text +*.html text +*.xml text +*.txt text +*.ini text +*.inc text +*.pl text +*.rb text +*.py text +*.scm text +*.sql text +*.sh text eof=LF +*.bat text + +# templates +*.hbt text +*.jade text +*.haml text +*.hbs text +*.dot text +*.tmpl text +*.phtml text + +# server config +.htaccess text + +# git config +.gitattributes text +.gitignore text + +# code analysis config +.jshintrc text +.jscsrc text +.jshintignore text +.csslintrc text + +# misc config +*.yaml text +*.yml text +.editorconfig text + +# build config +*.npmignore text +*.bowerrc text + +# Heroku +Procfile text +.slugignore text + +# Documentation +*.md text +LICENSE text +AUTHORS text + + +# +## These files are binary and should be left untouched +# + +# (binary is a macro for -text -diff) +*.png binary +*.jpg binary +*.jpeg binary +*.gif binary +*.ico binary +*.mov binary +*.mp4 binary +*.mp3 binary +*.flv binary +*.fla binary +*.swf binary +*.gz binary +*.zip binary +*.7z binary +*.ttf binary +*.pyc binary +*.pdf binary + +# Source files +# ============ +*.pxd text +*.py text +*.py3 text +*.pyw text +*.pyx text +*.sh text eol=lf +*.json text + +# Binary files +# ============ +*.db binary +*.p binary +*.pkl binary +*.pyc binary +*.pyd binary +*.pyo binary + +# Note: .db, .p, and .pkl files are associated +# with the python modules ``pickle``, ``dbm.*``, +# ``shelve``, ``marshal``, ``anydbm``, & ``bsddb`` +# (among others). \ No newline at end of file diff --git a/.gitignore b/.gitignore index 0d20b64..a00b0c0 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,19 @@ +# general things to ignore +/.tscache +/.idea +/build/ +/dist/ +*.egg-info/ +*.egg +*.py[cod] +__pycache__/ +*.so +*~ +*.log +*.pot *.pyc +*.swp +*.lock +# due to using tox and pytest +.tox +.cache diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..80a94cb --- /dev/null +++ b/.travis.yml @@ -0,0 +1,27 @@ +language: python +sudo: required +env: + - TOXENV=py27 + - TOXENV=py34 + +install: + - (!(test -f docker_packages.txt) || (cat docker_packages.txt | xargs sudo apt-get install -y)) + - pip install -r requirements_dev.txt + - pip install -r requirements.txt + +script: npm run dist + +deploy: + provider: releases + api_key: + secure: TK9/P34Bi3WuppiDrBCwVcn41yCBwmILaU8hXTBzUPbT7TbeFIwsC6/4CtH85Z+ZrUve4S5pTmWRNf2dQDxWw3uYu7+bJuemV2J1LHG76mognj+TNEiYxfLQUt3Gql4W7C7FcI4Rlx5/uMN9wY1wro8TWUBMwT6jjSrUWIvK3GXoojd5bHvJx07XpjWl9wCon4D0ruZiFoM2mdeP23lbc2GckETi32oEKswnQXxkMACmxbPzoWbvkxH4aK8Bt2Rj2sl2TbPhVkN6DAkHGkGAvLI+2/aRfG27+oo3OKsaDjbuGABct8TfZccJ970CbQ8kbnCjYxstvqkg1JWjF0W67sX/flBZZOEUA5l0OLWo6HqMGMxm7/lEQhIdPMsRmvXL+HVOxkMrB2dda58QzxVwiZp+rRqUaeabPZp8Kl5xodGrVxsBvxe6zAbJ5jCtCSumG6+kLyKI00/kYlghqQNrgUw0ZsYJlQ34h3lo/24QpaeyDpQoCkGWQgtgqiXGpeKSu7bCnOqIqAy3nbT9Utwj7K8gIasTG5idosEAz/THMampNbGDuyxxc340sYGNMg9Bhm1g2ILWRdtV470p5hwBtIDTKi3/PAizEO26+Wh0zI47Sg3ao57avcbCsTmzbZUeA5J4bojmchhJCHX8su9cSCGh/2fJA/1eBIgEvOQ8LNE= + file_glob: true + file: dist/phovea_clustering*.egg + on: + tags: true + +notifications: + slack: + secure: E8/1UIdHSczUbN+6i6gd1d5LM4vmLdwLQ30tpyjvnM0wvfDce76oPxLJAy240WJ5ybXRZUtNrttpVpt4tEXCy8aLFCmxD7s77rVloH+q1J8R/ptTFWZGhFGEujk1awEmVbzcWxJkV9/JENQaeGBKxwv8/EQwWwEkAb7p/+AJb9owmH88b3wUZUGHBWtbMiyyaF4Rm1Wg1stJB8Z1Ga7PRF4cqufTgcDdsCPVv9gAY+VxOIGqX/Vfuc9UWpUH8vq8lHUE7Inn5QS78kuFfSgLWga3H6Mu/Gko1XNlWk0QWWQBUvEZ6ZC6Wuo68KzvUjJHDTnx8WyfHue2JNHIslcX+eJq2WHLeEgM24VeNkILCGo/H/60NGHiSjrIv/Y9h6bQ9FDjo6TUyE4nbdPYN1RN9FQ5UbI9Y4Gi753H9mqnHWlEywBOzHxdZCAuz9Wh03CCF/blsvJ+Obbyo6Jrfe+g44jyi9kQdBNQ78qG6v4EXws8FiYao6x3PpgIwFix42Cpr+soAh5FpA3C1zHSAyZZpXF65/lrDl5yPNofK7Wy0B9bw+0I6Z/u7ZKFNVZXvYPGYvtUVcsALGBdmYc61+LCta36Po0KZseWVAlJj6QnOJDYzv0wvV/zsuf9A5KpYFGiqV9Q7zmtiO5FYF5sBy+lE7O9tHVO4O18IRndhRQgxhs= + on_success: change + on_failure: always diff --git a/.yo-rc.json b/.yo-rc.json new file mode 100644 index 0000000..308c8c3 --- /dev/null +++ b/.yo-rc.json @@ -0,0 +1,52 @@ +{ + "generator-phovea": { + "type": "slib", + "name": "phovea_clustering", + "author": "The Caleydo Team", + "githubAccount": "phovea", + "modules": [ + "phovea_server" + ], + "extensions": [], + "sextensions": [ + { + "type": "clustering", + "id": "caleydo-clustering-kmeans", + "module": "clustering_kmeans", + "extras": {} + }, + { + "type": "clustering", + "id": "caleydo-clustering-hierarchical", + "module": "clustering_hierarchical", + "extras": {} + }, + { + "type": "clustering", + "id": "caleydo-clustering-affinity", + "module": "clustering_affinity", + "extras": {} + }, + { + "type": "clustering", + "id": "caleydo-clustering-fuzzy", + "module": "clustering_fuzzy", + "extras": {} + }, + { + "type": "namespace", + "id": "caleydo-clustering", + "module": "clustering_api", + "extras": { + "namespace": "/api/clustering" + } + } + ], + "libraries": [], + "unknown": { + "requirements": [], + "dockerPackages": [] + }, + "today": "Fri, 23 Dec 2016 03:02:24 GMT" + } +} \ No newline at end of file diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md new file mode 100644 index 0000000..bdb156f --- /dev/null +++ b/ISSUE_TEMPLATE.md @@ -0,0 +1,17 @@ +* Release number or git hash: +* Web browser version and OS: +* Environment (local or deployed): + +### Steps to reproduce + +1. +2. + +### Observed behavior +* Any unexpected output or action (or lack of expected output or action) +* Web browser console errors (including tracebacks) +* Server errors (relevant messages and tracebacks) +* Static or animated images showing the UI behavior + +### Expected behavior +* diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..fd6f461 --- /dev/null +++ b/LICENSE @@ -0,0 +1,22 @@ +Copyright (c) 2016, The Caleydo Team +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + + Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + + Neither the name of the Caleydo Software nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md index 1a3242f..97caba2 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,44 @@ -Caleydo Clustering -================== - -This repository is a server-side plugin for Caleydo to apply clustering algorithms on arbitrary matrices. - -Supported Algorithms: --------------------- -- K-Means (Init methods: Forgy, Uniform, Random, Kmeans++) -- Hierarchical Clustering (Single, Complete, Weighted, Median) -- Affinity Propagation -- Fuzzy Clustering -- Various Distance Measurements(Euclidean, Chebyshef, Manhattan, Pearson, Spearman, ...) - -General Information: -------------------- -- All these algorithms expect the input as matrix (or 2D numpy array) -- It is assumed that the matrix is dense (no sparse matrix support right now) -- NaN values will be converted to zero - -Future Work: +phovea_clustering [![Phovea][phovea-image]][phovea-url] [![NPM version][npm-image]][npm-url] [![Build Status][travis-image]][travis-url] [![Dependency Status][daviddm-image]][daviddm-url] +===================== + + + +Installation ------------ -- Improvement of algorithms -- Combination of several clustering results + +``` +git clone https://github.com/phovea/phovea_clustering.git +cd phovea_clustering +npm install +``` + +Testing +------- + +``` +npm test +``` + +Building +-------- + +``` +npm run build +``` + + + +*** + + +This repository is part of **[Phovea](http://phovea.caleydo.org/)**, a platform for developing web-based visualization applications. For tutorials, API docs, and more information about the build and deployment process, see the [documentation page](http://caleydo.org/documentation/). + + +[phovea-image]: https://img.shields.io/badge/Phovea-Server%20Plugin-10ACDF.svg +[phovea-url]: https://phovea.caleydo.org +[npm-image]: https://badge.fury.io/js/phovea_clustering.svg +[npm-url]: https://npmjs.org/package/phovea_clustering +[travis-image]: https://travis-ci.org/phovea/phovea_clustering.svg?branch=master +[travis-url]: https://travis-ci.org/phovea/phovea_clustering +[daviddm-image]: https://david-dm.org/phovea/phovea_clustering.svg?theme=shields.io +[daviddm-url]: https://david-dm.org/phovea/phovea_clustering diff --git a/__init__.py b/__init__.py deleted file mode 100644 index f425e42..0000000 --- a/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -__author__ = 'Michael Kern' -__version__ = '1.0.0' -__email__ = 'kernm@in.tum.de' diff --git a/clustering_affinity.py b/clustering_affinity.py deleted file mode 100644 index 80a4963..0000000 --- a/clustering_affinity.py +++ /dev/null @@ -1,304 +0,0 @@ -__author__ = 'Michael Kern' -__version__ = '0.0.1' -__email__ = 'kernm@in.tum.de' - -######################################################################################################################## -# libraries - -# module to load own configurations -import caleydo_server.config -# request config if needed for the future -config = caleydo_server.config.view('caleydo-clustering') - -import numpy as np -from clustering_util import similarityMeasurementMatrix -from timeit import default_timer as timer - -######################################################################################################################## - -class AffinityPropagation: - """ - This is an implementation of the affinity propagation algorithm to cluster genomic data / matrices. - Implementation details: . - Matlab implementation: - Returns the centroids and labels / stratification of each row belonging to one cluster. - """ - - def __init__(self, obs, damping=0.5, factor=1.0, prefMethod='minimum', distance='euclidean'): - """ - Initializes the algorithm. - :param obs: genomic data / matrix - :param damping: controls update process to dampen oscillations - :param factor: controls the preference value (influences number of clusters) - :param prefMethod: all points are chosen equally with a given preference (median or minimum of similarity matrix) - :return: - """ - self.__n = np.shape(obs)[0] - # observations, can be 1D array or 2D matrix with genes as rows and conditions as columns - # remove all NaNs in data - self.__obs = np.nan_to_num(obs) - # variables influencing output of clustering algorithm - self.__damping = damping - self.__factor = factor - self.__prevMethod = prefMethod - - # similarity matrix - self.__S = np.zeros((self.__n, self.__n)) - # availability matrix - self.__A = np.zeros((self.__n, self.__n)) - # responsibility matrix - self.__R = np.zeros((self.__n, self.__n)) - - self.minValue = np.finfo(np.float).min - - # self.__mx1 = np.full(self.__n, self.minValue) - # self.__mx2 = np.full(self.__n, self.minValue) - - self.__idx = np.zeros(self.__n) - - # set similarity computation - self.__distance = distance - - self.__computeSimilarity() - - # ------------------------------------------------------------------------------------------------------------------ - - def __call__(self): - """ - Caller function for server API. - """ - return self.run() - - # ------------------------------------------------------------------------------------------------------------------ - - def __computeSimilarity(self): - """ - Compute the similarity matrix from the original observation matrix and set preference of each element. - :return: Similarity matrix - """ - # compute distance matrix containing the negative sq euclidean distances -|| xi - xj ||**2 - self.__S = -similarityMeasurementMatrix(self.__obs, self.__distance) - - # determine the preferences S(k,k) to control the output of clusters - pref = 0 - # could be median or minimum - if self.__prevMethod == 'median': - pref = float(np.median(self.__S)) * self.__factor - elif self.__prevMethod == 'minimum': - pref = np.min(self.__S) * self.__factor - else: - raise AttributeError - - np.fill_diagonal(self.__S, pref) - - # ------------------------------------------------------------------------------------------------------------------ - - def run(self): - """ - Runs the algorithm of affinity propagation. Conducts at least 100 iterations and checks if the outcome of - current exemplars/clusters has converged. If not, the algorithm will continue until convergence is found - or the maximum number of iterations (200) is reached. - :return: - """ - maxIter = 200 - maxConvIter = 100 - - # sum all decisions for exemplars per round - decisionSum = np.zeros(self.__n) - # collect decisions for one exemplar per iteration round - decisionIter = np.zeros((maxConvIter, self.__n)) - # counter for decisions (= consider data element as exemplar in each algorithm iteration) - decisionCounter = maxConvIter - # indicates if algorithm has converged - isConverged = False - - centroids = [] - it = 0 - clusterI = [] - - # helpful variables (that do not need recomputation) - indexDiag = np.arange(self.__n) - indicesDiag = np.diag_indices_from(self.__R) - newA = np.zeros((self.__n, self.__n)) - newR = np.zeros((self.__n, self.__n)) - - for it in range(1, maxIter + 1): - - # ---------------------------------------------------------------------------------------------------------- - - # compute responsibility matrix - AS = self.__A + self.__S - - maxY = np.max(AS, axis=1) - indexY = np.argmax(AS, axis=1) - - # set values of maxima to zero in AS matrix - AS[indexDiag, indexY] = self.minValue - - # look for second maxima - maxY2 = np.max(AS, axis=1) - - # perform responsibility update - for ii in range(self.__n): - # s(i, k) - max({ a(i, k') + s(i, k') }) - newR[ii] = self.__S[ii] - maxY[ii] - - # subtract second maximum from row -> column entry with maximum value - newR[indexDiag, indexY] = self.__S[indexDiag, indexY] - maxY2[indexDiag] - - # dampen values - # self.__R = self.__damping * self.__R + (1 - self.__damping) * newR - self.__R *= self.__damping - self.__R += (1 - self.__damping) * newR - - # ---------------------------------------------------------------------------------------------------------- - - # compute availability matrix - # cut out negative elements - # TODO! slow because of copy operation - Rp = np.maximum(self.__R, 0) - - # write back all diagonal elements als self representatives - Rp[indicesDiag] = self.__R[indicesDiag] - sumCols = np.sum(Rp, axis=0) - - # apply availability update - newA[:,] = sumCols - newA -= Rp - # for ii in range(self.__n): - # # r(k, k) + sum(max(0, r(i',k)) - # newA[:, ii] = sumCols[ii] - Rp[:, ii] - - diagA = np.diag(newA) - # take minimum of all the values in A, cut out all values above zero - # newA = np.minimum(newA, 0) - newA[newA > 0] = 0 - newA[indicesDiag] = diagA[indexDiag] - - # dampen values - # self.__A = self.__damping * self.__A + (1 - self.__damping) * newA - self.__A *= self.__damping - self.__A += (1 - self.__damping) * newA - - # ---------------------------------------------------------------------------------------------------------- - - # find exemplars for new clusters - # old version which is slower - # E = self.__R + self.__A - # diagE = np.diag(E) - - # take the diagonal elements of the create matrix E - diagE = np.diag(self.__R) + np.diag(self.__A) - - # all elements > 0 are considered to be an appropriate exemplar for the dataset - clusterI = np.argwhere(diagE > 0).flatten() - - # count the number of clusters - numClusters = len(clusterI) - - # ---------------------------------------------------------------------------------------------------------- - - decisionCounter += 1 - if decisionCounter >= maxConvIter: - decisionCounter = 0 - - # subtract outcome of previous iteration (< 100) from the total sum of the decisions - decisionSum -= decisionIter[decisionCounter] - - decisionIter[decisionCounter].fill(0) - decisionIter[decisionCounter][clusterI] = 1 - - # compute sum of decisions for each element being a exemplar - decisionSum += decisionIter[decisionCounter] - - # check for convergence - if it >= maxConvIter or it >= maxIter: - isConverged = True - - for ii in range(self.__n): - # if element is considered to be an exemplar in at least one iterations - # and total of decisions in the last 100 iterations is not 100 --> no convergence - if decisionSum[ii] != 0 and decisionSum[ii] != maxConvIter: - isConverged = False - break - - if isConverged and numClusters > 0: - break - - # -------------------------------------------------------------------------------------------------------------- - - # obtain centroids - centroids = self.__obs[clusterI] - - # find maximum columns in AS matrix to assign elements to clusters / exemplars - # fill A with negative values - self.__A.fill(self.minValue) - # set values of clusters to zero (as we only want to regard these values - self.__A[:, clusterI] = 0.0 - # fill diagonal of similarity matrix to zero (remove preferences) - np.fill_diagonal(self.__S, 0.0) - - # compute AS matrix - AS = self.__A + self.__S - # since values are < 0, look for the maximum number in each row and return its column index - self.__idx = np.argmax(AS, axis=1) - - clusterI = clusterI.tolist() - clusterLabels = [[] for _ in range(numClusters)] - - # create labels per cluster - for ii in range(self.__n): - index = clusterI.index(self.__idx[ii]) - self.__idx[ii] = index - clusterLabels[index].append(ii) - - # return sorted cluster labels (that's why we call compute cluster distances, might be redundant) - # for ii in range(numClusters): - # clusterLabels[ii], _ = computeClusterInternDistances(self.__obs, clusterLabels[ii]) - - # if isConverged: - # print('Algorithm has converged after {} iterations'.format(it)) - # else: - # print('Algorithm has not converged after 200 iterations') - # - # print('Number of detected clusters {}'.format(numClusters)) - # print('Centroids: {}'.format(centroids)) - - return centroids.tolist(), self.__idx.tolist(), clusterLabels - -######################################################################################################################## - -def _plugin_initialize(): - """ - optional initialization method of this module, will be called once - :return: - """ - pass - -# ---------------------------------------------------------------------------------------------------------------------- - -def create(data, damping, factor, preference, distance): - """ - by convention contain a factory called create returning the extension implementation - :return: - """ - return AffinityPropagation(data, damping, factor, preference, distance) - -######################################################################################################################## - -# from timeit import default_timer as timer - -if __name__ == '__main__': - np.random.seed(200) - # data = np.array([[1,2,3],[5,4,5],[3,2,2],[8,8,7],[9,6,7],[2,3,4]]) - # data = np.array([np.random.rand(8000) * 4 - 2 for _ in range(500)]) - # data = np.array([[0.9],[1],[1.1],[10],[11],[12],[20],[21],[22]]) - data = np.array([1,1.1,5,8,5.2,8.3]) - - s = timer() - aff = AffinityPropagation(data, 0.9, 1.0, 'median', 'euclidean') - result = aff.run() - e = timer() - print(result) - print('time elapsed: {}'.format(e - s)) - diff --git a/clustering_api.py b/clustering_api.py deleted file mode 100644 index 3d9941c..0000000 --- a/clustering_api.py +++ /dev/null @@ -1,148 +0,0 @@ -__author__ = 'Michael Kern' -__version__ = '0.0.1' -__email__ = 'kernm@in.tum.de' - -######################################################################################################################## -# libraries - -# use flask library for server activities -import flask -# load services (that are executed by the server when certain website is called) -from clustering_service import * - -# create new flask application for hosting namespace -app = flask.Flask(__name__) - -######################################################################################################################## - -@app.route('/kmeans////') -def kmeansClustering(k, initMethod, distance, datasetID): - """ - Access k-means clustering plugin. - :param k: number of clusters - :param initMethod: initialization method for initial clusters - :param distance: distance measurement - :param datasetID: identifier of data set - :return: jsonified output - """ - try: - data = loadData(datasetID) - response = runKMeans(data, int(k), initMethod, distance) - return flask.jsonify(response) - except: - return flask.jsonify({}) - -######################################################################################################################## - -@app.route('/hierarchical////') -def hierarchicalClustering(k, method, distance, datasetID): - """ - Access hierarchical clustering plugin. - :param k: number of desired clusters - :param method: type of single linkage - :param distance: distance measurement - :param datasetID: identifier of data set - :return: jsonified output - """ - try: - data = loadData(datasetID) - response = runHierarchical(data, int(k), method, distance) - return flask.jsonify(response) - except: - return flask.jsonify({}) - -######################################################################################################################## - -@app.route('/affinity/////') -def affinityPropagationClustering(damping, factor, preference, distance, datasetID): - """ - Access affinity propagation clustering plugin. - :param damping: - :param factor: - :param preference: - :param distance: distance measurement - :param datasetID: - :return: - """ - try: - data = loadData(datasetID) - response = runAffinityPropagation(data, float(damping), float(factor), preference, distance) - return flask.jsonify(response) - except: - return flask.jsonify({}) - -######################################################################################################################## - -@app.route('/fuzzy/////') -def fuzzyClustering(numClusters, m, threshold, distance, datasetID): - """ - :param numClusters: - :param m: - :param threshold: - :param distance: - :param datasetID: - :return: - """ - try: - data = loadData(datasetID) - response = runFuzzy(data, int(numClusters), float(m), float(threshold), distance) - return flask.jsonify(response) - except: - return flask.jsonify({}) - -######################################################################################################################## - -def loadAttribute(jsonData, attr): - import json - data = json.loads(jsonData) - if attr in data: - return data[attr] - else: - return None - -######################################################################################################################## - -@app.route('/distances///', methods=['POST']) -def getDistances(metric, datasetID, sorted): - """ - Compute the distances of the current stratification values to its centroid. - :param metric: - :param datasetID: - :return: distances and labels sorted in ascending order - """ - data = loadData(datasetID) - labels = [] - externLabels = None - - if 'group' in flask.request.values: - labels = loadAttribute(flask.request.values['group'], 'labels') - externLabels = loadAttribute(flask.request.values['group'], 'externLabels') - else: - return '' - - response = getClusterDistances(data, labels, metric, externLabels, sorted) - return flask.jsonify(response) - -######################################################################################################################## - -@app.route('/dendrogram//', methods=['POST']) -def dendrogramClusters(numClusters, datasetID): - data = loadData(datasetID) - - if 'group' in flask.request.values: - dendrogram = loadAttribute(flask.request.values['group'], 'dendrogram') - else: - return '' - - response = getClustersFromDendrogram(data, dendrogram, int(numClusters)) - return flask.jsonify(response) - - -######################################################################################################################## - -def create(): - """ - Standard Caleydo convention for creating the service when server is initialized. - :return: Returns implementation of this plugin with given name - """ - return app diff --git a/clustering_fuzzy.py b/clustering_fuzzy.py deleted file mode 100644 index ff4a607..0000000 --- a/clustering_fuzzy.py +++ /dev/null @@ -1,223 +0,0 @@ -__author__ = 'Michael Kern' -__version__ = '0.0.3' -__email__ = 'kernm@in.tum.de' - -######################################################################################################################## -# libraries - -# module to load own configurations -import caleydo_server.config -# request config if needed for the future -config = caleydo_server.config.view('caleydo-clustering') - -# library to conduct matrix/vector calculus -import numpy as np - -from clustering_util import similarityMeasurement - -######################################################################################################################## -# class definition - -class Fuzzy(object): - """ - Formulas: https://en.wikipedia.org/wiki/Fuzzy_clustering - """ - - def __init__(self, obs, numClusters, m=2.0, threshold=-1, distance='euclidean', init=None, error=0.0001): - """ - Initializes algorithm. - :param obs: observation matrix / genomic data - :param numClusters: number of clusters - :param m: fuzzifier, controls degree of fuzziness, from [1; inf] - :return: - """ - # observation - self.__obs = np.nan_to_num(obs) - - self.__n = obs.shape[0] - - # fuzzifier value - self.__m = np.float(m) - # number of clusters - self.__c = numClusters - - # matrix u containing all the weights describing the degree of membership of each patient to the centroid - if init is None: - init = np.random.rand(self.__c, self.__n) - - self.__u = np.copy(init) - - # TODO! scikit normalizes the values at the beginning and at each step to [0; 1] - self.__u /= np.ones((self.__c, 1)).dot(np.atleast_2d(np.sum(self.__u, axis=0))).astype(np.float64) - # remove all zero values and set them to smallest possible value - self.__u = np.fmax(self.__u, np.finfo(np.float64).eps) - # centroids - self.__centroids = np.zeros(self.__c) - # threshold for stopping criterion - self.__error = error - # distance function - self.__distance = distance - - # threshold or minimum probability used for cluster assignments - if threshold == -1: - self.__threshold = 1.0 / numClusters - else: - self.__threshold = threshold - - # ------------------------------------------------------------------------------------------------------------------ - - def __call__(self): - """ - Caller function for server API - :return: - """ - return self.run() - - # ------------------------------------------------------------------------------------------------------------------ - - def computeCentroid(self): - """ - Compute the new centroids using the computed partition matrix. - :return: - """ - uM = self.__u ** self.__m - - sumDataWeights = np.dot(uM, self.__obs) - if self.__obs.ndim == 1: - m = 1 - else: - m = self.__obs.shape[1] - - sumWeights = np.sum(uM, axis=1) - # tile array (sum of weights repeated in every row) - sumWeights = np.ones((m, 1)).dot(np.atleast_2d(sumWeights)).T - - if self.__obs.ndim == 1: - sumWeights = sumWeights.flatten() - - # divide by total sum to get new centroids - self.__centroids = sumDataWeights / sumWeights - - # ------------------------------------------------------------------------------------------------------------------ - - def computeCoefficients(self): - """ - Compute new partition matrix / weights describing the degree of membership of each patient to all clusters. - :return: - """ - - # TODO you can also use cdist of scipy.spatial.distance module - distMat = np.zeros((self.__c, self.__n)) - - for ii in range(self.__c): - distMat[ii] = similarityMeasurement(self.__obs, self.__centroids[ii], self.__distance) - - # set zero values to smallest values to prevent inf results - distMat = np.fmax(distMat, np.finfo(np.float64).eps) - - # apply coefficient formula - denom = np.float(self.__m - 1.0) - self.__u = distMat ** (-2.0 / denom) - - sumCoeffs = np.sum(self.__u, axis=0) - - self.__u /= np.ones((self.__c, 1)).dot(np.atleast_2d(sumCoeffs)) - self.__u = np.fmax(self.__u, np.finfo(np.float64).eps) - - # ------------------------------------------------------------------------------------------------------------------ - - def run(self): - """ - Perform the c-means fuzzy clustering. - :return: - """ - MAX_ITER = 100 - iter = 0 - - while iter < MAX_ITER: - # save last partition matrix - uOld = np.copy(self.__u) - # compute centroids with given weights - self.computeCentroid() - # compute new coefficient matrix - self.computeCoefficients() - - # normalize weight / partition matrix u - self.__u /= np.ones((self.__c, 1)).dot(np.atleast_2d(np.sum(self.__u, axis=0))) - self.__u = np.fmax(self.__u, np.finfo(np.float64).eps) - - # compute the difference between the old and new matrix - epsilon = np.linalg.norm(self.__u - uOld) - - # stop if difference (epsilon) is smaller than the user-defined threshold - if epsilon < self.__error: - break - - iter += 1 - - self.__end() - - u = self.__u.T - # print(self.__u.T) - - return self.__centroids.tolist(), self.__clusterLabels, u.tolist(), self.__threshold - - # ------------------------------------------------------------------------------------------------------------------ - - def __end(self): - """ - Conduct the cluster assignments and creates clusterLabel array. - :return: - """ - # assign patient to clusters - # transpose to get a (n, c) matrix - u = self.__u.T - - self.__labels = np.zeros(self.__n, dtype=np.int) - self.__clusterLabels = [[] for _ in range(self.__c)] - # gather all probabilities / degree of memberships of each patient to the clusters - # self.__clusterProbs = [[] for _ in range(self.__c)] - # probability that the patients belongs to each cluster - maxProb = np.float64(self.__threshold) - - for ii in range(self.__n): - # clusterID = np.argmax(u[ii]) - # self.__labels = clusterID - # self.__clusterLabels[clusterID].append(ii) - - for jj in range(self.__c): - if u[ii][jj] >= maxProb: - clusterID = jj - self.__labels = clusterID - self.__clusterLabels[clusterID].append(int(ii)) - - # for ii in range(self.__c): - # self.__clusterLabels[ii], _ = computeClusterInternDistances(self.__obs, self.__clusterLabels[ii]) - -######################################################################################################################## - -def _plugin_initialize(): - """ - optional initialization method of this module, will be called once - :return: - """ - pass - -# ---------------------------------------------------------------------------------------------------------------------- - -def create(data, numCluster, m, threshold, distance): - """ - by convention contain a factory called create returning the extension implementation - :return: - """ - return Fuzzy(data, numCluster, m, threshold, distance) - -######################################################################################################################## - -if __name__ == '__main__': - - data = np.array([[1,1,2],[5,4,5],[3,2,2],[8,8,7],[9,8,9],[2,2,2]]) - # data = np.array([1,1.1,5,8,5.2,8.3]) - - fuz = Fuzzy(data, 3, 1.5) - print(fuz.run()) diff --git a/clustering_hierarchical.py b/clustering_hierarchical.py deleted file mode 100644 index b432f40..0000000 --- a/clustering_hierarchical.py +++ /dev/null @@ -1,462 +0,0 @@ -__author__ = 'Michael Kern' -__version__ = '0.0.3' -__email__ = 'kernm@in.tum.de' - -######################################################################################################################## -# libraries - -# module to load own configurations -import caleydo_server.config -# request config if needed for the future -config = caleydo_server.config.view('caleydo-clustering') - -# library to conduct matrix/vector calculus -import numpy as np -# fastest distance computation by scipy -import scipy.spatial as spt - -# utility functions for clustering and creating the dendrogram trees -from clustering_util import BinaryNode, BinaryTree -from clustering_util import similarityMeasurementMatrix -from clustering_util import computeClusterInternDistances - -######################################################################################################################## - -class Hierarchical(object): - """ - This is a implementation of hierarchical clustering on genomic data using the Lance-Williams dissimilarity update - to compute different distance metrics (single linkage, complete linkage, ...). - Lance-Williams explained in: http://arxiv.org/pdf/1105.0121.pdf - """ - - def __init__(self, obs, method='single', distance='euclidean'): - """ - Initializes the algorithm - :param obs: genomic data / matrix - :param method: linkage method - :return: - """ - # genomic data / matrix - # observations, can be 1D array or 2D matrix with genes as rows and conditions as columns - # remove all NaNs in data - self.__obs = np.nan_to_num(obs) - - numGenes = np.shape(self.__obs)[0] - self.__n = numGenes - - # check if dimension is 2D - # if self.__obs.ndim == 2: - # # obtain number of observations (rows) - # numGenes, _ = np.shape(self.__obs) - # self.__n = numGenes - - # else: - # print("[Error]:\tdata / observations must be 2D. 1D observation arrays are not supported") - # raise AttributeError - - # distance measurement - self.__distance = distance - - # distance / proximity matrix - self.__d = [] - self.__computeProximityMatrix() - # dictionary mapping the string id (i,j,k,...) of clusters to corresponding index in matrix - self.__idMap = {} - # inverse mapping of idMap --> returns the string id given a certain index - self.__keyMap = {} - # contains actual index of all clusters, old clusters are from [0, n - 1], new clusters have indices in range - # [n, 2n - 1] - self.__clusterMap = {} - for ii in range(self.__n): - self.__idMap[str(ii)] = ii - self.__keyMap[ii] = str(ii) - self.__clusterMap[str(ii)] = ii - - # linkage method for hierarchical clustering - self.__method = method - - # internal dendrogram tree - self.__tree = None - - # ------------------------------------------------------------------------------------------------------------------ - - def __call__(self): - """ - Caller function for server API - :return: - """ - return self.run() - - # ------------------------------------------------------------------------------------------------------------------ - - @property - def tree(self): - return self.__tree - - # ------------------------------------------------------------------------------------------------------------------ - - def __getCoefficients(self, clusterI, clusterJ): - """ - Compute the coefficients for the Lance-Williams algorithm - :param clusterI: - :param clusterJ: - :return: - """ - # TODO! use hash map for storing numbers instead of computing them every time - if self.__method == 'single': - return 0.5, 0.5, 0, -0.5 - elif self.__method == 'complete': - return 0.5, 0.5, 0, 0.5 - elif self.__method == 'weighted': - return 0.5, 0.5, 0, 0 - elif self.__method == 'median': - return 0.5, 0.5, -0.25, 0 - - # TODO! ATTENTION! average method should compute the cluster centroids using the average - # TODO! || clusterI - clusterJ || ** 2 - elif self.__method == 'average': - nI = np.float(clusterI.count(',') + 1) - nJ = np.float(clusterJ.count(',') + 1) - sumN = nI + nJ - return (nI / sumN), (nJ / sumN), 0, 0 - - # TODO! ATTENTION! centroid method should compute the cluster centroids using the mean - # TODO! || clusterI - clusterJ || ** 2 - elif self.__method == 'centroid': - nI = np.float(clusterI.count(',') + 1) - nJ = np.float(clusterJ.count(',') + 1) - sumN = nI + nJ - return (nI / sumN), (nJ / sumN), -((nI * nJ) / (sumN ** 2)), 0 - - # TODO! Support ward method - # TODO! (|clusterI| * |clusterJ|) / (|clusterI| + |clusterJ) * || clusterI - clusterJ || ** 2 - # elif self.__method == 'ward': - # nI = np.float(clusterI.count(',') + 1) - # nJ = np.float(clusterJ.count(',') + 1) - # nK = np.float(clusterK.count(',') + 1) - # sumN = nI + nJ + nK - # return (nI + nK) / sumN, (nJ + nK) / sumN, -nK / sumN, 0 - else: - raise AttributeError - - # ------------------------------------------------------------------------------------------------------------------ - - def __computeProximityMatrix(self): - """ - Compute the proximity of each observation and store the results in a nxn matrix - :return: - """ - - # create distance matrix of size n x n - self.__d = np.zeros((self.__n, self.__n)) - - # compute euclidean distance - # TODO! implement generic distance functions - # TODO! look for an alternative proximity analysis without computing all distances - self.__d = similarityMeasurementMatrix(self.__obs, self.__distance) - - # get number of maximum value of float - self.__maxValue = self.__d.max() + 1 - - # fill diagonals with max value to exclude them from min dist process - # TODO! operate only on upper triangle matrix of distance matrix - np.fill_diagonal(self.__d, self.__maxValue) - - # print('\t-> finished.') - - # ------------------------------------------------------------------------------------------------------------------ - - def __getMatrixMinimumIndices(self): - """ - Searches for the minimum distance in the distance matrix - :return: indices of both clusters having the smallest distance - """ - minDist = self.__d.min() - minList = np.argwhere(self.__d == minDist) - - minI, minJ = 0, 0 - - # look for indices, where i < j - # TODO! for the future --> use upper triangle matrix - for ii in range(len(minList)): - minI, minJ = minList[ii] - if minI < minJ: - break - - if minI == minJ: - print("ERROR") - - return self.__keyMap[minI], self.__keyMap[minJ], minDist - - # ------------------------------------------------------------------------------------------------------------------ - - def __deleteClusters(self, i, j): - """ - Reorders and reduces the matrix to insert the new cluster formed of cluster i and j - and its distance values, and removes the old clusters by cutting the last row. - :param i: cluster index i - :param j: cluster index j - :return: - """ - idI = self.__idMap[str(i)] - idJ = self.__idMap[str(j)] - - minID = min(idI, idJ) - maxID = max(idI, idJ) - - # now set column max ID to last column -> swap last and i column - lastRow = self.__d[self.__n - 1] - self.__d[maxID] = lastRow - self.__d[:, maxID] = self.__d[:, (self.__n - 1)] - - # set key of last column (cluster) to column of the cluster with index maxID - key = self.__keyMap[self.__n - 1] - self.__idMap[key] = maxID - self.__keyMap[maxID] = key - - # delete entries in id and key map --> not required anymore - try: - del self.__idMap[i] - del self.__idMap[j] - del self.__keyMap[self.__n - 1] - except KeyError: - print("\nERROR: Key {} not found in idMap".format(j)) - print("ERROR: Previous key: {} in idMap".format(i)) - print("Given keys: ") - for key in self.__idMap: - print(key) - return - - # reduce dimension of matrix by one column and row - self.__n -= 1 - self.__d = self.__d[:-1, :-1] - - # ------------------------------------------------------------------------------------------------------------------ - - def __mergeClusters(self, i, j): - """ - Merges cluster i and j, computes the new ID and distances of the newly formed cluster - and stores required information - :param i: cluster index i - :param j: cluster index j - :return: - """ - idI = self.__idMap[str(i)] - idJ = self.__idMap[str(j)] - - minID = min(idI, idJ) - maxID = max(idI, idJ) - - # use Lance-Williams formula to compute linkages - DKI = self.__d[:, minID] - DKJ = self.__d[:, maxID] - DIJ = self.__d[minID, maxID] - distIJ = np.abs(DKI - DKJ) - - # compute coefficients - ai, aj, b, y = self.__getCoefficients(i, j) - - newEntries = ai * DKI + aj * DKJ + b * DIJ + y * distIJ - newEntries[minID] = self.__maxValue - newEntries[maxID] = self.__maxValue - - # add new column and row - self.__d[minID] = newEntries - self.__d[:, minID] = newEntries - - idIJ = minID - newKey = i + ',' + j - self.__idMap[newKey] = idIJ - self.__keyMap[idIJ] = newKey - self.__clusterMap[newKey] = len(self.__clusterMap) - - # delete old clusters - self.__deleteClusters(i, j) - - # count number of elements - return newKey.count(',') + 1 - - # ------------------------------------------------------------------------------------------------------------------ - - def run(self): - """ - Conducts the algorithm until there's only one cluster. - :return: - """ - - # number of the current iteration - m = 0 - - # resulting matrix containing information Z[i,x], x=0: cluster i, x=1: cluster j, x=2: dist(i,j), x=3: num(i,j) - runs = self.__n - 1 - Z = np.array([[0 for _ in range(4)] for _ in range(runs)], dtype=np.float) - - while m < runs: - m += 1 - - i, j, distIJ = self.__getMatrixMinimumIndices() - numIJ = self.__mergeClusters(i, j) - - clusterI, clusterJ = self.__clusterMap[i], self.__clusterMap[j] - Z[m - 1] = [int(min(clusterI, clusterJ)), int(max(clusterI, clusterJ)), np.float(distIJ), int(numIJ)] - - # reset number n to length of first dimension (number of genes) - self.__n = np.shape(self.__obs)[0] - - self.__tree = self.generateTree(Z) - return Z.tolist() - - # ------------------------------------------------------------------------------------------------------------------ - - def generateTree(self, linkageMatrix): - """ - Computes the dendrogram tree for a given linkage matrix. - :param linkageMatrix: - :return: - """ - self.__tree = None - - treeMap = {} - numTrees = len(linkageMatrix) - - for ii in range(numTrees): - entry = linkageMatrix[ii] - currentID = self.__n + ii - leftIndex, rightIndex, value, num = int(entry[1]), int(entry[0]), entry[2], int(entry[3]) - left = right = None - - if leftIndex < self.__n: - left = BinaryNode(self.__obs[leftIndex].tolist(), leftIndex, 1, None, None) - else: - left = treeMap[leftIndex] - - if rightIndex < self.__n: - right = BinaryNode(self.__obs[rightIndex].tolist(), rightIndex, 1, None, None) - else: - right = treeMap[rightIndex] - - if isinstance(left, BinaryNode) and isinstance(right, BinaryNode): - treeMap[currentID] = BinaryTree(left, right, currentID, value) - elif isinstance(left, BinaryNode): - treeMap[currentID] = right.addNode(left, currentID, value) - del treeMap[rightIndex] - elif isinstance(right, BinaryNode): - treeMap[currentID] = left.addNode(right, currentID, value) - del treeMap[leftIndex] - else: - treeMap[currentID] = left.merge(right, currentID, value) - del treeMap[rightIndex] - del treeMap[leftIndex] - - self.__tree = treeMap[numTrees + self.__n - 1] - return self.__tree - - # ------------------------------------------------------------------------------------------------------------------ - -######################################################################################################################## - -from clustering_util import cutJsonTreeByClusters - -def getClusters(k, obs, dendrogram, sorted=True): - """ - First implementation to cut dendrogram tree automatically by choosing nodes having the greatest node values - or rather distance to the other node / potential cluster - :param k: number of desired clusters - :param obs: set of observations - :param dendrogram: dendrogram tree - :return: centroids, sorted cluster labels and normal label list - """ - obs = np.nan_to_num(obs) - n = obs.shape[0] - - if isinstance(dendrogram, BinaryTree): - clusterLabels = dendrogram.cutTreeByClusters(k) - else: - clusterLabels = cutJsonTreeByClusters(dendrogram, k) - - clusterCentroids = [] - labels = np.zeros(n, dtype=np.int) - clusterID = 0 - - for ii in range(len(clusterLabels)): - cluster = clusterLabels[ii] - subObs = obs[cluster] - clusterCentroids.append(np.mean(subObs, axis=0).tolist()) - - for id in cluster: - labels[id] = clusterID - - # sort labels according to their distance - if sorted: - clusterLabels[ii], _ = computeClusterInternDistances(obs, cluster) - - clusterID += 1 - - return clusterCentroids, clusterLabels, labels.tolist() - -######################################################################################################################## - -def _plugin_initialize(): - """ - optional initialization method of this module, will be called once - :return: - """ - pass - -# ---------------------------------------------------------------------------------------------------------------------- - -def create(data, method, distance): - """ - by convention contain a factory called create returning the extension implementation - :return: - """ - return Hierarchical(data, method, distance) - -######################################################################################################################## - -from timeit import default_timer as timer -from scipy.cluster.hierarchy import linkage, leaves_list - -if __name__ == '__main__': - - np.random.seed(200) - # data = np.array([[1,2,3],[5,4,5],[3,2,2],[8,8,7],[9,6,7],[2,3,4]]) - data = np.array([1,1.1,5,8,5.2,8.3]) - - timeMine = 0 - timeTheirs = 0 - - - n = 10 - - for i in range(n): - # data = np.array([np.random.rand(6000) * 4 - 2 for _ in range(249)]) - # import time - s1 = timer() - hier = Hierarchical(data, 'complete') - # s = time.time() - linkageMatrix = hier.run() - e1 = timer() - print(linkageMatrix) - tree = hier.generateTree(linkageMatrix) - # print(tree.getLeaves()) - # print(tree.jsonify()) - # print(hier.getClusters(3)) - import json - jsonTree = json.loads(tree.jsonify()) - getClusters(3, data, jsonTree) - - - s2 = timer() - linkageMatrix2 = linkage(data, 'complete') - # print(leaves_list(linkageMatrix2)) - e2 = timer() - - timeMine += e1 - s1 - timeTheirs += e2 - s2 - - # print(linkageMatrix) - # print(linkageMatrix2) - print('mine: {}'.format(timeMine / n)) - print('theirs: {}'.format(timeTheirs / n)) - diff --git a/clustering_kmeans.py b/clustering_kmeans.py deleted file mode 100644 index 5e5d2a2..0000000 --- a/clustering_kmeans.py +++ /dev/null @@ -1,398 +0,0 @@ -__author__ = 'Michael Kern' -__version__ = '0.0.2' -__email__ = 'kernm@in.tum.de' - -######################################################################################################################## -# libraries - -# module to load own configurations -import caleydo_server.config -# request config if needed in the future -config = caleydo_server.config.view('caleydo-clustering') - -# numpy important to conduct matrix/vector calculus -import numpy as np -# creates random numbers -import random - -# contains utility functions -from clustering_util import weightedChoice, similarityMeasurement, computeClusterInternDistances - -######################################################################################################################## - -class KMeans: - """ - This is an implementation of the k-means algorithm to cluster genomic data / matrices. - Returns the centroids, the labels / stratification of each row belonging to one cluster, - distance matrix for cluster-cluster distance and distance arrays for row-clusterCentroid distance. - Implementation detail: - """ - - def __init__(self, obs, k, initMode='kmeans++', distance='sqeuclidean', iters=1000): - """ - Initializes the algorithm with observation, number of k clusters, the initial method and - the maximum number of iterations. - Initialization method of random cluster choice can be: forgy, uniform, random, plusplus - :param obs: genomic data / matrix - :param k: number of clusters - :param initMode: initialization method - :param distance: distance measurement - :param iters: number of maximum iterations - :return: - """ - - # number of clusters - self.__k = k - # observations, can be 1D array or 2D matrix with genes as rows and conditions as columns - # remove all NaNs in data - self.__obs = np.nan_to_num(obs) - # number of observations / genes - self.__n = np.shape(obs)[0] - # maps the element ids to clusters - self.__labelMap = np.zeros(self.__n, dtype=np.int) - # cluster means and number of elements - self.__clusterMeans = np.array([obs[0] for _ in range(k)], dtype=np.float) - self.__clusterNums = np.array([0 for _ in range(k)], dtype=np.int) - # tells if any cluster has changed or rather if any data item was moved - self.__changed = True - # number of iterations - self.__iters = iters - # initialization method - self.__initMode = initMode - # compare function - self.__distance = distance - - # ------------------------------------------------------------------------------------------------------------------ - - def __call__(self): - """ - Caller function for server API. - """ - return self.run() - - # ------------------------------------------------------------------------------------------------------------------ - - def __init(self): - """ - Initialize clustering with random clusters using a user-specified method - :return: - """ - # TODO! consider to init k-Means algorithm with Principal Component Analysis (PCA) - # TODO! see - # init cluster - if self.__initMode == 'forgy': - self.__forgyMethod() - elif self.__initMode == 'uniform': - self.__uniformMethod() - elif self.__initMode == 'random': - self.__randomMethod() - elif self.__initMode == 'kmeans++': - self.__plusplusMethod() - else: - raise AttributeError - - # ------------------------------------------------------------------------------------------------------------------ - - def __forgyMethod(self): - """ - Initialization method: - Randomly choose k observations from the data using a uniform random distribution. - :return: - """ - for ii in range(self.__k): - self.__clusterMeans[ii] = (self.__obs[random.randint(0, self.__n - 1)]) - - # ------------------------------------------------------------------------------------------------------------------ - - def __uniformMethod(self): - """ - Initialization method: - Randomly assign each observation to one of the k clusters using uniform random distribution - and compute the centroids of each cluster. - :return: - """ - for i in range(self.__n): - self.__labelMap[i] = random.randint(0, self.__k - 1) - - self.__update() - - # ------------------------------------------------------------------------------------------------------------------ - - def __randomMethod(self): - """ - Initialization method: - Randomly choose k observations from the data by estimating the mean and standard deviation of the data and - using the gaussian random distribution. - :return: - """ - mean = np.mean(self.__obs, axis=0) - std = np.std(self.__obs, axis=0) - - for ii in range(self.__k): - self.__clusterMeans[ii] = np.random.normal(mean, std) - - # ------------------------------------------------------------------------------------------------------------------ - - def __plusplusMethod(self): - """ - Initialization method: - Chooses k observations by computing probabilities for each observation and using a weighted random distribution. - Algorithm: . This method should accelerate the algorithm by finding - the appropriate clusters right at the beginning and hence should make it more robust. - :return: - """ - # 1) choose random center out of data - self.__clusterMeans[0] = (random.choice(self.__obs)) - - maxValue = np.max(self.__obs) + 1 - probs = np.array([maxValue for _ in range(self.__n)]) - - for i in range(1, self.__k): - probs.fill(maxValue) - # compute new probabilities, choose min of all distances - for j in range(0, i): - dists = similarityMeasurement(self.__obs, self.__clusterMeans[j], self.__distance) - # collect minimum squared distances to cluster centroids - probs = np.minimum(probs, dists) - - # sum all squared distances - sumProbs = np.float(np.sum(probs)) - - if sumProbs != 0: - probs /= sumProbs - # 3) choose new center based on probabilities - self.__clusterMeans[i] = (self.__obs[weightedChoice(probs)]) - else: - print('ERROR: cannot find enough cluster centroids for given k = ' + str(self.__k)) - - # ------------------------------------------------------------------------------------------------------------------ - - def getClusterMean(self, num): - """ - Returns the centroid of the cluster with index num. - :param num: - :return: - """ - if num >= self.__k: - return None - else: - return self.__clusterMeans[num] - - # ------------------------------------------------------------------------------------------------------------------ - - def getClusterOfElement(self, index): - """ - :param index: number of element in observation array - :return: cluster id of observation with given index. - """ - if index >= self.__n: - return None - else: - return self.__labelMap[index] - - # ------------------------------------------------------------------------------------------------------------------ - - def printClusters(self): - """ - Print the cluster centroids and the labels. - :return: - """ - print('Centroids: ' + str(self.__centroids) + ' | Labels: ' + str(self.__labels)) - - # ------------------------------------------------------------------------------------------------------------------ - - def __assignment(self): - """ - Assignment step: - Compute distance of current observation to each cluster centroid and move gene to the nearest cluster. - :return: - """ - for i in range(self.__n): - value = self.__obs[i] - - # compute squared distances to each mean - dists = similarityMeasurement(self.__clusterMeans, value, self.__distance) - # nearest cluster - nearestID = np.argmin(dists) - - if self.__labelMap[i] != nearestID: - self.__changed = True - self.__labelMap[i] = nearestID - - # ------------------------------------------------------------------------------------------------------------------ - - def __update(self): - """ - Update step: - Compute the new centroids of each cluster after the assignment. - :return: - """ - self.__clusterMeans.fill(0) - self.__clusterNums.fill(0) - - self.__clusterLabels = [[] for _ in range(self.__k)] - - for ii in range(self.__n): - clusterID = self.__labelMap[ii] - self.__clusterLabels[clusterID].append(ii) - self.__clusterNums[clusterID] += 1 - - for ii in range(self.__k): - self.__clusterMeans[ii] = np.mean(self.__obs[self.__clusterLabels[ii]], axis=0) - - # ------------------------------------------------------------------------------------------------------------------ - - def __end(self): - """ - Writes the results to the corresponding member variables. - :return: - """ - # returned values | have to be reinitialized in case of sequential running - # centroids - self.__centroids = np.array([self.__obs[0] for _ in range(self.__k)], dtype=np.float) - # labels of observations - self.__labels = np.array([0 for _ in range(self.__n)], dtype=np.int) - # distances between centroids - # self.__centroidDistMat = np.zeros((self.__k, self.__k)) - - # we do not use OrderedDict here, so obtain dict.values and fill array manually - for index in range(self.__n): - clusterID = self.__labelMap[index] - self.__labels[index] = clusterID - - # collect centroids - for ii in range(self.__k): - # self.__centroids.append(self.__clusterMeans[ii].tolist()) - self.__centroids[ii] = self.__clusterMeans[ii] - - # compute distances between each centroids - # for ii in range(self.__k - 1): - # # compute indices of other clusters - # jj = range(ii + 1, self.__k) - # # select matrix of cluster centroids - # centroidMat = self.__centroids[jj] - # distances = np.sqrt(self.__compare(centroidMat, self.__centroids[ii])) - # self.__centroidDistMat[ii, jj] = distances - # self.__centroidDistMat[jj, ii] = distances - - # ------------------------------------------------------------------------------------------------------------------ - - def run(self): - """ - Runs the algorithm of k-means, using the initialization method and the assignment/update step. - Conducts at most iters iterations and terminates if this number is exceeded or no observations - was moved to another cluster. - :return: - """ - # 1) init algorithm by choosing cluster centroids - self.__init() - - MAX_ITERS = self.__iters - counter = 0 - # 2) run clustering - while self.__changed and counter < MAX_ITERS: - self.__changed = False - - self.__assignment() - self.__update() - - counter += 1 - - self.numIters = counter - - # write results to the class members - self.__end() - return self.__centroids.tolist(), self.__labels.tolist(), self.__clusterLabels - #, self.__centroidDistMat.tolist() - - # ------------------------------------------------------------------------------------------------------------------ - - # def getDistsPerCentroid(self): - # """ - # Compute the distances between observations belonging to one cluster and the corresponding cluster centroid. - # Cluster labels are sorted in ascending order using their distances - # :return: array of distance arrays for each cluster and ordered labels - # """ - # - # # labels per centroid - # # self.__clusterLabels = [[] for _ in range(self.__k)] - # # distances of obs to their cluster - # self.__centroidDists = [[] for _ in range(self.__k)] - # - # for ii in range(self.__k): - # self.__clusterLabels[ii] = np.array(self.__clusterLabels[ii], dtype=np.int) - # - # # compute euclidean distances of values to cluster mean - # for ii in range(self.__k): - # mean = self.__clusterMeans[ii] - # obs = self.__obs[self.__clusterLabels[ii]] - # dists = similarityMeasurement(obs, mean, self.__compare).tolist() - # self.__centroidDists[ii] = dists - # - # # sort indices in ascending order using the distances - # indices = range(len(dists)) - # indices.sort(key=dists.__getitem__) - # self.__clusterLabels[ii] = self.__clusterLabels[ii][indices].tolist() - # self.__centroidDists[ii].sort() - # - # return self.__clusterLabels, self.__centroidDists - -######################################################################################################################## - -def _plugin_initialize(): - """ - optional initialization method of this module, will be called once - :return: - """ - pass - -# ---------------------------------------------------------------------------------------------------------------------- - -def create(data, k, initMethod, distance): - """ - by convention contain a factory called create returning the extension implementation - :return: - """ - return KMeans(data, k, initMethod, distance) - -######################################################################################################################## - -from timeit import default_timer as timer -from scipy.cluster.vq import kmeans2, kmeans - -""" -This is for testing the algorithm and comparing the resuls between this and scipy's algorithm -""" -if __name__ == '__main__': - from datetime import datetime - #np.random.seed(datetime.now()) - # data = np.array([[1,2,3],[5,4,5],[3,2,2],[8,8,7],[9,6,7],[2,3,4]]) - data = np.array([1,1.1,5,8,5.2,8.3]) - - # data = np.array([np.random.rand(2) * 5 for _ in range(10)]) - k = 3 - - timeMine = 0 - timeTheirs = 0 - n = 10 - - for i in range(10): - s1 = timer() - kMeansPlus = KMeans(data, k, 'kmeans++', 'sqeuclidean', 10) - result1 = kMeansPlus.run() - #print(result) - e1 = timer() - # labels = kMeansPlus.getDistsPerCentroid() - # l, d = computeClusterDistances(data, labels[0]) - - s2 = timer() - result2 = kmeans2(data, k) - e2 = timer() - - timeMine += e1 - s1 - timeTheirs += e2 - s2 - - print(result1) - print(result2) - print('mine: {}'.format(timeMine / n)) - print('theirs: {}'.format(timeTheirs / n)) diff --git a/clustering_service.py b/clustering_service.py deleted file mode 100644 index 0bed35c..0000000 --- a/clustering_service.py +++ /dev/null @@ -1,151 +0,0 @@ -__author__ = 'Michael Kern' -__version__ = '0.0.1' -__email__ = 'kernm@in.tum.de' - -import numpy as np -from clustering_hierarchical import getClusters - -######################################################################################################################## - -def loadData(datasetID): - """ - Loads the genomic data with given identifier datasetID. - :param datasetID: identifier - :return: array of the genomic data - """ - import caleydo_server.dataset as dt - # obtain Caleydo dataset from ID - dataset = dt.get(datasetID) - # choose loaded attribute and load raw data in numpy format - # somehow hack to get a numpy array out of the data - try: - arr = np.array(list(dataset.asnumpy())) - except: - raise Exception - return arr - -######################################################################################################################## - -def loadPlugin(pluginID, *args, **kwargs): - """ - Loads the clustering plugin with given arguments. - :param pluginID: identifier of plugin - :param *args: additional caller function arguments - :param **kwargs: additional arguments - :return: plugin - """ - import caleydo_server.plugin - # obtain all plugins with 'pluginID' extension - plugins = caleydo_server.plugin.list('clustering') - # choose plugin with given ID - for plugin in plugins: - if plugin.id == pluginID: - # load the implementation of the plugin - return plugin.load().factory(*args, **kwargs) - - raise NotImplementedError - - -######################################################################################################################## - -def runKMeans(data, k, initMethod, distance): - """ - Runs the k-Means clustering algorithm given the loaded data set, the number of clusters k and the initialization - method. - :param data: observation matrix - :param k: number of clusters - :param initMethod: number of clusters - :return: result of k-means - """ - KMeans = loadPlugin('caleydo-clustering-kmeans', data, k, initMethod, distance) - # and run the kmeans extension - centroids, labels, clusterLabels = KMeans() - # clusterLabels, clusterDists = KMeans.getDistsPerCentroid() - - return {'centroids': centroids, 'clusterLabels': clusterLabels} - -######################################################################################################################## - -def runHierarchical(data, k, method, distance): - """ - Runs the hierarchical clustering algorithm given the loaded data set and type of linkage method. - :param data: observation matrix - :param method: linkage method - :return: linkage matrix / dendrogram of the algorithm - """ - Hierarchical = loadPlugin('caleydo-clustering-hierarchical', data, method, distance) - # and use the extension - Hierarchical() - # obtain k-number of clusters - centroids, clusterLabels, labels = getClusters(k, data, Hierarchical.tree, False) - - return {'centroids': centroids, 'clusterLabels': clusterLabels, 'dendrogram': Hierarchical.tree.json()} - # print('\t-> creating dendrogram tree...') - # tree = Hierarchical.generateTree(linkage) - # print('\t-> creating json string ...') - # dendrogram = tree.jsonify() - # print('\t-> finished.') - - # return {'dendrogram': dendrogram} --> if needed later - -######################################################################################################################## - -def runAffinityPropagation(data, damping, factor, preference, distance): - """ - Runs the affinity propagation algorithm given the loaded dataset, a damping value, a certain factor and - a preference method. - :param data: - :param damping: - :param factor: - :param preference: - :return: - """ - Affinity = loadPlugin('caleydo-clustering-affinity', data, damping, factor, preference, distance) - # use this extension - centroids, labels, clusterLabels = Affinity() - - return {'centroids': centroids, 'clusterLabels': clusterLabels} - -######################################################################################################################## - -def runFuzzy(data, numClusters, m, threshold, distance): - Fuzzy = loadPlugin('caleydo-clustering-fuzzy', data, numClusters, m, threshold, distance) - - centroids, clusterLabels, partitionMatrix, maxProb = Fuzzy() - - return {'centroids': centroids, 'clusterLabels': clusterLabels, 'partitionMatrix': partitionMatrix, - 'maxProbability': maxProb} - -######################################################################################################################## - -def getClusterDistances(data, labels, metric, externLabels = None, sorted = True): - """ - Compute the cluster distances in a given data among certain rows (labels) - :param data: genomic data - :param labels: indices of rows - :param metric: distance metric - :param externLabels: - :return: labels and distances values sorted in ascending order - """ - from clustering_util import computeClusterInternDistances, computeClusterExternDistances - distLabels, distValues = computeClusterInternDistances(data, labels, sorted, metric) - - if externLabels is not None: - externDists = computeClusterExternDistances(data, distLabels, externLabels, metric) - return {'labels': distLabels, 'distances': distValues, 'externDistances': externDists} - else: - return {'labels': distLabels, 'distances': distValues} - -######################################################################################################################## - -def getClustersFromDendrogram(data, dendrogram, numClusters): - """ - - :param data: - :param dendrogram: - :param numClusters: - :return: - """ - - centroids, clusterLabels, _ = getClusters(numClusters, data, dendrogram) - return {'centroids': centroids, 'clusterLabels': clusterLabels} diff --git a/clustering_util.py b/clustering_util.py deleted file mode 100644 index 575a0d2..0000000 --- a/clustering_util.py +++ /dev/null @@ -1,470 +0,0 @@ -__author__ = "Michael Kern" -__email__ = 'kernm@in.tum.de' - -######################################################################################################################## - -import random -import numpy as np - -# use scipy to compute different distance matrices -from scipy.spatial.distance import pdist, squareform -import scipy.stats as stats - -""" -http://eli.thegreenplace.net/2010/01/22/weighted-random-generation-in-python ---> good explanation to create weighted choices / random numbers -""" -def weightedChoice(weights): - - # compute sum of all weights - sumTotal = sum(weights) - # compute a random with range[0, sumTotal] - rnd = random.random() * sumTotal - - for index, weight in enumerate(weights): - # subtract current weight from random to find current index - rnd -= weight - if rnd < 0: - return index - - # 20% faster if weights are sorted in descending order - -######################################################################################################################## - -""" -Implementation of an binary tree for hierarchical clustering -""" - -""" -Node of the tree containing information about it's id in data, children and the value -""" -class BinaryNode: - def __init__(self, value, id, size, leftChild, rightChild): - self.value = value - self.left = leftChild - self.right = rightChild - self.size = size - self.id = id - self.parent = None - self.indices = [id] - - # create json info on the fly - self.json = {"id": self.id, "size": self.size, "value": self.value, "indices": [id]} - if leftChild is not None and rightChild is not None: - # self.json["value"] = np.mean(self.value) - self.json["children"] = [rightChild.json, leftChild.json] - self.indices = [] + rightChild.indices + leftChild.indices - self.json["indices"] = self.indices - - def isLeave(self): - return self.left is None and self.right is None - -######################################################################################################################## - -""" -Implementation of an hierarchical binary tree -""" -class BinaryTree: - # this tree must not be empty and must have at least two children (leaves) - def __init__(self, leftNode, rightNode, newID, newValue): - self.__createNewRoot(leftNode, rightNode, newID, newValue) - - # ------------------------------------------------------------------------------------------------------------------ - - def addNode(self, newNode, newID, newValue): - self.__createNewRoot(self.root, newNode, newID, newValue) - return self - - # ------------------------------------------------------------------------------------------------------------------ - - def merge(self, tree, newID, newValue): - self.__createNewRoot(self.root, tree.root, newID, newValue) - return self - - # ------------------------------------------------------------------------------------------------------------------ - - def jsonify(self): - import json - return json.dumps(self.root.json) - # return self.root.json - # return self.__traverseJson(self.root) - - # ------------------------------------------------------------------------------------------------------------------ - - def json(self): - return self.root.json - - # ------------------------------------------------------------------------------------------------------------------ - - def cutTreeByClusters(self, k): - queue = [self.root] - - while len(queue) < k: - node = queue.pop(0) - queue.append(node.left) - queue.append(node.right) - - def keyFunc(x): - if x.isLeave(): - return 0 - else: - return -x.value - - queue.sort(key=keyFunc) - - clusters = [] - - for node in queue: - clusters.append(node.indices) - - return clusters - - # ------------------------------------------------------------------------------------------------------------------ - - def __traverseJson(self, node): - json = {"id": node.id, "size": node.size, "value": node.value} - if node.left is None and node.right is None: - return json - else: - json["children"] = [] + [self.__traverseJson(node.left)] + [self.__traverseJson(node.right)] - - return json - - # ------------------------------------------------------------------------------------------------------------------ - - def getLeaves(self): - return self.root.indices - # return self.__traverseIDs(self.root) - - # ------------------------------------------------------------------------------------------------------------------ - - def __traverseIDs(self, node): - - if node.left is None and node.right is None: - return [node.id] - else: - return [] + self.__traverseIDs(node.right) + self.__traverseIDs(node.left) - - # ------------------------------------------------------------------------------------------------------------------ - - def __createNewRoot(self, leftNode, rightNode, newID, newValue): - newSize = leftNode.size + rightNode.size - self.root = BinaryNode(newValue, newID, newSize, leftNode, rightNode) - leftNode.parent = rightNode.parent = self.root - - # ------------------------------------------------------------------------------------------------------------------ - -def cutJsonTreeByClusters(jsonData, k): - # import json - # tree = json.loads(jsonData) - queue = [jsonData] - - while len(queue) < k: - node = queue.pop(0) - queue.append(node['children'][0]) - queue.append(node['children'][1]) - - def keyFunc(x): - if 'children' not in x: - return 0 - else: - return -x['value'] - - queue.sort(key=keyFunc) - - clusters = [] - - for node in queue: - clusters.append(node['indices']) - - return clusters - -######################################################################################################################## - -def euclideanDistance(matrix, vector, squared=False): - """ - Computes the euclidean distance between a vector and the rows of a matrix in parallel. - :param matrix: array of observations or clusters - :param vector: cluster centroid or observation - :return: - """ - - # compute distance between values in matrix and the vector - distMat = matrix - vector - numValues = len(matrix) - distances = np.array([0.0 for _ in range(numValues)], dtype=np.float) - - for ii in range(numValues): - distance = distMat[ii] - # always try to use np.dot when computing euclidean distance - # it's way faster than ** 2 and sum(..., axis=1) - distances[ii] = np.dot(distance, distance) - - if squared: - return distances - else: - return np.sqrt(distances) - -# ---------------------------------------------------------------------------------------------------------------------- - -def correlationDistance(matrix, vector, method): - """ - - :param matrix: - :param vector: - :return: - """ - - numValues = len(matrix) - distances = np.array([0.0 for _ in range(numValues)], dtype=np.float) - - for ii in range(numValues): - value = matrix[ii] - - if method == 'pearson': - distances[ii], _ = stats.pearsonr(value, vector) - elif method == 'spearman': - distances[ii], _ = stats.spearmanr(value, vector) - elif method == 'kendall': - distances[ii], _ = stats.kendalltau(value, vector) - else: - raise AttributeError - - return distances - -# ---------------------------------------------------------------------------------------------------------------------- - -from scipy.spatial.distance import cdist - -def similarityMeasurement(matrix, vector, method='euclidean'): - - if method == 'euclidean': - return euclideanDistance(matrix, vector) - - if method == 'sqeuclidean': - return euclideanDistance(matrix, vector, True) - - spatialMethods = ['cityblock', 'chebyshev', 'canberra', 'correlation', 'hamming', 'mahalanobis',] - - if method in spatialMethods: - return np.nan_to_num(cdist(matrix, np.atleast_2d(vector), method).flatten()) - - corrMethods = ['spearman', 'pearson', 'kendall'] - - if method in corrMethods: - return correlationDistance(matrix, vector, method) - - raise AttributeError - -# ---------------------------------------------------------------------------------------------------------------------- - -def euclideanDistanceMatrix(matrix, squared=False): - """ - Compute the euclidean distance matrix required for the algorithm - :param matrix: - :param n: - :return: - """ - - n = np.shape(matrix)[0] - distMat = np.zeros((n, n)) - - # use Gram matrix and compute distances without inner products | FASTER than row-by-row method - "Gramiam matrix to compute dot products of each pair of elements: " - "" - gramMat = np.zeros((n, n)) - for ii in range(n): - for jj in range(ii, n): - gramMat[ii, jj] = np.dot(matrix[ii], matrix[jj]) - - # # ! This is slower than computing dot products of rows manually in python - # # ! And we only require the upper triangle matrix of the Gram matrix - # gramMat = np.dot(self.__obs, self.__obs.T) - - # make use of formula |a - b|^2 = a^2 - 2ab + b^2 - for ii in range(n): - # self.__d[ii, ii] = self.__maxValue - jj = np.arange(ii + 1, n) - distMat[ii, jj] = gramMat[ii, ii] - 2 * gramMat[ii, jj] + gramMat[jj, jj] - distMat[jj, ii] = distMat[ii, jj] - - # # take square root of distances to compute real euclidean distance - # distMat = np.sqrt(distMat) - - "alternative version --> use scipy's fast euclidean distance implementation: FASTEST" - # distMat = spt.distance.pdist(self.__obs, 'euclidean') - # self.__d = spt.distance.squareform(distMat) - # print(distMat) - - if squared: - return distMat - else: - return np.sqrt(distMat) - -# ---------------------------------------------------------------------------------------------------------------------- - -def norm1Distance(matrix, vector): - """ - Computes the norm-1 distance between a vector and the rows of a matrix in parallel. - :param matrix: array of observations or clusters - :param vector: cluster centroid or observation - :return: - """ - distMat = np.abs(matrix - vector) - numValues = len(vector) - - distances = np.sum(distMat, axis=1) / numValues - return distances - -# ---------------------------------------------------------------------------------------------------------------------- - -def pearsonCorrelationMatrix(matrix): - """ - - :param matrix: - :param n: - :return: - """ - # TODO! other possibilites like 1 - abs(corr) | sqrt(1 - corr ** 2) | (1 - corr) / 2 - distMat = 1 - np.corrcoef(matrix) - - return distMat - -# ---------------------------------------------------------------------------------------------------------------------- - -def statsCorrelationMatrix(matrix, method): - if method == 'pearson': - return pearsonCorrelationMatrix(matrix) - - n = np.shape(matrix)[0] - distMat = np.zeros((n, n)) - - for ii in range(n): - rowI = matrix[ii] - for jj in range(ii + 1, n): - rowJ = matrix[jj] - corr = 0 - - if method == 'spearman': - corr, _ = stats.spearmanr(rowI, rowJ) - - if method == 'kendall': - corr, _ = stats.kendalltau(rowI, rowJ) - - # TODO! other possibilites like 1 - abs(corr) | sqrt(1 - corr ** 2) | (1 - corr) / 2 - corr = 1 - corr - - distMat[ii, jj] = corr - distMat[jj, ii] = corr - - return distMat - -# ---------------------------------------------------------------------------------------------------------------------- - -def similarityMeasurementMatrix(matrix, method): - """ - Generic function to determine the similarity measurement for clustering - :param matrix: - :param method: - :return: - """ - if method == 'euclidean': - return euclideanDistanceMatrix(matrix) - # return squareform(pdist(matrix, method)) - - if method == 'sqeuclidean': - return euclideanDistanceMatrix(matrix, True) - # return squareform(pdist(matrix, method)) - - spatialMethods = ['cityblock', 'chebyshev', 'canberra', 'correlation', 'hamming', 'mahalanobis'] - - if method in spatialMethods: - return squareform(np.nan_to_num(pdist(matrix, method))) - - corrMethods = ['spearman', 'pearson', 'kendall'] - - if method in corrMethods: - return statsCorrelationMatrix(matrix, method) - - raise AttributeError - - -######################################################################################################################## -# utility functions to compute distances between rows and cluster centroids - -def computeClusterInternDistances(matrix, labels, sorted=True, metric='euclidean'): - """ - Computes the distances of each element in one cluster to the cluster's centroid. Returns distance values and labels - sorted in ascending order. - :param matrix: - :param labels: - :return: labels / indices of elements corresponding to distance array, distance values of cluster - """ - clusterLabels = np.array(labels) - if len(clusterLabels) == 0: - return [], [] - - subMatrix = matrix[clusterLabels] - # compute centroid of cluster along column (as we want to average each gene separately) - centroid = np.mean(subMatrix, axis=0) - - # compute distances to centroid - dists = similarityMeasurement(subMatrix, centroid, metric) - - if sorted == 'true': - # sort values - indices = range(len(dists)) - indices.sort(key=dists.__getitem__) - dists.sort() - - # reverse order if correlation coefficient is used - # (1 means perfect correlation while -1 denotes opposite correlation) - corrMetrics = ['pearson', 'spearman', 'kendall'] - if metric in corrMetrics: - indices.reverse() - dists = dists[::-1] - - # write back to our arrays - distLabels = clusterLabels[indices].tolist() - distValues = dists.tolist() - else: - distLabels = clusterLabels.tolist() - distValues = dists.tolist() - - return distLabels, distValues - -# ---------------------------------------------------------------------------------------------------------------------- - -def computeClusterExternDistances(matrix, labels, outerLabels, metric='euclidean'): - """ - Compute the distances of patients in one cluster to the centroids of all other clusters. - :param matrix: - :param labels: - :param outerLabels: - :return: - """ - externDists = [] - internSubMatrix = matrix[labels] - - for externLabels in outerLabels: - - if len(externLabels) == 0: - externDists.append([]) - - # compute centroid of external cluster - subMatrix = matrix[externLabels] - centroid = np.mean(subMatrix, axis=0) - - dists = similarityMeasurement(internSubMatrix, centroid, metric) - externDists.append(dists.tolist()) - - return externDists - -######################################################################################################################## - -if __name__ == '__main__': - print(cdist([[1,1,1],[3,3,3],[5,5,5]],np.atleast_2d([2,2,2]), 'sqeuclidean').flatten()) - - from scipy.stats import spearmanr - - print(spearmanr([1,2,3],[2,4,1])) diff --git a/docker_packages.txt b/docker_packages.txt new file mode 100644 index 0000000..e69de29 diff --git a/package.json b/package.json index 678bd7e..ab5f3a3 100644 --- a/package.json +++ b/package.json @@ -1,46 +1,37 @@ { - "name": "caleydo_clustering", - "version": "1.0.0", - "license" : "SEE LICENSE IN LICENSE", - "repository": "K3rn1n4tor/caleydo_clustering", - "dependencies": { + "files": [ + "phovea_clustering", + "__init__.py", + "__main__.py", + "build", + "requirements.txt", + "requirements_dev.txt", + "docker_packages.txt" + ], + "scripts": { + "check": "flake8", + "pretest": "npm run check", + "test": "python setup.py test", + "prebuild": "node -e \"process.exit(process.env.PHOVEA_SKIP_TESTS === undefined?1:0)\" || npm run test", + "build": "python -c \"from distutils.dir_util import mkpath ; mkpath('./build/source')\" && (tar -c ./phovea_clustering --exclude '*.pyc' | tar -xC build/source)", + "predist": "npm run build", + "dist": "python setup.py bdist_egg" }, - "peerDependencies": { - "caleydo_server": "*" + "name": "phovea_clustering", + "description": "", + "homepage": "https://phovea.caleydo.org", + "version": "1.0.0", + "author": { + "name": "The Caleydo Team", + "email": "contact@caleydo.org", + "url": "https://caleydo.org" }, - "caleydo": { - "plugins": { - "python": [ - { - "type": "clustering", - "id": "caleydo-clustering-kmeans", - "file": "clustering_kmeans" - }, - { - "type": "clustering", - "id": "caleydo-clustering-hierarchical", - "file": "clustering_hierarchical" - }, - { - "type": "clustering", - "id": "caleydo-clustering-affinity", - "file": "clustering_affinity" - }, - { - "type": "clustering", - "id": "caleydo-clustering-fuzzy", - "file": "clustering_fuzzy" - }, - { - "type": "namespace", - "id": "caleydo-clustering", - "file": "clustering_api", - "namespace": "/api/clustering" - } - ] - } + "license": "BSD-3-Clause", + "bugs": { + "url": "https://github.com/phovea/phovea_clustering/issues" }, - "publishConfig": { - "registry": "http://registry.caleydo.org/" + "repository": { + "type": "git", + "url": "https://github.com/phovea/phovea_clustering.git" } } diff --git a/phovea_clustering/__init__.py b/phovea_clustering/__init__.py new file mode 100644 index 0000000..0a25671 --- /dev/null +++ b/phovea_clustering/__init__.py @@ -0,0 +1,36 @@ +############################################################################### +# Caleydo - Visualization for Molecular Biology - http://caleydo.org +# Copyright (c) The Caleydo Team. All rights reserved. +# Licensed under the new BSD license, available at http://caleydo.org/license +############################################################################### + + +def phovea(registry): + """ + register extension points + :param registry: + """ + # generator-phovea:begin + registry.append('clustering', 'caleydo-clustering-kmeans', 'phovea_clustering.clustering_kmeans', {}) + + registry.append('clustering', 'caleydo-clustering-hierarchical', 'phovea_clustering.clustering_hierarchical', {}) + + registry.append('clustering', 'caleydo-clustering-affinity', 'phovea_clustering.clustering_affinity', {}) + + registry.append('clustering', 'caleydo-clustering-fuzzy', 'phovea_clustering.clustering_fuzzy', {}) + + registry.append('namespace', 'caleydo-clustering', 'phovea_clustering.clustering_api', { + 'namespace': '/api/clustering' + }) + # generator-phovea:end + pass + + +def phovea_config(): + """ + :return: file pointer to config file + """ + from os import path + here = path.abspath(path.dirname(__file__)) + config_file = path.join(here, 'config.json') + return config_file if path.exists(config_file) else None diff --git a/phovea_clustering/clustering_affinity.py b/phovea_clustering/clustering_affinity.py new file mode 100644 index 0000000..b5b3283 --- /dev/null +++ b/phovea_clustering/clustering_affinity.py @@ -0,0 +1,307 @@ +######################################################################################################################## +# libraries + +# module to load own configurations +import phovea_server.config +import numpy as np +from clustering_util import similarity_measurementmatrix +from timeit import default_timer as timer + +# request config if needed for the future +config = phovea_server.config.view('caleydo-clustering') + +__author__ = 'Michael Kern' +__version__ = '0.0.1' +__email__ = 'kernm@in.tum.de' + + +######################################################################################################################## + +class AffinityPropagation: + """ + This is an implementation of the affinity propagation algorithm to cluster genomic data / matrices. + Implementation details: . + Matlab implementation: + Returns the centroids and labels / stratification of each row belonging to one cluster. + """ + + def __init__(self, obs, damping=0.5, factor=1.0, pref_method='minimum', distance='euclidean'): + """ + Initializes the algorithm. + :param obs: genomic data / matrix + :param damping: controls update process to dampen oscillations + :param factor: controls the preference value (influences number of clusters) + :param pref_method: all points are chosen equally with a given preference (median or minimum of similarity matrix) + :return: + """ + self.__n = np.shape(obs)[0] + # observations, can be 1D array or 2D matrix with genes as rows and conditions as columns + # remove all NaNs in data + self.__obs = np.nan_to_num(obs) + # variables influencing output of clustering algorithm + self.__damping = damping + self.__factor = factor + self.__prev_method = pref_method + + # similarity matrix + self.__S = np.zeros((self.__n, self.__n)) + # availability matrix + self.__A = np.zeros((self.__n, self.__n)) + # responsibility matrix + self.__R = np.zeros((self.__n, self.__n)) + + self.min_value = np.finfo(np.float).min + + # self.__mx1 = np.full(self.__n, self.min_value) + # self.__mx2 = np.full(self.__n, self.min_value) + + self.__idx = np.zeros(self.__n) + + # set similarity computation + self.__distance = distance + + self.__compute_similarity() + + # ------------------------------------------------------------------------------------------------------------------ + + def __call__(self): + """ + Caller function for server API. + """ + return self.run() + + # ------------------------------------------------------------------------------------------------------------------ + + def __compute_similarity(self): + """ + Compute the similarity matrix from the original observation matrix and set preference of each element. + :return: _similarity matrix + """ + # compute distance matrix containing the negative sq euclidean distances -|| xi - xj ||**2 + self.__S = -similarity_measurementmatrix(self.__obs, self.__distance) + + # determine the preferences S(k,k) to control the output of clusters + pref = 0 + # could be median or minimum + if self.__prev_method == 'median': + pref = float(np.median(self.__S)) * self.__factor + elif self.__prev_method == 'minimum': + pref = np.min(self.__S) * self.__factor + else: + raise AttributeError + + np.fill_diagonal(self.__S, pref) + + # ------------------------------------------------------------------------------------------------------------------ + + def run(self): + """ + Runs the algorithm of affinity propagation. Conducts at least 100 iterations and checks if the outcome of + current exemplars/clusters has converged. If not, the algorithm will continue until convergence is found + or the maximum number of iterations (200) is reached. + :return: + """ + max_iter = 200 + max_conv_iter = 100 + + # sum all decisions for exemplars per round + decision_sum = np.zeros(self.__n) + # collect decisions for one exemplar per iteration round + decision_iter = np.zeros((max_conv_iter, self.__n)) + # counter for decisions (= consider data element as exemplar in each algorithm iteration) + decision_counter = max_conv_iter + # indicates if algorithm has converged + is_converged = False + + centroids = [] + it = 0 + cluster_i = [] + + # helpful variables (that do not need recomputation) + index_diag = np.arange(self.__n) + indices_diag = np.diag_indices_from(self.__R) + new_a = np.zeros((self.__n, self.__n)) + new_r = np.zeros((self.__n, self.__n)) + + for it in range(1, max_iter + 1): + + # ---------------------------------------------------------------------------------------------------------- + + # compute responsibility matrix + m_as = self.__A + self.__S + + max_y = np.max(m_as, axis=1) + index_y = np.argmax(m_as, axis=1) + + # set values of maxima to zero in m_as matrix + m_as[index_diag, index_y] = self.min_value + + # look for second maxima + max_y2 = np.max(m_as, axis=1) + + # perform responsibility update + for ii in range(self.__n): + # s(i, k) - max({ a(i, k') + s(i, k') }) + new_r[ii] = self.__S[ii] - max_y[ii] + + # subtract second maximum from row -> column entry with maximum value + new_r[index_diag, index_y] = self.__S[index_diag, index_y] - max_y2[index_diag] + + # dampen values + # self.__R = self.__damping * self.__R + (1 - self.__damping) * new_r + self.__R *= self.__damping + self.__R += (1 - self.__damping) * new_r + + # ---------------------------------------------------------------------------------------------------------- + + # compute availability matrix + # cut out negative elements + # TODO! slow because of copy operation + rp = np.maximum(self.__R, 0) + + # write back all diagonal elements als self representatives + rp[indices_diag] = self.__R[indices_diag] + sum_cols = np.sum(rp, axis=0) + + # apply availability update + new_a[:, ] = sum_cols + new_a -= rp + # for ii in range(self.__n): + # # r(k, k) + sum(max(0, r(i',k)) + # new_a[:, ii] = sum_cols[ii] - Rp[:, ii] + + diag_a = np.diag(new_a) + # take minimum of all the values in A, cut out all values above zero + # new_a = np.minimum(new_a, 0) + new_a[new_a > 0] = 0 + new_a[indices_diag] = diag_a[index_diag] + + # dampen values + # self.__A = self.__damping * self.__A + (1 - self.__damping) * new_a + self.__A *= self.__damping + self.__A += (1 - self.__damping) * new_a + + # ---------------------------------------------------------------------------------------------------------- + + # find exemplars for new clusters + # old version which is slower + # E = self.__R + self.__A + # diag_e = np.diag(E) + + # take the diagonal elements of the create matrix E + diag_e = np.diag(self.__R) + np.diag(self.__A) + + # all elements > 0 are considered to be an appropriate exemplar for the dataset + cluster_i = np.argwhere(diag_e > 0).flatten() + + # count the number of clusters + num_clusters = len(cluster_i) + + # ---------------------------------------------------------------------------------------------------------- + + decision_counter += 1 + if decision_counter >= max_conv_iter: + decision_counter = 0 + + # subtract outcome of previous iteration (< 100) from the total sum of the decisions + decision_sum -= decision_iter[decision_counter] + + decision_iter[decision_counter].fill(0) + decision_iter[decision_counter][cluster_i] = 1 + + # compute sum of decisions for each element being a exemplar + decision_sum += decision_iter[decision_counter] + + # check for convergence + if it >= max_conv_iter or it >= max_iter: + is_converged = True + + for ii in range(self.__n): + # if element is considered to be an exemplar in at least one iterations + # and total of decisions in the last 100 iterations is not 100 --> no convergence + if decision_sum[ii] != 0 and decision_sum[ii] != max_conv_iter: + is_converged = False + break + + if is_converged and num_clusters > 0: + break + + # -------------------------------------------------------------------------------------------------------------- + + # obtain centroids + centroids = self.__obs[cluster_i] + + # find maximum columns in m_as matrix to assign elements to clusters / exemplars + # fill A with negative values + self.__A.fill(self.min_value) + # set values of clusters to zero (as we only want to regard these values + self.__A[:, cluster_i] = 0.0 + # fill diagonal of similarity matrix to zero (remove preferences) + np.fill_diagonal(self.__S, 0.0) + + # compute m_as matrix + m_as = self.__A + self.__S + # since values are < 0, look for the maximum number in each row and return its column index + self.__idx = np.argmax(m_as, axis=1) + + cluster_i = cluster_i.tolist() + cluster_labels = [[] for _ in range(num_clusters)] + + # create labels per cluster + for ii in range(self.__n): + index = cluster_i.index(self.__idx[ii]) + self.__idx[ii] = index + cluster_labels[index].append(ii) + + # return sorted cluster labels (that's why we call compute cluster distances, might be redundant) + # for ii in range(num_clusters): + # cluster_labels[ii], _ = compute_cluster_intern_distances(self.__obs, cluster_labels[ii]) + + # if is_converged: + # print('Algorithm has converged after {} iterations'.format(it)) + # else: + # print('Algorithm has not converged after 200 iterations') + # + # print('Number of detected clusters {}'.format(num_clusters)) + # print('Centroids: {}'.format(centroids)) + + return centroids.tolist(), self.__idx.tolist(), cluster_labels + + +######################################################################################################################## + +def _plugin_initialize(): + """ + optional initialization method of this module, will be called once + :return: + """ + pass + + +# ---------------------------------------------------------------------------------------------------------------------- + +def create(data, damping, factor, preference, distance): + """ + by convention contain a factory called create returning the extension implementation + :return: + """ + return AffinityPropagation(data, damping, factor, preference, distance) + + +######################################################################################################################## + +# from timeit import default_timer as timer + +if __name__ == '__main__': + np.random.seed(200) + # data = np.array([[1,2,3],[5,4,5],[3,2,2],[8,8,7],[9,6,7],[2,3,4]]) + # data = np.array([np.random.rand(8000) * 4 - 2 for _ in range(500)]) + # data = np.array([[0.9],[1],[1.1],[10],[11],[12],[20],[21],[22]]) + data = np.array([1, 1.1, 5, 8, 5.2, 8.3]) + + s = timer() + aff = AffinityPropagation(data, 0.9, 1.0, 'median', 'euclidean') + result = aff.run() + e = timer() + print(result) + print('time elapsed: {}'.format(e - s)) diff --git a/phovea_clustering/clustering_api.py b/phovea_clustering/clustering_api.py new file mode 100644 index 0000000..4aa1200 --- /dev/null +++ b/phovea_clustering/clustering_api.py @@ -0,0 +1,156 @@ +######################################################################################################################## +# libraries + +# use flask library for server activities +from phovea_server import ns +# load services (that are executed by the server when certain website is called) +from clustering_service import get_cluster_distances, get_clusters_from_dendrogram, load_data, run_affinity_propagation, run_fuzzy, run_hierarchical, run_kmeans + + +__author__ = 'Michael Kern' +__version__ = '0.0.1' +__email__ = 'kernm@in.tum.de' + +# create new flask application for hosting namespace +app = ns.Namespace(__name__) + + +######################################################################################################################## + +@app.route('/kmeans////') +def kmeans_clustering(k, init_method, distance, dataset_id): + """ + Access k-means clustering plugin. + :param k: number of clusters + :param init_method: initialization method for initial clusters + :param distance: distance measurement + :param dataset_id: identifier of data set + :return: jsonified output + """ + try: + data = load_data(dataset_id) + response = run_kmeans(data, int(k), init_method, distance) + return ns.jsonify(response) + except: + return ns.jsonify({}) + + +######################################################################################################################## + +@app.route('/hierarchical////') +def hierarchical_clustering(k, method, distance, dataset_id): + """ + Access hierarchical clustering plugin. + :param k: number of desired clusters + :param method: type of single linkage + :param distance: distance measurement + :param dataset_id: identifier of data set + :return: jsonified output + """ + try: + data = load_data(dataset_id) + response = run_hierarchical(data, int(k), method, distance) + return ns.jsonify(response) + except: + return ns.jsonify({}) + + +######################################################################################################################## + +@app.route('/affinity/////') +def affinity_propagation_clustering(damping, factor, preference, distance, dataset_id): + """ + Access affinity propagation clustering plugin. + :param damping: + :param factor: + :param preference: + :param distance: distance measurement + :param dataset_id: + :return: + """ + try: + data = load_data(dataset_id) + response = run_affinity_propagation(data, float(damping), float(factor), preference, distance) + return ns.jsonify(response) + except: + return ns.jsonify({}) + + +######################################################################################################################## + +@app.route('/fuzzy/////') +def fuzzy_clustering(num_clusters, m, threshold, distance, dataset_id): + """ + :param num_clusters: + :param m: + :param threshold: + :param distance: + :param dataset_id: + :return: + """ + try: + data = load_data(dataset_id) + response = run_fuzzy(data, int(num_clusters), float(m), float(threshold), distance) + return ns.jsonify(response) + except: + return ns.jsonify({}) + + +######################################################################################################################## + +def load_atttribute(json_data, attr): + import json + data = json.loads(json_data) + if attr in data: + return data[attr] + else: + return None + + +######################################################################################################################## + +@app.route('/distances///', methods=['POST']) +def get_distances(metric, dataset_id, sorted): + """ + Compute the distances of the current stratification values to its centroid. + :param metric: + :param dataset_id: + :return: distances and labels sorted in ascending order + """ + data = load_data(dataset_id) + labels = [] + extern_labels = None + + if 'group' in ns.request.values: + labels = load_atttribute(ns.request.values['group'], 'labels') + extern_labels = load_atttribute(ns.request.values['group'], 'externLabels') + else: + return '' + + response = get_cluster_distances(data, labels, metric, extern_labels, sorted) + return ns.jsonify(response) + + +######################################################################################################################## + +@app.route('/dendrogram//', methods=['POST']) +def dendrogram_clusters(num_clusters, dataset_id): + data = load_data(dataset_id) + + if 'group' in ns.request.values: + dendrogram = load_atttribute(ns.request.values['group'], 'dendrogram') + else: + return '' + + response = get_clusters_from_dendrogram(data, dendrogram, int(num_clusters)) + return ns.jsonify(response) + + +######################################################################################################################## + +def create(): + """ + Standard Caleydo convention for creating the service when server is initialized. + :return: Returns implementation of this plugin with given name + """ + return app diff --git a/phovea_clustering/clustering_fuzzy.py b/phovea_clustering/clustering_fuzzy.py new file mode 100644 index 0000000..8b95f59 --- /dev/null +++ b/phovea_clustering/clustering_fuzzy.py @@ -0,0 +1,225 @@ +######################################################################################################################## +# libraries + +# module to load own configurations +import phovea_server.config +# library to conduct matrix/vector calculus +import numpy as np +from clustering_util import similarity_measurement + +# request config if needed for the future +config = phovea_server.config.view('caleydo-clustering') + +__author__ = 'Michael Kern' +__version__ = '0.0.3' +__email__ = 'kernm@in.tum.de' + + +######################################################################################################################## +# class definition + +class Fuzzy(object): + """ + Formulas: https://en.wikipedia.org/wiki/Fuzzy_clustering + """ + + def __init__(self, obs, num_clusters, m=2.0, threshold=-1, distance='euclidean', init=None, error=0.0001): + """ + Initializes algorithm. + :param obs: observation matrix / genomic data + :param num_clusters: number of clusters + :param m: fuzzifier, controls degree of fuzziness, from [1; inf] + :return: + """ + # observation + self.__obs = np.nan_to_num(obs) + + self.__n = obs.shape[0] + + # fuzzifier value + self.__m = np.float(m) + # number of clusters + self.__c = num_clusters + + # matrix u containing all the weights describing the degree of membership of each patient to the centroid + if init is None: + init = np.random.rand(self.__c, self.__n) + + self.__u = np.copy(init) + + # TODO! scikit normalizes the values at the beginning and at each step to [0; 1] + self.__u /= np.ones((self.__c, 1)).dot(np.atleast_2d(np.sum(self.__u, axis=0))).astype(np.float64) + # remove all zero values and set them to smallest possible value + self.__u = np.fmax(self.__u, np.finfo(np.float64).eps) + # centroids + self.__centroids = np.zeros(self.__c) + # threshold for stopping criterion + self.__error = error + # distance function + self.__distance = distance + + # threshold or minimum probability used for cluster assignments + if threshold == -1: + self.__threshold = 1.0 / num_clusters + else: + self.__threshold = threshold + + # ------------------------------------------------------------------------------------------------------------------ + + def __call__(self): + """ + Caller function for server API + :return: + """ + return self.run() + + # ------------------------------------------------------------------------------------------------------------------ + + def compute_centroid(self): + """ + Compute the new centroids using the computed partition matrix. + :return: + """ + u_m = self.__u ** self.__m + + sum_data_weights = np.dot(u_m, self.__obs) + if self.__obs.ndim == 1: + m = 1 + else: + m = self.__obs.shape[1] + + sum_weights = np.sum(u_m, axis=1) + # tile array (sum of weights repeated in every row) + sum_weights = np.ones((m, 1)).dot(np.atleast_2d(sum_weights)).T + + if self.__obs.ndim == 1: + sum_weights = sum_weights.flatten() + + # divide by total sum to get new centroids + self.__centroids = sum_data_weights / sum_weights + + # ------------------------------------------------------------------------------------------------------------------ + + def compute_coefficients(self): + """ + Compute new partition matrix / weights describing the degree of membership of each patient to all clusters. + :return: + """ + + # TODO you can also use cdist of scipy.spatial.distance module + dist_mat = np.zeros((self.__c, self.__n)) + + for ii in range(self.__c): + dist_mat[ii] = similarity_measurement(self.__obs, self.__centroids[ii], self.__distance) + + # set zero values to smallest values to prevent inf results + dist_mat = np.fmax(dist_mat, np.finfo(np.float64).eps) + + # apply coefficient formula + denom = np.float(self.__m - 1.0) + self.__u = dist_mat ** (-2.0 / denom) + + sum_coeffs = np.sum(self.__u, axis=0) + + self.__u /= np.ones((self.__c, 1)).dot(np.atleast_2d(sum_coeffs)) + self.__u = np.fmax(self.__u, np.finfo(np.float64).eps) + + # ------------------------------------------------------------------------------------------------------------------ + + def run(self): + """ + Perform the c-means fuzzy clustering. + :return: + """ + max_iter = 100 + iter = 0 + + while iter < max_iter: + # save last partition matrix + u_old = np.copy(self.__u) + # compute centroids with given weights + self.compute_centroid() + # compute new coefficient matrix + self.compute_coefficients() + + # normalize weight / partition matrix u + self.__u /= np.ones((self.__c, 1)).dot(np.atleast_2d(np.sum(self.__u, axis=0))) + self.__u = np.fmax(self.__u, np.finfo(np.float64).eps) + + # compute the difference between the old and new matrix + epsilon = np.linalg.norm(self.__u - u_old) + + # stop if difference (epsilon) is smaller than the user-defined threshold + if epsilon < self.__error: + break + + iter += 1 + + self.__end() + + u = self.__u.T + # print(self.__u.T) + + return self.__centroids.tolist(), self.__cluster_labels, u.tolist(), self.__threshold + + # ------------------------------------------------------------------------------------------------------------------ + + def __end(self): + """ + Conduct the cluster assignments and creates cluster_label array. + :return: + """ + # assign patient to clusters + # transpose to get a (n, c) matrix + u = self.__u.T + + self.__labels = np.zeros(self.__n, dtype=np.int) + self.__cluster_labels = [[] for _ in range(self.__c)] + # gather all probabilities / degree of memberships of each patient to the clusters + # self.__cluster_probs = [[] for _ in range(self.__c)] + # probability that the patients belongs to each cluster + max_prob = np.float64(self.__threshold) + + for ii in range(self.__n): + # cluster_iD = np.argmax(u[ii]) + # self.__labels = cluster_iD + # self.__cluster_labels[cluster_iD].append(ii) + + for jj in range(self.__c): + if u[ii][jj] >= max_prob: + cluster_id = jj + self.__labels = cluster_id + self.__cluster_labels[cluster_id].append(int(ii)) + + # for ii in range(self.__c): + # self.__cluster_labels[ii], _ = compute_cluster_intern_distances(self.__obs, self.__cluster_labels[ii]) + + +######################################################################################################################## + +def _plugin_initialize(): + """ + optional initialization method of this module, will be called once + :return: + """ + pass + + +# ---------------------------------------------------------------------------------------------------------------------- + +def create(data, num_cluster, m, threshold, distance): + """ + by convention contain a factory called create returning the extension implementation + :return: + """ + return Fuzzy(data, num_cluster, m, threshold, distance) + + +######################################################################################################################## + +if __name__ == '__main__': + data = np.array([[1, 1, 2], [5, 4, 5], [3, 2, 2], [8, 8, 7], [9, 8, 9], [2, 2, 2]]) + # data = np.array([1,1.1,5,8,5.2,8.3]) + + fuz = Fuzzy(data, 3, 1.5) + print(fuz.run()) diff --git a/phovea_clustering/clustering_hierarchical.py b/phovea_clustering/clustering_hierarchical.py new file mode 100644 index 0000000..668d77b --- /dev/null +++ b/phovea_clustering/clustering_hierarchical.py @@ -0,0 +1,467 @@ +######################################################################################################################## +# libraries + +# module to load own configurations +import phovea_server.config + + +# library to conduct matrix/vector calculus +import numpy as np +# fastest distance computation by scipy +# import scipy.spatial as spt + +# utility functions for clustering and creating the dendrogram trees +from clustering_util import BinaryNode, BinaryTree +from clustering_util import similarity_measurement_matrix +from clustering_util import compute_cluster_intern_distances +from clustering_util import cut_json_tree_by_clusters + +__author__ = 'Michael Kern' +__version__ = '0.0.3' +__email__ = 'kernm@in.tum.de' +# request config if needed for the future +config = phovea_server.config.view('caleydo-clustering') + + +######################################################################################################################## + +class Hierarchical(object): + """ + This is a implementation of hierarchical clustering on genomic data using the Lance-Williams dissimilarity update + to compute different distance metrics (single linkage, complete linkage, ...). + Lance-Williams explained in: http://arxiv.org/pdf/1105.0121.pdf + """ + + def __init__(self, obs, method='single', distance='euclidean'): + """ + Initializes the algorithm + :param obs: genomic data / matrix + :param method: linkage method + :return: + """ + # genomic data / matrix + # observations, can be 1D array or 2D matrix with genes as rows and conditions as columns + # remove all NaNs in data + self.__obs = np.nan_to_num(obs) + + num_genes = np.shape(self.__obs)[0] + self.__n = num_genes + + # check if dimension is 2D + # if self.__obs.ndim == 2: + # # obtain number of observations (rows) + # num_genes, _ = np.shape(self.__obs) + # self.__n = num_genes + + # else: + # print("[Error]:\tdata / observations must be 2D. 1D observation arrays are not supported") + # raise Attribute_error + + # distance measurement + self.__distance = distance + + # distance / proximity matrix + self.__d = [] + self.__compute_proximity_matrix() + # dictionary mapping the string id (i,j,k,...) of clusters to corresponding index in matrix + self.__id_map = {} + # inverse mapping of id_map --> returns the string id given a certain index + self.__key_map = {} + # contains actual index of all clusters, old clusters are from [0, n - 1], new clusters have indices in range + # [n, 2n - 1] + self.__cluster_map = {} + for ii in range(self.__n): + self.__id_map[str(ii)] = ii + self.__key_map[ii] = str(ii) + self.__cluster_map[str(ii)] = ii + + # linkage method for hierarchical clustering + self.__method = method + + # internal dendrogram tree + self.__tree = None + + # ------------------------------------------------------------------------------------------------------------------ + + def __call__(self): + """ + Caller function for server API + :return: + """ + return self.run() + + # ------------------------------------------------------------------------------------------------------------------ + + @property + def tree(self): + return self.__tree + + # ------------------------------------------------------------------------------------------------------------------ + + def __get_coefficients(self, cluster_i, cluster_j): + """ + Compute the coefficients for the Lance-Williams algorithm + :param cluster_i: + :param cluster_j: + :return: + """ + # TODO! use hash map for storing numbers instead of computing them every time + if self.__method == 'single': + return 0.5, 0.5, 0, -0.5 + elif self.__method == 'complete': + return 0.5, 0.5, 0, 0.5 + elif self.__method == 'weighted': + return 0.5, 0.5, 0, 0 + elif self.__method == 'median': + return 0.5, 0.5, -0.25, 0 + + # TODO! ATTENTION! average method should compute the cluster centroids using the average + # TODO! || cluster_i - cluster_j || ** 2 + elif self.__method == 'average': + n_i = np.float(cluster_i.count(',') + 1) + n_j = np.float(cluster_j.count(',') + 1) + sum_n = n_i + n_j + return (n_i / sum_n), (n_j / sum_n), 0, 0 + + # TODO! ATTENTION! centroid method should compute the cluster centroids using the mean + # TODO! || cluster_i - cluster_j || ** 2 + elif self.__method == 'centroid': + n_i = np.float(cluster_i.count(',') + 1) + n_j = np.float(cluster_j.count(',') + 1) + sum_n = n_i + n_j + return (n_i / sum_n), (n_j / sum_n), -((n_i * n_j) / (sum_n ** 2)), 0 + + # TODO! Support ward method + # TODO! (|cluster_i| * |cluster_j|) / (|cluster_i| + |cluster_j) * || cluster_i - cluster_j || ** 2 + # elif self.__method == 'ward': + # n_i = np.float(cluster_i.count(',') + 1) + # n_j = np.float(cluster_j.count(',') + 1) + # n_k = np.float(cluster_k.count(',') + 1) + # sum_n = n_i + n_j + n_k + # return (n_i + n_k) / sum_n, (n_j + n_k) / sum_n, -n_k / sum_n, 0 + else: + raise AttributeError + + # ------------------------------------------------------------------------------------------------------------------ + + def __compute_proximity_matrix(self): + """ + Compute the proximity of each observation and store the results in a nxn matrix + :return: + """ + + # create distance matrix of size n x n + self.__d = np.zeros((self.__n, self.__n)) + + # compute euclidean distance + # TODO! implement generic distance functions + # TODO! look for an alternative proximity analysis without computing all distances + self.__d = similarity_measurement_matrix(self.__obs, self.__distance) + + # get number of maximum value of float + self.__max_value = self.__d.max() + 1 + + # fill diagonals with max value to exclude them from min dist process + # TODO! operate only on upper triangle matrix of distance matrix + np.fill_diagonal(self.__d, self.__max_value) + + # print('\t-> finished.') + + # ------------------------------------------------------------------------------------------------------------------ + + def __get_matrix_minimum_indices(self): + """ + Searches for the minimum distance in the distance matrix + :return: indices of both clusters having the smallest distance + """ + min_dist = self.__d.min() + min_list = np.argwhere(self.__d == min_dist) + + min_i, min_j = 0, 0 + + # look for indices, where i < j + # TODO! for the future --> use upper triangle matrix + for ii in range(len(min_list)): + min_i, min_j = min_list[ii] + if min_i < min_j: + break + + if min_i == min_j: + print("ERROR") + + return self.__key_map[min_i], self.__key_map[min_j], min_dist + + # ------------------------------------------------------------------------------------------------------------------ + + def __delete_clusters(self, i, j): + """ + Reorders and reduces the matrix to insert the new cluster formed of cluster i and j + and its distance values, and removes the old clusters by cutting the last row. + :param i: cluster index i + :param j: cluster index j + :return: + """ + id_i = self.__id_map[str(i)] + id_j = self.__id_map[str(j)] + + # min_id = min(id_i, id_j) + max_id = max(id_i, id_j) + + # now set column max ID to last column -> swap last and i column + last_row = self.__d[self.__n - 1] + self.__d[max_id] = last_row + self.__d[:, max_id] = self.__d[:, (self.__n - 1)] + + # set key of last column (cluster) to column of the cluster with index max_id + key = self.__key_map[self.__n - 1] + self.__id_map[key] = max_id + self.__key_map[max_id] = key + + # delete entries in id and key map --> not required anymore + try: + del self.__id_map[i] + del self.__id_map[j] + del self.__key_map[self.__n - 1] + except KeyError: + print("\nERROR: Key {} not found in id_map".format(j)) + print("ERROR: Previous key: {} in id_map".format(i)) + print("Given keys: ") + for key in self.__id_map: + print(key) + return + + # reduce dimension of matrix by one column and row + self.__n -= 1 + self.__d = self.__d[:-1, :-1] + + # ------------------------------------------------------------------------------------------------------------------ + + def __merge_clusters(self, i, j): + """ + Merges cluster i and j, computes the new ID and distances of the newly formed cluster + and stores required information + :param i: cluster index i + :param j: cluster index j + :return: + """ + id_i = self.__id_map[str(i)] + id_j = self.__id_map[str(j)] + + min_id = min(id_i, id_j) + max_id = max(id_i, id_j) + + # use Lance-Williams formula to compute linkages + dki = self.__d[:, min_id] + dkj = self.__d[:, max_id] + dij = self.__d[min_id, max_id] + dist_ij = np.abs(dki - dkj) + + # compute coefficients + ai, aj, b, y = self.__get_coefficients(i, j) + + new_entries = ai * dki + aj * dkj + b * dij + y * dist_ij + new_entries[min_id] = self.__max_value + new_entries[max_id] = self.__max_value + + # add new column and row + self.__d[min_id] = new_entries + self.__d[:, min_id] = new_entries + + id_ij = min_id + new_key = i + ',' + j + self.__id_map[new_key] = id_ij + self.__key_map[id_ij] = new_key + self.__cluster_map[new_key] = len(self.__cluster_map) + + # delete old clusters + self.__delete_clusters(i, j) + + # count number of elements + return new_key.count(',') + 1 + + # ------------------------------------------------------------------------------------------------------------------ + + def run(self): + """ + Conducts the algorithm until there's only one cluster. + :return: + """ + + # number of the current iteration + m = 0 + + # resulting matrix containing information Z[i,x], x=0: cluster i, x=1: cluster j, x=2: dist(i,j), x=3: num(i,j) + runs = self.__n - 1 + z = np.array([[0 for _ in range(4)] for _ in range(runs)], dtype=np.float) + + while m < runs: + m += 1 + + i, j, dist_ij = self.__get_matrix_minimum_indices() + num_ij = self.__merge_clusters(i, j) + + cluster_i, cluster_j = self.__cluster_map[i], self.__cluster_map[j] + z[m - 1] = [int(min(cluster_i, cluster_j)), int(max(cluster_i, cluster_j)), np.float(dist_ij), int(num_ij)] + + # reset number n to length of first dimension (number of genes) + self.__n = np.shape(self.__obs)[0] + + self.__tree = self.generate_tree(z) + return z.tolist() + + # ------------------------------------------------------------------------------------------------------------------ + + def generate_tree(self, linkage_matrix): + """ + Computes the dendrogram tree for a given linkage matrix. + :param linkage_matrix: + :return: + """ + self.__tree = None + + tree_map = {} + num_trees = len(linkage_matrix) + + for ii in range(num_trees): + entry = linkage_matrix[ii] + current_id = self.__n + ii + left_index, right_index, value = int(entry[1]), int(entry[0]), entry[2], int(entry[3]) + left = right = None + + if left_index < self.__n: + left = BinaryNode(self.__obs[left_index].tolist(), left_index, 1, None, None) + else: + left = tree_map[left_index] + + if right_index < self.__n: + right = BinaryNode(self.__obs[right_index].tolist(), right_index, 1, None, None) + else: + right = tree_map[right_index] + + if isinstance(left, BinaryNode) and isinstance(right, BinaryNode): + tree_map[current_id] = BinaryTree(left, right, current_id, value) + elif isinstance(left, BinaryNode): + tree_map[current_id] = right.add_node(left, current_id, value) + del tree_map[right_index] + elif isinstance(right, BinaryNode): + tree_map[current_id] = left.add_node(right, current_id, value) + del tree_map[left_index] + else: + tree_map[current_id] = left.merge(right, current_id, value) + del tree_map[right_index] + del tree_map[left_index] + + self.__tree = tree_map[num_trees + self.__n - 1] + return self.__tree + + # ------------------------------------------------------------------------------------------------------------------ + + +######################################################################################################################## + + +def get_clusters(k, obs, dendrogram, sorted=True): + """ + First implementation to cut dendrogram tree automatically by choosing nodes having the greatest node values + or rather distance to the other node / potential cluster + :param k: number of desired clusters + :param obs: set of observations + :param dendrogram: dendrogram tree + :return: centroids, sorted cluster labels and normal label list + """ + obs = np.nan_to_num(obs) + n = obs.shape[0] + + if isinstance(dendrogram, BinaryTree): + cluster_labels = dendrogram.cut_tree_by_clusters(k) + else: + cluster_labels = cut_json_tree_by_clusters(dendrogram, k) + + cluster_centroids = [] + labels = np.zeros(n, dtype=np.int) + cluster_id = 0 + + for ii in range(len(cluster_labels)): + cluster = cluster_labels[ii] + sub_obs = obs[cluster] + cluster_centroids.append(np.mean(sub_obs, axis=0).tolist()) + + for id in cluster: + labels[id] = cluster_id + + # sort labels according to their distance + if sorted: + cluster_labels[ii], _ = compute_cluster_intern_distances(obs, cluster) + + cluster_id += 1 + + return cluster_centroids, cluster_labels, labels.tolist() + + +######################################################################################################################## + +def _plugin_initialize(): + """ + optional initialization method of this module, will be called once + :return: + """ + pass + + +# ---------------------------------------------------------------------------------------------------------------------- + +def create(data, method, distance): + """ + by convention contain a factory called create returning the extension implementation + :return: + """ + return Hierarchical(data, method, distance) + + +######################################################################################################################## +def _main(): + from timeit import default_timer as timer + # from scipy.cluster.hierarchy import linkage, leaves_list + + np.random.seed(200) + # data = np.array([[1,2,3],[5,4,5],[3,2,2],[8,8,7],[9,6,7],[2,3,4]]) + data = np.array([1, 1.1, 5, 8, 5.2, 8.3]) + + time_mine = 0 + time_theirs = 0 + + n = 10 + + for i in range(n): + # data = np.array([np.random.rand(6000) * 4 - 2 for _ in range(249)]) + # import time + s1 = timer() + hier = Hierarchical(data, 'complete') + # s = time.time() + linkage_matrix = hier.run() + e1 = timer() + print(linkage_matrix) + tree = hier.generate_tree(linkage_matrix) + # print(tree.get_leaves()) + # print(tree.jsonify()) + # print(hier.get_clusters(3)) + import json + + json_tree = json.loads(tree.jsonify()) + get_clusters(3, data, json_tree) + + s2 = timer() + # linkage_matrix2 = linkage(data, 'complete') + # print(leaves_list(linkage_matrix2)) + e2 = timer() + + time_mine += e1 - s1 + time_theirs += e2 - s2 + + # print(linkage_matrix) + # print(linkage_matrix2) + print('mine: {}'.format(time_mine / n)) + print('theirs: {}'.format(time_theirs / n)) + +if __name__ == '__main__': + _main() diff --git a/phovea_clustering/clustering_kmeans.py b/phovea_clustering/clustering_kmeans.py new file mode 100644 index 0000000..8cc53b0 --- /dev/null +++ b/phovea_clustering/clustering_kmeans.py @@ -0,0 +1,406 @@ +######################################################################################################################## +# libraries + +# module to load own configurations +import phovea_server.config + +# numpy important to conduct matrix/vector calculus +import numpy as np +# creates random numbers +import random + +# contains utility functions +from clustering_util import weighted_choice, similarity_measurement + +__author__ = 'Michael Kern' +__version__ = '0.0.2' +__email__ = 'kernm@in.tum.de' + +# request config if needed in the future +config = phovea_server.config.view('caleydo-clustering') + + +######################################################################################################################## + +class KMeans: + """ + This is an implementation of the k-means algorithm to cluster genomic data / matrices. + Returns the centroids, the labels / stratification of each row belonging to one cluster, + distance matrix for cluster-cluster distance and distance arrays for row-cluster_centroid distance. + Implementation detail: + """ + + def __init__(self, obs, k, init_mode='kmeans++', distance='sqeuclidean', iters=1000): + """ + Initializes the algorithm with observation, number of k clusters, the initial method and + the maximum number of iterations. + Initialization method of random cluster choice can be: forgy, uniform, random, plusplus + :param obs: genomic data / matrix + :param k: number of clusters + :param init_mode: initialization method + :param distance: distance measurement + :param iters: number of maximum iterations + :return: + """ + + # number of clusters + self.__k = k + # observations, can be 1D array or 2D matrix with genes as rows and conditions as columns + # remove all NaNs in data + self.__obs = np.nan_to_num(obs) + # number of observations / genes + self.__n = np.shape(obs)[0] + # maps the element ids to clusters + self.__label_map = np.zeros(self.__n, dtype=np.int) + # cluster means and number of elements + self.__cluster_means = np.array([obs[0] for _ in range(k)], dtype=np.float) + self.__cluster_nums = np.array([0 for _ in range(k)], dtype=np.int) + # tells if any cluster has changed or rather if any data item was moved + self.__changed = True + # number of iterations + self.__iters = iters + # initialization method + self.__init_mode = init_mode + # compare function + self.__distance = distance + + # ------------------------------------------------------------------------------------------------------------------ + + def __call__(self): + """ + Caller function for server API. + """ + return self.run() + + # ------------------------------------------------------------------------------------------------------------------ + + def __init(self): + """ + Initialize clustering with random clusters using a user-specified method + :return: + """ + # TODO! consider to init k-Means algorithm with Principal Component Analysis (PCA) + # TODO! see + # init cluster + if self.__init_mode == 'forgy': + self.__forgy_method() + elif self.__init_mode == 'uniform': + self.__uniform_method() + elif self.__init_mode == 'random': + self.__random_method() + elif self.__init_mode == 'kmeans++': + self.__plusplus_method() + else: + raise AttributeError + + # ------------------------------------------------------------------------------------------------------------------ + + def __forgy_method(self): + """ + Initialization method: + Randomly choose k observations from the data using a uniform random distribution. + :return: + """ + for ii in range(self.__k): + self.__cluster_means[ii] = (self.__obs[random.randint(0, self.__n - 1)]) + + # ------------------------------------------------------------------------------------------------------------------ + + def __uniform_method(self): + """ + Initialization method: + Randomly assign each observation to one of the k clusters using uniform random distribution + and compute the centroids of each cluster. + :return: + """ + for i in range(self.__n): + self.__label_map[i] = random.randint(0, self.__k - 1) + + self.__update() + + # ------------------------------------------------------------------------------------------------------------------ + + def __random_method(self): + """ + Initialization method: + Randomly choose k observations from the data by estimating the mean and standard deviation of the data and + using the gaussian random distribution. + :return: + """ + mean = np.mean(self.__obs, axis=0) + std = np.std(self.__obs, axis=0) + + for ii in range(self.__k): + self.__cluster_means[ii] = np.random.normal(mean, std) + + # ------------------------------------------------------------------------------------------------------------------ + + def __plusplus_method(self): + """ + Initialization method: + Chooses k observations by computing probabilities for each observation and using a weighted random distribution. + Algorithm: . This method should accelerate the algorithm by finding + the appropriate clusters right at the beginning and hence should make it more robust. + :return: + """ + # 1) choose random center out of data + self.__cluster_means[0] = (random.choice(self.__obs)) + + max_value = np.max(self.__obs) + 1 + probs = np.array([max_value for _ in range(self.__n)]) + + for i in range(1, self.__k): + probs.fill(max_value) + # compute new probabilities, choose min of all distances + for j in range(0, i): + dists = similarity_measurement(self.__obs, self.__cluster_means[j], self.__distance) + # collect minimum squared distances to cluster centroids + probs = np.minimum(probs, dists) + + # sum all squared distances + sum_probs = np.float(np.sum(probs)) + + if sum_probs != 0: + probs /= sum_probs + # 3) choose new center based on probabilities + self.__cluster_means[i] = (self.__obs[weighted_choice(probs)]) + else: + print('ERROR: cannot find enough cluster centroids for given k = ' + str(self.__k)) + + # ------------------------------------------------------------------------------------------------------------------ + + def get_cluster_mean(self, num): + """ + Returns the centroid of the cluster with index num. + :param num: + :return: + """ + if num >= self.__k: + return None + else: + return self.__cluster_means[num] + + # ------------------------------------------------------------------------------------------------------------------ + + def get_cluster_of_element(self, index): + """ + :param index: number of element in observation array + :return: cluster id of observation with given index. + """ + if index >= self.__n: + return None + else: + return self.__label_map[index] + + # ------------------------------------------------------------------------------------------------------------------ + + def print_clusters(self): + """ + Print the cluster centroids and the labels. + :return: + """ + print('Centroids: ' + str(self.__centroids) + ' | _labels: ' + str(self.__labels)) + + # ------------------------------------------------------------------------------------------------------------------ + + def __assignment(self): + """ + Assignment step: + Compute distance of current observation to each cluster centroid and move gene to the nearest cluster. + :return: + """ + for i in range(self.__n): + value = self.__obs[i] + + # compute squared distances to each mean + dists = similarity_measurement(self.__cluster_means, value, self.__distance) + # nearest cluster + nearest_i_d = np.argmin(dists) + + if self.__label_map[i] != nearest_i_d: + self.__changed = True + self.__label_map[i] = nearest_i_d + + # ------------------------------------------------------------------------------------------------------------------ + + def __update(self): + """ + Update step: + Compute the new centroids of each cluster after the assignment. + :return: + """ + self.__cluster_means.fill(0) + self.__cluster_nums.fill(0) + + self.__cluster_labels = [[] for _ in range(self.__k)] + + for ii in range(self.__n): + cluster_i_d = self.__label_map[ii] + self.__cluster_labels[cluster_i_d].append(ii) + self.__cluster_nums[cluster_i_d] += 1 + + for ii in range(self.__k): + self.__cluster_means[ii] = np.mean(self.__obs[self.__cluster_labels[ii]], axis=0) + + # ------------------------------------------------------------------------------------------------------------------ + + def __end(self): + """ + Writes the results to the corresponding member variables. + :return: + """ + # returned values | have to be reinitialized in case of sequential running + # centroids + self.__centroids = np.array([self.__obs[0] for _ in range(self.__k)], dtype=np.float) + # labels of observations + self.__labels = np.array([0 for _ in range(self.__n)], dtype=np.int) + # distances between centroids + # self.__centroid_dist_mat = np.zeros((self.__k, self.__k)) + + # we do not use Ordered_dict here, so obtain dict.values and fill array manually + for index in range(self.__n): + cluster_i_d = self.__label_map[index] + self.__labels[index] = cluster_i_d + + # collect centroids + for ii in range(self.__k): + # self.__centroids.append(self.__cluster_means[ii].tolist()) + self.__centroids[ii] = self.__cluster_means[ii] + + # compute distances between each centroids + # for ii in range(self.__k - 1): + # # compute indices of other clusters + # jj = range(ii + 1, self.__k) + # # select matrix of cluster centroids + # centroid_mat = self.__centroids[jj] + # distances = np.sqrt(self.__compare(centroid_mat, self.__centroids[ii])) + # self.__centroid_dist_mat[ii, jj] = distances + # self.__centroid_dist_mat[jj, ii] = distances + + # ------------------------------------------------------------------------------------------------------------------ + + def run(self): + """ + Runs the algorithm of k-means, using the initialization method and the assignment/update step. + Conducts at most iters iterations and terminates if this number is exceeded or no observations + was moved to another cluster. + :return: + """ + # 1) init algorithm by choosing cluster centroids + self.__init() + + max_iters = self.__iters + counter = 0 + # 2) run clustering + while self.__changed and counter < max_iters: + self.__changed = False + + self.__assignment() + self.__update() + + counter += 1 + + self.num_iters = counter + + # write results to the class members + self.__end() + return self.__centroids.tolist(), self.__labels.tolist(), self.__cluster_labels + # , self.__centroid_dist_mat.tolist() + + # ------------------------------------------------------------------------------------------------------------------ + + # def get_dists_per_centroid(self): + # """ + # Compute the distances between observations belonging to one cluster and the corresponding cluster centroid. + # Cluster labels are sorted in ascending order using their distances + # :return: array of distance arrays for each cluster and ordered labels + # """ + # + # # labels per centroid + # # self.__cluster_labels = [[] for _ in range(self.__k)] + # # distances of obs to their cluster + # self.__centroid_dists = [[] for _ in range(self.__k)] + # + # for ii in range(self.__k): + # self.__cluster_labels[ii] = np.array(self.__cluster_labels[ii], dtype=np.int) + # + # # compute euclidean distances of values to cluster mean + # for ii in range(self.__k): + # mean = self.__cluster_means[ii] + # obs = self.__obs[self.__cluster_labels[ii]] + # dists = similarity_measurement(obs, mean, self.__compare).tolist() + # self.__centroid_dists[ii] = dists + # + # # sort indices in ascending order using the distances + # indices = range(len(dists)) + # indices.sort(key=dists.__getitem__) + # self.__cluster_labels[ii] = self.__cluster_labels[ii][indices].tolist() + # self.__centroid_dists[ii].sort() + # + # return self.__cluster_labels, self.__centroid_dists + + +######################################################################################################################## + +def _plugin_initialize(): + """ + optional initialization method of this module, will be called once + :return: + """ + pass + + +# ---------------------------------------------------------------------------------------------------------------------- + +def create(data, k, init_method, distance): + """ + by convention contain a factory called create returning the extension implementation + :return: + """ + return KMeans(data, k, init_method, distance) + + +######################################################################################################################## + + +def _main(): + from timeit import default_timer as timer + from scipy.cluster.vq import kmeans2 + # np.random.seed(datetime.now()) + # data = np.array([[1,2,3],[5,4,5],[3,2,2],[8,8,7],[9,6,7],[2,3,4]]) + data = np.array([1, 1.1, 5, 8, 5.2, 8.3]) + + # data = np.array([np.random.rand(2) * 5 for _ in range(10)]) + k = 3 + + time_mine = 0 + time_theirs = 0 + n = 10 + + for i in range(10): + s1 = timer() + k_means_plus = KMeans(data, k, 'kmeans++', 'sqeuclidean', 10) + result1 = k_means_plus.run() + # print(result) + e1 = timer() + # labels = k_means_plus.get_dists_per_centroid() + # l, d = compute_cluster_distances(data, labels[0]) + + s2 = timer() + result2 = kmeans2(data, k) + e2 = timer() + + time_mine += e1 - s1 + time_theirs += e2 - s2 + + print(result1) + print(result2) + print('mine: {}'.format(time_mine / n)) + print('theirs: {}'.format(time_theirs / n)) + + +""" +This is for testing the algorithm and comparing the resuls between this and scipy's algorithm +""" +if __name__ == '__main__': + _main() diff --git a/phovea_clustering/clustering_service.py b/phovea_clustering/clustering_service.py new file mode 100644 index 0000000..c789b7b --- /dev/null +++ b/phovea_clustering/clustering_service.py @@ -0,0 +1,158 @@ +import numpy as np +from clustering_hierarchical import get_clusters + +__author__ = 'Michael Kern' +__version__ = '0.0.1' +__email__ = 'kernm@in.tum.de' + + +######################################################################################################################## + +def load_data(dataset_id): + """ + Loads the genomic data with given identifier dataset_id. + :param dataset_id: identifier + :return: array of the genomic data + """ + import phovea_server.dataset as dt + # obtain Caleydo dataset from ID + dataset = dt.get(dataset_id) + # choose loaded attribute and load raw data in numpy format + # somehow hack to get a numpy array out of the data + try: + arr = np.array(list(dataset.asnumpy())) + except: + raise Exception + return arr + + +######################################################################################################################## + +def load_plugin(plugin_id, *args, **kwargs): + """ + Loads the clustering plugin with given arguments. + :param plugin_id: identifier of plugin + :param *args: additional caller function arguments + :param **kwargs: additional arguments + :return: plugin + """ + import phovea_server.plugin + # obtain all plugins with 'plugin_id' extension + plugins = phovea_server.plugin.list('clustering') + # choose plugin with given ID + for plugin in plugins: + if plugin.id == plugin_id: + # load the implementation of the plugin + return plugin.load().factory(*args, **kwargs) + + raise NotImplementedError + + +######################################################################################################################## + +def run_kmeans(data, k, init_method, distance): + """ + Runs the k-Means clustering algorithm given the loaded data set, the number of clusters k and the initialization + method. + :param data: observation matrix + :param k: number of clusters + :param init_method: number of clusters + :return: result of k-means + """ + kmeans = load_plugin('caleydo-clustering-kmeans', data, k, init_method, distance) + # and run the kmeans extension + centroids, labels, cluster_labels = kmeans() + # cluster_labels, clusterDists = KMeans.getDistsPerCentroid() + + return {'centroids': centroids, 'clusterLabels': cluster_labels} + + +######################################################################################################################## + +def run_hierarchical(data, k, method, distance): + """ + Runs the hierarchical clustering algorithm given the loaded data set and type of linkage method. + :param data: observation matrix + :param method: linkage method + :return: linkage matrix / dendrogram of the algorithm + """ + hierarchical = load_plugin('caleydo-clustering-hierarchical', data, method, distance) + # and use the extension + hierarchical() + # obtain k-number of clusters + centroids, cluster_labels, labels = get_clusters(k, data, hierarchical.tree, False) + + return {'centroids': centroids, 'clusterLabels': cluster_labels, 'dendrogram': hierarchical.tree.json()} + # print('\t-> creating dendrogram tree...') + # tree = Hierarchical.generateTree(linkage) + # print('\t-> creating json string ...') + # dendrogram = tree.jsonify() + # print('\t-> finished.') + + # return {'dendrogram': dendrogram} --> if needed later + + +######################################################################################################################## + +def run_affinity_propagation(data, damping, factor, preference, distance): + """ + Runs the affinity propagation algorithm given the loaded dataset, a damping value, a certain factor and + a preference method. + :param data: + :param damping: + :param factor: + :param preference: + :return: + """ + affinity = load_plugin('caleydo-clustering-affinity', data, damping, factor, preference, distance) + # use this extension + centroids, labels, cluster_labels = affinity() + + return {'centroids': centroids, 'clusterLabels': cluster_labels} + + +######################################################################################################################## + +def run_fuzzy(data, num_clusters, m, threshold, distance): + fuzzy = load_plugin('caleydo-clustering-fuzzy', data, num_clusters, m, threshold, distance) + + centroids, cluster_labels, partition_matrix, max_prob = fuzzy() + + return {'centroids': centroids, 'clusterLabels': cluster_labels, 'partitionMatrix': partition_matrix, + 'maxProbability': max_prob} + + +######################################################################################################################## + +def get_cluster_distances(data, labels, metric, extern_labels=None, sorted=True): + """ + Compute the cluster distances in a given data among certain rows (labels) + :param data: genomic data + :param labels: indices of rows + :param metric: distance metric + :param extern_labels: + :return: labels and distances values sorted in ascending order + """ + from clustering_util import compute_cluster_intern_distances, compute_cluster_extern_distances + dist_labels, dist_values = compute_cluster_intern_distances(data, labels, sorted, metric) + + if extern_labels is not None: + extern_dists = compute_cluster_extern_distances(data, dist_labels, extern_labels, metric) + return {'labels': dist_labels, 'distances': dist_values, 'externDistances': extern_dists} + else: + return {'labels': dist_labels, 'distances': dist_values} + + +######################################################################################################################## + +def get_clusters_from_dendrogram(data, dendrogram, num_clusters): + """ + + :param data: + :param dendrogram: + :param num_clusters: + :return: + """ + + centroids, cluster_labels, _ = get_clusters(num_clusters, data, dendrogram) + return {'centroids': centroids, 'clusterLabels': cluster_labels} diff --git a/phovea_clustering/clustering_util.py b/phovea_clustering/clustering_util.py new file mode 100644 index 0000000..5470538 --- /dev/null +++ b/phovea_clustering/clustering_util.py @@ -0,0 +1,489 @@ +######################################################################################################################## + +import random +import numpy as np + +# use scipy to compute different distance matrices +from scipy.spatial.distance import pdist, squareform +import scipy.stats as stats + +__author__ = "Michael Kern" +__email__ = 'kernm@in.tum.de' + +""" +http://eli.thegreenplace.net/2010/01/22/weighted-random-generation-in-python +--> good explanation to create weighted choices / random numbers +""" + + +def weighted_choice(weights): + # compute sum of all weights + sum_total = sum(weights) + # compute a random with range[0, sum_total] + rnd = random.random() * sum_total + + for index, weight in enumerate(weights): + # subtract current weight from random to find current index + rnd -= weight + if rnd < 0: + return index + + # 20% faster if weights are sorted in descending order + + +######################################################################################################################## + +""" +Implementation of an binary tree for hierarchical clustering +""" + +""" +Node of the tree containing information about it's id in data, children and the value +""" + + +class BinaryNode: + def __init__(self, value, id, size, left_child, right_child): + self.value = value + self.left = left_child + self.right = right_child + self.size = size + self.id = id + self.parent = None + self.indices = [id] + + # create json info on the fly + self.json = {"id": self.id, "size": self.size, "value": self.value, "indices": [id]} + if left_child is not None and right_child is not None: + # self.json["value"] = np.mean(self.value) + self.json["children"] = [right_child.json, left_child.json] + self.indices = [] + right_child.indices + left_child.indices + self.json["indices"] = self.indices + + def is_leave(self): + return self.left is None and self.right is None + + +######################################################################################################################## + +""" +Implementation of an hierarchical binary tree +""" + + +class BinaryTree: + # this tree must not be empty and must have at least two children (leaves) + def __init__(self, left_node, right_node, new_id, new_value): + self.__create_new_root(left_node, right_node, new_id, new_value) + + # ------------------------------------------------------------------------------------------------------------------ + + def add_node(self, new_node, new_id, new_value): + self.__create_new_root(self.root, new_node, new_id, new_value) + return self + + # ------------------------------------------------------------------------------------------------------------------ + + def merge(self, tree, new_id, new_value): + self.__create_new_root(self.root, tree.root, new_id, new_value) + return self + + # ------------------------------------------------------------------------------------------------------------------ + + def jsonify(self): + import json + return json.dumps(self.root.json) + # return self.root.json + # return self.__traverseJson(self.root) + + # ------------------------------------------------------------------------------------------------------------------ + + def json(self): + return self.root.json + + # ------------------------------------------------------------------------------------------------------------------ + + def cut_tree_by_clusters(self, k): + queue = [self.root] + + while len(queue) < k: + node = queue.pop(0) + queue.append(node.left) + queue.append(node.right) + + def key_func(x): + if x.is_leave(): + return 0 + else: + return -x.value + + queue.sort(key=key_func) + + clusters = [] + + for node in queue: + clusters.append(node.indices) + + return clusters + + # ------------------------------------------------------------------------------------------------------------------ + + def __traverse_json(self, node): + json = {"id": node.id, "size": node.size, "value": node.value} + if node.left is None and node.right is None: + return json + else: + json["children"] = [] + [self.__traverse_json(node.left)] + [self.__traverse_json(node.right)] + + return json + + # ------------------------------------------------------------------------------------------------------------------ + + def get_leaves(self): + return self.root.indices + # return self.__traverseIDs(self.root) + + # ------------------------------------------------------------------------------------------------------------------ + + def __traverse_ids(self, node): + + if node.left is None and node.right is None: + return [node.id] + else: + return [] + self.__traverse_ids(node.right) + self.__traverse_ids(node.left) + + # ------------------------------------------------------------------------------------------------------------------ + + def __create_new_root(self, left_node, right_node, new_id, new_value): + new_size = left_node.size + right_node.size + self.root = BinaryNode(new_value, new_id, new_size, left_node, right_node) + left_node.parent = right_node.parent = self.root + + # ------------------------------------------------------------------------------------------------------------------ + + +def cut_json_tree_by_clusters(json_data, k): + # import json + # tree = json.loads(json_data) + queue = [json_data] + + while len(queue) < k: + node = queue.pop(0) + queue.append(node['children'][0]) + queue.append(node['children'][1]) + + def key_func(x): + if 'children' not in x: + return 0 + else: + return -x['value'] + + queue.sort(key=key_func) + + clusters = [] + + for node in queue: + clusters.append(node['indices']) + + return clusters + + +######################################################################################################################## + +def euclidean_distance(matrix, vector, squared=False): + """ + Computes the euclidean distance between a vector and the rows of a matrix in parallel. + :param matrix: array of observations or clusters + :param vector: cluster centroid or observation + :return: + """ + + # compute distance between values in matrix and the vector + dist_mat = matrix - vector + num_values = len(matrix) + distances = np.array([0.0 for _ in range(num_values)], dtype=np.float) + + for ii in range(num_values): + distance = dist_mat[ii] + # always try to use np.dot when computing euclidean distance + # it's way faster than ** 2 and sum(..., axis=1) + distances[ii] = np.dot(distance, distance) + + if squared: + return distances + else: + return np.sqrt(distances) + + +# ---------------------------------------------------------------------------------------------------------------------- + +def correlation_distance(matrix, vector, method): + """ + + :param matrix: + :param vector: + :return: + """ + + num_values = len(matrix) + distances = np.array([0.0 for _ in range(num_values)], dtype=np.float) + + for ii in range(num_values): + value = matrix[ii] + + if method == 'pearson': + distances[ii], _ = stats.pearsonr(value, vector) + elif method == 'spearman': + distances[ii], _ = stats.spearmanr(value, vector) + elif method == 'kendall': + distances[ii], _ = stats.kendalltau(value, vector) + else: + raise AttributeError + + return distances + + +# ---------------------------------------------------------------------------------------------------------------------- + + +def similarity_measurement(matrix, vector, method='euclidean'): + from scipy.spatial.distance import cdist + + if method == 'euclidean': + return euclidean_distance(matrix, vector) + + if method == 'sqeuclidean': + return euclidean_distance(matrix, vector, True) + + spatial_methods = ['cityblock', 'chebyshev', 'canberra', 'correlation', 'hamming', 'mahalanobis', ] + + if method in spatial_methods: + return np.nan_to_num(cdist(matrix, np.atleast_2d(vector), method).flatten()) + + corr_methods = ['spearman', 'pearson', 'kendall'] + + if method in corr_methods: + return correlation_distance(matrix, vector, method) + + raise AttributeError + + +# ---------------------------------------------------------------------------------------------------------------------- + +def euclidean_distance_matrix(matrix, squared=False): + """ + Compute the euclidean distance matrix required for the algorithm + :param matrix: + :param n: + :return: + """ + + n = np.shape(matrix)[0] + dist_mat = np.zeros((n, n)) + + # use Gram matrix and compute distances without inner products | FASTER than row-by-row method + "Gramiam matrix to compute dot products of each pair of elements: " + "" + gram_mat = np.zeros((n, n)) + for ii in range(n): + for jj in range(ii, n): + gram_mat[ii, jj] = np.dot(matrix[ii], matrix[jj]) + + # # ! This is slower than computing dot products of rows manually in python + # # ! And we only require the upper triangle matrix of the Gram matrix + # gram_mat = np.dot(self.__obs, self.__obs.T) + + # make use of formula |a - b|^2 = a^2 - 2ab + b^2 + for ii in range(n): + # self.__d[ii, ii] = self.__maxValue + jj = np.arange(ii + 1, n) + dist_mat[ii, jj] = gram_mat[ii, ii] - 2 * gram_mat[ii, jj] + gram_mat[jj, jj] + dist_mat[jj, ii] = dist_mat[ii, jj] + + # # take square root of distances to compute real euclidean distance + # dist_mat = np.sqrt(dist_mat) + + "alternative version --> use scipy's fast euclidean distance implementation: FASTEST" + # dist_mat = spt.distance.pdist(self.__obs, 'euclidean') + # self.__d = spt.distance.squareform(dist_mat) + # print(dist_mat) + + if squared: + return dist_mat + else: + return np.sqrt(dist_mat) + + +# ---------------------------------------------------------------------------------------------------------------------- + +def norm1_distance(matrix, vector): + """ + Computes the norm-1 distance between a vector and the rows of a matrix in parallel. + :param matrix: array of observations or clusters + :param vector: cluster centroid or observation + :return: + """ + dist_mat = np.abs(matrix - vector) + num_values = len(vector) + + distances = np.sum(dist_mat, axis=1) / num_values + return distances + + +# ---------------------------------------------------------------------------------------------------------------------- + +def pearson_correlation_matrix(matrix): + """ + + :param matrix: + :param n: + :return: + """ + # TODO! other possibilites like 1 - abs(corr) | sqrt(1 - corr ** 2) | (1 - corr) / 2 + dist_mat = 1 - np.corrcoef(matrix) + + return dist_mat + + +# ---------------------------------------------------------------------------------------------------------------------- + +def stats_correlation_matrix(matrix, method): + if method == 'pearson': + return pearson_correlation_matrix(matrix) + + n = np.shape(matrix)[0] + dist_mat = np.zeros((n, n)) + + for ii in range(n): + row_i = matrix[ii] + for jj in range(ii + 1, n): + row_j = matrix[jj] + corr = 0 + + if method == 'spearman': + corr, _ = stats.spearmanr(row_i, row_j) + + if method == 'kendall': + corr, _ = stats.kendalltau(row_i, row_j) + + # TODO! other possibilites like 1 - abs(corr) | sqrt(1 - corr ** 2) | (1 - corr) / 2 + corr = 1 - corr + + dist_mat[ii, jj] = corr + dist_mat[jj, ii] = corr + + return dist_mat + + +# ---------------------------------------------------------------------------------------------------------------------- + +def similarity_measurement_matrix(matrix, method): + """ + Generic function to determine the similarity measurement for clustering + :param matrix: + :param method: + :return: + """ + if method == 'euclidean': + return euclidean_distance_matrix(matrix) + # return squareform(pdist(matrix, method)) + + if method == 'sqeuclidean': + return euclidean_distance_matrix(matrix, True) + # return squareform(pdist(matrix, method)) + + spatial_methods = ['cityblock', 'chebyshev', 'canberra', 'correlation', 'hamming', 'mahalanobis'] + + if method in spatial_methods: + return squareform(np.nan_to_num(pdist(matrix, method))) + + corr_methods = ['spearman', 'pearson', 'kendall'] + + if method in corr_methods: + return stats_correlation_matrix(matrix, method) + + raise AttributeError + + +######################################################################################################################## +# utility functions to compute distances between rows and cluster centroids + +def compute_cluster_intern_distances(matrix, labels, sorted=True, metric='euclidean'): + """ + Computes the distances of each element in one cluster to the cluster's centroid. Returns distance values and labels + sorted in ascending order. + :param matrix: + :param labels: + :return: labels / indices of elements corresponding to distance array, distance values of cluster + """ + cluster_labels = np.array(labels) + if len(cluster_labels) == 0: + return [], [] + + sub_matrix = matrix[cluster_labels] + # compute centroid of cluster along column (as we want to average each gene separately) + centroid = np.mean(sub_matrix, axis=0) + + # compute distances to centroid + dists = similarity_measurement(sub_matrix, centroid, metric) + + if sorted == 'true': + # sort values + indices = range(len(dists)) + indices.sort(key=dists.__getitem__) + dists.sort() + + # reverse order if correlation coefficient is used + # (1 means perfect correlation while -1 denotes opposite correlation) + corr_metrics = ['pearson', 'spearman', 'kendall'] + if metric in corr_metrics: + indices.reverse() + dists = dists[::-1] + + # write back to our arrays + dist_labels = cluster_labels[indices].tolist() + dist_values = dists.tolist() + else: + dist_labels = cluster_labels.tolist() + dist_values = dists.tolist() + + return dist_labels, dist_values + + +# ---------------------------------------------------------------------------------------------------------------------- + +def compute_cluster_extern_distances(matrix, labels, outer_labels, metric='euclidean'): + """ + Compute the distances of patients in one cluster to the centroids of all other clusters. + :param matrix: + :param labels: + :param outer_labels: + :return: + """ + extern_dists = [] + intern_sub_matrix = matrix[labels] + + for extern_labels in outer_labels: + + if len(extern_labels) == 0: + extern_dists.append([]) + + # compute centroid of external cluster + sub_matrix = matrix[extern_labels] + centroid = np.mean(sub_matrix, axis=0) + + dists = similarity_measurement(intern_sub_matrix, centroid, metric) + extern_dists.append(dists.tolist()) + + return extern_dists + + +######################################################################################################################## + +if __name__ == '__main__': + from scipy.spatial.distance import cdist + print(cdist([[1, 1, 1], [3, 3, 3], [5, 5, 5]], np.atleast_2d([2, 2, 2]), 'sqeuclidean').flatten()) + + from scipy.stats import spearmanr + + print(spearmanr([1, 2, 3], [2, 4, 1])) diff --git a/phovea_clustering/config.json b/phovea_clustering/config.json new file mode 100644 index 0000000..0967ef4 --- /dev/null +++ b/phovea_clustering/config.json @@ -0,0 +1 @@ +{} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f74d796 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +-e git+https://github.com/phovea/phovea_server.git#egg=phovea_server \ No newline at end of file diff --git a/requirements_dev.txt b/requirements_dev.txt new file mode 100644 index 0000000..0e2902f --- /dev/null +++ b/requirements_dev.txt @@ -0,0 +1,4 @@ +flake8==3.0.4 +pep8-naming==0.4.1 +pytest==3.0.3 +pytest-runner==2.9 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..5519f85 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,14 @@ +############################################################################### +# Caleydo - Visualization for Molecular Biology - http://caleydo.org +# Copyright (c) The Caleydo Team. All rights reserved. +# Licensed under the new BSD license, available at http://caleydo.org/license +############################################################################### + +[bdist_wheel] +# This flag says that the code is written to work on both Python 2 and Python +# 3. If at all possible, it is good practice to do this. If you cannot, you +# will need to generate wheels for each Python version that you support. +universal=1 + +[aliases] +test=pytest diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..2f7a0a1 --- /dev/null +++ b/setup.py @@ -0,0 +1,81 @@ +############################################################################### +# Caleydo - Visualization for Molecular Biology - http://caleydo.org +# Copyright (c) The Caleydo Team. All rights reserved. +# Licensed under the new BSD license, available at http://caleydo.org/license +############################################################################### +from __future__ import with_statement, print_function +from setuptools import setup +from codecs import open +from os import path + +here = path.abspath(path.dirname(__file__)) + + +def read_it(name): + with open(path.join(here, name), encoding='utf-8') as f: + return f.read() + + +# read package.json information +with open(path.join(here, 'package.json'), encoding='utf-8') as json_data: + import json + + pkg = json.load(json_data) + + +def packaged(*files): + r = {} + global pkg + r[pkg['name'].encode('ascii')] = list(files) + return r + + +setup( + name=pkg['name'], + version=pkg['version'], + description=pkg['description'], + long_description=read_it('README.md'), + keywords=pkg.get('keywords', ''), + author=pkg['author']['name'], + author_email=pkg['author']['email'], + license=pkg['license'], + zip_safe=False, + + entry_points={ + 'phovea.registry': ['{0} = {0}:phovea'.format(pkg['name'])], + 'phovea.config': ['{0} = {0}:phovea_config'.format(pkg['name'])] + }, + + # See https://pypi.python.org/pypi?%3Aaction=list_classifiers + classifiers=[ + 'Intended Audience :: Developers', + 'Operating System :: OS Independent', + # Pick your license as you wish (should match "license" above) + 'License :: OSI Approved :: ' + pkg['license'], + 'Programming Language :: Python', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3.4' + ], + + # You can just specify the packages manually here if your project is + # simple. Or you can use find_packages(). + py_modules=[pkg['name']], + + # List run-time dependencies here. These will be installed by pip when + # your project is installed. For an analysis of "install_requires" vs pip's + # requirements files see: + # https://packaging.python.org/en/latest/requirements.html + install_requires=[r for r in read_it('requirements.txt').split('\n') if not r.startswith('-e git+https://')], + tests_require=read_it('requirements_dev.txt').split('\n'), + + # If there are data files included in your packages that need to be + # installed, specify them here. If using Python 2.6 or less, then these + # have to be included in MANIFEST.in as well. + package_data=packaged('config.json'), + + # Although 'package_data' is the preferred approach, in some case you may + # need to place data files outside of your packages. See: + # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa + # In this case, 'data_file' will be installed into '/my_data' + data_files=[] # [('my_data', ['data/data_file'])], +) diff --git a/tests/test_dummy.py b/tests/test_dummy.py new file mode 100644 index 0000000..4b8fe97 --- /dev/null +++ b/tests/test_dummy.py @@ -0,0 +1,4 @@ + + +def test_dummy(): + assert 1 == 1 diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..f2734b2 --- /dev/null +++ b/tox.ini @@ -0,0 +1,28 @@ +############################################################################### +# Caleydo - Visualization for Molecular Biology - http://caleydo.org +# Copyright (c) The Caleydo Team. All rights reserved. +# Licensed under the new BSD license, available at http://caleydo.org/license +############################################################################### + +[tox] +envlist = py{27,34} + +[testenv] +basepython = + py27: python2.7 + py34: python3.4 +deps = + flake8 + pytest +commands = + check-manifest --ignore tox.ini,tests* + python setup.py check -m -r -s + flake8 . + py.test tests + +[flake8] +ignore=E111,E114,E501 +exclude = .tox,*.egg,build,data,.git,__pycache__,docs,node_modules + +[pytest] +testpaths = tests