diff --git a/.gitignore b/.gitignore index 43091aa..a207628 100644 --- a/.gitignore +++ b/.gitignore @@ -102,4 +102,10 @@ ENV/ .mypy_cache/ # IDE settings -.vscode/ \ No newline at end of file +.vscode/ + +# Misc +!/regression_enrichment_surface.egg-info/ +!/docs/_build/ +!/docs/modules.rst +!/docs/regression_enrichment_surface.rst diff --git a/AUTHORS.rst b/AUTHORS.rst index 56a89f5..f93f69a 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -10,4 +10,4 @@ Development Lead Contributors ------------ -None yet. Why not be the first? +* Xiaotian Duan diff --git a/docs/Makefile b/docs/Makefile index d1cdbf5..0d1bb80 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -14,7 +14,13 @@ help: .PHONY: help Makefile + +clean: + rm -rf modules.rst $(SPHINXPROJ).rst "$(BUILDDIR)" || true + + # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile + sphinx-apidoc -o . ../$(SPHINXPROJ) @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/index.rst b/docs/index.rst index 1132f93..89ea7db 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,5 +1,5 @@ Welcome to Regression Enrichment Surface's documentation! -====================================== +========================================================= .. toctree:: :maxdepth: 2 diff --git a/regression_enrichment_surface/__init__.py b/regression_enrichment_surface/__init__.py index 62b2b5c..c5b7170 100644 --- a/regression_enrichment_surface/__init__.py +++ b/regression_enrichment_surface/__init__.py @@ -2,4 +2,4 @@ __author__ = """Austin Clyde""" __email__ = 'aclyde@uchicago.edu' -__version__ = '0.1.0' +__version__ = '0.2.0' diff --git a/regression_enrichment_surface/regression_enrichment_surface.py b/regression_enrichment_surface/regression_enrichment_surface.py index df26082..6e6dd38 100644 --- a/regression_enrichment_surface/regression_enrichment_surface.py +++ b/regression_enrichment_surface/regression_enrichment_surface.py @@ -1,99 +1,358 @@ -"""Main module.""" -import matplotlib.pyplot as plt +"""Regression Enrichment Surface + +This module implements the (RES) regression enrichment surface plot. +Please refer to the paper "Regression Enrichment Surfaces: A Simple Analysis +Technique for Virtual Screening Models" by Austin Clyde for more details. + +Example: + TODO: usage examples for both stratified and non-stratified cases + +TODO: + * Simple examples for demonstration and test purposes + * Performance optimization for the iterative computation of RES + +""" +import warnings import numpy as np +import matplotlib.pyplot as plt -def erf(x_pred, x_true, r, y, indexs_pred, indexs_true): - return len( - set(indexs_pred[:int(r * indexs_pred.shape[0])]).intersection(set(indexs_true[:int(y * indexs_pred.shape[0])]))) +def get_enrichment( + ranked_indexes_true, + ranked_indexes_pred, + cutoff_true, + cutoff_pred, +): + """Calculates the enrichment with true and predicted ranking and cutoffs. -def erfmax(x_pred, x_true, r, y, indexs_pred, indexs_true): - return (int(min(r, y) * indexs_pred.shape[0])) + This function is the implementation of the equation 10 in the paper + "Regression Enrichment Surfaces: A Simple Analysis Technique for Virtual + Screening Models" by Austin Clyde. + Specifically, it returns the number of common members of true and + predicted ranked indexes within two cutoffs separately, normalized by the + theoretically maximum possible common members with given cutoffs. + Roughly speaking, the enrichment represents the similarity between two + rankings with different cutoffs in the scale of [0, 1]. + :param ranked_indexes_true: the true/target ranked list of all the + indexes of drug candidates + :type ranked_indexes_true: list, tuple, or other sequence of int + :param ranked_indexes_pred: the predicted ranked list of all the indexes + of drug candidates + :type ranked_indexes_pred: list, tuple, or other sequence of int + :param cutoff_true: cutoff percentage for the true/target ranked list ( + from the top or index 0) + :type cutoff_true: float + :param cutoff_pred: cutoff percentage for the predicted ranked list ( + from the top or index 0) + :type cutoff_true: float -def nefr(*i): - return erf(*i) / erfmax(*i) + :return: enrichment score indicating the similarity between two ranking + orders with given cutoffs for both + :rtype: float + """ + # might use some sanity check here? Specifically, check if + # * set(ranked_indexes_true) == set(ranked_indexes_pred) + # * 0 <= cutoff_* <= 1 -def nefrcurve(points_, p, t, min_sample=-3, reverse_sort=False): - xs = np.logspace(min_sample, 0, points_, base=10) - ys = np.logspace(min_sample, 0, points_, base=10) + _num_indexes = len(ranked_indexes_true) - indexs_pred = np.argsort(p) - indexs_true = np.argsort(t) - if reverse_sort: - indexs_pred = indexs_pred[::-1] - indexs_true = indexs_true[::-1] + # "included" means in the top percentage according to the ranking + _included_indexes_true = set( + ranked_indexes_true[:int(cutoff_true * _num_indexes)]) + _included_indexes_pred = set( + ranked_indexes_pred[:int(cutoff_pred * _num_indexes)]) - xx, yy = np.meshgrid(xs, ys) - zz = np.zeros(xx.shape) - for i in range(points_): - for j in range(points_): - zz[i, j] = nefr(p, t, xx[i, j], yy[i, j], indexs_pred, indexs_true) + _num_jointly_included_indexes = len( + _included_indexes_true.intersection(_included_indexes_pred)) - return xx, yy, zz + # consider that the predicted ranking is the same as the target/true + # one, then the maximum number of jointly included indexes in both true + # and predicted rankings is min(cutoffs) * num_indexes + _max_num_jointly_included_indexes = int( + min(cutoff_true, cutoff_pred) * _num_indexes) + return _num_jointly_included_indexes / _max_num_jointly_included_indexes + + +def get_enrichment_grid( + y_true, + y_pred, + num_samples_per_axis, + logspace_start=-3, + logspace_stop=0, + logspace_base=10, + descending_ranking_order=False, +): + """Generates a 2D square grid of enrichment scores. + + This function will first rank the true values and the predictions with + reverse option, which will then be used in the calculation of enrichment + scores repeatedly. + A 2D square grid is generated based on exponential sampling, + where both X and Y axes are sampled exponentially, which means that the + grid coordinates on each axis [base**start, ..., base**stop] are evenly + distributed on log scale. + Each point on the square grid has a corresponding enrichment score, + calculated using the X coordinate as as the cutoff for predictions, + and Y coordinate as the cutoff for the true values. + Finally the function will return a tuple of three numpy arrays, + containing the X and Y coordinates, and the enrichment score for all + the points on the square grid. + Note that all three arrays have the shape of + ('num_samples_per_axis', 'num_samples_per_axis') and share the same + indexes. + + :param y_true: the true values or target of the predictive problem + :type y_true: sequence of integer, float or any other comparable type + :param y_pred: the predicted values + :type y_pred: sequence of integer, float or any other comparable type + :param num_samples_per_axis: number of samples per axis in the grid + :type num_samples_per_axis: int + :param logspace_start: inclusive starting point (base**'logspace_start') + for logspace sampling, defaults to -3 + :type logspace_start: int or float + :param logspace_stop: inclusive stopping point (base**'logspace_stop') + for logspace sampling, defaults to 0 + :type logspace_stop: int or float + :param logspace_base: the exponential base of logspace sampling for + grid, defaults to 10 + :type logspace_base: int or float + :param descending_ranking_order: descending ranking order option for + both predictions and true targets, defaults to False + :type descending_ranking_order: bool + + :return: tuple of three numpy arrays, all in the shape of + ('num_samples_per_axis', 'num_samples_per_axis'), containing + the X and Y coordinates, and the enrichment score for all the + points on the grid + :rtype: tuple of three numpy arrays -class RegressionEnrichmentSurface: """ - This is a conceptual class representation of a simple BLE device (GATT Server). It is essentially an extended combination of the :class:`bluepy.btle.Peripheral` and :class:`bluepy.btle.ScanEntry` classes - :param percent_min: sets the axis bounds. Must be reasonable for your data size (i.e. cannot be data size 100 if you set -3) - :type client: int + + # sanity check for + # * len(y_true) == len(y_pred) > 0 + # * num_samples_per_axis > 0 + # * logspace_start < logspace_stop <= 0 + # * logspace_base > 0 + + # obtain the ranked indexes for both target and predictions + _ranked_indexes_true = np.argsort(y_true)[::-1] \ + if descending_ranking_order else np.argsort(y_true) + _ranked_indexes_pred = tuple( + np.argsort(y_pred)[::-1] if descending_ranking_order + else np.argsort(y_pred)) + + # construct the mesh grid + # X and Y axes for positioning, and Z for enrichment score + _sample_positions_on_axis = np.logspace( + start=logspace_start, + stop=logspace_stop, + num=num_samples_per_axis, + base=logspace_base, + ) + + # all X, Y, Z np arrays have shape of (num_samples, num_samples) + _x_coordinates, _y_coordinates = \ + np.meshgrid(_sample_positions_on_axis, _sample_positions_on_axis) + _z_values = np.zeros_like(_x_coordinates) + + # this for-loop could be optimized + for _i in range(num_samples_per_axis): + for _j in range(num_samples_per_axis): + + # assume that X axis represents the predictions, and Y the targets + _z_values[_i, _j] = get_enrichment( + ranked_indexes_true=_ranked_indexes_true, + ranked_indexes_pred=_ranked_indexes_pred, + cutoff_true=_y_coordinates[_i, _j], + cutoff_pred=_x_coordinates[_i, _j], + ) + + return _x_coordinates, _y_coordinates, _z_values + + +class RegressionEnrichmentSurface: + """This class combines the RES computation with the plot function. + + The initialization merely declares an empty variable, which will hold + the enrichment surface grid(s) after 'get_enrichment_grids' is called. + """ - def __init__(self, percent_min=-3): - self.min = percent_min - self.nefr = None - self.stratify = False + def __init__(self): + """Constructor method + + """ + # self.__stratified = None + self.__enrichment_grids = None + + def get_enrichment_grids( + self, + y_true, + y_pred, + num_samples_per_axis=30, + logspace_start=-3, + logspace_stop=0, + logspace_base=10, + descending_ranking_order=False, + stratified_on=None, + ): + """Generates multiple 2D square grid of enrichment scores. + + This function is mostly the same as 'get_enrichment_grid', with one + additional functionality: stratification of the surfaces. If the + argument 'stratified_on' is passed in, the function will seek to + divide the target and predictions into different groups according + to the stratified labels, and therefore generates multiple + enrichment surfaces and returned a tuple of three numpy arrays, + representing X, Y, and Z of all the surfaces. + All three arrays share the shape of ('num_labels', + 'num_samples_per_axis', 'num_samples_per_axis'), where the + 'num_labels' equals to the number of unique labels in the + sequence 'stratified_on', otherwise equals to 1. + + :param y_true: the true values or target of the predictive problem + :type y_true: sequence of integer, float or any other comparable type + :param y_pred: the predicted values + :type y_pred: sequence of integer, float or any other comparable type + :param num_samples_per_axis: number of samples per axis in the grid + :type num_samples_per_axis: int + :param logspace_start: inclusive starting point (base**logspace_start) + for logspace sampling, defaults to -3 + :type logspace_start: int or float + :param logspace_stop: inclusive stopping point (base**logspace_stop) + for logspace sampling, defaults to 0 + :type logspace_stop: int or float + :param logspace_base: the exponential base of logspace sampling for + grid, defaults to 10 + :type logspace_base: int or float + :param descending_ranking_order: descending ranking order option for + both predictions and true targets, defaults to False + :type descending_ranking_order: bool + :param stratified_on: stratified labels for the samples, should be a + sequence of the same length as 'y_true' and 'y_pred', labeling + different predictions for multiple RES grids, defaults to None + :type stratified_on: sequence + + :return: tuple of three numpy arrays, all in the shape of + ('num_labels', 'num_samples_per_axis', 'num_samples_per_axis'), + containing the X and Y coordinates, and the enrichment score + for all the points on the grids. the 'num_labels' equals to the + number of unique labels in the sequence 'stratified_on', + otherwise equals to 1 + :rtype: tuple of three numpy arrays + """ + + _common_enrichment_grid_kwargs = { + 'num_samples_per_axis': num_samples_per_axis, + 'logspace_start': logspace_start, + 'logspace_stop': logspace_stop, + 'logspace_base': logspace_base, + 'descending_ranking_order': descending_ranking_order, + } + + if stratified_on is None: + _x, _y, _z = get_enrichment_grid( + y_true=y_true, + y_pred=y_pred, + **_common_enrichment_grid_kwargs, + ) + self.__enrichment_grids = ([_x, ], [_y, ], [_z, ]) - def compute(self, trues, preds, stratify=None, samples=30): - self.stratify = stratify is not None - if not self.stratify: - self.nefr = nefrcurve(samples, preds, trues, self.min) else: - x, y, z = [], [], [] - u, indices = np.unique(stratify, return_inverse=True) - for i in (range(u.shape[0])): - locs = np.argwhere(indices == i).flatten() - preds_strat = preds[locs] - trues_strat = trues[locs] + _stratified_xs, _stratified_ys, _stratified_zs = [], [], [] + _unique_labels, _label_indexes = \ + np.unique(stratified_on, return_inverse=True) + + for _label in _unique_labels: + + # construct one grid in the scope of each particular label + _curr_y_indexes = np.argwhere( + stratified_on == _label).flatten() + + _curr_y_true = y_true[_curr_y_indexes] + _curr_y_pred = y_pred[_curr_y_indexes] + try: - x_2, y_2, z_2 = nefrcurve(30, preds_strat, trues_strat) + _curr_x, _curr_y, _curr_z = get_enrichment_grid( + y_true=_curr_y_true, + y_pred=_curr_y_pred, + **_common_enrichment_grid_kwargs, + ) + # catch other errors, especially from the sanity check except ZeroDivisionError: + warnings.warn( + f'Insufficient amount ({len(_curr_y_true)}) of ' + f'samples labeled as \'{_label}\' during stratified ' + f'RES. Disregarding label \'{_label}\' ...') continue - x.append(x_2) - y.append(y_2) - z.append(z_2) - self.nefr = (x, y, z) - - return self.nefr - - def plot(self, save_file=None, levels=10, title="RDS", cmap='Blues', figsize=(8, 5)): - """Returns a list of :class:`bluepy.blte.Service` objects representing the services offered by the device. This will perform Bluetooth service discovery if this has not already been done; otherwise it will return a cached list of services immediately.. - - :param save_file: if None uses plt show otherwise saves png to file - :param levels: used for contour - :param title: sets plot title - :param cmap: uses matplotlib color maps - :param figsize: sets figure size - """ - plt.figure(figsize=figsize) - plt.xscale("log") - plt.yscale("log") - plt.xlabel("Screen top x%") - plt.ylabel("True top x%") - plt.contourf(np.stack(self.nefr[0]).mean(0) if self.stratify else self.nefr[0], - np.stack(self.nefr[1]).mean(0) if self.stratify else self.nefr[1], - np.stack(self.nefr[2]).mean(0) if self.stratify else self.nefr[2], - vmin=0, - vmax=1, - cmap=cmap, - levels=levels) - - plt.colorbar() - plt.title(title) - - if save_file is None: - plt.show() + + _stratified_xs.append(_curr_x) + _stratified_ys.append(_curr_y) + _stratified_zs.append(_curr_z) + + self.__enrichment_grids = \ + (_stratified_xs, _stratified_ys, _stratified_zs) + + return self.__enrichment_grids + + def plot_enrichment_grids( + self, + plot_size=(8, 5), + color_map='Blues', + num_contour_levels=10, + plot_title='Regression Enrichment Surface', + plot_file_path=None, + ): + """Visualize the enrichment surface(s) as contour plot. + + This function averages the enrichment grids if there are more than + one grids (resulted from the stratification of + 'get_enrichment_grids) and visualize the averaged surface in a + contour plot. + + :param plot_size: size of the plot, represented by the width and + height in inches, defaults to (8, 5) + :type plot_size: tuple of two int or float + :param color_map: color map for the contour plot, defaults to 'Blues' + :type color_map: string or matplotlib Colormap + :param num_contour_levels: number of contour levels, defaults to 10 + :type num_contour_levels: int + :param plot_title: title for the contour plot, defaults to + 'Regression Enrichment Surface' + :type plot_title: str + :param plot_file_path: filepath for the contour plot. If given, + the plot will be directed saved to the path, otherwise the + plot will be shown. Defaults to None + :type plot_file_path: str + """ + + assert self.__enrichment_grids + + plt.figure(figsize=plot_size) + plt.xscale('log') + plt.yscale('log') + plt.xlabel('Screen Top X%') + plt.ylabel('True Top X%') + plt.title(plot_title) + + _levels = np.linspace(0., 1., num_contour_levels, endpoint=True) + _contour = plt.contourf( + np.stack(self.__enrichment_grids[0]).mean(0), + np.stack(self.__enrichment_grids[1]).mean(0), + np.stack(self.__enrichment_grids[2]).mean(0), + cmap=color_map, + levels=_levels, + ) + plt.colorbar(ticks=_levels) + + if plot_file_path: + plt.savefig( + plot_file_path, + bbox_inches='tight', + dpi=300, + ) else: - plt.savefig(save_file, bbox_inches='tight', dpi=300) + plt.show() diff --git a/setup.py b/setup.py index 363a5b3..a81be29 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,10 @@ with open('HISTORY.rst') as history_file: history = history_file.read() -requirements = [ ] +requirements = [ + 'numpy', + 'matplotlib>=3', +] setup_requirements = [ ] diff --git a/tests/__init__.py b/tests/__init__.py index 6a59862..8db6133 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1 +1 @@ -"""Unit test package for regression_enrichment_surface.""" +"""Unit test package for Regression Enrichment Surface.""" diff --git a/tests/test_enrichment_grids.npy b/tests/test_enrichment_grids.npy new file mode 100644 index 0000000..eff9736 Binary files /dev/null and b/tests/test_enrichment_grids.npy differ diff --git a/tests/test_regression_enrichment_surface.py b/tests/test_regression_enrichment_surface.py index 294bd67..81e77f6 100644 --- a/tests/test_regression_enrichment_surface.py +++ b/tests/test_regression_enrichment_surface.py @@ -1,21 +1,66 @@ -#!/usr/bin/env python +"""Unit Test fir Regression Enrichment Surface -"""Tests for `regression_enrichment_surface` package.""" +This file implements the unit test cases for the functions and classes in +Regression Enrichment Surface module. +Example: + python -m unittest +TODO: + +""" import unittest +import numpy as np +from regression_enrichment_surface.regression_enrichment_surface import \ + RegressionEnrichmentSurface + + +class TestRegressionEnrichmentSurface(unittest.TestCase): + + def test_get_enrichment(self): + self.assertTrue(True) + + def test_get_enrichment_grid(self): + self.assertTrue(True) + + def test_get_enrichment_grids(self): + """ Verify the implementation of get_enrichment_grids method for + RegressionEnrichmentSurface class. + + The verification process will also check the implementation of + get_enrichment and get_enrichment_grid in the module, as they are + being used in the class for computing RES with stratification. + + :return: None + """ + + # fix the random seed and therefore test results + np.random.seed(0) + _size = (100_000,) + + _y_true = np.random.uniform(low=0.0, high=1.0, size=_size) + _y_pred = _y_true + np.random.uniform(low=-0.2, high=0.2, size=_size) + _y_labl = np.random.randint(low=0, high=3, size=_size) -from regression_enrichment_surface import regression_enrichment_surface + _res = RegressionEnrichmentSurface() + _grids = _res.get_enrichment_grids( + y_true=_y_true, + y_pred=_y_pred, + stratified_on=_y_labl, + ) + _grids = np.array(_grids) + # np.save(file='./test_enrichment_grids', arr=_grids) + try: + _ref_grids = np.load(file='./tests/test_enrichment_grids.npy') + except FileNotFoundError: + _ref_grids = np.load(file='./test_enrichment_grids.npy') -class TestRegression_enrichment_surface(unittest.TestCase): - """Tests for `regression_enrichment_surface` package.""" + self.assertTrue((_grids == _ref_grids).all()) - def setUp(self): - """Set up test fixtures, if any.""" + def test_plot_enrichment_grids(self): + self.assertTrue(True) - def tearDown(self): - """Tear down test fixtures, if any.""" - def test_000_something(self): - """Test something.""" +if __name__ == '__main__': + unittest.main()