v0.1.6: Removed hit selection algorithm, added median and quantile no…

…rmalization
TalusBio · Nov 16, 2021 · f2f71c5 · f2f71c5
1 parent f0ea734
commit f2f71c5
Show file tree

Hide file tree

Showing 5 changed files with 56 additions and 268 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "talus-utils"
-version = "0.1.5"
+version = "0.1.6"
 description = "Talus Utils"
 authors = ["Rico Meinl <[email protected]>"]
 license = "MIT"

diff --git a/src/talus_utils/algorithms.py b/src/talus_utils/algorithms.py
@@ -1,167 +1,7 @@
 """src/talus_utils/algorithms.py module."""
-from typing import Tuple, Union
 
-import numpy as np
 import pandas as pd
 
-from . import dataframe as df_utils
-from .constants import MAX_NAN_VALUES_HIT_SELECTION, MIN_PEPTIDES_HIT_SELECTION
-
-
-def get_hits_for_proteins(
-    outlier_peptide_intensities: pd.DataFrame,
-    peptide_df: pd.DataFrame,
-) -> pd.DataFrame:
-    """Calculate the percentage of peptides that are a hit for a protein.
-
-    Parameters
-    ----------
-    outlier_peptide_intensities : pd.DataFrame
-        A dataframe with the outliers peptide intensities.
-    peptide_df : pd.DataFrame
-        A transformed peptide.txt dataframe with columns: ["Peptide", "Protein", "NumPeptides"].
-
-    Returns
-    -------
-    protein_df
-        A dataframe with the percentage of peptides that are a hit for a given protein.
-
-    """
-    protein_df = peptide_df[["Protein"]].drop_duplicates()
-    # loop over each sample of the outlier peptide intensities and calculate the percentage of peptides that are a hit for a given protein
-    for column_name in outlier_peptide_intensities.columns:
-        hits_per_protein = pd.merge(
-            peptide_df,
-            outlier_peptide_intensities[column_name],
-            on="Peptide",
-            how="left",
-        )
-        hits_per_protein = hits_per_protein.groupby("Protein", as_index=False).sum()
-        # number of peptide hits / number of total number of peptides for a given protein
-        hits_per_protein[column_name] /= hits_per_protein["NumPeptides"]
-        hits_per_protein = hits_per_protein.drop("NumPeptides", axis=1)
-        protein_df = pd.merge(protein_df, hits_per_protein, on="Protein")
-
-    return protein_df.set_index("Protein")
-
-
-@df_utils.normalize(how="median_column")
-@df_utils.log_scaling(log_function=np.log2, filter_outliers=True)
-@df_utils.copy
-def get_outlier_peptide_intensities(
-    peptide_intensities: pd.DataFrame,
-    max_nan_values: int = MAX_NAN_VALUES_HIT_SELECTION,
-    split_above_below: bool = False,
-) -> Union[pd.DataFrame, Tuple[pd.DataFrame, pd.DataFrame]]:
-    """For each sample, finds the peptides that are more than 2 standard deviations above or below the mean.
-
-    Parameters
-    ----------
-    peptide_intensities : pd.DataFrame
-        A dataframe containing Peptides as index and intensities as values.
-    max_nan_values : int
-        The maximum number of NaN values a peptide can have across samples. (Default value = MAX_NAN_VALUES_HIT_SELECTION).
-    split_above_below : bool
-        If True, separate between outliers below and above the mean (returns two dataframes). (Default value = False).
-
-    Returns
-    -------
-    Union[pd.DataFrame, Tuple[pd.DataFrame, pd.DataFrame]]
-        A dataframe with the outliers peptides.
-
-    """
-    # drop peptides with more than MAX_NAN_VALUES_HIT_SELECTION NaN values
-    peptide_intensities = peptide_intensities.dropna(
-        thresh=peptide_intensities.shape[1] - max_nan_values, axis=0
-    )
-
-    # calculate mean and std for each peptide across samples
-    peptide_mean = peptide_intensities.mean(axis=1)
-    peptide_std = peptide_intensities.std(axis=1)
-
-    # calculate lower and upper bound (2 std away from the mean)
-    lower_bound = (peptide_mean - 2 * peptide_std).values.reshape(-1, 1)
-    upper_bound = (peptide_mean + 2 * peptide_std).values.reshape(-1, 1)
-
-    if split_above_below:
-        peptide_intensities_above_mean = (peptide_intensities > upper_bound).astype(int)
-        peptide_intensities_below_mean = (peptide_intensities < lower_bound).astype(int)
-        return peptide_intensities_above_mean, peptide_intensities_below_mean
-    else:
-        peptide_intensities = (
-            (peptide_intensities > upper_bound) | (peptide_intensities < lower_bound)
-        ).astype(int)
-        return peptide_intensities
-
-
-def hit_selection(
-    peptide_df: pd.DataFrame,
-    min_peptides: int = MIN_PEPTIDES_HIT_SELECTION,
-    max_nan_values: int = MAX_NAN_VALUES_HIT_SELECTION,
-    split_above_below: bool = False,
-) -> Union[pd.DataFrame, Tuple[pd.DataFrame, pd.DataFrame]]:
-    """Hit Selection algorithm. Takes a peptide intensity dataframe, with the Peptides as index and the intensities as the values.
-    Calculcates how many peptides are 2 std devs above or below the mean and reports the associated protein.
-
-    Parameters
-    ----------
-    peptide_df : pd.DataFrame
-        A raw peptide dataframe (peptides.txt).
-    min_peptides : int
-        The minimum number of peptides a protein needs to have to be to be considered. (Default value = MIN_PEPTIDES_HIT_SELECTION).
-    max_nan_values : int
-        The maximum number of NaN values a peptide can have across samples. (Default value = MAX_NAN_VALUES_HIT_SELECTION).
-    split_above_below : bool
-        If True, separate between hits below and above the mean (returns two dataframes). (Default value = False).
-
-    Returns
-    -------
-    Union[pd.DataFrame, Tuple[pd.DataFrame, pd.DataFrame]]
-        A dataframe with the percentage of peptides that are a hit for a given protein.
-
-    """
-    peptide_intensities = peptide_df.drop(["Protein"], axis=1)
-    peptide_intensities = peptide_intensities.drop_duplicates(subset="Peptide")
-    peptide_intensities = peptide_intensities.set_index(["Peptide"])
-
-    # prepare protein dataframe and peptides per protein (both filtered by each protein having at least MIN_PEPTIDES peptides)
-    peptide_df = peptide_df[["Peptide", "Protein"]]
-    peptide_df["NumPeptides"] = peptide_df.groupby("Protein").transform("count")
-    peptide_df = peptide_df[peptide_df["NumPeptides"] >= min_peptides]
-
-    if split_above_below:
-        (
-            pos_outlier_peptide_intensities,
-            neg_outlier_peptide_intensities,
-        ) = get_outlier_peptide_intensities(
-            peptide_intensities=peptide_intensities,
-            max_nan_values=max_nan_values,
-            split_above_below=True,
-        )
-
-        protein_df_above_mean = get_hits_for_proteins(
-            outlier_peptide_intensities=pos_outlier_peptide_intensities,
-            peptide_df=peptide_df,
-        )
-        protein_df_below_mean = get_hits_for_proteins(
-            outlier_peptide_intensities=neg_outlier_peptide_intensities,
-            peptide_df=peptide_df,
-        )
-
-        return protein_df_above_mean, protein_df_below_mean
-    else:
-        outlier_peptide_intensities = get_outlier_peptide_intensities(
-            peptide_intensities=peptide_intensities,
-            max_nan_values=max_nan_values,
-            split_above_below=False,
-        )
-        protein_df = get_hits_for_proteins(
-            outlier_peptide_intensities=outlier_peptide_intensities,
-            peptide_df=peptide_df,
-        )
-
-        return protein_df
-
 
 def subcellular_enrichment_scores(
     proteins_with_locations: pd.DataFrame, expected_fractions_of_locations: pd.DataFrame

diff --git a/src/talus_utils/dataframe.py b/src/talus_utils/dataframe.py
@@ -194,6 +194,39 @@ def wrapped_func(*args: str, **kwargs: str) -> Any:
     return pivot_table_wrap
 
 
+def median_normalize(df):
+    """Apply median normalization to input dataframe.
+
+    Parameters
+    ----------
+    df: pd.DataFrame
+        Input data frame.
+
+    Returns
+    -------
+    pd.DataFrame
+        Transformed output data frame.
+    """
+    return df / df.median()
+
+
+def quantile_normalize(df):
+    """Apply quantile normalization to input dataframe.
+
+    Parameters
+    ----------
+    df: pd.DataFrame
+        Input data frame.
+
+    Returns
+    -------
+    pd.DataFrame
+        Transformed output data frame.
+    """
+    rank_mean = df.stack().groupby(df.rank(method="first").stack().astype(int)).mean()
+    return df.rank(method="min").stack().astype(int).map(rank_mean).unstack()
+
+
 def normalize(how: str) -> Callable[..., Any]:
     """Apply a row or column normalization to a pandas DataFrame argument.
 
@@ -236,8 +269,10 @@ def wrapped_func(*args: str, **kwargs: str) -> Any:
                 apply_func = lambda df: df.apply(lambda x: x / x.sum(), axis=0)
             elif how.lower() in set(["minmax", "min-max", "min_max"]):
                 apply_func = lambda df: (df - df.min()) / (df.max() - df.min())
-            elif how.lower() in set(["median_column", "median_col"]):
-                apply_func = lambda df: df - df.median(axis=0)
+            elif how.lower() in set(["median", "median_column", "median_col"]):
+                apply_func = lambda df: median_normalize(df)
+            elif how.lower() in set(["quantile", "quantile_column", "quantile_col"]):
+                apply_func = lambda df: quantile_normalize(df)
             else:
                 raise ValueError(
                     "Invalid input value for 'how'. Needs to be one of {'row', 'colum', 'minmax'}."

diff --git a/tests/test_algorithms.py b/tests/test_algorithms.py
@@ -11,110 +11,6 @@
 DATA_DIR = Path(__file__).resolve().parent.joinpath("data")
 
 
-def test_get_outlier_peptide_intensities() -> None:
-    """Test the get_outlier_peptide_intensities function."""
-    df_expected = pd.read_csv(DATA_DIR.joinpath("quant_peptides_outliers.csv"))
-    df_expected = df_expected.set_index("Peptide")
-
-    df_input = pd.read_csv(DATA_DIR.joinpath("quant_peptides.txt"), sep="\t")
-    df_input = df_input.drop(["Protein", "numFragments"], axis=1)
-    df_input = df_input.drop_duplicates(subset="Peptide")
-    df_input = df_input.set_index(["Peptide"])
-
-    df_actual = algorithms.get_outlier_peptide_intensities(peptide_intensities=df_input)
-
-    assert_frame_equal(df_actual, df_expected)
-
-
-def test_get_outlier_peptide_intensities_above_below() -> None:
-    """Test the get_outlier_peptide_intensities function with split_above_below."""
-    df_expected_above = pd.read_csv(
-        DATA_DIR.joinpath("quant_peptides_outliers_above_mean.csv")
-    )
-    df_expected_above = df_expected_above.set_index("Peptide")
-
-    df_expected_below = pd.read_csv(
-        DATA_DIR.joinpath("quant_peptides_outliers_below_mean.csv")
-    )
-    df_expected_below = df_expected_below.set_index("Peptide")
-
-    df_input = pd.read_csv(DATA_DIR.joinpath("quant_peptides.txt"), sep="\t")
-    df_input = df_input.drop(["Protein", "numFragments"], axis=1)
-    df_input = df_input.drop_duplicates(subset="Peptide")
-    df_input = df_input.set_index(["Peptide"])
-
-    df_actual_above, df_actual_below = algorithms.get_outlier_peptide_intensities(
-        peptide_intensities=df_input, split_above_below=True
-    )
-
-    assert_frame_equal(df_actual_above, df_expected_above)
-    assert_frame_equal(df_actual_below, df_expected_below)
-
-
-def test_get_hits_for_proteins() -> None:
-    """Test the get_hits_for_proteins function."""
-    # using min_peptides = 1 for testing purposes
-    min_peptides = 1
-    df_expected = pd.read_csv(DATA_DIR.joinpath("hits_for_proteins.csv"))
-    df_expected = df_expected.set_index("Protein")
-
-    df_outlier_peptides = pd.read_csv(DATA_DIR.joinpath("quant_peptides_outliers.csv"))
-    df_outlier_peptides = df_outlier_peptides.set_index("Peptide")
-
-    df_peptides = pd.read_csv(DATA_DIR.joinpath("quant_peptides.txt"), sep="\t")
-    df_peptides = df_peptides[["Peptide", "Protein"]]
-    df_peptides["NumPeptides"] = df_peptides.groupby("Protein").transform("count")
-    df_peptides = df_peptides[df_peptides["NumPeptides"] >= min_peptides]
-
-    df_actual = algorithms.get_hits_for_proteins(
-        outlier_peptide_intensities=df_outlier_peptides,
-        peptide_df=df_peptides,
-    )
-
-    assert_frame_equal(df_actual, df_expected)
-
-
-def test_hit_selection() -> None:
-    """Test the hit_selection function."""
-    df_expected = pd.read_csv(DATA_DIR.joinpath("hits_for_proteins.csv"))
-    df_expected = df_expected.set_index("Protein")
-
-    df_input = pd.read_csv(DATA_DIR.joinpath("quant_peptides.txt"), sep="\t")
-    df_input = df_input.drop(["numFragments"], axis=1)
-
-    df_actual = algorithms.hit_selection(
-        peptide_df=df_input,
-        min_peptides=1,
-    )
-
-    assert_frame_equal(df_actual, df_expected)
-
-
-def test_hit_selection_above_below() -> None:
-    """Test the hit_selection function with split_above_below."""
-    df_expected_above = pd.read_csv(
-        DATA_DIR.joinpath("hits_for_proteins_above_mean.csv")
-    )
-    df_expected_above = df_expected_above.set_index("Protein")
-
-    df_expected_below = pd.read_csv(
-        DATA_DIR.joinpath("hits_for_proteins_below_mean.csv")
-    )
-    df_expected_below = df_expected_below.set_index("Protein")
-
-    df_input = pd.read_csv(DATA_DIR.joinpath("quant_peptides.txt"), sep="\t")
-    df_input = df_input.drop(["numFragments"], axis=1)
-
-    df_actual_above, df_actual_below = algorithms.hit_selection(
-        peptide_df=df_input,
-        min_peptides=1,
-        split_above_below=True,
-    )
-
-    assert_frame_equal(df_actual_above, df_expected_above)
-    assert_frame_equal(df_actual_below, df_expected_below)
-
-
 def test_subcellular_enrichment_scores() -> None:
     """Test the subcellular_enrichment_scores function."""
     df_expected = pd.read_csv(DATA_DIR.joinpath("subcellular_enrichment_scores.csv"))

diff --git a/tests/test_dataframe.py b/tests/test_dataframe.py
@@ -192,12 +192,29 @@ def test_normalize_column() -> None:
 def test_normalize_median_column() -> None:
     """Test the normalize decorator with how='median_column'."""
     df_input = pd.DataFrame(np.random.rand(5, 5) * 100)
-    df_expected = df_input - df_input.median(axis=0)
+    df_expected = df_input / df_input.median(axis=0)
 
     df_actual = dataframe.normalize(how="median_column")(dummy_function)(df_input)
     assert_frame_equal(df_actual, df_expected)
 
 
+def test_normalize_quantile_column() -> None:
+    """Test the normalize decorator with how='quantile_column'."""
+    df_input = pd.DataFrame(np.random.rand(5, 5) * 100)
+
+    rank_mean = (
+        df_input.stack()
+        .groupby(df_input.rank(method="first").stack().astype(int))
+        .mean()
+    )
+    df_expected = (
+        df_input.rank(method="min").stack().astype(int).map(rank_mean).unstack()
+    )
+
+    df_actual = dataframe.normalize(how="quantile_column")(dummy_function)(df_input)
+    assert_frame_equal(df_actual, df_expected)
+
+
 def test_sort_row_values_value_error() -> None:
     """Test the sort_by decorator with a value error."""
     df_input = pd.DataFrame([{"test": "a", "test2": "b"}, {"test": "c", "test2": "d"}])