Skip to content
This repository has been archived by the owner on Oct 22, 2022. It is now read-only.

Commit

Permalink
v0.1.6: Removed hit selection algorithm, added median and quantile no…
Browse files Browse the repository at this point in the history
…rmalization
  • Loading branch information
Rico Meinl committed Nov 16, 2021
1 parent f0ea734 commit f2f71c5
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 268 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "talus-utils"
version = "0.1.5"
version = "0.1.6"
description = "Talus Utils"
authors = ["Rico Meinl <[email protected]>"]
license = "MIT"
Expand Down
160 changes: 0 additions & 160 deletions src/talus_utils/algorithms.py
Original file line number Diff line number Diff line change
@@ -1,167 +1,7 @@
"""src/talus_utils/algorithms.py module."""
from typing import Tuple, Union

import numpy as np
import pandas as pd

from . import dataframe as df_utils
from .constants import MAX_NAN_VALUES_HIT_SELECTION, MIN_PEPTIDES_HIT_SELECTION


def get_hits_for_proteins(
outlier_peptide_intensities: pd.DataFrame,
peptide_df: pd.DataFrame,
) -> pd.DataFrame:
"""Calculate the percentage of peptides that are a hit for a protein.
Parameters
----------
outlier_peptide_intensities : pd.DataFrame
A dataframe with the outliers peptide intensities.
peptide_df : pd.DataFrame
A transformed peptide.txt dataframe with columns: ["Peptide", "Protein", "NumPeptides"].
Returns
-------
protein_df
A dataframe with the percentage of peptides that are a hit for a given protein.
"""
protein_df = peptide_df[["Protein"]].drop_duplicates()
# loop over each sample of the outlier peptide intensities and calculate the percentage of peptides that are a hit for a given protein
for column_name in outlier_peptide_intensities.columns:
hits_per_protein = pd.merge(
peptide_df,
outlier_peptide_intensities[column_name],
on="Peptide",
how="left",
)
hits_per_protein = hits_per_protein.groupby("Protein", as_index=False).sum()
# number of peptide hits / number of total number of peptides for a given protein
hits_per_protein[column_name] /= hits_per_protein["NumPeptides"]
hits_per_protein = hits_per_protein.drop("NumPeptides", axis=1)
protein_df = pd.merge(protein_df, hits_per_protein, on="Protein")

return protein_df.set_index("Protein")


@df_utils.normalize(how="median_column")
@df_utils.log_scaling(log_function=np.log2, filter_outliers=True)
@df_utils.copy
def get_outlier_peptide_intensities(
peptide_intensities: pd.DataFrame,
max_nan_values: int = MAX_NAN_VALUES_HIT_SELECTION,
split_above_below: bool = False,
) -> Union[pd.DataFrame, Tuple[pd.DataFrame, pd.DataFrame]]:
"""For each sample, finds the peptides that are more than 2 standard deviations above or below the mean.
Parameters
----------
peptide_intensities : pd.DataFrame
A dataframe containing Peptides as index and intensities as values.
max_nan_values : int
The maximum number of NaN values a peptide can have across samples. (Default value = MAX_NAN_VALUES_HIT_SELECTION).
split_above_below : bool
If True, separate between outliers below and above the mean (returns two dataframes). (Default value = False).
Returns
-------
Union[pd.DataFrame, Tuple[pd.DataFrame, pd.DataFrame]]
A dataframe with the outliers peptides.
"""
# drop peptides with more than MAX_NAN_VALUES_HIT_SELECTION NaN values
peptide_intensities = peptide_intensities.dropna(
thresh=peptide_intensities.shape[1] - max_nan_values, axis=0
)

# calculate mean and std for each peptide across samples
peptide_mean = peptide_intensities.mean(axis=1)
peptide_std = peptide_intensities.std(axis=1)

# calculate lower and upper bound (2 std away from the mean)
lower_bound = (peptide_mean - 2 * peptide_std).values.reshape(-1, 1)
upper_bound = (peptide_mean + 2 * peptide_std).values.reshape(-1, 1)

if split_above_below:
peptide_intensities_above_mean = (peptide_intensities > upper_bound).astype(int)
peptide_intensities_below_mean = (peptide_intensities < lower_bound).astype(int)
return peptide_intensities_above_mean, peptide_intensities_below_mean
else:
peptide_intensities = (
(peptide_intensities > upper_bound) | (peptide_intensities < lower_bound)
).astype(int)
return peptide_intensities


def hit_selection(
peptide_df: pd.DataFrame,
min_peptides: int = MIN_PEPTIDES_HIT_SELECTION,
max_nan_values: int = MAX_NAN_VALUES_HIT_SELECTION,
split_above_below: bool = False,
) -> Union[pd.DataFrame, Tuple[pd.DataFrame, pd.DataFrame]]:
"""Hit Selection algorithm. Takes a peptide intensity dataframe, with the Peptides as index and the intensities as the values.
Calculcates how many peptides are 2 std devs above or below the mean and reports the associated protein.
Parameters
----------
peptide_df : pd.DataFrame
A raw peptide dataframe (peptides.txt).
min_peptides : int
The minimum number of peptides a protein needs to have to be to be considered. (Default value = MIN_PEPTIDES_HIT_SELECTION).
max_nan_values : int
The maximum number of NaN values a peptide can have across samples. (Default value = MAX_NAN_VALUES_HIT_SELECTION).
split_above_below : bool
If True, separate between hits below and above the mean (returns two dataframes). (Default value = False).
Returns
-------
Union[pd.DataFrame, Tuple[pd.DataFrame, pd.DataFrame]]
A dataframe with the percentage of peptides that are a hit for a given protein.
"""
peptide_intensities = peptide_df.drop(["Protein"], axis=1)
peptide_intensities = peptide_intensities.drop_duplicates(subset="Peptide")
peptide_intensities = peptide_intensities.set_index(["Peptide"])

# prepare protein dataframe and peptides per protein (both filtered by each protein having at least MIN_PEPTIDES peptides)
peptide_df = peptide_df[["Peptide", "Protein"]]
peptide_df["NumPeptides"] = peptide_df.groupby("Protein").transform("count")
peptide_df = peptide_df[peptide_df["NumPeptides"] >= min_peptides]

if split_above_below:
(
pos_outlier_peptide_intensities,
neg_outlier_peptide_intensities,
) = get_outlier_peptide_intensities(
peptide_intensities=peptide_intensities,
max_nan_values=max_nan_values,
split_above_below=True,
)

protein_df_above_mean = get_hits_for_proteins(
outlier_peptide_intensities=pos_outlier_peptide_intensities,
peptide_df=peptide_df,
)
protein_df_below_mean = get_hits_for_proteins(
outlier_peptide_intensities=neg_outlier_peptide_intensities,
peptide_df=peptide_df,
)

return protein_df_above_mean, protein_df_below_mean
else:
outlier_peptide_intensities = get_outlier_peptide_intensities(
peptide_intensities=peptide_intensities,
max_nan_values=max_nan_values,
split_above_below=False,
)
protein_df = get_hits_for_proteins(
outlier_peptide_intensities=outlier_peptide_intensities,
peptide_df=peptide_df,
)

return protein_df


def subcellular_enrichment_scores(
proteins_with_locations: pd.DataFrame, expected_fractions_of_locations: pd.DataFrame
Expand Down
39 changes: 37 additions & 2 deletions src/talus_utils/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,39 @@ def wrapped_func(*args: str, **kwargs: str) -> Any:
return pivot_table_wrap


def median_normalize(df):
"""Apply median normalization to input dataframe.
Parameters
----------
df: pd.DataFrame
Input data frame.
Returns
-------
pd.DataFrame
Transformed output data frame.
"""
return df / df.median()


def quantile_normalize(df):
"""Apply quantile normalization to input dataframe.
Parameters
----------
df: pd.DataFrame
Input data frame.
Returns
-------
pd.DataFrame
Transformed output data frame.
"""
rank_mean = df.stack().groupby(df.rank(method="first").stack().astype(int)).mean()
return df.rank(method="min").stack().astype(int).map(rank_mean).unstack()


def normalize(how: str) -> Callable[..., Any]:
"""Apply a row or column normalization to a pandas DataFrame argument.
Expand Down Expand Up @@ -236,8 +269,10 @@ def wrapped_func(*args: str, **kwargs: str) -> Any:
apply_func = lambda df: df.apply(lambda x: x / x.sum(), axis=0)
elif how.lower() in set(["minmax", "min-max", "min_max"]):
apply_func = lambda df: (df - df.min()) / (df.max() - df.min())
elif how.lower() in set(["median_column", "median_col"]):
apply_func = lambda df: df - df.median(axis=0)
elif how.lower() in set(["median", "median_column", "median_col"]):
apply_func = lambda df: median_normalize(df)
elif how.lower() in set(["quantile", "quantile_column", "quantile_col"]):
apply_func = lambda df: quantile_normalize(df)
else:
raise ValueError(
"Invalid input value for 'how'. Needs to be one of {'row', 'colum', 'minmax'}."
Expand Down
104 changes: 0 additions & 104 deletions tests/test_algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,110 +11,6 @@
DATA_DIR = Path(__file__).resolve().parent.joinpath("data")


def test_get_outlier_peptide_intensities() -> None:
"""Test the get_outlier_peptide_intensities function."""
df_expected = pd.read_csv(DATA_DIR.joinpath("quant_peptides_outliers.csv"))
df_expected = df_expected.set_index("Peptide")

df_input = pd.read_csv(DATA_DIR.joinpath("quant_peptides.txt"), sep="\t")
df_input = df_input.drop(["Protein", "numFragments"], axis=1)
df_input = df_input.drop_duplicates(subset="Peptide")
df_input = df_input.set_index(["Peptide"])

df_actual = algorithms.get_outlier_peptide_intensities(peptide_intensities=df_input)

assert_frame_equal(df_actual, df_expected)


def test_get_outlier_peptide_intensities_above_below() -> None:
"""Test the get_outlier_peptide_intensities function with split_above_below."""
df_expected_above = pd.read_csv(
DATA_DIR.joinpath("quant_peptides_outliers_above_mean.csv")
)
df_expected_above = df_expected_above.set_index("Peptide")

df_expected_below = pd.read_csv(
DATA_DIR.joinpath("quant_peptides_outliers_below_mean.csv")
)
df_expected_below = df_expected_below.set_index("Peptide")

df_input = pd.read_csv(DATA_DIR.joinpath("quant_peptides.txt"), sep="\t")
df_input = df_input.drop(["Protein", "numFragments"], axis=1)
df_input = df_input.drop_duplicates(subset="Peptide")
df_input = df_input.set_index(["Peptide"])

df_actual_above, df_actual_below = algorithms.get_outlier_peptide_intensities(
peptide_intensities=df_input, split_above_below=True
)

assert_frame_equal(df_actual_above, df_expected_above)
assert_frame_equal(df_actual_below, df_expected_below)


def test_get_hits_for_proteins() -> None:
"""Test the get_hits_for_proteins function."""
# using min_peptides = 1 for testing purposes
min_peptides = 1
df_expected = pd.read_csv(DATA_DIR.joinpath("hits_for_proteins.csv"))
df_expected = df_expected.set_index("Protein")

df_outlier_peptides = pd.read_csv(DATA_DIR.joinpath("quant_peptides_outliers.csv"))
df_outlier_peptides = df_outlier_peptides.set_index("Peptide")

df_peptides = pd.read_csv(DATA_DIR.joinpath("quant_peptides.txt"), sep="\t")
df_peptides = df_peptides[["Peptide", "Protein"]]
df_peptides["NumPeptides"] = df_peptides.groupby("Protein").transform("count")
df_peptides = df_peptides[df_peptides["NumPeptides"] >= min_peptides]

df_actual = algorithms.get_hits_for_proteins(
outlier_peptide_intensities=df_outlier_peptides,
peptide_df=df_peptides,
)

assert_frame_equal(df_actual, df_expected)


def test_hit_selection() -> None:
"""Test the hit_selection function."""
df_expected = pd.read_csv(DATA_DIR.joinpath("hits_for_proteins.csv"))
df_expected = df_expected.set_index("Protein")

df_input = pd.read_csv(DATA_DIR.joinpath("quant_peptides.txt"), sep="\t")
df_input = df_input.drop(["numFragments"], axis=1)

df_actual = algorithms.hit_selection(
peptide_df=df_input,
min_peptides=1,
)

assert_frame_equal(df_actual, df_expected)


def test_hit_selection_above_below() -> None:
"""Test the hit_selection function with split_above_below."""
df_expected_above = pd.read_csv(
DATA_DIR.joinpath("hits_for_proteins_above_mean.csv")
)
df_expected_above = df_expected_above.set_index("Protein")

df_expected_below = pd.read_csv(
DATA_DIR.joinpath("hits_for_proteins_below_mean.csv")
)
df_expected_below = df_expected_below.set_index("Protein")

df_input = pd.read_csv(DATA_DIR.joinpath("quant_peptides.txt"), sep="\t")
df_input = df_input.drop(["numFragments"], axis=1)

df_actual_above, df_actual_below = algorithms.hit_selection(
peptide_df=df_input,
min_peptides=1,
split_above_below=True,
)

assert_frame_equal(df_actual_above, df_expected_above)
assert_frame_equal(df_actual_below, df_expected_below)


def test_subcellular_enrichment_scores() -> None:
"""Test the subcellular_enrichment_scores function."""
df_expected = pd.read_csv(DATA_DIR.joinpath("subcellular_enrichment_scores.csv"))
Expand Down
19 changes: 18 additions & 1 deletion tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,12 +192,29 @@ def test_normalize_column() -> None:
def test_normalize_median_column() -> None:
"""Test the normalize decorator with how='median_column'."""
df_input = pd.DataFrame(np.random.rand(5, 5) * 100)
df_expected = df_input - df_input.median(axis=0)
df_expected = df_input / df_input.median(axis=0)

df_actual = dataframe.normalize(how="median_column")(dummy_function)(df_input)
assert_frame_equal(df_actual, df_expected)


def test_normalize_quantile_column() -> None:
"""Test the normalize decorator with how='quantile_column'."""
df_input = pd.DataFrame(np.random.rand(5, 5) * 100)

rank_mean = (
df_input.stack()
.groupby(df_input.rank(method="first").stack().astype(int))
.mean()
)
df_expected = (
df_input.rank(method="min").stack().astype(int).map(rank_mean).unstack()
)

df_actual = dataframe.normalize(how="quantile_column")(dummy_function)(df_input)
assert_frame_equal(df_actual, df_expected)


def test_sort_row_values_value_error() -> None:
"""Test the sort_by decorator with a value error."""
df_input = pd.DataFrame([{"test": "a", "test2": "b"}, {"test": "c", "test2": "d"}])
Expand Down

0 comments on commit f2f71c5

Please sign in to comment.