diff --git a/CHANGELOG b/CHANGELOG index e71364d..adb019d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,8 @@ +v0.7.0 +- Add support for modified sequence grouping and aggregating +- Fix SyntaxWarning in docstrings +- Expose 'annotators' submodule as part of the public API + v0.6.1 - Add support for other organisms (#10) diff --git a/psite_annotation/__init__.py b/psite_annotation/__init__.py index f5143ad..10db656 100644 --- a/psite_annotation/__init__.py +++ b/psite_annotation/__init__.py @@ -1,6 +1,7 @@ """Convenience functions for annotating a pandas dataframe with a variety of annotations.""" from .functional_annotation import * # noqa: F401,F403 +from . import annotators """Get version from distribution and set copyright.""" __version__ = "0.0.0" diff --git a/psite_annotation/annotators/__init__.py b/psite_annotation/annotators/__init__.py index fda3a13..6014162 100644 --- a/psite_annotation/annotators/__init__.py +++ b/psite_annotation/annotators/__init__.py @@ -25,3 +25,5 @@ from .psp_studies import PSPStudiesAnnotator from .ptm_turnover import PTMTurnoverAnnotator from .site_sequence_context import SiteSequenceContextAnnotator +from .modified_sequence_group import ModifiedSequenceGroupAnnotator +from .modified_sequence_aggregation import ModifiedSequenceAggregatorAnnotator diff --git a/psite_annotation/annotators/clinical_basket.py b/psite_annotation/annotators/clinical_basket.py index 9d5185a..23bcf80 100644 --- a/psite_annotation/annotators/clinical_basket.py +++ b/psite_annotation/annotators/clinical_basket.py @@ -34,7 +34,7 @@ def __init__(self, annotation_file: str): self.basket_df = None def load_annotations(self) -> None: - """Reads in excel file with basket-gene annotations. + r"""Reads in excel file with basket-gene annotations. Creates a dataframe `basket_df` with two columns\: @@ -62,7 +62,7 @@ def load_annotations(self) -> None: @check_columns(["Gene names"]) def annotate(self, df: pd.DataFrame, inplace: bool = False) -> pd.DataFrame: - """Adds column with baskets the gene names correspond to. + r"""Adds column with baskets the gene names correspond to. Adds the following annotation columns to dataframe\: diff --git a/psite_annotation/annotators/domain.py b/psite_annotation/annotators/domain.py index e61099d..84b6cc0 100644 --- a/psite_annotation/annotators/domain.py +++ b/psite_annotation/annotators/domain.py @@ -48,7 +48,7 @@ def load_annotations(self) -> None: @check_columns(["Matched proteins", "Start positions", "End positions"]) def annotate(self, df: pd.DataFrame, inplace: bool = False) -> pd.DataFrame: - """Adds column with domains the peptide overlaps with. + r"""Adds column with domains the peptide overlaps with. Adds the following annotation columns to dataframe\: diff --git a/psite_annotation/annotators/in_vitro_kinases.py b/psite_annotation/annotators/in_vitro_kinases.py index bb0caa9..e494af0 100644 --- a/psite_annotation/annotators/in_vitro_kinases.py +++ b/psite_annotation/annotators/in_vitro_kinases.py @@ -44,7 +44,7 @@ def load_annotations(self) -> None: @check_columns(["Site positions"]) def annotate(self, df: pd.DataFrame) -> pd.DataFrame: - """Adds column with phosphorylating kinases. + r"""Adds column with phosphorylating kinases. Adds the following annotation columns to dataframe\: diff --git a/psite_annotation/annotators/kinase_library.py b/psite_annotation/annotators/kinase_library.py index c014b4a..e1f4778 100644 --- a/psite_annotation/annotators/kinase_library.py +++ b/psite_annotation/annotators/kinase_library.py @@ -77,7 +77,7 @@ def load_annotations(self) -> None: @check_columns(["Site sequence context"]) def annotate(self, df: pd.DataFrame, inplace: bool = False) -> pd.DataFrame: - """Adds column with motifs the site sequence context matches with. + r"""Adds column with motifs the site sequence context matches with. Adds the following annotation columns to dataframe\: diff --git a/psite_annotation/annotators/modified_sequence_aggregation.py b/psite_annotation/annotators/modified_sequence_aggregation.py new file mode 100755 index 0000000..cf03e49 --- /dev/null +++ b/psite_annotation/annotators/modified_sequence_aggregation.py @@ -0,0 +1,137 @@ +# adapted from phospho_delocalization.py (Florian P. Bayer - 2025) +from typing import Any + +import pandas as pd +import numpy as np + +from .annotator_base import check_columns + + +class ModifiedSequenceAggregatorAnnotator: + """Annotate and aggregate pandas dataframe with representative modified sequence from a modified sequence group. + + Example: + :: + + annotator = ReprentativeModifiedSequenceAnnotator() + df = annotator.annotate(df) + """ + + def __init__( + self, + experiment_cols: list[str], + agg_func: str = "mean", + agg_cols: dict[str, Any] = None, + ) -> None: + """ + Initialize the options for ReprentativeModifiedSequenceAnnotator. + + Args: + experiment_cols: list of column names with quantitative values. + agg_func: function to aggregate quantitative values within each group, e.g. 'mean', 'sum', etc. + + """ + self.experiment_cols = experiment_cols + self.agg_func = agg_func + self.agg_cols = {} + if agg_cols: + self.agg_cols = agg_cols + + def load_annotations(self) -> None: + pass + + @check_columns(["Modified sequence", "Delocalized sequence", "Modified sequence group"]) + def annotate(self, df: pd.DataFrame, ) -> pd.DataFrame: + r"""Group delocalized phospho-forms and aggregate their quantitative values. + + This function identifies peptide sequences that differ only by the position + of their phosphorylation (`(ph)`) group and collapses them into + "delocalized" groups. Each group contains all modified sequence variants + that represent the same underlying peptide backbone. + + The following columns are added to the dataframe\: + + - 'Modified sequence representative' = A single representative sequence + selected from the group, i.e. the most frequently measured across experiments. + - 'Modified sequence representative degree' = Fraction of summed observation + frequency contributed by the representative peptide. + + All experiment columns (e.g. `"Experiment 1"`, `"Experiment 2"`, …) are aggregated + per group by summing the intensities of member sequences. + + Args: + df: Input dataframe with: + - `"Modified sequence"` column containing peptide strings with `(ph)` annotations + - 'Delocalized sequence' = Canonical unmodified backbone with an index + suffix to distinguish the number of modifications. + - 'Modified sequence group' = All peptide variants belonging to the same + delocalized group, concatenated with semicolons. + + Returns: + pd.DataFrame: Dataframe with grouped phospho-forms and aggregated intensities. + """ + # TODO: implement inplace option. does not work currently because groupby().agg() cannot be done inplace + annotated_df = df + + # Determine representative sequence for each cluster based on the observations in the experiments. + # The Modified sequence representative degree gives the proportion of the representative relative to all observations. + df_representative = find_representative_modified_sequence( + annotated_df, self.experiment_cols + ) + + # Aggregate experiments per group. + df_agg = ( + annotated_df.groupby(["Delocalized sequence", "Modified sequence group"]) + .agg({x: self.agg_func for x in self.experiment_cols} | self.agg_cols) + .reset_index() + ) + + df_agg = df_representative.merge(df_agg, on="Modified sequence group") + + return df_agg + + +def find_representative_modified_sequence( + df: pd.DataFrame, observation_cols: list[str] +) -> pd.DataFrame: + """ + This function counts the number of observations of a specific modified sequence and defines the most representative sequence as + the one with most observations. Missing is indicated as NaN. Any other value is considered an observation. + + Parameters + ---------- + df : pd.DataFrame + a DataFrame with columns <'Modified sequence', 'Delocalized sequence'>, observation_cols + observation_cols : list of cols + the names of the columns that are used for counting if a peptide was observed. + + Returns + ------- + df : pd.DataFrame + A new DataFrame with cols ['Modified sequence representative', 'Modified sequence representative degree'] + """ + col = "Modified sequence" + group_col = "Modified sequence group" + count_col = "Modified sequence count" + assert ( + (col in df) + and (group_col in df) + and all(c in df.columns for c in observation_cols) + ) + + # Copy so the original df is not modified + df = df[[col, group_col] + list(observation_cols)].copy() + + # Do the grouping and counting + df[count_col] = df[observation_cols].isna().apply(np.logical_not).sum(axis=1) + reprentitive_idx = ( + df.groupby(group_col)[count_col].transform(lambda x: x.idxmax()).values + ) + df[f"{col} representative"] = df.loc[reprentitive_idx, col].values + df[f"{col} representative degree"] = df.groupby(group_col)[count_col].transform( + lambda x: max(x) / sum(x) + ) + out = df.groupby(group_col)[ + [f"{col} representative", f"{col} representative degree"] + ].first() + return out diff --git a/psite_annotation/annotators/modified_sequence_group.py b/psite_annotation/annotators/modified_sequence_group.py new file mode 100755 index 0000000..cc61a98 --- /dev/null +++ b/psite_annotation/annotators/modified_sequence_group.py @@ -0,0 +1,233 @@ +# adapted from phospho_delocalization.py (Florian P. Bayer - 2025) + +import re +import itertools + +import pandas as pd +import numpy as np +from scipy.cluster import hierarchy + +from .annotator_base import check_columns + +PHOSPHORYLATION_PATTERN = re.compile(r"\(ph\)") + + +class ModifiedSequenceGroupAnnotator: + """Annotate pandas dataframe with modified sequence groups where localizations are within `match_tolerance` of each other. + + Example: + :: + + annotator = DelocalizationAnnotator() + df = annotator.annotate(df) + """ + + def __init__( + self, + match_tolerance: int = 2, + ) -> None: + """ + Initialize the options for DelocalizationAnnotator. + + Args: + match_tolerance: group all modifiable positions within n positions of modified sites. + + """ + self.match_tolerance = match_tolerance + + def load_annotations(self) -> None: + pass + + @check_columns(["Modified sequence"]) + def annotate(self, df: pd.DataFrame, inplace: bool = False) -> pd.DataFrame: + r"""Group delocalized phospho-forms. + + This function identifies peptide sequences that differ only by the position + of their phosphorylation (`(ph)`) group and collapses them into + "delocalized" groups. Each group contains all modified sequence variants + that represent the same underlying peptide backbone. + + The following columns are added to the dataframe\: + + - 'Delocalized sequence' = Canonical unmodified backbone with an index + suffix to distinguish the number of modifications. + - 'Modified sequence group' = All peptide variants belonging to the same + delocalized group, concatenated with semicolons. + + Args: + df: Input dataframe with: + - `"Modified sequence"` column containing peptide strings with `(ph)` annotations + inplace: add the new column to df in place + + Returns: + pd.DataFrame: Dataframe with Modified sequence group column + """ + annotated_df = df + if not inplace: + annotated_df = df.copy() + + # Add delocalized sequences. + annotated_df["Delocalized sequence"] = delocalize_phospho_sequence( + annotated_df["Modified sequence"] + ) + + # Add modified sequence group clusters. + annotated_df["Modified sequence group"] = aggregate_phospho_groups( + annotated_df, self.match_tolerance + ) + + if not inplace: + return annotated_df + + +def extract_phos_positions(mod_seq: str, pattern: re.Pattern) -> np.array: + """ + Parses a modified sequence and reports all positions of the pattern in the aa sequence as numpy array. + """ + return np.array( + tuple( + match.start() - (4 * i) - 1 + for i, match in enumerate(pattern.finditer(mod_seq)) + ) + ) + + +def positional_distance(a: int, b: int) -> int: + """ + Calculates the positional distance of two position ptm arrays a and b. + If multiple positions exist, its the maximal distance that defines the distance. + """ + return max(abs(a - b)) + + +def find_clusters(seqs_pos: list[int], max_distance: int) -> np.array: + """ + Clusters a group of position ptm arrays if they are closer than max_distance. + + Parameters + ---------- + seqs_pos : array-like [positions of sequence A, positions of sequence B, ...] + A list of positional sequences + max_distance : int >= 0 + The maximal distance two ptm positions can be apart to be considered similar. + + Returns + ------- + cluster_ids : array-like + a list of cluster integer ids in the same order as the seqs_pos input list. + """ + if len(seqs_pos) > 1: + distance_matrix = [ + positional_distance(a, b) for a, b in itertools.combinations(seqs_pos, 2) + ] + linkage_matrix = hierarchy.linkage( + distance_matrix, method="single", metric=None + ) + cluster_ids = hierarchy.fcluster( + linkage_matrix, t=max_distance, criterion="distance" + ) + return cluster_ids + return np.array([0]) + + +def aggregate_phospho_groups(df: pd.DataFrame, match_tolerance: int) -> pd.Series: + """ + This function delocalizes ptm-positions in modified sequences by match_tolerance and combines them if they are present in the data. + + Parameters + ---------- + df : pd.DataFrame + a DataFrame with columns <'Modified sequence', 'Delocalized sequence'> + match_tolerance : int >= 0 + the matching tolerance that specifies how close two ptm sites can be to be considered the same. + + Returns + ------- + mod_seqs_clusters : pd.Series() + """ + assert ("Modified sequence" in df) and ("Delocalized sequence" in df) + assert match_tolerance > 0 + + # dont work on input + df = df.copy() + + # Make a positional array using tqdm progress apply else standard pandas apply + if hasattr(df, "progress_transform"): + df["Positional array"] = df["Modified sequence"].progress_apply( + extract_phos_positions, pattern=PHOSPHORYLATION_PATTERN + ) + else: + df["Positional array"] = df["Modified sequence"].apply( + extract_phos_positions, pattern=PHOSPHORYLATION_PATTERN + ) + + # cluster sequences groups using tqdm progress transform else standard pandas transform + if hasattr(df, "progress_transform"): + clusters = df.groupby("Delocalized sequence")[ + "Positional array" + ].progress_transform(find_clusters, max_distance=int(match_tolerance)) + df["Modified sequence group"] = ( + df["Delocalized sequence"] + "_" + clusters.astype(str) + ) + mod_seqs_clusters = df.groupby("Modified sequence group")[ + "Modified sequence" + ].progress_transform(lambda seqs: ";".join(sorted(set(seqs)))) + else: + clusters = df.groupby("Delocalized sequence")["Positional array"].transform( + find_clusters, max_distance=int(match_tolerance) + ) + df["Modified sequence group"] = ( + df["Delocalized sequence"] + "_" + clusters.astype(str) + ) + mod_seqs_clusters = df.groupby("Modified sequence group")[ + "Modified sequence" + ].transform(lambda seqs: ";".join(sorted(set(seqs)))) + + return mod_seqs_clusters + + +def delocalize_phospho_sequence(mod_seqs: pd.Series) -> pd.Series: + """ + Removes the phospho position and adds a _N at the end of the sequence to indicate the number of phosphorylations. + All other modifications remain untouched. + Columns-wise operation is 10x faster than apply. + + Parameters + ---------- + mod_seqs : pd.Series() + + Returns + ------- + mod_seqs : pd.Series() + """ + # Count + ph_count = mod_seqs.str.count("(ph)").replace(np.nan, 0) + # De-localize + mod_seqs = ( + mod_seqs.str.replace(r"\(ph\)", "", regex=True) + + "_" + + ph_count.astype(int).astype(str) + ) + return mod_seqs + + +def make_monophos_versions( + mod_seq: str, pattern: re.Pattern = PHOSPHORYLATION_PATTERN +) -> list[str]: + """ + This function returns a list of all mono-phosphorylated peptide versions given the available positions in the input sequence. + The order of the output is sorted by the modification position. + + Parameters + ---------- + mod_seqs : str + + Returns + ------- + out : list(, , ...) + """ + base_seq = pattern.sub("", mod_seq) + out = [] + for pos in extract_phos_positions(mod_seq, pattern): + out.append(base_seq[: (pos + 1)] + "(ph)" + base_seq[(pos + 1) :]) + return out diff --git a/psite_annotation/annotators/motif.py b/psite_annotation/annotators/motif.py index 2012f65..8a3dc7e 100644 --- a/psite_annotation/annotators/motif.py +++ b/psite_annotation/annotators/motif.py @@ -57,7 +57,7 @@ def load_annotations(self) -> None: @check_columns(["Site sequence context"]) def annotate(self, df: pd.DataFrame, inplace: bool = False) -> pd.DataFrame: - """Adds column with motifs the site sequence context matches with. + r"""Adds column with motifs the site sequence context matches with. Adds the following annotation columns to dataframe\: diff --git a/psite_annotation/annotators/peptide_position.py b/psite_annotation/annotators/peptide_position.py index 71633cb..aa7ea44 100644 --- a/psite_annotation/annotators/peptide_position.py +++ b/psite_annotation/annotators/peptide_position.py @@ -75,7 +75,7 @@ def load_annotations(self) -> None: @check_columns(["Proteins", "Modified sequence"]) def annotate(self, df: pd.DataFrame, inplace: bool = False) -> pd.DataFrame: - """Adds columns regarding the peptide position within the protein to a pandas dataframe. + r"""Adds columns regarding the peptide position within the protein to a pandas dataframe. Adds the following annotation columns to dataframe\: diff --git a/psite_annotation/annotators/psp_kinases.py b/psite_annotation/annotators/psp_kinases.py index 5e983e8..2007536 100644 --- a/psite_annotation/annotators/psp_kinases.py +++ b/psite_annotation/annotators/psp_kinases.py @@ -63,7 +63,7 @@ def load_annotations(self) -> None: @check_columns(["Site positions"]) def annotate(self, df: pd.DataFrame) -> pd.DataFrame: - """Adds column with phosphorylating kinases. + r"""Adds column with phosphorylating kinases. Adds the following annotation columns to dataframe\: diff --git a/psite_annotation/annotators/psp_regulatory.py b/psite_annotation/annotators/psp_regulatory.py index f1a8abe..8d8bc0c 100644 --- a/psite_annotation/annotators/psp_regulatory.py +++ b/psite_annotation/annotators/psp_regulatory.py @@ -63,7 +63,7 @@ def load_annotations(self) -> None: @check_columns(["Site positions"]) def annotate(self, df: pd.DataFrame) -> pd.DataFrame: - """Adds columns with number of studies. + r"""Adds columns with number of studies. Adds the following annotation columns to dataframe\: diff --git a/psite_annotation/annotators/psp_studies.py b/psite_annotation/annotators/psp_studies.py index fc564b3..fe496fb 100644 --- a/psite_annotation/annotators/psp_studies.py +++ b/psite_annotation/annotators/psp_studies.py @@ -60,7 +60,7 @@ def load_annotations(self) -> None: @check_columns(["Site positions"]) def annotate(self, df: pd.DataFrame) -> pd.DataFrame: - """Adds columns with number of studies. + r"""Adds columns with number of studies. Adds the following annotation columns to dataframe\: diff --git a/psite_annotation/annotators/ptm_turnover.py b/psite_annotation/annotators/ptm_turnover.py index d6dcc50..42f2f47 100644 --- a/psite_annotation/annotators/ptm_turnover.py +++ b/psite_annotation/annotators/ptm_turnover.py @@ -58,7 +58,7 @@ def load_annotations(self) -> None: @check_columns(["Modified sequence"]) def annotate(self, df: pd.DataFrame) -> pd.DataFrame: - """Adds column regarding the PTM turnover behavior. + r"""Adds column regarding the PTM turnover behavior. Adds the following annotation columns to dataframe\: diff --git a/psite_annotation/annotators/site_sequence_context.py b/psite_annotation/annotators/site_sequence_context.py index d6417cc..982bf56 100644 --- a/psite_annotation/annotators/site_sequence_context.py +++ b/psite_annotation/annotators/site_sequence_context.py @@ -66,7 +66,7 @@ def load_annotations(self) -> None: @check_columns(["Site positions"]) def annotate(self, df: pd.DataFrame, inplace: bool = False) -> pd.DataFrame: - """Adds columns regarding the peptide position within the protein to a pandas dataframe. + r"""Adds columns regarding the peptide position within the protein to a pandas dataframe. Adds the following annotation columns to dataframe\: diff --git a/psite_annotation/functional_annotation.py b/psite_annotation/functional_annotation.py index cfeea5b..366ad7a 100755 --- a/psite_annotation/functional_annotation.py +++ b/psite_annotation/functional_annotation.py @@ -1,6 +1,6 @@ import logging import sys -from typing import Dict +from typing import Dict, Any import pandas as pd @@ -30,6 +30,7 @@ "addInVitroKinases", "addTurnoverRates", "addKinaseLibraryAnnotations", + "aggregateModifiedSequenceGroups", ] defaults, user = _getConfigDicts() @@ -64,7 +65,7 @@ def addPeptideAndPsitePositions( return_sorted: bool = False, organism: str = "human", ) -> pd.DataFrame: - """Annotate pandas dataframe with positions of the peptide within the protein sequence based on a fasta file. + r"""Annotate pandas dataframe with positions of the peptide within the protein sequence based on a fasta file. Adds the following annotation columns to dataframe\: @@ -148,7 +149,7 @@ def addSiteSequenceContext( return_sorted: bool = False, organism: str = "human", ) -> pd.DataFrame: - """Annotate pandas dataframe with sequence context of a p-site. + r"""Annotate pandas dataframe with sequence context of a p-site. Adds the following annotation columns to dataframe\: @@ -188,7 +189,7 @@ def addSiteSequenceContext( def addTurnoverRates(df: pd.DataFrame, turnoverFile: str) -> pd.DataFrame: - """Annotate pandas dataframe with PTM turnover behavior. + r"""Annotate pandas dataframe with PTM turnover behavior. Adds column regarding the PTM turnover behavior. @@ -222,7 +223,7 @@ def addTurnoverRates(df: pd.DataFrame, turnoverFile: str) -> pd.DataFrame: def addPSPAnnotations( df: pd.DataFrame, phosphoSitePlusFile: str, organism: str = "human" ) -> pd.DataFrame: - """Annotate pandas dataframe with number of high and low-throughput studies according to PhosphositePlus. + r"""Annotate pandas dataframe with number of high and low-throughput studies according to PhosphositePlus. Adds the following annotation columns to dataframe\: @@ -257,7 +258,7 @@ def addPSPAnnotations( def addPSPRegulatoryAnnotations( df: pd.DataFrame, phosphoSitePlusRegulatoryFile: str, organism: str = "human" ) -> pd.DataFrame: - """Annotate pandas dataframe with regulatory functions according to PhosphositePlus. + r"""Annotate pandas dataframe with regulatory functions according to PhosphositePlus. Adds the following annotation columns to dataframe\: @@ -299,7 +300,7 @@ def addPSPKinaseSubstrateAnnotations( gene_name: bool = False, organism: str = "human", ) -> pd.DataFrame: - """Annotate pandas dataframe with upstream kinases according to PhosphositePlus. + r"""Annotate pandas dataframe with upstream kinases according to PhosphositePlus. Adds the following annotation columns to dataframe\: @@ -335,7 +336,7 @@ def addPSPKinaseSubstrateAnnotations( def addDomains(df: pd.DataFrame, domainMappingFile: str) -> pd.DataFrame: - """Adds column with domains the peptide overlaps with. + r"""Adds column with domains the peptide overlaps with. Adds the following annotation columns to dataframe\: @@ -366,7 +367,7 @@ def addDomains(df: pd.DataFrame, domainMappingFile: str) -> pd.DataFrame: def addMotifs(df: pd.DataFrame, motifsFile: str) -> pd.DataFrame: - """Adds column with motifs the site sequence context matches with. + r"""Adds column with motifs the site sequence context matches with. Adds the following annotation columns to dataframe\: @@ -399,7 +400,7 @@ def addMotifs(df: pd.DataFrame, motifsFile: str) -> pd.DataFrame: def addInVitroKinases( df: pd.DataFrame, inVitroKinaseSubstrateMappingFile: str ) -> pd.DataFrame: - """Annotate pandas dataframe with upstream in vitro kinases according to Sugiyama et al (2019). + r"""Annotate pandas dataframe with upstream in vitro kinases according to Sugiyama et al (2019). https://www.nature.com/articles/s41598-019-46385-4 @@ -442,7 +443,7 @@ def addKinaseLibraryAnnotations( score_cutoff: float = 3, split_sequences: bool = False, ) -> pd.DataFrame: - """Annotate pandas dataframe with highest scoring kinases from the kinase library. + r"""Annotate pandas dataframe with highest scoring kinases from the kinase library. Johnson et al. 2023, https://doi.org/10.1038/s41586-022-05575-3 @@ -495,6 +496,62 @@ def addKinaseLibraryAnnotations( return df +def aggregateModifiedSequenceGroups( + df: pd.DataFrame, + experiment_cols: list[str], + agg_cols: dict[str, Any] = None, + match_tolerance: int = 2, + agg_func: str = "mean", +) -> pd.DataFrame: + r"""Annotate DataFrame with representative sequences from grouped localizations. + + Requires "Modified sequence" column in the dataframe to be present. + + Adds the following annotation columns to dataframe\: + + - 'Delocalized sequence' = Canonical unmodified backbone with an index + suffix to distinguish the number of modifications. + - 'Modified sequence group' = All peptide variants belonging to the same + delocalized group, concatenated with semicolons. + - 'Modified sequence representative' = A single representative sequence + selected from the group, i.e. the most frequently measured across experiments. + - 'Modified sequence representative degree' = Fraction of summed observation + frequency contributed by the representative peptide. + + All experiment columns (e.g. `"Experiment 1"`, `"Experiment 2"`, …) are aggregated + per group by summing the intensities of member sequences. + Example: + :: + + df = pa.aggregateModifiedSequenceGroups(df) + + Required columns: + :code:`Modified sequence` + + Args: + df: pandas dataframe with 'Modified sequence' column + match_tolerance: group all modifiable positions within n positions of modified sites. + agg_func: function to aggregate quantitative values within each group, e.g. 'mean', 'sum', etc. + + Returns: + pd.DataFrame: annotated and aggregated dataframe + + """ + annotator = annotators.ModifiedSequenceGroupAnnotator( + match_tolerance=match_tolerance + ) + df = annotator.annotate(df) + + annotator = annotators.ModifiedSequenceAggregatorAnnotator( + experiment_cols=experiment_cols, + agg_func=agg_func, + agg_cols=agg_cols, + ) + df = annotator.annotate(df) + + return df + + def main(argv): df = pd.read_csv(argv[0], sep="\t") addPeptideAndPsitePositions(df, pspFastaFile, pspInput=True) diff --git a/pyproject.toml b/pyproject.toml index 352874e..d282fe3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "psite-annotation" -version = "0.6.1" +version = "0.7.0" description = "Module for annotating p-sites based on resources such as PhosphoSitePlus" authors = ["Matthew The ", "Amirhossein Sakhteman ", "Florian P. Bayer "] license= "Apache-2.0" diff --git a/tests/unit_tests/annotators/test_modified_sequence_aggregation.py b/tests/unit_tests/annotators/test_modified_sequence_aggregation.py new file mode 100644 index 0000000..9c3de91 --- /dev/null +++ b/tests/unit_tests/annotators/test_modified_sequence_aggregation.py @@ -0,0 +1,204 @@ +import pytest +import pandas as pd +import numpy as np + +from psite_annotation.annotators import modified_sequence_aggregation + + +@pytest.fixture +def modified_sequence_df() -> pd.DataFrame: + df = pd.DataFrame( + { + "Modified sequence": [ + "(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNSWNASS(ph)SPGEAREDGPEGLDK", + "(ac)ASNSWNAS(ph)SSPGEAREDGPEGLDK", + "(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNS(ph)WNASS(ph)SPGEAREDGPEGLDK", + "(ac)ASNS(ph)WNAS(ph)SSPGEAREDGPEGLDK", + "(ac)ASNS(ph)WNASSSPGEAREDGPEGLDK", + "(ac)AS(ph)NSWNASSSPGEAREDGPEGLDK", + ], + "Experiment 1": [10.0, np.nan, np.nan, np.nan, 10.0, np.nan, 5.0, np.nan], + "Experiment 2": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 3": [np.nan, 10.0, np.nan, 10.0, np.nan, np.nan, np.nan, 5.0], + "Experiment 4": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 5": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 6": [np.nan, 10.0, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 7": [np.nan, np.nan, 10.0, np.nan, 10.0, np.nan, 5.0, np.nan], + "Experiment 8": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 9": [10.0, np.nan, np.nan, np.nan, np.nan, 10.0, 5.0, np.nan], + "Experiment 10": [np.nan, 10.0, np.nan, np.nan, 10.0, np.nan, 5.0, np.nan], + "Experiment 11": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 12": [10.0, np.nan, np.nan, np.nan, np.nan, 10.0, 5.0, np.nan], + "Experiment 13": [np.nan, 10.0, np.nan, 10.0, 10.0, np.nan, 5.0, np.nan], + "Experiment 14": [np.nan, np.nan, 10.0, np.nan, 10.0, np.nan, 5.0, np.nan], + "Experiment 15": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 16": [10.0, 10.0, np.nan, 10.0, np.nan, np.nan, np.nan, 5.0], + "Experiment 17": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 18": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 19": [np.nan, np.nan, 10.0, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 20": [10.0, 10.0, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + } + ) + return df + + +@pytest.fixture +def delocalized_sequence_series(): + deloc_seqs = pd.Series( + [ + "(ac)ASNSWNASSSPGEAREDGPEGLDK_1", + "(ac)ASNSWNASSSPGEAREDGPEGLDK_1", + "(ac)ASNSWNASSSPGEAREDGPEGLDK_1", + "(ac)ASNSWNASSSPGEAREDGPEGLDK_2", + "(ac)ASNSWNASSSPGEAREDGPEGLDK_2", + "(ac)ASNSWNASSSPGEAREDGPEGLDK_2", + "(ac)ASNSWNASSSPGEAREDGPEGLDK_1", + "(ac)ASNSWNASSSPGEAREDGPEGLDK_1", + ], + name="Modified sequence", + ) + return deloc_seqs + + +@pytest.fixture +def delocalized_sequence_df( + modified_sequence_df: pd.DataFrame, delocalized_sequence_series: pd.Series +): + modified_sequence_df.insert( + loc=1, column="Delocalized sequence", value=delocalized_sequence_series + ) + return modified_sequence_df + + +@pytest.fixture +def sequence_groups_series(): + delocalized_sequence_groups = pd.Series( + [ + "(ac)ASNSWNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNSWNASS(ph)SPGEAREDGPEGLDK;(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNSWNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNSWNASS(ph)SPGEAREDGPEGLDK;(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNSWNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNSWNASS(ph)SPGEAREDGPEGLDK;(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNS(ph)WNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASS(ph)SPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNS(ph)WNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASS(ph)SPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNS(ph)WNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASS(ph)SPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK", + "(ac)AS(ph)NSWNASSSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSSPGEAREDGPEGLDK", + "(ac)AS(ph)NSWNASSSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSSPGEAREDGPEGLDK", + ], + name="Modified sequence", + ) + return delocalized_sequence_groups + + +@pytest.fixture +def sequence_groups_df( + delocalized_sequence_df: pd.DataFrame, sequence_groups_series: pd.Series +): + delocalized_sequence_df.insert( + loc=2, column="Modified sequence group", value=sequence_groups_series + ) + return delocalized_sequence_df + + +@pytest.fixture +def representative_sequence_df(): + representative_df = pd.DataFrame( + { + "Modified sequence group": [ + "(ac)AS(ph)NSWNASSSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSSPGEAREDGPEGLDK", + "(ac)ASNS(ph)WNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASS(ph)SPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNSWNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNSWNASS(ph)SPGEAREDGPEGLDK;(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK", + ], + "Modified sequence representative": [ + "(ac)ASNS(ph)WNASSSPGEAREDGPEGLDK", + "(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK", + ], + "Modified sequence representative degree": [ + 0.9, + 0.6666666666666666, + 0.5909090909090909, + ], + } + ) + return representative_df.set_index("Modified sequence group") + + +@pytest.fixture +def annotated_expected_df(): + return pd.DataFrame( + { + "Modified sequence group": [ + "(ac)AS(ph)NSWNASSSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSSPGEAREDGPEGLDK", + "(ac)ASNS(ph)WNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASS(ph)SPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNSWNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNSWNASS(ph)SPGEAREDGPEGLDK;(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK", + ], + "Modified sequence representative": [ + "(ac)ASNS(ph)WNASSSPGEAREDGPEGLDK", + "(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK", + ], + "Modified sequence representative degree": [ + 0.9, + 0.6666666666666666, + 0.5909090909090909, + ], + "Delocalized sequence": [ + "(ac)ASNSWNASSSPGEAREDGPEGLDK_1", + "(ac)ASNSWNASSSPGEAREDGPEGLDK_2", + "(ac)ASNSWNASSSPGEAREDGPEGLDK_1", + ], + "Experiment 1": [5.0, 10.0, 10.0], + "Experiment 2": [5.0, 10.0, 10.0], + "Experiment 3": [5.0, 10.0, 10.0], + "Experiment 4": [5.0, 10.0, 10.0], + "Experiment 5": [5.0, 10.0, 10.0], + "Experiment 6": [5.0, 10.0, 10.0], + "Experiment 7": [5.0, 10.0, 10.0], + "Experiment 8": [5.0, 10.0, 10.0], + "Experiment 9": [5.0, 10.0, 10.0], + "Experiment 10": [5.0, 10.0, 10.0], + "Experiment 11": [5.0, 10.0, 10.0], + "Experiment 12": [5.0, 10.0, 10.0], + "Experiment 13": [5.0, 10.0, 10.0], + "Experiment 14": [5.0, 10.0, 10.0], + "Experiment 15": [5.0, 10.0, 10.0], + "Experiment 16": [5.0, 10.0, 10.0], + "Experiment 17": [5.0, 10.0, 10.0], + "Experiment 18": [5.0, 10.0, 10.0], + "Experiment 19": [5.0, 10.0, 10.0], + "Experiment 20": [5.0, 10.0, 10.0], + } + ) + + +class TestFindRepresentativeModifiedSequence: + def test_find_representative_modified_sequence( + self, sequence_groups_df: pd.DataFrame, representative_sequence_df: pd.DataFrame + ): + experiment_cols = sequence_groups_df.columns[ + sequence_groups_df.columns.str.contains("Experiment") + ] + df_representative = ( + modified_sequence_aggregation.find_representative_modified_sequence( + sequence_groups_df, experiment_cols + ) + ) + pd.testing.assert_frame_equal(df_representative, representative_sequence_df) + + +class TestDelocalizationAnnotator: + def test_delocalization_annotator( + self, sequence_groups_df: pd.DataFrame, annotated_expected_df: pd.DataFrame + ): + annotator = modified_sequence_aggregation.ModifiedSequenceAggregatorAnnotator( + experiment_cols=sequence_groups_df.columns[ + sequence_groups_df.columns.str.startswith("Experiment") + ] + ) + annotated_df = annotator.annotate( + sequence_groups_df, + ) + pd.testing.assert_frame_equal( + annotated_df, annotated_expected_df, check_like=True + ) diff --git a/tests/unit_tests/annotators/test_modified_sequence_group.py b/tests/unit_tests/annotators/test_modified_sequence_group.py new file mode 100644 index 0000000..fbc41cd --- /dev/null +++ b/tests/unit_tests/annotators/test_modified_sequence_group.py @@ -0,0 +1,161 @@ +import pytest +import pandas as pd +import numpy as np + +from psite_annotation.annotators import modified_sequence_group + + +@pytest.fixture +def modified_sequence_df() -> pd.DataFrame: + df = pd.DataFrame( + { + "Modified sequence": [ + "(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNSWNASS(ph)SPGEAREDGPEGLDK", + "(ac)ASNSWNAS(ph)SSPGEAREDGPEGLDK", + "(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNS(ph)WNASS(ph)SPGEAREDGPEGLDK", + "(ac)ASNS(ph)WNAS(ph)SSPGEAREDGPEGLDK", + "(ac)ASNS(ph)WNASSSPGEAREDGPEGLDK", + "(ac)AS(ph)NSWNASSSPGEAREDGPEGLDK", + ], + "Experiment 1": [10.0, np.nan, np.nan, np.nan, 10.0, np.nan, 5.0, np.nan], + "Experiment 2": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 3": [np.nan, 10.0, np.nan, 10.0, np.nan, np.nan, np.nan, 5.0], + "Experiment 4": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 5": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 6": [np.nan, 10.0, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 7": [np.nan, np.nan, 10.0, np.nan, 10.0, np.nan, 5.0, np.nan], + "Experiment 8": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 9": [10.0, np.nan, np.nan, np.nan, np.nan, 10.0, 5.0, np.nan], + "Experiment 10": [np.nan, 10.0, np.nan, np.nan, 10.0, np.nan, 5.0, np.nan], + "Experiment 11": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 12": [10.0, np.nan, np.nan, np.nan, np.nan, 10.0, 5.0, np.nan], + "Experiment 13": [np.nan, 10.0, np.nan, 10.0, 10.0, np.nan, 5.0, np.nan], + "Experiment 14": [np.nan, np.nan, 10.0, np.nan, 10.0, np.nan, 5.0, np.nan], + "Experiment 15": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 16": [10.0, 10.0, np.nan, 10.0, np.nan, np.nan, np.nan, 5.0], + "Experiment 17": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 18": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 19": [np.nan, np.nan, 10.0, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 20": [10.0, 10.0, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + } + ) + return df + + +@pytest.fixture +def delocalized_sequence_series(): + deloc_seqs = pd.Series( + [ + "(ac)ASNSWNASSSPGEAREDGPEGLDK_1", + "(ac)ASNSWNASSSPGEAREDGPEGLDK_1", + "(ac)ASNSWNASSSPGEAREDGPEGLDK_1", + "(ac)ASNSWNASSSPGEAREDGPEGLDK_2", + "(ac)ASNSWNASSSPGEAREDGPEGLDK_2", + "(ac)ASNSWNASSSPGEAREDGPEGLDK_2", + "(ac)ASNSWNASSSPGEAREDGPEGLDK_1", + "(ac)ASNSWNASSSPGEAREDGPEGLDK_1", + ], + name="Modified sequence", + ) + return deloc_seqs + + +@pytest.fixture +def delocalized_sequence_df( + modified_sequence_df: pd.DataFrame, delocalized_sequence_series: pd.Series +): + modified_sequence_df.insert( + loc=1, column="Delocalized sequence", value=delocalized_sequence_series + ) + return modified_sequence_df + + +@pytest.fixture +def sequence_groups_series(): + delocalized_sequence_groups = pd.Series( + [ + "(ac)ASNSWNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNSWNASS(ph)SPGEAREDGPEGLDK;(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNSWNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNSWNASS(ph)SPGEAREDGPEGLDK;(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNSWNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNSWNASS(ph)SPGEAREDGPEGLDK;(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNS(ph)WNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASS(ph)SPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNS(ph)WNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASS(ph)SPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNS(ph)WNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASS(ph)SPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK", + "(ac)AS(ph)NSWNASSSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSSPGEAREDGPEGLDK", + "(ac)AS(ph)NSWNASSSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSSPGEAREDGPEGLDK", + ], + name="Modified sequence", + ) + return delocalized_sequence_groups + + +@pytest.fixture +def sequence_groups_df( + delocalized_sequence_df: pd.DataFrame, sequence_groups_series: pd.Series +): + delocalized_sequence_df.insert( + loc=2, column="Modified sequence group", value=sequence_groups_series + ) + return delocalized_sequence_df + + +class TestDelocalizeSequence: + def test_delocalize_sequence( + self, modified_sequence_df: pd.DataFrame, delocalized_sequence_series: pd.Series + ): + deloc_seqs = modified_sequence_group.delocalize_phospho_sequence( + modified_sequence_df["Modified sequence"] + ) + pd.testing.assert_series_equal(deloc_seqs, delocalized_sequence_series) + + +class TestAggregateGroups: + def test_aggregate_ptm_groups( + self, delocalized_sequence_df: pd.DataFrame, sequence_groups_series: pd.Series + ): + sequence_groups = modified_sequence_group.aggregate_phospho_groups( + delocalized_sequence_df, match_tolerance=2 + ) + pd.testing.assert_series_equal(sequence_groups, sequence_groups_series) + + +class TestDelocalizationAnnotator: + def test_delocalization_annotator( + self, modified_sequence_df: pd.DataFrame, sequence_groups_df: pd.DataFrame + ): + annotator = modified_sequence_group.ModifiedSequenceGroupAnnotator( + match_tolerance=2 + ) + annotated_df = annotator.annotate(modified_sequence_df) + pd.testing.assert_frame_equal(annotated_df, sequence_groups_df) + + +class TestMakeMonophosVersions: + """Unit tests for make_monophos_versions function.""" + + def test_multiple_sites(self): + mod_seq = "(ph)ACDE(ph)FGHIK" + expected = [ + "(ph)ACDEFGHIK", # phosphorylation at position 0 + "ACDE(ph)FGHIK", # phosphorylation at position 4 + ] + assert modified_sequence_group.make_monophos_versions(mod_seq) == expected + + def test_no_sites(self): + mod_seq = "ACDEFGHIK" + assert modified_sequence_group.make_monophos_versions(mod_seq) == [] + + def test_single_site(self): + mod_seq = "ACDE(ph)FGHIK" + expected = ["ACDE(ph)FGHIK"] + assert modified_sequence_group.make_monophos_versions(mod_seq) == expected + + def test_ordering(self): + mod_seq = "A(ph)C(ph)D(ph)E" + expected = [ + "A(ph)CDE", # position 1 + "AC(ph)DE", # position 2 + "ACD(ph)E", # position 3 + ] + assert modified_sequence_group.make_monophos_versions(mod_seq) == expected diff --git a/tests/unit_tests/test_functional_annotation.py b/tests/unit_tests/test_functional_annotation.py index d0e0213..4522788 100644 --- a/tests/unit_tests/test_functional_annotation.py +++ b/tests/unit_tests/test_functional_annotation.py @@ -1,6 +1,9 @@ import unittest import unittest.mock + +import pytest import pandas as pd +import numpy as np import psite_annotation.functional_annotation as pa @@ -35,9 +38,140 @@ class TestAddSiteSequenceContext: create=True, ) def test_add_site_sequence_context(self): - df = pd.DataFrame({"Site positions": ["Q9Y3C8_T6", "Q9Y3C8_Q167", "Q9Y3C8_Q168"]}) + df = pd.DataFrame( + {"Site positions": ["Q9Y3C8_T6", "Q9Y3C8_Q167", "Q9Y3C8_Q168"]} + ) result_df = pa.addSiteSequenceContext(df, "mock_input_file_psp", pspInput=True) - - assert result_df["Site sequence context"].iloc[0] == "__________MADEAtRRVVSEIPVLKTNAG" - assert result_df["Site sequence context"].iloc[1] == "DLIQKGVIQHKEKCNq_______________" + + assert ( + result_df["Site sequence context"].iloc[0] + == "__________MADEAtRRVVSEIPVLKTNAG" + ) + assert ( + result_df["Site sequence context"].iloc[1] + == "DLIQKGVIQHKEKCNq_______________" + ) assert result_df["Site sequence context"].iloc[2] == "" + + +@pytest.fixture +def modified_sequence_df() -> pd.DataFrame: + df = pd.DataFrame( + { + "Modified sequence": [ + "(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNSWNASS(ph)SPGEAREDGPEGLDK", + "(ac)ASNSWNAS(ph)SSPGEAREDGPEGLDK", + "(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNS(ph)WNASS(ph)SPGEAREDGPEGLDK", + "(ac)ASNS(ph)WNAS(ph)SSPGEAREDGPEGLDK", + "(ac)ASNS(ph)WNASSSPGEAREDGPEGLDK", + "(ac)AS(ph)NSWNASSSPGEAREDGPEGLDK", + ], + "Experiment 1": [10.0, np.nan, np.nan, np.nan, 10.0, np.nan, 5.0, np.nan], + "Experiment 2": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 3": [np.nan, 10.0, np.nan, 10.0, np.nan, np.nan, np.nan, 5.0], + "Experiment 4": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 5": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 6": [np.nan, 10.0, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 7": [np.nan, np.nan, 10.0, np.nan, 10.0, np.nan, 5.0, np.nan], + "Experiment 8": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 9": [10.0, np.nan, np.nan, np.nan, np.nan, 10.0, 5.0, np.nan], + "Experiment 10": [np.nan, 10.0, np.nan, np.nan, 10.0, np.nan, 5.0, np.nan], + "Experiment 11": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 12": [10.0, np.nan, np.nan, np.nan, np.nan, 10.0, 5.0, np.nan], + "Experiment 13": [np.nan, 10.0, np.nan, 10.0, 10.0, np.nan, 5.0, np.nan], + "Experiment 14": [np.nan, np.nan, 10.0, np.nan, 10.0, np.nan, 5.0, np.nan], + "Experiment 15": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 16": [10.0, 10.0, np.nan, 10.0, np.nan, np.nan, np.nan, 5.0], + "Experiment 17": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 18": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 19": [np.nan, np.nan, 10.0, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 20": [10.0, 10.0, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + } + ) + return df + + +@pytest.fixture +def annotated_expected_df(): + return pd.DataFrame( + { + "Modified sequence group": [ + "(ac)AS(ph)NSWNASSSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSSPGEAREDGPEGLDK", + "(ac)ASNS(ph)WNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASS(ph)SPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNSWNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNSWNASS(ph)SPGEAREDGPEGLDK;(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK", + ], + "Modified sequence representative": [ + "(ac)ASNS(ph)WNASSSPGEAREDGPEGLDK", + "(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK", + ], + "Modified sequence representative degree": [ + 0.9, + 0.6666666666666666, + 0.5909090909090909, + ], + "Delocalized sequence": [ + "(ac)ASNSWNASSSPGEAREDGPEGLDK_1", + "(ac)ASNSWNASSSPGEAREDGPEGLDK_2", + "(ac)ASNSWNASSSPGEAREDGPEGLDK_1", + ], + "Experiment 1": [5.0, 10.0, 10.0], + "Experiment 2": [5.0, 10.0, 10.0], + "Experiment 3": [5.0, 10.0, 10.0], + "Experiment 4": [5.0, 10.0, 10.0], + "Experiment 5": [5.0, 10.0, 10.0], + "Experiment 6": [5.0, 10.0, 10.0], + "Experiment 7": [5.0, 10.0, 10.0], + "Experiment 8": [5.0, 10.0, 10.0], + "Experiment 9": [5.0, 10.0, 10.0], + "Experiment 10": [5.0, 10.0, 10.0], + "Experiment 11": [5.0, 10.0, 10.0], + "Experiment 12": [5.0, 10.0, 10.0], + "Experiment 13": [5.0, 10.0, 10.0], + "Experiment 14": [5.0, 10.0, 10.0], + "Experiment 15": [5.0, 10.0, 10.0], + "Experiment 16": [5.0, 10.0, 10.0], + "Experiment 17": [5.0, 10.0, 10.0], + "Experiment 18": [5.0, 10.0, 10.0], + "Experiment 19": [5.0, 10.0, 10.0], + "Experiment 20": [5.0, 10.0, 10.0], + } + ) + + +class TestAggregateModifiedSequenceGroups: + def test_aggregate_modified_sequence_groups( + self, modified_sequence_df: pd.DataFrame, annotated_expected_df: pd.DataFrame + ): + annotated_df = pa.aggregateModifiedSequenceGroups( + modified_sequence_df, + experiment_cols=modified_sequence_df.columns[ + modified_sequence_df.columns.str.startswith("Experiment") + ], + ) + pd.testing.assert_frame_equal(annotated_df, annotated_expected_df) + + def test_aggregate_modified_sequence_groups_extra_columns( + self, modified_sequence_df: pd.DataFrame, annotated_expected_df: pd.DataFrame + ): + modified_sequence_df["Gene Names"] = [ + "GeneA", + "GeneA;GeneB", + "GeneB", + "GeneB", + "GeneA;GeneB", + "GeneB", + "GeneC", + "GeneA;GeneB", + ] + annotated_df = pa.aggregateModifiedSequenceGroups( + modified_sequence_df, + experiment_cols=modified_sequence_df.columns[ + modified_sequence_df.columns.str.startswith("Experiment") + ], + agg_cols={"Gene Names": "first"}, + ) + annotated_expected_df["Gene Names"] = ["GeneC", "GeneB", "GeneA"] + pd.testing.assert_frame_equal(annotated_df, annotated_expected_df)