Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
v0.7.0
- Add support for modified sequence grouping and aggregating
- Fix SyntaxWarning in docstrings
- Expose 'annotators' submodule as part of the public API

v0.6.1
- Add support for other organisms (#10)

Expand Down
1 change: 1 addition & 0 deletions psite_annotation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Convenience functions for annotating a pandas dataframe with a variety of annotations."""

from .functional_annotation import * # noqa: F401,F403
from . import annotators

"""Get version from distribution and set copyright."""
__version__ = "0.0.0"
Expand Down
2 changes: 2 additions & 0 deletions psite_annotation/annotators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,5 @@
from .psp_studies import PSPStudiesAnnotator
from .ptm_turnover import PTMTurnoverAnnotator
from .site_sequence_context import SiteSequenceContextAnnotator
from .modified_sequence_group import ModifiedSequenceGroupAnnotator
from .modified_sequence_aggregation import ModifiedSequenceAggregatorAnnotator
4 changes: 2 additions & 2 deletions psite_annotation/annotators/clinical_basket.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __init__(self, annotation_file: str):
self.basket_df = None

def load_annotations(self) -> None:
"""Reads in excel file with basket-gene annotations.
r"""Reads in excel file with basket-gene annotations.

Creates a dataframe `basket_df` with two columns\:

Expand Down Expand Up @@ -62,7 +62,7 @@ def load_annotations(self) -> None:

@check_columns(["Gene names"])
def annotate(self, df: pd.DataFrame, inplace: bool = False) -> pd.DataFrame:
"""Adds column with baskets the gene names correspond to.
r"""Adds column with baskets the gene names correspond to.

Adds the following annotation columns to dataframe\:

Expand Down
2 changes: 1 addition & 1 deletion psite_annotation/annotators/domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def load_annotations(self) -> None:

@check_columns(["Matched proteins", "Start positions", "End positions"])
def annotate(self, df: pd.DataFrame, inplace: bool = False) -> pd.DataFrame:
"""Adds column with domains the peptide overlaps with.
r"""Adds column with domains the peptide overlaps with.

Adds the following annotation columns to dataframe\:

Expand Down
2 changes: 1 addition & 1 deletion psite_annotation/annotators/in_vitro_kinases.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def load_annotations(self) -> None:

@check_columns(["Site positions"])
def annotate(self, df: pd.DataFrame) -> pd.DataFrame:
"""Adds column with phosphorylating kinases.
r"""Adds column with phosphorylating kinases.

Adds the following annotation columns to dataframe\:

Expand Down
2 changes: 1 addition & 1 deletion psite_annotation/annotators/kinase_library.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def load_annotations(self) -> None:

@check_columns(["Site sequence context"])
def annotate(self, df: pd.DataFrame, inplace: bool = False) -> pd.DataFrame:
"""Adds column with motifs the site sequence context matches with.
r"""Adds column with motifs the site sequence context matches with.

Adds the following annotation columns to dataframe\:

Expand Down
137 changes: 137 additions & 0 deletions psite_annotation/annotators/modified_sequence_aggregation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# adapted from phospho_delocalization.py (Florian P. Bayer - 2025)
from typing import Any

import pandas as pd
import numpy as np

from .annotator_base import check_columns


class ModifiedSequenceAggregatorAnnotator:
"""Annotate and aggregate pandas dataframe with representative modified sequence from a modified sequence group.

Example:
::

annotator = ReprentativeModifiedSequenceAnnotator()
df = annotator.annotate(df)
"""

def __init__(
self,
experiment_cols: list[str],
agg_func: str = "mean",
agg_cols: dict[str, Any] = None,
) -> None:
"""
Initialize the options for ReprentativeModifiedSequenceAnnotator.

Args:
experiment_cols: list of column names with quantitative values.
agg_func: function to aggregate quantitative values within each group, e.g. 'mean', 'sum', etc.

"""
self.experiment_cols = experiment_cols
self.agg_func = agg_func
self.agg_cols = {}
if agg_cols:
self.agg_cols = agg_cols

def load_annotations(self) -> None:
pass

@check_columns(["Modified sequence", "Delocalized sequence", "Modified sequence group"])
def annotate(self, df: pd.DataFrame, ) -> pd.DataFrame:
r"""Group delocalized phospho-forms and aggregate their quantitative values.

This function identifies peptide sequences that differ only by the position
of their phosphorylation (`(ph)`) group and collapses them into
"delocalized" groups. Each group contains all modified sequence variants
that represent the same underlying peptide backbone.

The following columns are added to the dataframe\:

- 'Modified sequence representative' = A single representative sequence
selected from the group, i.e. the most frequently measured across experiments.
- 'Modified sequence representative degree' = Fraction of summed observation
frequency contributed by the representative peptide.

All experiment columns (e.g. `"Experiment 1"`, `"Experiment 2"`, …) are aggregated
per group by summing the intensities of member sequences.

Args:
df: Input dataframe with:
- `"Modified sequence"` column containing peptide strings with `(ph)` annotations
- 'Delocalized sequence' = Canonical unmodified backbone with an index
suffix to distinguish the number of modifications.
- 'Modified sequence group' = All peptide variants belonging to the same
delocalized group, concatenated with semicolons.

Returns:
pd.DataFrame: Dataframe with grouped phospho-forms and aggregated intensities.
"""
# TODO: implement inplace option. does not work currently because groupby().agg() cannot be done inplace
annotated_df = df

# Determine representative sequence for each cluster based on the observations in the experiments.
# The Modified sequence representative degree gives the proportion of the representative relative to all observations.
df_representative = find_representative_modified_sequence(
annotated_df, self.experiment_cols
)

# Aggregate experiments per group.
df_agg = (
annotated_df.groupby(["Delocalized sequence", "Modified sequence group"])
.agg({x: self.agg_func for x in self.experiment_cols} | self.agg_cols)
.reset_index()
)

df_agg = df_representative.merge(df_agg, on="Modified sequence group")

return df_agg


def find_representative_modified_sequence(
df: pd.DataFrame, observation_cols: list[str]
) -> pd.DataFrame:
"""
This function counts the number of observations of a specific modified sequence and defines the most representative sequence as
the one with most observations. Missing is indicated as NaN. Any other value is considered an observation.

Parameters
----------
df : pd.DataFrame
a DataFrame with columns <'Modified sequence', 'Delocalized sequence'>, observation_cols
observation_cols : list of cols
the names of the columns that are used for counting if a peptide was observed.

Returns
-------
df : pd.DataFrame
A new DataFrame with cols ['Modified sequence representative', 'Modified sequence representative degree']
"""
col = "Modified sequence"
group_col = "Modified sequence group"
count_col = "Modified sequence count"
assert (
(col in df)
and (group_col in df)
and all(c in df.columns for c in observation_cols)
)

# Copy so the original df is not modified
df = df[[col, group_col] + list(observation_cols)].copy()

# Do the grouping and counting
df[count_col] = df[observation_cols].isna().apply(np.logical_not).sum(axis=1)
reprentitive_idx = (
df.groupby(group_col)[count_col].transform(lambda x: x.idxmax()).values
)
df[f"{col} representative"] = df.loc[reprentitive_idx, col].values
df[f"{col} representative degree"] = df.groupby(group_col)[count_col].transform(
lambda x: max(x) / sum(x)
)
out = df.groupby(group_col)[
[f"{col} representative", f"{col} representative degree"]
].first()
return out
Loading