Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
v0.7.1
- Add support for modified sequence grouping without aggregating

v0.7.0
- Add support for modified sequence grouping and aggregating
- Fix SyntaxWarning in docstrings
Expand Down
50 changes: 46 additions & 4 deletions psite_annotation/functional_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,14 +496,53 @@ def addKinaseLibraryAnnotations(
return df


def addModifiedSequenceGroups(
df: pd.DataFrame,
match_tolerance: int = 2,
) -> pd.DataFrame:
r"""Annotate DataFrame with representative sequences from grouped localizations.

Requires "Modified sequence" column in the dataframe to be present.

Adds the following annotation columns to dataframe\:

- 'Delocalized sequence' = Canonical unmodified backbone with an index
suffix to distinguish the number of modifications.
- 'Modified sequence group' = All peptide variants belonging to the same
delocalized group, concatenated with semicolons.

Example:
::

df = pa.addModifiedSequenceGroups(df)

Required columns:
:code:`Modified sequence`

Args:
df: pandas dataframe with 'Modified sequence' column
match_tolerance: group all modifiable positions within n positions of modified sites.

Returns:
pd.DataFrame: annotated and aggregated dataframe

"""
annotator = annotators.ModifiedSequenceGroupAnnotator(
match_tolerance=match_tolerance
)
df = annotator.annotate(df)

return df


def aggregateModifiedSequenceGroups(
df: pd.DataFrame,
experiment_cols: list[str],
agg_cols: dict[str, Any] = None,
match_tolerance: int = 2,
agg_cols: dict[str, Any] = None,
agg_func: str = "mean",
) -> pd.DataFrame:
r"""Annotate DataFrame with representative sequences from grouped localizations.
r"""Annotate and aggregate DataFrame with representative sequences from grouped localizations.

Requires "Modified sequence" column in the dataframe to be present.

Expand All @@ -520,6 +559,7 @@ def aggregateModifiedSequenceGroups(

All experiment columns (e.g. `"Experiment 1"`, `"Experiment 2"`, …) are aggregated
per group by summing the intensities of member sequences.

Example:
::

Expand All @@ -529,9 +569,11 @@ def aggregateModifiedSequenceGroups(
:code:`Modified sequence`

Args:
df: pandas dataframe with 'Modified sequence' column
df: pandas dataframe with 'Modified sequence' column.
experiment_cols: list of column names with quantitative values.
match_tolerance: group all modifiable positions within n positions of modified sites.
agg_func: function to aggregate quantitative values within each group, e.g. 'mean', 'sum', etc.
agg_func: function to aggregate quantitative values within each group, e.g. 'mean', 'sum', etc.
agg_cols: dictionary for non-quantitative columns of {column name: aggregation function}.

Returns:
pd.DataFrame: annotated and aggregated dataframe
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "psite-annotation"
version = "0.7.0"
version = "0.7.1"
description = "Module for annotating p-sites based on resources such as PhosphoSitePlus"
authors = ["Matthew The <[email protected]>", "Amirhossein Sakhteman <[email protected]>", "Florian P. Bayer <[email protected]>"]
license= "Apache-2.0"
Expand Down
81 changes: 75 additions & 6 deletions tests/unit_tests/test_functional_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,66 @@ def modified_sequence_df() -> pd.DataFrame:


@pytest.fixture
def annotated_expected_df():
def annotated_expected_df() -> pd.DataFrame:
df = pd.DataFrame(
{
"Modified sequence": [
"(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK",
"(ac)ASNSWNASS(ph)SPGEAREDGPEGLDK",
"(ac)ASNSWNAS(ph)SSPGEAREDGPEGLDK",
"(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK",
"(ac)ASNS(ph)WNASS(ph)SPGEAREDGPEGLDK",
"(ac)ASNS(ph)WNAS(ph)SSPGEAREDGPEGLDK",
"(ac)ASNS(ph)WNASSSPGEAREDGPEGLDK",
"(ac)AS(ph)NSWNASSSPGEAREDGPEGLDK",
],
"Experiment 1": [10.0, np.nan, np.nan, np.nan, 10.0, np.nan, 5.0, np.nan],
"Experiment 2": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan],
"Experiment 3": [np.nan, 10.0, np.nan, 10.0, np.nan, np.nan, np.nan, 5.0],
"Experiment 4": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan],
"Experiment 5": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan],
"Experiment 6": [np.nan, 10.0, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan],
"Experiment 7": [np.nan, np.nan, 10.0, np.nan, 10.0, np.nan, 5.0, np.nan],
"Experiment 8": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan],
"Experiment 9": [10.0, np.nan, np.nan, np.nan, np.nan, 10.0, 5.0, np.nan],
"Experiment 10": [np.nan, 10.0, np.nan, np.nan, 10.0, np.nan, 5.0, np.nan],
"Experiment 11": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan],
"Experiment 12": [10.0, np.nan, np.nan, np.nan, np.nan, 10.0, 5.0, np.nan],
"Experiment 13": [np.nan, 10.0, np.nan, 10.0, 10.0, np.nan, 5.0, np.nan],
"Experiment 14": [np.nan, np.nan, 10.0, np.nan, 10.0, np.nan, 5.0, np.nan],
"Experiment 15": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan],
"Experiment 16": [10.0, 10.0, np.nan, 10.0, np.nan, np.nan, np.nan, 5.0],
"Experiment 17": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan],
"Experiment 18": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan],
"Experiment 19": [np.nan, np.nan, 10.0, 10.0, np.nan, np.nan, 5.0, np.nan],
"Experiment 20": [10.0, 10.0, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan],
"Delocalized sequence": [
"(ac)ASNSWNASSSPGEAREDGPEGLDK_1",
"(ac)ASNSWNASSSPGEAREDGPEGLDK_1",
"(ac)ASNSWNASSSPGEAREDGPEGLDK_1",
"(ac)ASNSWNASSSPGEAREDGPEGLDK_2",
"(ac)ASNSWNASSSPGEAREDGPEGLDK_2",
"(ac)ASNSWNASSSPGEAREDGPEGLDK_2",
"(ac)ASNSWNASSSPGEAREDGPEGLDK_1",
"(ac)ASNSWNASSSPGEAREDGPEGLDK_1",
],
"Modified sequence group": [
"(ac)ASNSWNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNSWNASS(ph)SPGEAREDGPEGLDK;(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK",
"(ac)ASNSWNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNSWNASS(ph)SPGEAREDGPEGLDK;(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK",
"(ac)ASNSWNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNSWNASS(ph)SPGEAREDGPEGLDK;(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK",
"(ac)ASNS(ph)WNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASS(ph)SPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK",
"(ac)ASNS(ph)WNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASS(ph)SPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK",
"(ac)ASNS(ph)WNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASS(ph)SPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK",
"(ac)AS(ph)NSWNASSSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSSPGEAREDGPEGLDK",
"(ac)AS(ph)NSWNASSSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSSPGEAREDGPEGLDK",
],
}
)
return df


@pytest.fixture
def aggregated_expected_df():
return pd.DataFrame(
{
"Modified sequence group": [
Expand Down Expand Up @@ -141,20 +200,30 @@ def annotated_expected_df():
)


class TestAggregateModifiedSequenceGroups:
class TestAddModifiedSequenceGroups:
def test_aggregate_modified_sequence_groups(
self, modified_sequence_df: pd.DataFrame, annotated_expected_df: pd.DataFrame
):
annotated_df = pa.addModifiedSequenceGroups(
modified_sequence_df,
)
pd.testing.assert_frame_equal(annotated_df, annotated_expected_df)


class TestAggregateModifiedSequenceGroups:
def test_aggregate_modified_sequence_groups(
self, modified_sequence_df: pd.DataFrame, aggregated_expected_df: pd.DataFrame
):
annotated_df = pa.aggregateModifiedSequenceGroups(
modified_sequence_df,
experiment_cols=modified_sequence_df.columns[
modified_sequence_df.columns.str.startswith("Experiment")
],
)
pd.testing.assert_frame_equal(annotated_df, annotated_expected_df)
pd.testing.assert_frame_equal(annotated_df, aggregated_expected_df)

def test_aggregate_modified_sequence_groups_extra_columns(
self, modified_sequence_df: pd.DataFrame, annotated_expected_df: pd.DataFrame
self, modified_sequence_df: pd.DataFrame, aggregated_expected_df: pd.DataFrame
):
modified_sequence_df["Gene Names"] = [
"GeneA",
Expand All @@ -173,5 +242,5 @@ def test_aggregate_modified_sequence_groups_extra_columns(
],
agg_cols={"Gene Names": "first"},
)
annotated_expected_df["Gene Names"] = ["GeneC", "GeneB", "GeneA"]
pd.testing.assert_frame_equal(annotated_df, annotated_expected_df)
aggregated_expected_df["Gene Names"] = ["GeneC", "GeneB", "GeneA"]
pd.testing.assert_frame_equal(annotated_df, aggregated_expected_df)