diff --git a/CHANGELOG b/CHANGELOG index adb019d..c509655 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,6 @@ +v0.7.1 +- Add support for modified sequence grouping without aggregating + v0.7.0 - Add support for modified sequence grouping and aggregating - Fix SyntaxWarning in docstrings diff --git a/psite_annotation/functional_annotation.py b/psite_annotation/functional_annotation.py index 366ad7a..43903fa 100755 --- a/psite_annotation/functional_annotation.py +++ b/psite_annotation/functional_annotation.py @@ -496,14 +496,53 @@ def addKinaseLibraryAnnotations( return df +def addModifiedSequenceGroups( + df: pd.DataFrame, + match_tolerance: int = 2, +) -> pd.DataFrame: + r"""Annotate DataFrame with representative sequences from grouped localizations. + + Requires "Modified sequence" column in the dataframe to be present. + + Adds the following annotation columns to dataframe\: + + - 'Delocalized sequence' = Canonical unmodified backbone with an index + suffix to distinguish the number of modifications. + - 'Modified sequence group' = All peptide variants belonging to the same + delocalized group, concatenated with semicolons. + + Example: + :: + + df = pa.addModifiedSequenceGroups(df) + + Required columns: + :code:`Modified sequence` + + Args: + df: pandas dataframe with 'Modified sequence' column + match_tolerance: group all modifiable positions within n positions of modified sites. + + Returns: + pd.DataFrame: annotated and aggregated dataframe + + """ + annotator = annotators.ModifiedSequenceGroupAnnotator( + match_tolerance=match_tolerance + ) + df = annotator.annotate(df) + + return df + + def aggregateModifiedSequenceGroups( df: pd.DataFrame, experiment_cols: list[str], - agg_cols: dict[str, Any] = None, match_tolerance: int = 2, + agg_cols: dict[str, Any] = None, agg_func: str = "mean", ) -> pd.DataFrame: - r"""Annotate DataFrame with representative sequences from grouped localizations. + r"""Annotate and aggregate DataFrame with representative sequences from grouped localizations. Requires "Modified sequence" column in the dataframe to be present. @@ -520,6 +559,7 @@ def aggregateModifiedSequenceGroups( All experiment columns (e.g. `"Experiment 1"`, `"Experiment 2"`, …) are aggregated per group by summing the intensities of member sequences. + Example: :: @@ -529,9 +569,11 @@ def aggregateModifiedSequenceGroups( :code:`Modified sequence` Args: - df: pandas dataframe with 'Modified sequence' column + df: pandas dataframe with 'Modified sequence' column. + experiment_cols: list of column names with quantitative values. match_tolerance: group all modifiable positions within n positions of modified sites. - agg_func: function to aggregate quantitative values within each group, e.g. 'mean', 'sum', etc. + agg_func: function to aggregate quantitative values within each group, e.g. 'mean', 'sum', etc. + agg_cols: dictionary for non-quantitative columns of {column name: aggregation function}. Returns: pd.DataFrame: annotated and aggregated dataframe diff --git a/pyproject.toml b/pyproject.toml index d282fe3..65c10ae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "psite-annotation" -version = "0.7.0" +version = "0.7.1" description = "Module for annotating p-sites based on resources such as PhosphoSitePlus" authors = ["Matthew The ", "Amirhossein Sakhteman ", "Florian P. Bayer "] license= "Apache-2.0" diff --git a/tests/unit_tests/test_functional_annotation.py b/tests/unit_tests/test_functional_annotation.py index 4522788..38751ef 100644 --- a/tests/unit_tests/test_functional_annotation.py +++ b/tests/unit_tests/test_functional_annotation.py @@ -94,7 +94,66 @@ def modified_sequence_df() -> pd.DataFrame: @pytest.fixture -def annotated_expected_df(): +def annotated_expected_df() -> pd.DataFrame: + df = pd.DataFrame( + { + "Modified sequence": [ + "(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNSWNASS(ph)SPGEAREDGPEGLDK", + "(ac)ASNSWNAS(ph)SSPGEAREDGPEGLDK", + "(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNS(ph)WNASS(ph)SPGEAREDGPEGLDK", + "(ac)ASNS(ph)WNAS(ph)SSPGEAREDGPEGLDK", + "(ac)ASNS(ph)WNASSSPGEAREDGPEGLDK", + "(ac)AS(ph)NSWNASSSPGEAREDGPEGLDK", + ], + "Experiment 1": [10.0, np.nan, np.nan, np.nan, 10.0, np.nan, 5.0, np.nan], + "Experiment 2": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 3": [np.nan, 10.0, np.nan, 10.0, np.nan, np.nan, np.nan, 5.0], + "Experiment 4": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 5": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 6": [np.nan, 10.0, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 7": [np.nan, np.nan, 10.0, np.nan, 10.0, np.nan, 5.0, np.nan], + "Experiment 8": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 9": [10.0, np.nan, np.nan, np.nan, np.nan, 10.0, 5.0, np.nan], + "Experiment 10": [np.nan, 10.0, np.nan, np.nan, 10.0, np.nan, 5.0, np.nan], + "Experiment 11": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 12": [10.0, np.nan, np.nan, np.nan, np.nan, 10.0, 5.0, np.nan], + "Experiment 13": [np.nan, 10.0, np.nan, 10.0, 10.0, np.nan, 5.0, np.nan], + "Experiment 14": [np.nan, np.nan, 10.0, np.nan, 10.0, np.nan, 5.0, np.nan], + "Experiment 15": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 16": [10.0, 10.0, np.nan, 10.0, np.nan, np.nan, np.nan, 5.0], + "Experiment 17": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 18": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 19": [np.nan, np.nan, 10.0, 10.0, np.nan, np.nan, 5.0, np.nan], + "Experiment 20": [10.0, 10.0, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan], + "Delocalized sequence": [ + "(ac)ASNSWNASSSPGEAREDGPEGLDK_1", + "(ac)ASNSWNASSSPGEAREDGPEGLDK_1", + "(ac)ASNSWNASSSPGEAREDGPEGLDK_1", + "(ac)ASNSWNASSSPGEAREDGPEGLDK_2", + "(ac)ASNSWNASSSPGEAREDGPEGLDK_2", + "(ac)ASNSWNASSSPGEAREDGPEGLDK_2", + "(ac)ASNSWNASSSPGEAREDGPEGLDK_1", + "(ac)ASNSWNASSSPGEAREDGPEGLDK_1", + ], + "Modified sequence group": [ + "(ac)ASNSWNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNSWNASS(ph)SPGEAREDGPEGLDK;(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNSWNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNSWNASS(ph)SPGEAREDGPEGLDK;(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNSWNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNSWNASS(ph)SPGEAREDGPEGLDK;(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNS(ph)WNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASS(ph)SPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNS(ph)WNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASS(ph)SPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK", + "(ac)ASNS(ph)WNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASS(ph)SPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK", + "(ac)AS(ph)NSWNASSSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSSPGEAREDGPEGLDK", + "(ac)AS(ph)NSWNASSSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSSPGEAREDGPEGLDK", + ], + } + ) + return df + + +@pytest.fixture +def aggregated_expected_df(): return pd.DataFrame( { "Modified sequence group": [ @@ -141,9 +200,19 @@ def annotated_expected_df(): ) -class TestAggregateModifiedSequenceGroups: +class TestAddModifiedSequenceGroups: def test_aggregate_modified_sequence_groups( self, modified_sequence_df: pd.DataFrame, annotated_expected_df: pd.DataFrame + ): + annotated_df = pa.addModifiedSequenceGroups( + modified_sequence_df, + ) + pd.testing.assert_frame_equal(annotated_df, annotated_expected_df) + + +class TestAggregateModifiedSequenceGroups: + def test_aggregate_modified_sequence_groups( + self, modified_sequence_df: pd.DataFrame, aggregated_expected_df: pd.DataFrame ): annotated_df = pa.aggregateModifiedSequenceGroups( modified_sequence_df, @@ -151,10 +220,10 @@ def test_aggregate_modified_sequence_groups( modified_sequence_df.columns.str.startswith("Experiment") ], ) - pd.testing.assert_frame_equal(annotated_df, annotated_expected_df) + pd.testing.assert_frame_equal(annotated_df, aggregated_expected_df) def test_aggregate_modified_sequence_groups_extra_columns( - self, modified_sequence_df: pd.DataFrame, annotated_expected_df: pd.DataFrame + self, modified_sequence_df: pd.DataFrame, aggregated_expected_df: pd.DataFrame ): modified_sequence_df["Gene Names"] = [ "GeneA", @@ -173,5 +242,5 @@ def test_aggregate_modified_sequence_groups_extra_columns( ], agg_cols={"Gene Names": "first"}, ) - annotated_expected_df["Gene Names"] = ["GeneC", "GeneB", "GeneA"] - pd.testing.assert_frame_equal(annotated_df, annotated_expected_df) + aggregated_expected_df["Gene Names"] = ["GeneC", "GeneB", "GeneA"] + pd.testing.assert_frame_equal(annotated_df, aggregated_expected_df)