kusterlab · MatthewThe · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,3 +1,6 @@
+v0.7.1
+- Add support for modified sequence grouping without aggregating
+
 v0.7.0
 - Add support for modified sequence grouping and aggregating
 - Fix SyntaxWarning in docstrings

diff --git a/psite_annotation/functional_annotation.py b/psite_annotation/functional_annotation.py
@@ -496,14 +496,53 @@ def addKinaseLibraryAnnotations(
     return df
 
 
+def addModifiedSequenceGroups(
+    df: pd.DataFrame,
+    match_tolerance: int = 2,    
+) -> pd.DataFrame:
+    r"""Annotate DataFrame with representative sequences from grouped localizations.
+
+    Requires "Modified sequence" column in the dataframe to be present.
+
+    Adds the following annotation columns to dataframe\:
+
+    - 'Delocalized sequence' = Canonical unmodified backbone with an index
+    suffix to distinguish the number of modifications.
+    - 'Modified sequence group' = All peptide variants belonging to the same
+    delocalized group, concatenated with semicolons.
+
+    Example:
+        ::
+
+            df = pa.addModifiedSequenceGroups(df)
+
+    Required columns:
+        :code:`Modified sequence`
+
+    Args:
+        df: pandas dataframe with 'Modified sequence' column
+        match_tolerance: group all modifiable positions within n positions of modified sites.
+
+    Returns:
+        pd.DataFrame: annotated and aggregated dataframe
+
+    """
+    annotator = annotators.ModifiedSequenceGroupAnnotator(
+        match_tolerance=match_tolerance
+    )
+    df = annotator.annotate(df)
+
+    return df
+
+
 def aggregateModifiedSequenceGroups(
     df: pd.DataFrame,
     experiment_cols: list[str],
-    agg_cols: dict[str, Any] = None,
     match_tolerance: int = 2,
+    agg_cols: dict[str, Any] = None,
     agg_func: str = "mean",
 ) -> pd.DataFrame:
-    r"""Annotate DataFrame with representative sequences from grouped localizations.
+    r"""Annotate and aggregate DataFrame with representative sequences from grouped localizations.
 
     Requires "Modified sequence" column in the dataframe to be present.
 
@@ -520,6 +559,7 @@ def aggregateModifiedSequenceGroups(
 
     All experiment columns (e.g. `"Experiment 1"`, `"Experiment 2"`, …) are aggregated
     per group by summing the intensities of member sequences.
+
     Example:
         ::
 
@@ -529,9 +569,11 @@ def aggregateModifiedSequenceGroups(
         :code:`Modified sequence`
 
     Args:
-        df: pandas dataframe with 'Modified sequence' column
+        df: pandas dataframe with 'Modified sequence' column.
+        experiment_cols: list of column names with quantitative values.
         match_tolerance: group all modifiable positions within n positions of modified sites.
-        agg_func: function to aggregate quantitative values within each group, e.g. 'mean', 'sum', etc.        
+        agg_func: function to aggregate quantitative values within each group, e.g. 'mean', 'sum', etc.
+        agg_cols: dictionary for non-quantitative columns of {column name: aggregation function}.
 
     Returns:
         pd.DataFrame: annotated and aggregated dataframe

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "psite-annotation"
-version = "0.7.0"
+version = "0.7.1"
 description = "Module for annotating p-sites based on resources such as PhosphoSitePlus"
 authors = ["Matthew The <[email protected]>", "Amirhossein Sakhteman <[email protected]>", "Florian P. Bayer <[email protected]>"]
 license= "Apache-2.0"

diff --git a/tests/unit_tests/test_functional_annotation.py b/tests/unit_tests/test_functional_annotation.py
@@ -94,7 +94,66 @@ def modified_sequence_df() -> pd.DataFrame:
 
 
 @pytest.fixture
-def annotated_expected_df():
+def annotated_expected_df() -> pd.DataFrame:
+    df = pd.DataFrame(
+        {
+            "Modified sequence": [
+                "(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK",
+                "(ac)ASNSWNASS(ph)SPGEAREDGPEGLDK",
+                "(ac)ASNSWNAS(ph)SSPGEAREDGPEGLDK",
+                "(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK",
+                "(ac)ASNS(ph)WNASS(ph)SPGEAREDGPEGLDK",
+                "(ac)ASNS(ph)WNAS(ph)SSPGEAREDGPEGLDK",
+                "(ac)ASNS(ph)WNASSSPGEAREDGPEGLDK",
+                "(ac)AS(ph)NSWNASSSPGEAREDGPEGLDK",
+            ],
+            "Experiment 1": [10.0, np.nan, np.nan, np.nan, 10.0, np.nan, 5.0, np.nan],
+            "Experiment 2": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan],
+            "Experiment 3": [np.nan, 10.0, np.nan, 10.0, np.nan, np.nan, np.nan, 5.0],
+            "Experiment 4": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan],
+            "Experiment 5": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan],
+            "Experiment 6": [np.nan, 10.0, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan],
+            "Experiment 7": [np.nan, np.nan, 10.0, np.nan, 10.0, np.nan, 5.0, np.nan],
+            "Experiment 8": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan],
+            "Experiment 9": [10.0, np.nan, np.nan, np.nan, np.nan, 10.0, 5.0, np.nan],
+            "Experiment 10": [np.nan, 10.0, np.nan, np.nan, 10.0, np.nan, 5.0, np.nan],
+            "Experiment 11": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan],
+            "Experiment 12": [10.0, np.nan, np.nan, np.nan, np.nan, 10.0, 5.0, np.nan],
+            "Experiment 13": [np.nan, 10.0, np.nan, 10.0, 10.0, np.nan, 5.0, np.nan],
+            "Experiment 14": [np.nan, np.nan, 10.0, np.nan, 10.0, np.nan, 5.0, np.nan],
+            "Experiment 15": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan],
+            "Experiment 16": [10.0, 10.0, np.nan, 10.0, np.nan, np.nan, np.nan, 5.0],
+            "Experiment 17": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan],
+            "Experiment 18": [10.0, np.nan, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan],
+            "Experiment 19": [np.nan, np.nan, 10.0, 10.0, np.nan, np.nan, 5.0, np.nan],
+            "Experiment 20": [10.0, 10.0, np.nan, 10.0, np.nan, np.nan, 5.0, np.nan],
+            "Delocalized sequence": [
+                "(ac)ASNSWNASSSPGEAREDGPEGLDK_1",
+                "(ac)ASNSWNASSSPGEAREDGPEGLDK_1",
+                "(ac)ASNSWNASSSPGEAREDGPEGLDK_1",
+                "(ac)ASNSWNASSSPGEAREDGPEGLDK_2",
+                "(ac)ASNSWNASSSPGEAREDGPEGLDK_2",
+                "(ac)ASNSWNASSSPGEAREDGPEGLDK_2",
+                "(ac)ASNSWNASSSPGEAREDGPEGLDK_1",
+                "(ac)ASNSWNASSSPGEAREDGPEGLDK_1",
+            ],
+            "Modified sequence group": [
+                "(ac)ASNSWNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNSWNASS(ph)SPGEAREDGPEGLDK;(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK",
+                "(ac)ASNSWNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNSWNASS(ph)SPGEAREDGPEGLDK;(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK",
+                "(ac)ASNSWNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNSWNASS(ph)SPGEAREDGPEGLDK;(ac)ASNSWNASSS(ph)PGEAREDGPEGLDK",
+                "(ac)ASNS(ph)WNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASS(ph)SPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK",
+                "(ac)ASNS(ph)WNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASS(ph)SPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK",
+                "(ac)ASNS(ph)WNAS(ph)SSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASS(ph)SPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSS(ph)PGEAREDGPEGLDK",
+                "(ac)AS(ph)NSWNASSSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSSPGEAREDGPEGLDK",
+                "(ac)AS(ph)NSWNASSSPGEAREDGPEGLDK;(ac)ASNS(ph)WNASSSPGEAREDGPEGLDK",
+            ],
+        }
+    )
+    return df
+
+
+@pytest.fixture
+def aggregated_expected_df():
     return pd.DataFrame(
         {
             "Modified sequence group": [
@@ -141,20 +200,30 @@ def annotated_expected_df():
     )
 
 
-class TestAggregateModifiedSequenceGroups:
+class TestAddModifiedSequenceGroups:
     def test_aggregate_modified_sequence_groups(
         self, modified_sequence_df: pd.DataFrame, annotated_expected_df: pd.DataFrame
+    ):
+        annotated_df = pa.addModifiedSequenceGroups(
+            modified_sequence_df,
+        )
+        pd.testing.assert_frame_equal(annotated_df, annotated_expected_df)
+
+
+class TestAggregateModifiedSequenceGroups:
+    def test_aggregate_modified_sequence_groups(
+        self, modified_sequence_df: pd.DataFrame, aggregated_expected_df: pd.DataFrame
     ):
         annotated_df = pa.aggregateModifiedSequenceGroups(
             modified_sequence_df,
             experiment_cols=modified_sequence_df.columns[
                 modified_sequence_df.columns.str.startswith("Experiment")
             ],
         )
-        pd.testing.assert_frame_equal(annotated_df, annotated_expected_df)
+        pd.testing.assert_frame_equal(annotated_df, aggregated_expected_df)
 
     def test_aggregate_modified_sequence_groups_extra_columns(
-        self, modified_sequence_df: pd.DataFrame, annotated_expected_df: pd.DataFrame
+        self, modified_sequence_df: pd.DataFrame, aggregated_expected_df: pd.DataFrame
     ):
         modified_sequence_df["Gene Names"] = [
             "GeneA",
@@ -173,5 +242,5 @@ def test_aggregate_modified_sequence_groups_extra_columns(
             ],
             agg_cols={"Gene Names": "first"},
         )
-        annotated_expected_df["Gene Names"] = ["GeneC", "GeneB", "GeneA"]
-        pd.testing.assert_frame_equal(annotated_df, annotated_expected_df)
+        aggregated_expected_df["Gene Names"] = ["GeneC", "GeneB", "GeneA"]
+        pd.testing.assert_frame_equal(annotated_df, aggregated_expected_df)