From 11e02a16725248bca59f429af9f3b37aa3c92dc6 Mon Sep 17 00:00:00 2001 From: AlexBennett Date: Tue, 7 Apr 2026 16:54:10 +0200 Subject: [PATCH 1/9] Added distance matrix to beta diversity output, required information to visualise sample clustering, a likely use-case --- glycowork/motif/analysis.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/glycowork/motif/analysis.py b/glycowork/motif/analysis.py index 4c942f77..bbb659bb 100644 --- a/glycowork/motif/analysis.py +++ b/glycowork/motif/analysis.py @@ -7,6 +7,9 @@ import networkx as nx import statsmodels.api as sm import matplotlib.pyplot as plt +from numpy import ndarray +from pandas import DataFrame + plt.rcParams.update({ 'font.size': 11, 'axes.labelsize': 12, 'axes.titlesize': 13, 'xtick.labelsize': 10, 'ytick.labelsize': 10, 'axes.linewidth': 0.8, @@ -16,7 +19,7 @@ 'axes.prop_cycle': plt.cycler('color', ['#2D6A9F', '#C84B55', '#3A9268', '#E8863A', '#7B5EA7', '#C4843A', '#4AADA8']) }) from collections import Counter -from typing import Any +from typing import Any, List from scipy.stats import ttest_ind, ttest_rel, norm, levene, f_oneway, spearmanr from statsmodels.formula.api import ols from statsmodels.stats.multitest import multipletests @@ -1015,13 +1018,14 @@ def get_biodiversity( gamma: float = 0.1, # Uncertainty parameter for CLR transform custom_scale: float | dict = 0, # Ratio of total signal in group2/group1 for an informed scale model (or group_idx: mean(group)/min(mean(groups)) signal dict for multivariate) random_state: int | np.random.Generator | None = None # optional random state for reproducibility - ) -> pd.DataFrame: # DataFrame with diversity indices and test statistics + ) -> list[DataFrame | ndarray[Any, Any] | list[Any]]: # DataFrame with diversity indices and test statistics "Calculates alpha (Shannon/Simpson) and beta (ANOSIM/PERMANOVA) diversity measures from glycomics data" experiment = "diff" if group2 else "anova" df, df_org, group1, group2 = preprocess_data(df, group1, group2, experiment = experiment, motifs = motifs, impute = False, transform = transform, feature_set = feature_set, paired = paired, gamma = gamma, custom_scale = custom_scale, custom_motifs = custom_motifs, random_state = random_state) shopping_cart = [] + distance_matrix = [] group_sizes = group1 if not group2 else len(group1)*[1]+len(group2)*[2] group_counts = Counter(group_sizes) # Sample-size aware alpha via Bayesian-Adaptive Alpha Adjustment @@ -1066,8 +1070,8 @@ def get_biodiversity( bc_diversity[index_1, index_2] = bc_pair b_df_out = pd.DataFrame.from_dict(bc_diversity, orient = 'index') out_len = int(np.sqrt(len(b_df_out))) - b_df_out_values = b_df_out.values.reshape(out_len, out_len) - beta_df_out = pd.DataFrame(data = b_df_out_values, index = range(out_len), columns = range(out_len)) + distance_matrix = b_df_out.values.reshape(out_len, out_len) + beta_df_out = pd.DataFrame(data = distance_matrix, index = range(out_len), columns = range(out_len)) if all(count > 1 for count in group_counts.values()): r, p = anosim(beta_df_out, group_sizes, permutations) b_test_stats = pd.DataFrame({'Metric': 'Beta diversity (ANOSIM)', 'p-val': p, 'Effect size': r}, index = [0]) @@ -1079,7 +1083,7 @@ def get_biodiversity( corrpvals, significance = correct_multiple_testing(df_out['p-val'], alpha) df_out["corr p-val"] = corrpvals df_out["significant"] = significance - return df_out.sort_values(by = 'p-val').sort_values(by = 'corr p-val').reset_index(drop = True) + return [df_out.sort_values(by = 'p-val').sort_values(by = 'corr p-val').reset_index(drop = True), distance_matrix] def get_SparCC( From f5eb35a3cbe42d2cc1428bc5546a00cec6d6ba73 Mon Sep 17 00:00:00 2001 From: AlexBennett Date: Tue, 7 Apr 2026 16:57:42 +0200 Subject: [PATCH 2/9] Changelog update --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f894ecf3..5a98ab75 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,8 @@ #### analysis ##### Fixed 🐛 - Fixed column names slipping into column values when `motifs = True` combined with `transform = ALR` in `get_pca` (e802da1) +##### Changed 🔄 +- Added distance matrix to beta diversity output #### draw ##### Changed 🔄 From b0f72e7e20692e331ee8894f0f2160b6cdc709e7 Mon Sep 17 00:00:00 2001 From: AlexBennett Date: Thu, 9 Apr 2026 17:24:21 +0200 Subject: [PATCH 3/9] Beta diversity update: fixed testing --- glycowork/motif/analysis.py | 9 ++++----- tests/test_core_functions.py | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/glycowork/motif/analysis.py b/glycowork/motif/analysis.py index bbb659bb..f1a10c02 100644 --- a/glycowork/motif/analysis.py +++ b/glycowork/motif/analysis.py @@ -8,7 +8,6 @@ import statsmodels.api as sm import matplotlib.pyplot as plt from numpy import ndarray -from pandas import DataFrame plt.rcParams.update({ 'font.size': 11, 'axes.labelsize': 12, 'axes.titlesize': 13, @@ -19,7 +18,7 @@ 'axes.prop_cycle': plt.cycler('color', ['#2D6A9F', '#C84B55', '#3A9268', '#E8863A', '#7B5EA7', '#C4843A', '#4AADA8']) }) from collections import Counter -from typing import Any, List +from typing import Any from scipy.stats import ttest_ind, ttest_rel, norm, levene, f_oneway, spearmanr from statsmodels.formula.api import ols from statsmodels.stats.multitest import multipletests @@ -1018,14 +1017,14 @@ def get_biodiversity( gamma: float = 0.1, # Uncertainty parameter for CLR transform custom_scale: float | dict = 0, # Ratio of total signal in group2/group1 for an informed scale model (or group_idx: mean(group)/min(mean(groups)) signal dict for multivariate) random_state: int | np.random.Generator | None = None # optional random state for reproducibility - ) -> list[DataFrame | ndarray[Any, Any] | list[Any]]: # DataFrame with diversity indices and test statistics + ) -> tuple[pd.DataFrame, pd.DataFrame ]: # First dataFrame with diversity indices and test statistics, second with beta-diversity distance matrix "Calculates alpha (Shannon/Simpson) and beta (ANOSIM/PERMANOVA) diversity measures from glycomics data" experiment = "diff" if group2 else "anova" df, df_org, group1, group2 = preprocess_data(df, group1, group2, experiment = experiment, motifs = motifs, impute = False, transform = transform, feature_set = feature_set, paired = paired, gamma = gamma, custom_scale = custom_scale, custom_motifs = custom_motifs, random_state = random_state) shopping_cart = [] - distance_matrix = [] + distance_matrix = pd.DataFrame() group_sizes = group1 if not group2 else len(group1)*[1]+len(group2)*[2] group_counts = Counter(group_sizes) # Sample-size aware alpha via Bayesian-Adaptive Alpha Adjustment @@ -1083,7 +1082,7 @@ def get_biodiversity( corrpvals, significance = correct_multiple_testing(df_out['p-val'], alpha) df_out["corr p-val"] = corrpvals df_out["significant"] = significance - return [df_out.sort_values(by = 'p-val').sort_values(by = 'corr p-val').reset_index(drop = True), distance_matrix] + return df_out.sort_values(by = 'p-val').sort_values(by = 'corr p-val').reset_index(drop = True), distance_matrix def get_SparCC( diff --git a/tests/test_core_functions.py b/tests/test_core_functions.py index 26b299f5..deb51a1c 100644 --- a/tests/test_core_functions.py +++ b/tests/test_core_functions.py @@ -4006,7 +4006,7 @@ def generate_group_data(base_proportions, n_samples=3, noise_scale=0.15): # Run biodiversity analysis results = get_biodiversity(df, group1, group2, metrics=['alpha', 'beta']) # Basic assertions - assert isinstance(results, pd.DataFrame), "Results should be a DataFrame" + assert isinstance(results, tuple(pd.DataFrame, pd.DataFrame)), "Results should be a tuple of two DataFrames" assert 'Metric' in results.columns, "Results should have a Metric column" assert 'p-val' in results.columns, "Results should have a p-val column" # Additional assertions to verify realistic results From 956111f20d24e089977c4b5cc0e561038a1ff6cf Mon Sep 17 00:00:00 2001 From: AlexBennett Date: Fri, 10 Apr 2026 14:30:34 +0200 Subject: [PATCH 4/9] Beta diversity update: fixed testing --- glycowork/motif/analysis.py | 2 +- tests/test_core_functions.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/glycowork/motif/analysis.py b/glycowork/motif/analysis.py index f1a10c02..ca930553 100644 --- a/glycowork/motif/analysis.py +++ b/glycowork/motif/analysis.py @@ -1017,7 +1017,7 @@ def get_biodiversity( gamma: float = 0.1, # Uncertainty parameter for CLR transform custom_scale: float | dict = 0, # Ratio of total signal in group2/group1 for an informed scale model (or group_idx: mean(group)/min(mean(groups)) signal dict for multivariate) random_state: int | np.random.Generator | None = None # optional random state for reproducibility - ) -> tuple[pd.DataFrame, pd.DataFrame ]: # First dataFrame with diversity indices and test statistics, second with beta-diversity distance matrix + ) -> tuple(pd.DataFrame, pd.DataFrame): # First dataFrame with diversity indices and test statistics, second with beta-diversity distance matrix "Calculates alpha (Shannon/Simpson) and beta (ANOSIM/PERMANOVA) diversity measures from glycomics data" experiment = "diff" if group2 else "anova" df, df_org, group1, group2 = preprocess_data(df, group1, group2, experiment = experiment, motifs = motifs, impute = False, diff --git a/tests/test_core_functions.py b/tests/test_core_functions.py index deb51a1c..af7b803f 100644 --- a/tests/test_core_functions.py +++ b/tests/test_core_functions.py @@ -4006,7 +4006,8 @@ def generate_group_data(base_proportions, n_samples=3, noise_scale=0.15): # Run biodiversity analysis results = get_biodiversity(df, group1, group2, metrics=['alpha', 'beta']) # Basic assertions - assert isinstance(results, tuple(pd.DataFrame, pd.DataFrame)), "Results should be a tuple of two DataFrames" + assert isinstance(results, tuple), "Results should be a tuple" + assert len(results) ==2, "Results should be consist of two DataFrames" assert 'Metric' in results.columns, "Results should have a Metric column" assert 'p-val' in results.columns, "Results should have a p-val column" # Additional assertions to verify realistic results From ad028875e0b9af8a62b398f6ff2746ade8306d35 Mon Sep 17 00:00:00 2001 From: AlexBennett Date: Fri, 10 Apr 2026 14:37:33 +0200 Subject: [PATCH 5/9] Beta diversity update: fixed testing --- glycowork/motif/analysis.py | 2 +- tests/test_core_functions.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/glycowork/motif/analysis.py b/glycowork/motif/analysis.py index ca930553..6287a1d9 100644 --- a/glycowork/motif/analysis.py +++ b/glycowork/motif/analysis.py @@ -1017,7 +1017,7 @@ def get_biodiversity( gamma: float = 0.1, # Uncertainty parameter for CLR transform custom_scale: float | dict = 0, # Ratio of total signal in group2/group1 for an informed scale model (or group_idx: mean(group)/min(mean(groups)) signal dict for multivariate) random_state: int | np.random.Generator | None = None # optional random state for reproducibility - ) -> tuple(pd.DataFrame, pd.DataFrame): # First dataFrame with diversity indices and test statistics, second with beta-diversity distance matrix + ) -> tuple(): # First dataFrame with diversity indices and test statistics, second with beta-diversity distance matrix "Calculates alpha (Shannon/Simpson) and beta (ANOSIM/PERMANOVA) diversity measures from glycomics data" experiment = "diff" if group2 else "anova" df, df_org, group1, group2 = preprocess_data(df, group1, group2, experiment = experiment, motifs = motifs, impute = False, diff --git a/tests/test_core_functions.py b/tests/test_core_functions.py index af7b803f..fab7a9c4 100644 --- a/tests/test_core_functions.py +++ b/tests/test_core_functions.py @@ -4007,7 +4007,7 @@ def generate_group_data(base_proportions, n_samples=3, noise_scale=0.15): results = get_biodiversity(df, group1, group2, metrics=['alpha', 'beta']) # Basic assertions assert isinstance(results, tuple), "Results should be a tuple" - assert len(results) ==2, "Results should be consist of two DataFrames" + assert len(results) == 2, "Results should be consist of two DataFrames" assert 'Metric' in results.columns, "Results should have a Metric column" assert 'p-val' in results.columns, "Results should have a p-val column" # Additional assertions to verify realistic results From 888852e7eb289065bf783e43f00763ad48671fac Mon Sep 17 00:00:00 2001 From: AlexBennett Date: Fri, 10 Apr 2026 15:26:57 +0200 Subject: [PATCH 6/9] Beta diversity update: fixed testing --- tests/test_core_functions.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_core_functions.py b/tests/test_core_functions.py index fab7a9c4..3892ee17 100644 --- a/tests/test_core_functions.py +++ b/tests/test_core_functions.py @@ -4008,7 +4008,10 @@ def generate_group_data(base_proportions, n_samples=3, noise_scale=0.15): # Basic assertions assert isinstance(results, tuple), "Results should be a tuple" assert len(results) == 2, "Results should be consist of two DataFrames" - assert 'Metric' in results.columns, "Results should have a Metric column" + stats, dist_matrix = results + assert isinstance(stats, pd.DataFrame) + assert isinstance(dist_matrix, pd.DataFrame) + assert 'Metric' in stats.columns, "Stats results should have a Metric column" assert 'p-val' in results.columns, "Results should have a p-val column" # Additional assertions to verify realistic results assert len(results) >= 2, "Should have at least alpha and beta diversity results" From 3988ae127bca5c7f652f27e1563fe7f160efd36d Mon Sep 17 00:00:00 2001 From: AlexBennett Date: Fri, 10 Apr 2026 15:43:52 +0200 Subject: [PATCH 7/9] Beta diversity update: fixed testing --- tests/test_core_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_core_functions.py b/tests/test_core_functions.py index 3892ee17..690aed50 100644 --- a/tests/test_core_functions.py +++ b/tests/test_core_functions.py @@ -4010,7 +4010,7 @@ def generate_group_data(base_proportions, n_samples=3, noise_scale=0.15): assert len(results) == 2, "Results should be consist of two DataFrames" stats, dist_matrix = results assert isinstance(stats, pd.DataFrame) - assert isinstance(dist_matrix, pd.DataFrame) + assert isinstance(dist_matrix, np.ndarray) assert 'Metric' in stats.columns, "Stats results should have a Metric column" assert 'p-val' in results.columns, "Results should have a p-val column" # Additional assertions to verify realistic results From e2f43f4c2a5909002ab4511f0fae6b6ca52be405 Mon Sep 17 00:00:00 2001 From: AlexBennett Date: Fri, 10 Apr 2026 15:52:55 +0200 Subject: [PATCH 8/9] Beta diversity update: fixed testing --- tests/test_core_functions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_core_functions.py b/tests/test_core_functions.py index 690aed50..7a1749e2 100644 --- a/tests/test_core_functions.py +++ b/tests/test_core_functions.py @@ -4012,13 +4012,13 @@ def generate_group_data(base_proportions, n_samples=3, noise_scale=0.15): assert isinstance(stats, pd.DataFrame) assert isinstance(dist_matrix, np.ndarray) assert 'Metric' in stats.columns, "Stats results should have a Metric column" - assert 'p-val' in results.columns, "Results should have a p-val column" + assert 'p-val' in stats.columns, "Results should have a p-val column" # Additional assertions to verify realistic results - assert len(results) >= 2, "Should have at least alpha and beta diversity results" - assert all(0 <= p <= 1 for p in results['p-val']), "p-values should be between 0 and 1" + assert len(stats) >= 2, "Should have at least alpha and beta diversity results" + assert all(0 <= p <= 1 for p in stats['p-val']), "p-values should be between 0 and 1" # Optional: Check if the differences are detectable # The groups are designed to be different, so p-values should be < 0.05 - assert any(p < 0.05 for p in results['p-val']), "Should detect differences between groups" + assert any(p < 0.05 for p in stats['p-val']), "Should detect differences between groups" results = get_biodiversity(df, group1, group2, metrics=['alpha', 'beta'], paired=True) df3 = df[['glycan'] + group1 + group2].copy() for i in range(3): From 34c1666b52b840b2541a263429fa9ed29c8cedc9 Mon Sep 17 00:00:00 2001 From: AlexBennett Date: Fri, 10 Apr 2026 16:02:13 +0200 Subject: [PATCH 9/9] Beta diversity update: fixed testing --- tests/test_core_functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_core_functions.py b/tests/test_core_functions.py index 7a1749e2..adfdba20 100644 --- a/tests/test_core_functions.py +++ b/tests/test_core_functions.py @@ -4024,9 +4024,9 @@ def generate_group_data(base_proportions, n_samples=3, noise_scale=0.15): for i in range(3): df3[f'sample3_{i + 1}'] = group2_data[i] * 1.2 results = get_biodiversity(df3, [1, 1, 1, 2, 2, 2, 3, 3, 3], [], metrics = ['alpha']) - assert isinstance(results, pd.DataFrame) + assert isinstance(results, tuple) results = get_biodiversity(df, group1, group2, metrics = ['beta'], motifs = True) - assert isinstance(results, pd.DataFrame) + assert isinstance(results, tuple) @pytest.fixture