diff --git a/meta_individual_column.csv b/meta_individual_column.csv new file mode 100644 index 00000000..82053971 --- /dev/null +++ b/meta_individual_column.csv @@ -0,0 +1,144 @@ +Sample,Group,Batch,Individual +AD01_C1_INSOLUBLE_01,AD,C1,AD01_ +AD01_C1_INSOLUBLE_02,AD,C1,AD01_ +AD01_C1_INSOLUBLE_03,AD,C1,AD01_ +AD01_C2_INSOLUBLE_01,AD,C2,AD01_ +AD02_C1_INSOLUBLE_01,AD,C1,AD02_ +AD02_C1_INSOLUBLE_02,AD,C1,AD02_ +AD02_C2_INSOLUBLE_01,AD,C2,AD02_ +AD03_C1_INSOLUBLE_01,AD,C1,AD03_ +AD03_C1_INSOLUBLE_02,AD,C1,AD03_ +AD03_C1_INSOLUBLE_03,AD,C1,AD03_ +AD03_C2_INSOLUBLE_01,AD,C2,AD03_ +AD04_C1_INSOLUBLE_01,AD,C1,AD04_ +AD04_C2_INSOLUBLE_01,AD,C2,AD04_ +AD05_C2_INSOLUBLE_01,AD,C2,AD05_ +AD06_C1_INSOLUBLE_01,AD,C1,AD06_ +AD07_C1_INSOLUBLE_01,AD,C1,AD07_ +AD07_C1_INSOLUBLE_02,AD,C1,AD07_ +AD07_C1_INSOLUBLE_03,AD,C1,AD07_ +AD07_C2_INSOLUBLE_01,AD,C2,AD07_ +AD08_C2_INSOLUBLE_01,AD,C2,AD08_ +AD09_C1_INSOLUBLE_01,AD,C1,AD09_ +AD10_C1_INSOLUBLE_01,AD,C1,AD10_ +AD10_C2_INSOLUBLE_01,AD,C2,AD10_ +AD11_C2_INSOLUBLE_01,AD,C2,AD11_ +AD12_C2_INSOLUBLE_01,AD,C2,AD12_ +AD13_C2_INSOLUBLE_01,AD,C2,AD13_ +AD14_C2_INSOLUBLE_01,AD,C2,AD14_ +AD15_C2_INSOLUBLE_01,AD,C2,AD15_ +AD16_C2_INSOLUBLE_01,AD,C2,AD16_ +AD17_C2_INSOLUBLE_01,AD,C2,AD17_ +AD18_C2_INSOLUBLE_01,AD,C2,AD18_ +AD19_C2_INSOLUBLE_01,AD,C2,AD19_ +AD20_C1_INSOLUBLE_01,AD,C1,AD20_ +AD21_C1_INSOLUBLE_01,AD,C1,AD21_ +AD21_C2_INSOLUBLE_01,AD,C2,AD21_ +AD22_C1_INSOLUBLE_01,AD,C1,AD22_ +AD23_C1_INSOLUBLE_01,AD,C1,AD23_ +AD23_C1_INSOLUBLE_02,AD,C1,AD23_ +AD23_C2_INSOLUBLE_01,AD,C2,AD23_ +AD24_C1_INSOLUBLE_01,AD,C1,AD24_ +AD24_C1_INSOLUBLE_02,AD,C1,AD24_ +AD25_C1_INSOLUBLE_01,AD,C1,AD25_ +AD26_C1_INSOLUBLE_01,AD,C1,AD26_ +AD27_C1_INSOLUBLE_01,AD,C1,AD27_ +AD27_C1_INSOLUBLE_02,AD,C1,AD27_ +AD28_C2_INSOLUBLE_01,AD,C2,AD28_ +AD29_C1_INSOLUBLE_01,AD,C1,AD29_ +AD30_C1_INSOLUBLE_01,AD,C1,AD30_ +AD30_C1_INSOLUBLE_02,AD,C1,AD30_ +AD30_C2_INSOLUBLE_01,AD,C2,AD30_ +AD31_C2_INSOLUBLE_01,AD,C2,AD31_ +AD32_C2_INSOLUBLE_01,AD,C2,AD32_ +AD33_C2_INSOLUBLE_01,AD,C2,AD33_ +AD34_C1_INSOLUBLE_01,AD,C1,AD34_ +AD34_C1_INSOLUBLE_02,AD,C1,AD34_ +AD35_C1_INSOLUBLE_01,AD,C1,AD35_ +AD35_C1_INSOLUBLE_02,AD,C1,AD35_ +AD36_C1_INSOLUBLE_01,AD,C1,AD36_ +AD37_C1_INSOLUBLE_01,AD,C1,AD37_ +AD37_C2_INSOLUBLE_01,AD,C2,AD37_ +AD38_C1_INSOLUBLE_01,AD,C1,AD38_ +AD38_C1_INSOLUBLE_02,AD,C1,AD38_ +AD38_C1_INSOLUBLE_03,AD,C1,AD38_ +AD39_C2_INSOLUBLE_01,AD,C2,AD39_ +AD40_C2_INSOLUBLE_01,AD,C2,AD40_ +AD41_C2_INSOLUBLE_01,AD,C2,AD41_ +AD42_C2_INSOLUBLE_01,AD,C2,AD42_ +AD43_C1_INSOLUBLE_01,AD,C1,AD43_ +AD44_C1_INSOLUBLE_01,AD,C1,AD44_ +AD44_C1_INSOLUBLE_02,AD,C1,AD44_ +AD44_C1_INSOLUBLE_03,AD,C1,AD44_ +AD44_C1_INSOLUBLE_04,AD,C1,AD44_ +AD45_C1_INSOLUBLE_01,AD,C1,AD45_ +AD45_C1_INSOLUBLE_02,AD,C1,AD45_ +AD46_C1_INSOLUBLE_01,AD,C1,AD46_ +AD46_C1_INSOLUBLE_02,AD,C1,AD46_ +AD46_C1_INSOLUBLE_03,AD,C1,AD46_ +AD46_C2_INSOLUBLE_01,AD,C2,AD46_ +AD47_C1_INSOLUBLE_01,AD,C1,AD47_ +AD48_C2_INSOLUBLE_01,AD,C2,AD48_ +AD49_C2_INSOLUBLE_01,AD,C2,AD49_ +CTR01_C1_INSOLUBLE_01,CTR,C1,CTR01 +CTR02_C1_INSOLUBLE_01,CTR,C1,CTR02 +CTR03_C1_INSOLUBLE_01,CTR,C1,CTR03 +CTR04_C1_INSOLUBLE_01,CTR,C1,CTR04 +CTR05_C2_INSOLUBLE_01,CTR,C2,CTR05 +CTR06_C2_INSOLUBLE_01,CTR,C2,CTR06 +CTR07_C1_INSOLUBLE_01,CTR,C1,CTR07 +CTR08_C1_INSOLUBLE_01,CTR,C1,CTR08 +CTR08_C2_INSOLUBLE_01,CTR,C2,CTR08 +CTR09_C2_INSOLUBLE_01,CTR,C2,CTR09 +CTR10_C1_INSOLUBLE_01,CTR,C1,CTR10 +CTR10_C2_INSOLUBLE_01,CTR,C2,CTR10 +CTR11_C2_INSOLUBLE_01,CTR,C2,CTR11 +CTR12_C2_INSOLUBLE_01,CTR,C2,CTR12 +CTR13_C2_INSOLUBLE_01,CTR,C2,CTR13 +CTR14_C2_INSOLUBLE_01,CTR,C2,CTR14 +CTR15_C2_INSOLUBLE_01,CTR,C2,CTR15 +CTR16_C2_INSOLUBLE_01,CTR,C2,CTR16 +CTR17_C2_INSOLUBLE_01,CTR,C2,CTR17 +CTR18_C2_INSOLUBLE_01,CTR,C2,CTR18 +CTR19_C1_INSOLUBLE_01,CTR,C1,CTR19 +CTR20_C1_INSOLUBLE_01,CTR,C1,CTR20 +CTR21_C2_INSOLUBLE_01,CTR,C2,CTR21 +CTR22_C2_INSOLUBLE_01,CTR,C2,CTR22 +CTR23_C2_INSOLUBLE_01,CTR,C2,CTR23 +CTR24_C1_INSOLUBLE_01,CTR,C1,CTR24 +CTR25_C1_INSOLUBLE_01,CTR,C1,CTR25 +CTR26_C2_INSOLUBLE_01,CTR,C2,CTR26 +CTR27_C1_INSOLUBLE_01,CTR,C1,CTR27 +CTR28_C1_INSOLUBLE_01,CTR,C1,CTR28 +CTR28_C1_INSOLUBLE_02,CTR,C1,CTR28 +CTR28_C2_INSOLUBLE_01,CTR,C2,CTR28 +CTR29_C1_INSOLUBLE_01,CTR,C1,CTR29 +CTR29_C1_INSOLUBLE_02,CTR,C1,CTR29 +CTR29_C1_INSOLUBLE_03,CTR,C1,CTR29 +CTR30_C1_INSOLUBLE_01,CTR,C1,CTR30 +CTR30_C1_INSOLUBLE_02,CTR,C1,CTR30 +CTR30_C2_INSOLUBLE_01,CTR,C2,CTR30 +CTR31_C1_INSOLUBLE_01,CTR,C1,CTR31 +CTR31_C2_INSOLUBLE_01,CTR,C2,CTR31 +CTR32_C1_INSOLUBLE_01,CTR,C1,CTR32 +CTR32_C2_INSOLUBLE_01,CTR,C2,CTR32 +CTR33_C1_INSOLUBLE_01,CTR,C1,CTR33 +CTR34_C1_INSOLUBLE_01,CTR,C1,CTR34 +CTR34_C2_INSOLUBLE_01,CTR,C2,CTR34 +CTR35_C1_INSOLUBLE_01,CTR,C1,CTR35 +CTR36_C1_INSOLUBLE_01,CTR,C1,CTR36 +CTR36_C1_INSOLUBLE_02,CTR,C1,CTR36 +CTR37_C1_INSOLUBLE_01,CTR,C1,CTR37 +CTR38_C1_INSOLUBLE_01,CTR,C1,CTR38 +CTR39_C1_INSOLUBLE_01,CTR,C1,CTR39 +CTR40_C1_INSOLUBLE_01,CTR,C1,CTR40 +CTR40_C1_INSOLUBLE_02,CTR,C1,CTR40 +CTR40_C1_INSOLUBLE_03,CTR,C1,CTR40 +CTR41_C1_INSOLUBLE_01,CTR,C1,CTR41 +CTR41_C1_INSOLUBLE_02,CTR,C1,CTR41 +CTR41_C1_INSOLUBLE_03,CTR,C1,CTR41 +CTR42_C1_INSOLUBLE_01,CTR,C1,CTR42 +CTR42_C1_INSOLUBLE_02,CTR,C1,CTR42 +CTR42_C1_INSOLUBLE_03,CTR,C1,CTR42 +CTR43_C2_INSOLUBLE_01,CTR,C2,CTR43 +CTR44_C1_INSOLUBLE_01,CTR,C1,CTR44 diff --git a/protzilla/data_analysis/differential_expression_mann_whitney.py b/protzilla/data_analysis/differential_expression_mann_whitney.py index a7a06c2e..93506724 100644 --- a/protzilla/data_analysis/differential_expression_mann_whitney.py +++ b/protzilla/data_analysis/differential_expression_mann_whitney.py @@ -4,21 +4,24 @@ import pandas as pd from scipy import stats -from protzilla.data_analysis.differential_expression_helper import _map_log_base, apply_multiple_testing_correction, \ - merge_differential_expression_and_significant_df, normalize_ptm_df +from protzilla.data_analysis.differential_expression_helper import ( + _map_log_base, + apply_multiple_testing_correction, + normalize_ptm_df, +) from protzilla.utilities.transform_dfs import long_to_wide def mann_whitney_test_on_intensity_data( - protein_df: pd.DataFrame, - metadata_df: pd.DataFrame, - grouping: str, - group1: str, - group2: str, - log_base: str = None, - alpha=0.05, - multiple_testing_correction_method: str = "Benjamini-Hochberg", - p_value_calculation_method: str = "auto" + protein_df: pd.DataFrame, + metadata_df: pd.DataFrame, + grouping: str, + group1: str, + group2: str, + log_base: str = None, + alpha=0.05, + multiple_testing_correction_method: str = "Benjamini-Hochberg", + p_value_calculation_method: str = "auto", ) -> dict: """ Perform Mann-Whitney U test on all proteins in the given intensity data frame. @@ -57,15 +60,26 @@ def mann_whitney_test_on_intensity_data( alpha=alpha, multiple_testing_correction_method=multiple_testing_correction_method, columns_name="Protein ID", - p_value_calculation_method=p_value_calculation_method + p_value_calculation_method=p_value_calculation_method, + ) + differentially_expressed_proteins_df = pd.merge( + protein_df, + outputs["differential_expressed_columns_df"], + on="Protein ID", + how="left", ) - differentially_expressed_proteins_df = pd.merge(protein_df, outputs["differential_expressed_columns_df"], on="Protein ID", how="left") differentially_expressed_proteins_df = differentially_expressed_proteins_df.loc[ - differentially_expressed_proteins_df["Protein ID"].isin(outputs["differential_expressed_columns_df"]["Protein ID"]) + differentially_expressed_proteins_df["Protein ID"].isin( + outputs["differential_expressed_columns_df"]["Protein ID"] + ) ] - significant_proteins_df = pd.merge(protein_df, outputs["significant_columns_df"], on="Protein ID", how="left") + significant_proteins_df = pd.merge( + protein_df, outputs["significant_columns_df"], on="Protein ID", how="left" + ) significant_proteins_df = significant_proteins_df.loc[ - significant_proteins_df["Protein ID"].isin(outputs["significant_columns_df"]["Protein ID"]) + significant_proteins_df["Protein ID"].isin( + outputs["significant_columns_df"]["Protein ID"] + ) ] return dict( @@ -80,14 +94,14 @@ def mann_whitney_test_on_intensity_data( def mann_whitney_test_on_ptm_data( - ptm_df: pd.DataFrame, - metadata_df: pd.DataFrame, - grouping: str, - group1: str, - group2: str, - alpha=0.05, - multiple_testing_correction_method: str = "Benjamini-Hochberg", - p_value_calculation_method: str = "auto" + ptm_df: pd.DataFrame, + metadata_df: pd.DataFrame, + grouping: str, + group1: str, + group2: str, + alpha=0.05, + multiple_testing_correction_method: str = "Benjamini-Hochberg", + p_value_calculation_method: str = "auto", ) -> dict: """ Perform Mann-Whitney U test on all PTMs in the given PTM data frame. @@ -126,7 +140,7 @@ def mann_whitney_test_on_ptm_data( alpha=alpha, multiple_testing_correction_method=multiple_testing_correction_method, columns_name="PTM", - p_value_calculation_method=p_value_calculation_method + p_value_calculation_method=p_value_calculation_method, ) return dict( @@ -141,16 +155,16 @@ def mann_whitney_test_on_ptm_data( def mann_whitney_test_on_columns( - df: pd.DataFrame, - metadata_df: pd.DataFrame, - grouping: str, - group1: str, - group2: str, - log_base: str = None, - alpha=0.05, - multiple_testing_correction_method: str = "Benjamini-Hochberg", - columns_name: str = "Protein ID", - p_value_calculation_method: str = "auto" + df: pd.DataFrame, + metadata_df: pd.DataFrame, + grouping: str, + group1: str, + group2: str, + log_base: str = None, + alpha=0.05, + multiple_testing_correction_method: str = "Benjamini-Hochberg", + columns_name: str = "Protein ID", + p_value_calculation_method: str = "auto", ) -> dict: """ Perform Mann-Whitney U test on all columns of the data frame. @@ -197,8 +211,12 @@ def mann_whitney_test_on_columns( for column in data_columns: group1_data = df_with_groups[df_with_groups[grouping] == group1][column] group2_data = df_with_groups[df_with_groups[grouping] == group2][column] - u_statistic, p_value = ( - stats.mannwhitneyu(group1_data, group2_data, alternative="two-sided", method=p_value_calculation_method)) + u_statistic, p_value = stats.mannwhitneyu( + group1_data, + group2_data, + alternative="two-sided", + method=p_value_calculation_method, + ) if not np.isnan(p_value): log2_fold_change = ( @@ -243,9 +261,13 @@ def mann_whitney_test_on_columns( significant_columns_df = combined_df[ combined_df["corrected_p_value"] <= corrected_alpha - ] + ] - messages = [dict(level=logging.INFO, msg=f"Invalid columns: {invalid_columns}")] if invalid_columns else [] + messages = ( + [dict(level=logging.INFO, msg=f"Invalid columns: {invalid_columns}")] + if invalid_columns + else [] + ) return dict( differential_expressed_columns_df=combined_df, diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py new file mode 100644 index 00000000..a2192c2a --- /dev/null +++ b/protzilla/data_analysis/power_analysis.py @@ -0,0 +1,520 @@ +import math + +import numpy as np +import pandas as pd +from scipy import stats +import plotly.express as px +import plotly.graph_objs as go +import protzilla.constants.colors as colorscheme + +from ..constants.colors import PLOT_COLOR_SEQUENCE +from protzilla.utilities import default_intensity_column + + +def variance_protein_group_calculation_max( + intensity_df: pd.DataFrame, + protein_id: str, + group1: str, + group2: str, + intensity_name: str = None, +) -> float: + """ + Function to calculate the variance of a protein group for the two classes and return the maximum variance. + + :param intensity_df: The dataframe containing the protein group intensities. + :param protein_id: The protein ID. + :param group1: The name of the first group. + :param group2: The name of the second group. + :param intensity_name: The name of the column containing the protein group intensities. + :return: The variance of the protein group. + """ + intensity_name = default_intensity_column(intensity_df, intensity_name) + protein_group = intensity_df[intensity_df["Protein ID"] == protein_id] + + group1_intensities = protein_group[protein_group["Group"] == group1][ + intensity_name + ].values + group2_intensities = protein_group[protein_group["Group"] == group2][ + intensity_name + ].values + + variance_group1 = np.var(group1_intensities, ddof=1) + variance_group2 = np.var(group2_intensities, ddof=1) + + max_variance = max(variance_group1, variance_group2) + + return max_variance + + +def sample_size_calculation( + differentially_expressed_proteins_df: pd.DataFrame, + significant_proteins_df: pd.DataFrame, + metadata_df: pd.DataFrame, + fc_threshold: float, + alpha: float, + power: float, + group1: str, + group2: str, + selected_protein_group: str, + individual_column: str, + intensity_name: str = None, +) -> dict: + """ + Function to calculate the required sample size for a selected protein to achieve the desired statistical power. + If metadata_df contains a column that identifies individuals, the function first calculates the mean intensity for + each individual (based on replicates) within the dataset. These individual means are used to determine the variance + for the sample size calculation formula. + + :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. + :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. + :param metadata_df: The dataframe containing the clinical data. + :param fc_threshold: The fold change threshold. + :param alpha: The significance level. The value for alpha is taken from the t-test by default. + :param power: The power of the test. + :param group1: The name of the first group. + :param group2: The name of the second group. + :param selected_protein_group: The selected protein group for which the required sample size is to be calculated. + :param individual_column: The name of the column in metadata_df containing the individual ID. + :param intensity_name: The name of the column containing the protein group intensities. + :return: The required sample size. + """ + + if ( + selected_protein_group not in significant_proteins_df["Protein ID"].values + and selected_protein_group + not in differentially_expressed_proteins_df["Protein ID"].values + ): + raise ValueError("Please select a valid protein group.") + protein_group = selected_protein_group + z_alpha = stats.norm.ppf(1 - alpha / 2) + z_beta = stats.norm.ppf(power) + + intensity_name = default_intensity_column( + differentially_expressed_proteins_df, intensity_name + ) + filtered_protein_group_df = differentially_expressed_proteins_df[ + differentially_expressed_proteins_df["Protein ID"] == protein_group + ] + + if individual_column != "None" and individual_column in metadata_df.columns: + # filtered_protein_group_df["Individual"] = filtered_protein_group_df["Sample"].apply(lambda x: x[:4]) + filtered_protein_group_merged_df = pd.merge( + filtered_protein_group_df, + metadata_df[["Sample", individual_column]], + on="Sample", + ) + + filtered_protein_group_df = ( + filtered_protein_group_merged_df.groupby( + ["Protein ID", "Group", individual_column] + )[intensity_name] + .mean() + .reset_index() + ) + + variance_protein_group = variance_protein_group_calculation_max( + intensity_df=filtered_protein_group_df, + protein_id=protein_group, + group1=group1, + group2=group2, + intensity_name=intensity_name, + ) + + required_sample_size = ( + 2 * ((z_alpha + z_beta) / fc_threshold) ** 2 * variance_protein_group + ) # Equation (1) in Cairns, David A., et al., 2008, Sample size determination in clinical proteomic profiling experiments using mass spectrometry for class comparison + required_sample_size = math.ceil(required_sample_size) + print(required_sample_size) + + return dict(required_sample_size=required_sample_size, + variance_protein_group=variance_protein_group) #TODO: remove this line before merging into main + + +def power_calculation( + differentially_expressed_proteins_df: pd.DataFrame, + significant_proteins_df: pd.DataFrame, + metadata_df: pd.DataFrame, + alpha: float, + fc_threshold: float, + group1: str, + group2: str, + selected_protein_group: str, + individual_column: str, + intensity_name: str = None, +) -> dict: + """ + Function to calculate the power of the t-test for a selected protein group. + If metadata_df contains a column that identifies individuals, the function first calculates the mean intensity for + each individual (based on replicates) within the dataset. These individual means are used to determine the variance + for the power calculation formula. + If both groups have different numbers of samples, the sample size for the power formula is calculated according + to the equation 2.3.1 from Cohen 1988, Statistical Power Analysis for the Behavioral Sciences. + + :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. + :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. + :param metadata_df: The dataframe containing the clinical data. + :param alpha: The significance level. The value for alpha is taken from the t-test by default. + :param fc_threshold: The fold change threshold. + :param group1: The name of the first group. + :param group2: The name of the second group. + :param selected_protein_group: The selected protein group for which the power is to be calculated. + :param individual_column: The name of the column in metadata_df containing the individual ID. + :param intensity_name: The name of the column containing the protein group intensities. + :return: The power of the test. + """ + if ( + selected_protein_group not in significant_proteins_df["Protein ID"].values + and selected_protein_group + not in differentially_expressed_proteins_df["Protein ID"].values + ): + raise ValueError("Please select a valid protein group.") + protein_group = selected_protein_group + z_alpha = stats.norm.ppf(1 - alpha / 2) + + intensity_name = default_intensity_column( + differentially_expressed_proteins_df, intensity_name + ) + filtered_protein_group_df = differentially_expressed_proteins_df[ + differentially_expressed_proteins_df["Protein ID"] == protein_group + ] + if individual_column != "None" and individual_column in metadata_df.columns: + filtered_protein_group_merged_df = pd.merge( + filtered_protein_group_df, + metadata_df[["Sample", individual_column]], + on="Sample", + ) + + filtered_protein_group_df = ( + filtered_protein_group_merged_df.groupby( + ["Protein ID", "Group", individual_column] + )[intensity_name] + .mean() + .reset_index() + ) + filtered_protein_group_df = filtered_protein_group_df.rename( + columns={individual_column: "Sample"} + ) + + variance_protein_group = variance_protein_group_calculation_max( + intensity_df=filtered_protein_group_df, + protein_id=protein_group, + group1=group1, + group2=group2, + intensity_name=intensity_name, + ) + + group_count_df = filtered_protein_group_df.groupby(["Group", "Protein ID"])[ + "Sample" + ].count() + sample_size_group1 = group_count_df[group1][0] + sample_size_group2 = group_count_df[group2][0] + sample_size = (2 * sample_size_group1 * sample_size_group2) / ( + sample_size_group1 + sample_size_group2 + ) # Equation 2.3.1 from Cohen 1988, Statistical Power Analysis for the Behavioral Sciences + z_beta = ( + fc_threshold * np.sqrt(sample_size / (2 * variance_protein_group)) - z_alpha + ) + power = float(round(stats.norm.cdf(z_beta), 2)) + + return dict(power=power) + +def sample_size_calculation_for_all_proteins( + differentially_expressed_proteins_df: pd.DataFrame, + significant_proteins_df: pd.DataFrame, + significant_proteins_only: str, + metadata_df: pd.DataFrame, + fc_threshold: float, + alpha: float, + power: float, + group1: str, + group2: str, + individual_column: str, + select_all_proteins: bool, + selected_protein_groups: list, + intensity_name: str = None, +) -> dict: + """ + Function to calculate the required sample size for all proteins in the dataset to achieve the required power. + + :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. + :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. + :param significant_proteins_only: A boolean indicating whether only significant proteins should be considered. + :param metadata_df: The dataframe containing the clinical data. + :param fc_threshold: The fold change threshold. + :param alpha: The significance level. The value for alpha is taken from the t-test by default. + :param power: The power of the test. + :param group1: The name of the first group. + :param group2: The name of the second group. + :param individual_column: The name of the column in metadata_df containing the individual ID. + :param select_all_proteins: A boolean indicating whether all proteins should be considered. + :param selected_protein_groups: A list of selected protein groups, if not all proteins should be considered. + :param intensity_name: The name of the column containing the protein group intensities. + + :return: + - required_sample_size_for_all_proteins: The maximum required sample size for all proteins. + - a violin plot showing the distribution of required sample sizes for all proteins. + - a df differentially_expressed_proteins_df from t-test output with added sample size column. + - a df significant_proteins_df from t-test output with added sample size column. + - a df sample_size_dataframe containing the sample sizes for all proteins. + """ + + if select_all_proteins and significant_proteins_only == "No": + protein_groups_for_calculation = differentially_expressed_proteins_df[ + "Protein ID" + ].unique() + elif select_all_proteins and significant_proteins_only == "Yes": + protein_groups_for_calculation = significant_proteins_df["Protein ID"].unique() + else: + protein_groups_for_calculation = selected_protein_groups + + required_sample_sizes = [] + + for protein_group in protein_groups_for_calculation: + required_sample_size = sample_size_calculation( + differentially_expressed_proteins_df=differentially_expressed_proteins_df, + significant_proteins_df=significant_proteins_df, + metadata_df=metadata_df, + fc_threshold=fc_threshold, + alpha=alpha, + power=power, + group1=group1, + group2=group2, + selected_protein_group=protein_group, + individual_column=individual_column, + intensity_name=intensity_name, + )["required_sample_size"] + + required_sample_sizes.append(required_sample_size) + + required_sample_size_for_all_proteins = max(required_sample_sizes) + + #TODO: remove before merging into main + required_sample_size_above_threshold = [] + + for protein_group in protein_groups_for_calculation: + required_sample_size = sample_size_calculation( + differentially_expressed_proteins_df=differentially_expressed_proteins_df, + significant_proteins_df=significant_proteins_df, + metadata_df=metadata_df, + fc_threshold=fc_threshold, + alpha=alpha, + power=power, + group1=group1, + group2=group2, + selected_protein_group=protein_group, + individual_column=individual_column, + intensity_name=intensity_name, + )["required_sample_size"] + + if required_sample_size > 44: + required_sample_size_above_threshold.append({"Protein ID": protein_group, "Required Sample Size": required_sample_size}) + + num_proteins_above_threshold = len(required_sample_size_above_threshold) + print(num_proteins_above_threshold) + num_required_sample_sizes = len(required_sample_sizes) + print(num_required_sample_sizes) + + + variance_protein_group_all = [] + for protein_group in protein_groups_for_calculation: + variance_protein_group = sample_size_calculation( + differentially_expressed_proteins_df=differentially_expressed_proteins_df, + significant_proteins_df=significant_proteins_df, + metadata_df=metadata_df, + fc_threshold=fc_threshold, + alpha=alpha, + power=power, + group1=group1, + group2=group2, + selected_protein_group=protein_group, + individual_column=individual_column, + intensity_name=intensity_name, + )["variance_protein_group"] + + variance_protein_group_all.append(variance_protein_group) + + variance_mean = np.mean(variance_protein_group_all) + print(variance_mean) + + #end of lines that should be removed before merging + + colors = colorscheme.PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE + + fig = go.Figure( + go.Violin( + name="" * len(required_sample_sizes), + y=required_sample_sizes, + line_color=colors[1], + meanline_visible=True, + box_visible=True, + scalemode="width", + spanmode="hard", + span=[0, required_sample_size_for_all_proteins], + hoverinfo="y", + ) + ) + + fig.update_layout( + title="Distribution of Required Sample Sizes for All Proteins", + yaxis_title="Required Sample Size", + showlegend=False, + ) + sample_size_dataframe = pd.DataFrame(protein_groups_for_calculation) + sample_size_dataframe.columns = ["Protein ID"] + sample_size_dataframe["Sample Size"] = required_sample_sizes + + if select_all_proteins and significant_proteins_only == "No": + differentially_expressed_proteins_df = pd.merge( + differentially_expressed_proteins_df, + sample_size_dataframe, + on="Protein ID", + ) + elif select_all_proteins and significant_proteins_only == "Yes": + significant_proteins_df = pd.merge( + significant_proteins_df, + sample_size_dataframe, + on="Protein ID", + ) + + return dict( + required_sample_size_for_all_proteins=required_sample_size_for_all_proteins, + plots=[fig], + differentially_expressed_proteins_df=differentially_expressed_proteins_df, + significant_proteins_df=significant_proteins_df, + sample_size_dataframe=sample_size_dataframe, + ) + +def power_calculation_for_all_proteins( + differentially_expressed_proteins_df: pd.DataFrame, + significant_proteins_df: pd.DataFrame, + significant_proteins_only: str, + metadata_df: pd.DataFrame, + fc_threshold: float, + alpha: float, + group1: str, + group2: str, + individual_column: str, + select_all_proteins: bool, + selected_protein_groups: list, + intensity_name: str = None, +) -> dict: + """ + Function to calculate the power of the t-test for all proteins in the dataset. + + :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. + :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. + :param significant_proteins_only: A boolean indicating whether only significant proteins should be considered. + :param metadata_df: The dataframe containing the clinical data. + :param fc_threshold: The fold change threshold. + :param alpha: The significance level. The value for alpha is taken from the t-test by default. + :param group1: The name of the first group. + :param group2: The name of the second group. + :param individual_column: The name of the column in metadata_df containing the individual ID. + :param select_all_proteins: A boolean indicating whether all proteins should be considered. + :param selected_protein_groups: A list of selected protein groups, if not all proteins should be considered. + :param intensity_name: The name of the column containing the protein group intensities. + + :return: + - power_for_all_proteins: The minimum power of all proteins. + - a df differentially_expressed_proteins_df from t-test output with added power column. + - a df significant_proteins_df from t-test output with added power column. + - a df power_dataframe containing the power for all proteins. + """ + if select_all_proteins and significant_proteins_only == "No": + protein_groups_for_calculation = differentially_expressed_proteins_df[ + "Protein ID" + ].unique() + elif select_all_proteins and significant_proteins_only == "Yes": + protein_groups_for_calculation = significant_proteins_df["Protein ID"].unique() + else: + protein_groups_for_calculation = selected_protein_groups + + power_list = [] + + for protein_group in protein_groups_for_calculation: + power = power_calculation( + differentially_expressed_proteins_df=differentially_expressed_proteins_df, + significant_proteins_df=significant_proteins_df, + metadata_df=metadata_df, + fc_threshold=fc_threshold, + alpha=alpha, + group1=group1, + group2=group2, + selected_protein_group=protein_group, + individual_column=individual_column, + intensity_name=intensity_name, + )["power"] + + power_list.append(power) + + power_for_all_proteins = min(power_list) + + """ + power_below_threshold = [] + for protein_group in protein_groups_for_calculation: + power = power_calculation( + differentially_expressed_proteins_df=differentially_expressed_proteins_df, + significant_proteins_df=significant_proteins_df, + metadata_df=metadata_df, + fc_threshold=fc_threshold, + alpha=alpha, + group1=group1, + group2=group2, + selected_protein_group=protein_group, + individual_column=individual_column, + intensity_name=intensity_name, + )["power"] + power_list.append({"Protein ID": protein_group, "Power": power}) + if power < 0.8: + power_below_threshold.append({"Protein ID": protein_group, "Power": power}) + num_proteins_below_threshold = len(power_below_threshold) + print(num_proteins_below_threshold) + num_power_list = len(power_list) + print(num_power_list) + """ + + colors = colorscheme.PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE + + fig = go.Figure( + go.Violin( + name="" * len(power_list), + y=power_list, + line_color=colors[1], + meanline_visible=True, + box_visible=True, + scalemode="width", + spanmode="hard", + span=[power_for_all_proteins, 1], + hoverinfo="y" + ) + ) + + fig.update_layout( + title="Distribution of Power for All Proteins", + yaxis_title="Power", + showlegend=False, + ) + power_dataframe = pd.DataFrame(protein_groups_for_calculation) + power_dataframe.columns = ["Protein ID"] + power_dataframe["Power"] = power_list + + if select_all_proteins and significant_proteins_only == "No": + differentially_expressed_proteins_df = pd.merge( + differentially_expressed_proteins_df, + power_dataframe, + on="Protein ID", + ) + elif select_all_proteins and significant_proteins_only == "Yes": + significant_proteins_df = pd.merge( + significant_proteins_df, + power_dataframe, + on="Protein ID", + ) + + return dict( + power_for_all_proteins=power_for_all_proteins, + plots=[fig], + differentially_expressed_proteins_df=differentially_expressed_proteins_df, + significant_proteins_df=significant_proteins_df, + power_dataframe=power_dataframe, + ) diff --git a/protzilla/data_analysis/ptm_analysis.py b/protzilla/data_analysis/ptm_analysis.py index 6a19a9ca..8c6a39d0 100644 --- a/protzilla/data_analysis/ptm_analysis.py +++ b/protzilla/data_analysis/ptm_analysis.py @@ -1,15 +1,15 @@ import logging -from math import log +import re import numpy as np import pandas as pd -import re from protzilla.utilities.transform_dfs import long_to_wide def select_peptides_of_protein( - peptide_df: pd.DataFrame, protein_ids: list[str], + peptide_df: pd.DataFrame, + protein_ids: list[str], ) -> dict: """ This function filters out all peptides with a PEP value (assigned to all samples @@ -23,15 +23,21 @@ def select_peptides_of_protein( filtered_peptide_dfs = [pd.DataFrame] * len(protein_ids) for i, protein_id in enumerate(protein_ids): - filtered_peptide_dfs[i] = peptide_df[peptide_df["Protein ID"].str.contains(protein_id)] + filtered_peptide_dfs[i] = peptide_df[ + peptide_df["Protein ID"].str.contains(protein_id) + ] filtered_peptides = pd.concat(filtered_peptide_dfs) return dict( peptide_df=filtered_peptides, - messages=[{ - "level": logging.INFO if len(filtered_peptides) > 0 else logging.WARNING, - "msg": f"Selected {len(filtered_peptides)} entry's from the peptide dataframe." - }], + messages=[ + { + "level": logging.INFO + if len(filtered_peptides) > 0 + else logging.WARNING, + "msg": f"Selected {len(filtered_peptides)} entry's from the peptide dataframe.", + } + ], ) @@ -47,7 +53,9 @@ def ptms_per_sample(peptide_df: pd.DataFrame) -> dict: modification_df = aggregate_ptms(peptide_df, ["Sample"]) - modification_df["Total Amount of Peptides"] = peptide_df.groupby("Sample").size().reset_index()[0] + modification_df["Total Amount of Peptides"] = ( + peptide_df.groupby("Sample").size().reset_index()[0] + ) return dict(ptm_df=modification_df) @@ -65,25 +73,28 @@ def ptms_per_protein_and_sample(peptide_df: pd.DataFrame) -> dict: modification_df = aggregate_ptms(peptide_df, ["Sample", "Protein ID"]) - modi = ( - modification_df.drop(["Sample", "Protein ID"], axis=1).apply(lambda x: ('(' + x.astype(str) + ') ' + x.name + ", "))) + modi = modification_df.drop(["Sample", "Protein ID"], axis=1).apply( + lambda x: ("(" + x.astype(str) + ") " + x.name + ", ") + ) for column, data in modi.iteritems(): modi[column] = np.where(modification_df[column] > 0, modi[column], "") - modification_df["Modifications"] = modi.apply(''.join, axis=1) - modification_df = modification_df[['Sample', 'Protein ID', 'Modifications']] + modification_df["Modifications"] = modi.apply("".join, axis=1) + modification_df = modification_df[["Sample", "Protein ID", "Modifications"]] - modification_df = long_to_wide(modification_df, "Modifications").fillna("").reset_index() + modification_df = ( + long_to_wide(modification_df, "Modifications").fillna("").reset_index() + ) return dict(ptm_df=modification_df) def aggregate_ptms(peptide_df: pd.DataFrame, group_by: list[str]): - modification_df = pd.concat( - [peptide_df[group_by], - (peptide_df['Modifications'].str.get_dummies(sep=","))], axis=1) + [peptide_df[group_by], (peptide_df["Modifications"].str.get_dummies(sep=","))], + axis=1, + ) for column, data in modification_df.iteritems(): amount, name = from_string(column) @@ -109,9 +120,9 @@ def from_string(mod_string: str) -> tuple[int, str]: :return: tuple containing the amount and name of the modification """ - re_search = re.search(r'\d+', mod_string) + re_search = re.search(r"\d+", mod_string) amount = int(re_search.group()) if re_search else 1 - name = re.search(r'\D+', mod_string).group() + name = re.search(r"\D+", mod_string).group() name = name[1:] if name[0] == " " else name return amount, name diff --git a/protzilla/data_integration/di_plots.py b/protzilla/data_integration/di_plots.py index be600dc0..ce5d9cde 100644 --- a/protzilla/data_integration/di_plots.py +++ b/protzilla/data_integration/di_plots.py @@ -7,8 +7,6 @@ from protzilla.constants.protzilla_logging import logger from protzilla.utilities.utilities import fig_to_base64 -from protzilla.constants.colors import PLOT_COLOR_SEQUENCE - def GO_enrichment_bar_plot( input_df, diff --git a/protzilla/data_preprocessing/transformation.py b/protzilla/data_preprocessing/transformation.py index fa3a955c..9f30e9c8 100644 --- a/protzilla/data_preprocessing/transformation.py +++ b/protzilla/data_preprocessing/transformation.py @@ -5,7 +5,9 @@ from protzilla.utilities import default_intensity_column -def by_log(protein_df: pd.DataFrame, peptide_df: pd.DataFrame | None, log_base="log10") -> dict: +def by_log( + protein_df: pd.DataFrame, peptide_df: pd.DataFrame | None, log_base="log10" +) -> dict: """ This function log-transforms intensity DataFrames. Supports log-transformation to the base diff --git a/protzilla/importing/ms_data_import.py b/protzilla/importing/ms_data_import.py index c3d9136f..8dce747b 100644 --- a/protzilla/importing/ms_data_import.py +++ b/protzilla/importing/ms_data_import.py @@ -11,7 +11,10 @@ def max_quant_import( - file_path: str, intensity_name: str, map_to_uniprot=False, aggregation_method: str ="Sum" + file_path: str, + intensity_name: str, + map_to_uniprot=False, + aggregation_method: str = "Sum", ) -> dict: assert intensity_name in ["Intensity", "iBAQ", "LFQ intensity"] try: @@ -34,15 +37,28 @@ def max_quant_import( c[len(intensity_name) + 1 :] for c in intensity_df.columns ] intensity_df = intensity_df.assign(**{"Protein ID": protein_groups}) - return transform_and_clean(intensity_df, intensity_name, map_to_uniprot, aggregation_method) + return transform_and_clean( + intensity_df, intensity_name, map_to_uniprot, aggregation_method + ) except Exception as e: msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid Max Quant file." - return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))]) + return dict( + messages=[ + dict( + level=logging.ERROR, + msg=msg, + trace=format_trace(traceback.format_exception(e)), + ) + ] + ) def ms_fragger_import( - file_path: str, intensity_name: str, map_to_uniprot=False, aggregation_method: str ="Sum" + file_path: str, + intensity_name: str, + map_to_uniprot=False, + aggregation_method: str = "Sum", ) -> dict: assert intensity_name in [ "Intensity", @@ -87,13 +103,25 @@ def ms_fragger_import( ) intensity_df = intensity_df.assign(**{"Protein ID": protein_groups}) - return transform_and_clean(intensity_df, intensity_name, map_to_uniprot, aggregation_method) + return transform_and_clean( + intensity_df, intensity_name, map_to_uniprot, aggregation_method + ) except Exception as e: msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid MS Fragger file." - return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))]) + return dict( + messages=[ + dict( + level=logging.ERROR, + msg=msg, + trace=format_trace(traceback.format_exception(e)), + ) + ] + ) -def diann_import(file_path, map_to_uniprot=False, aggregation_method: str ="Sum") -> dict: +def diann_import( + file_path, map_to_uniprot=False, aggregation_method: str = "Sum" +) -> dict: try: df = pd.read_csv( file_path, @@ -117,14 +145,27 @@ def diann_import(file_path, map_to_uniprot=False, aggregation_method: str ="Sum" intensity_name = "Intensity" - return transform_and_clean(intensity_df, intensity_name, map_to_uniprot, aggregation_method) + return transform_and_clean( + intensity_df, intensity_name, map_to_uniprot, aggregation_method + ) except Exception as e: msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid DIA-NN MS file." - return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))]) + return dict( + messages=[ + dict( + level=logging.ERROR, + msg=msg, + trace=format_trace(traceback.format_exception(e)), + ) + ] + ) def transform_and_clean( - df: pd.DataFrame, intensity_name: str, map_to_uniprot: bool, aggregation_method: str ="Sum" + df: pd.DataFrame, + intensity_name: str, + map_to_uniprot: bool, + aggregation_method: str = "Sum", ) -> dict: """ Transforms a dataframe that is read from a file in wide format into long format, @@ -158,7 +199,9 @@ def transform_and_clean( # applies the selected aggregation to duplicate protein groups, NaN if all are NaN, aggregation of numbers otherwise aggregation_method = aggregation_method.lower() agg_kwargs = {"sum": {"min_count": 1}, "median": {}, "mean": {}} - df = df.groupby("Protein ID", as_index=False).agg(aggregation_method, **agg_kwargs[aggregation_method]) + df = df.groupby("Protein ID", as_index=False).agg( + aggregation_method, **agg_kwargs[aggregation_method] + ) df = df.assign(Gene=lambda _: np.nan) # add deprecated genes column @@ -230,7 +273,7 @@ def clean_protein_groups(protein_groups, map_to_uniprot=True): all_ids_of_group.extend(new_ids) else: all_ids_of_group.append(old_id) - new_groups.append(all_ids_of_group[0] if all_ids_of_group else '') + new_groups.append(all_ids_of_group[0] if all_ids_of_group else "") return new_groups, removed_protein_ids diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 2d42e7b1..13d01fd2 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -7,15 +7,17 @@ k_means, ) from protzilla.data_analysis.differential_expression_anova import anova -from protzilla.data_analysis.differential_expression_kruskal_wallis import kruskal_wallis_test_on_ptm_data, \ - kruskal_wallis_test_on_intensity_data +from protzilla.data_analysis.differential_expression_kruskal_wallis import ( + kruskal_wallis_test_on_intensity_data, + kruskal_wallis_test_on_ptm_data, +) from protzilla.data_analysis.differential_expression_linear_model import linear_model from protzilla.data_analysis.differential_expression_mann_whitney import ( - mann_whitney_test_on_intensity_data, mann_whitney_test_on_ptm_data) + mann_whitney_test_on_intensity_data, + mann_whitney_test_on_ptm_data, +) from protzilla.data_analysis.differential_expression_t_test import t_test from protzilla.data_analysis.dimension_reduction import t_sne, umap -from protzilla.data_analysis.ptm_analysis import ptms_per_sample, \ - ptms_per_protein_and_sample, select_peptides_of_protein from protzilla.data_analysis.model_evaluation import evaluate_classification_model from protzilla.data_analysis.plots import ( clustergram_plot, @@ -23,11 +25,17 @@ prot_quant_plot, scatter_plot, ) +from protzilla.data_analysis.power_analysis import ( + power_calculation, + power_calculation_for_all_proteins, + sample_size_calculation, + sample_size_calculation_for_all_proteins, +) from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph from protzilla.data_analysis.ptm_analysis import ( - select_peptides_of_protein, ptms_per_protein_and_sample, ptms_per_sample, + select_peptides_of_protein, ) from protzilla.data_analysis.ptm_quantification import flexiquant_lf from protzilla.methods.data_preprocessing import TransformationLog @@ -113,8 +121,10 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: class DifferentialExpressionMannWhitneyOnIntensity(DataAnalysisStep): display_name = "Mann-Whitney Test" operation = "differential_expression" - method_description = ("A function to conduct a Mann-Whitney U test between groups defined in the clinical data." - "The p-values are corrected for multiple testing.") + method_description = ( + "A function to conduct a Mann-Whitney U test between groups defined in the clinical data." + "The p-values are corrected for multiple testing." + ) output_keys = [ "differentially_expressed_proteins_df", @@ -129,7 +139,9 @@ class DifferentialExpressionMannWhitneyOnIntensity(DataAnalysisStep): def insert_dataframes(self, steps: StepManager, inputs) -> dict: if steps.get_step_output(Step, "protein_df", inputs["protein_df"]) is not None: - inputs["protein_df"] = steps.get_step_output(Step, "protein_df", inputs["protein_df"]) + inputs["protein_df"] = steps.get_step_output( + Step, "protein_df", inputs["protein_df"] + ) inputs["metadata_df"] = steps.metadata_df inputs["log_base"] = steps.get_step_input(TransformationLog, "log_base") return inputs @@ -138,8 +150,10 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: class DifferentialExpressionMannWhitneyOnPTM(DataAnalysisStep): display_name = "Mann-Whitney Test" operation = "Peptide analysis" - method_description = ("A function to conduct a Mann-Whitney U test between groups defined in the clinical data." - "The p-values are corrected for multiple testing.") + method_description = ( + "A function to conduct a Mann-Whitney U test between groups defined in the clinical data." + "The p-values are corrected for multiple testing." + ) output_keys = [ "differentially_expressed_ptm_df", @@ -161,8 +175,10 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: class DifferentialExpressionKruskalWallisOnIntensity(DataAnalysisStep): display_name = "Kruskal-Wallis Test" operation = "differential_expression" - method_description = ("A function to conduct a Kruskal-Wallis test between groups defined in the clinical data." - "The p-values are corrected for multiple testing.") + method_description = ( + "A function to conduct a Kruskal-Wallis test between groups defined in the clinical data." + "The p-values are corrected for multiple testing." + ) output_keys = [ "differentially_expressed_proteins_df", @@ -182,8 +198,10 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: class DifferentialExpressionKruskalWallisOnIntensity(DataAnalysisStep): display_name = "Kruskal-Wallis Test" operation = "differential_expression" - method_description = ("A function to conduct a Kruskal-Wallis test between groups defined in the clinical data." - "The p-values are corrected for multiple testing.") + method_description = ( + "A function to conduct a Kruskal-Wallis test between groups defined in the clinical data." + "The p-values are corrected for multiple testing." + ) output_keys = [ "differentially_expressed_proteins_df", @@ -195,7 +213,9 @@ class DifferentialExpressionKruskalWallisOnIntensity(DataAnalysisStep): calc_method = staticmethod(kruskal_wallis_test_on_intensity_data) def insert_dataframes(self, steps: StepManager, inputs) -> dict: - inputs["protein_df"] = steps.get_step_output(Step, "protein_df", inputs["protein_df"]) + inputs["protein_df"] = steps.get_step_output( + Step, "protein_df", inputs["protein_df"] + ) inputs["metadata_df"] = steps.metadata_df inputs["log_base"] = steps.get_step_input(TransformationLog, "log_base") return inputs @@ -204,8 +224,10 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: class DifferentialExpressionKruskalWallisOnPTM(DataAnalysisStep): display_name = "Kruskal-Wallis Test" operation = "Peptide analysis" - method_description = ("A function to conduct a Kruskal-Wallis test between groups defined in the clinical data." - "The p-values are corrected for multiple testing.") + method_description = ( + "A function to conduct a Kruskal-Wallis test between groups defined in the clinical data." + "The p-values are corrected for multiple testing." + ) output_keys = [ "differentially_expressed_ptm_df", @@ -225,10 +247,12 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: class PlotVolcano(DataAnalysisStep): display_name = "Volcano Plot" operation = "plot" - method_description = ("Plots the results of a differential expression analysis in a volcano plot. The x-axis shows " - "the log2 fold change and the y-axis shows the -log10 of the corrected p-values. The user " - "can define a fold change threshold and an alpha level to highlight significant items.") - + method_description = ( + "Plots the results of a differential expression analysis in a volcano plot. The x-axis shows " + "the log2 fold change and the y-axis shows the -log10 of the corrected p-values. The user " + "can define a fold change threshold and an alpha level to highlight significant items." + ) + output_keys = [] plot_method = staticmethod(create_volcano_plot) @@ -258,7 +282,7 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: class PlotScatterPlot(DataAnalysisStep): display_name = "Scatter Plot" - operation = "plot" + operation = "plot" method_description = "Creates a scatter plot from data. This requires a dimension reduction method to be run first, as the input dataframe should contain only 2 or 3 columns." plot_method = staticmethod(scatter_plot) @@ -565,17 +589,23 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["metadata_df"] = steps.metadata_df if inputs["auto_select"]: - significant_proteins = ( - steps.get_step_output(DataAnalysisStep, "significant_proteins_df", inputs["protein_list"])) - index_of_most_significant_protein = significant_proteins['corrected_p_value'].idxmin() - most_significant_protein = significant_proteins.loc[index_of_most_significant_protein] + significant_proteins = steps.get_step_output( + DataAnalysisStep, "significant_proteins_df", inputs["protein_list"] + ) + index_of_most_significant_protein = significant_proteins[ + "corrected_p_value" + ].idxmin() + most_significant_protein = significant_proteins.loc[ + index_of_most_significant_protein + ] inputs["protein_id"] = [most_significant_protein["Protein ID"]] - self.messages.append({ - "level": logging.INFO, - "msg": - f"Selected the most significant Protein: {most_significant_protein['Protein ID']}, " - f"from {inputs['protein_list']}" - }) + self.messages.append( + { + "level": logging.INFO, + "msg": f"Selected the most significant Protein: {most_significant_protein['Protein ID']}, " + f"from {inputs['protein_list']}", + } + ) return inputs @@ -583,8 +613,10 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: class PTMsPerSample(DataAnalysisStep): display_name = "PTMs per Sample" operation = "Peptide analysis" - method_description = ("Analyze the post-translational modifications (PTMs) of a single protein of interest. " - "This function requires a peptide dataframe with PTM information.") + method_description = ( + "Analyze the post-translational modifications (PTMs) of a single protein of interest. " + "This function requires a peptide dataframe with PTM information." + ) output_keys = [ "ptm_df", @@ -602,8 +634,10 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: class PTMsProteinAndPerSample(DataAnalysisStep): display_name = "PTMs per Sample and Protein" operation = "Peptide analysis" - method_description = ("Analyze the post-translational modifications (PTMs) of all Proteins. " - "This function requires a peptide dataframe with PTM information.") + method_description = ( + "Analyze the post-translational modifications (PTMs) of all Proteins. " + "This function requires a peptide dataframe with PTM information." + ) output_keys = [ "ptm_df", @@ -615,4 +649,143 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["peptide_df"] = steps.get_step_output( Step, "peptide_df", inputs["peptide_df"] ) - return inputs \ No newline at end of file + return inputs + + +class PowerAnalysisPowerCalculation(DataAnalysisStep): + display_name = "Power Calculation" + operation = "Power Analysis" + method_description = "Calculates power of the test for given protein groups" + + output_keys = ["power"] + + calc_method = staticmethod(power_calculation) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["differentially_expressed_proteins_df"] = steps.get_step_output( + Step, "differentially_expressed_proteins_df", inputs["input_dict"] + ) + step = next( + s for s in steps.all_steps if s.instance_identifier == inputs["input_dict"] + ) + inputs["significant_proteins_df"] = steps.get_step_output( + Step, "significant_proteins_df", inputs["input_dict"] + ) + inputs["metadata_df"] = steps.metadata_df + inputs["alpha"] = step.inputs["alpha"] + inputs["group1"] = step.inputs["group1"] + inputs["group2"] = step.inputs["group2"] + return inputs + + def handle_calc_outputs(self, outputs : dict): + super().handle_calc_outputs(outputs) + self.display_output["power"] = f"Power of the test: {outputs['power']}" + + +class PowerAnalysisSampleSizeCalculation(DataAnalysisStep): + display_name = "Sample Size Calculation" + operation = "Power Analysis" + method_description = "Calculates sample size for given protein groups" + + output_keys = [ + "required_sample_size", + "variance_protein_group", # TODO: remove this line before merging into main + ] + + calc_method = staticmethod(sample_size_calculation) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["differentially_expressed_proteins_df"] = steps.get_step_output( + Step, "differentially_expressed_proteins_df", inputs["input_dict"] + ) + step = next( + s for s in steps.all_steps if s.instance_identifier == inputs["input_dict"] + ) + inputs["significant_proteins_df"] = steps.get_step_output( + Step, "significant_proteins_df", inputs["input_dict"] + ) + inputs["metadata_df"] = steps.metadata_df + inputs["alpha"] = step.inputs["alpha"] + inputs["group1"] = step.inputs["group1"] + inputs["group2"] = step.inputs["group2"] + return inputs + + def handle_calc_outputs(self, outputs: dict): + super().handle_calc_outputs(outputs) + self.display_output[ + "required_sample_size" + ] = f"Required Sample Size: {outputs['required_sample_size']}" + + +class PowerAnalysisSampleSizeCalculationForAllProteins(Step): + display_name = "Sample Size Calculation for All Proteins" + operation = "Power Analysis" + method_description = "Calculates sample size for a selected group of proteins and returns the maximum required sample size." + + output_keys = [ + "required_sample_size_for_all_proteins", + "differentially_expressed_proteins_df", + "sample_size_dataframe", + "significant_proteins_df", + ] + + plot_method = staticmethod(sample_size_calculation_for_all_proteins) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["differentially_expressed_proteins_df"] = steps.get_step_output( + Step, "differentially_expressed_proteins_df", inputs["input_dict"] + ) + step = next( + s for s in steps.all_steps if s.instance_identifier == inputs["input_dict"] + ) + inputs["significant_proteins_df"] = steps.get_step_output( + Step, "significant_proteins_df", inputs["input_dict"] + ) + inputs["metadata_df"] = steps.metadata_df + inputs["alpha"] = step.inputs["alpha"] + inputs["group1"] = step.inputs["group1"] + inputs["group2"] = step.inputs["group2"] + return inputs + + def handle_plot_outputs(self, outputs: dict): + super().handle_plot_outputs(outputs) + self.display_output[ + "required_sample_size_for_all_proteins" + ] = f"Required Sample Size for all Proteins: {outputs['required_sample_size_for_all_proteins']}" + + +class PowerAnalysisPowerCalculationForAllProteins(): + display_name = "Power Calculation for All Proteins" + operation = "Power Analysis" + method_description = "Calculates power for a selected group of proteins and returns the minimum power." + + output_keys = [ + "power_for_all_proteins", + "differentially_expressed_proteins_df", + "power_dataframe", + "significant_proteins_df", + ] + + plot_method = staticmethod(power_calculation_for_all_proteins) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["differentially_expressed_proteins_df"] = steps.get_step_output( + Step, "differentially_expressed_proteins_df", inputs["input_dict"] + ) + step = next( + s for s in steps.all_steps if s.instance_identifier == inputs["input_dict"] + ) + inputs["significant_proteins_df"] = steps.get_step_output( + Step, "significant_proteins_df", inputs["input_dict"] + ) + inputs["metadata_df"] = steps.metadata_df + inputs["alpha"] = step.inputs["alpha"] + inputs["group1"] = step.inputs["group1"] + inputs["group2"] = step.inputs["group2"] + return inputs + + def handle_plot_outputs(self, outputs: dict): + super().handle_plot_outputs(outputs) + self.display_output[ + "power_for_all_proteins" + ] = f"Power for all Proteins: {outputs['power_for_all_proteins']}" diff --git a/protzilla/methods/importing.py b/protzilla/methods/importing.py index 5efb24b3..cea01192 100644 --- a/protzilla/methods/importing.py +++ b/protzilla/methods/importing.py @@ -10,7 +10,7 @@ max_quant_import, ms_fragger_import, ) -from protzilla.importing.peptide_import import peptide_import, evidence_import +from protzilla.importing.peptide_import import evidence_import, peptide_import from protzilla.steps import Step, StepManager @@ -47,7 +47,9 @@ class DiannImport(ImportingStep): class MsFraggerImport(ImportingStep): display_name = "MS Fragger Combined Protein Import" operation = "Protein Data Import" - method_description = "Import the combined_protein.tsv file form output of MS Fragger" + method_description = ( + "Import the combined_protein.tsv file form output of MS Fragger" + ) output_keys = ["protein_df"] @@ -118,4 +120,4 @@ class EvidenceImport(ImportingStep): output_keys = ["peptide_df"] - calc_method = staticmethod(evidence_import) \ No newline at end of file + calc_method = staticmethod(evidence_import) diff --git a/protzilla/steps.py b/protzilla/steps.py index 9448d26e..1fedcab7 100644 --- a/protzilla/steps.py +++ b/protzilla/steps.py @@ -10,8 +10,8 @@ from typing import Literal import pandas as pd -import plotly.io as pio import plotly.graph_objects as go +import plotly.io as pio from PIL import Image from protzilla.utilities import format_trace @@ -30,7 +30,9 @@ class Step: operation: str = None method_description: str = None output_keys: list[str] = [] - calculation_status: Literal["complete", "outdated", "incomplete", "failed"] = "incomplete" + calculation_status: Literal[ + "complete", "outdated", "incomplete", "failed" + ] = "incomplete" def __init__(self, instance_identifier: str | None = None): self.form_inputs: dict = {} @@ -38,6 +40,7 @@ def __init__(self, instance_identifier: str | None = None): self.output: Output = Output() self.plots: Plots = Plots() self.messages: Messages = Messages([]) + self.display_output: DisplayOutput = DisplayOutput() self.instance_identifier = instance_identifier if self.instance_identifier is None: @@ -69,16 +72,15 @@ def calculate(self, steps: StepManager, inputs: dict) -> bool: :return: None """ stepIndex = steps.all_steps.index(self) - previousStep = steps.all_steps[stepIndex-1] - - if (previousStep.calculation_status == "outdated" ): - if not previousStep.calculate(steps,inputs): + previousStep = steps.all_steps[stepIndex - 1] + + if previousStep.calculation_status == "outdated": + if not previousStep.calculate(steps, inputs): return False - if (steps.current_step_index == stepIndex): + if steps.current_step_index == stepIndex: self.updateInputs(inputs) self.messages.clear() - try: self.insert_dataframes(steps, self.inputs) @@ -88,9 +90,9 @@ def calculate(self, steps: StepManager, inputs: dict) -> bool: self.validate_outputs() self.calculation_status = "complete" - if (steps.failed_step_index == stepIndex): + if steps.failed_step_index == stepIndex: steps.failed_step_index = -1 - + if self.plot_method: plot_output = self.plot_method(**self.plot_input) self.handle_plot_outputs(plot_output) @@ -130,7 +132,7 @@ def calculate(self, steps: StepManager, inputs: dict) -> bool: trace=format_trace(traceback.format_exception(e)), ) ) - + if self.calculation_status != "complete": self.calculation_status = "failed" steps.failed_step_index = stepIndex @@ -157,7 +159,7 @@ def handle_calc_outputs(self, outputs: dict) -> None: self.handle_messages(outputs) - def handle_plot_outputs(self, outputs: dict|list) -> None: + def handle_plot_outputs(self, outputs: dict | list) -> None: """ Handles the dictionary from the plot method and creates a Plots object from it. Responsible for clearing and setting the plots attribute of the class. @@ -167,14 +169,14 @@ def handle_plot_outputs(self, outputs: dict|list) -> None: if not isinstance(outputs, dict) and not isinstance(outputs, list): raise TypeError("Output of plot method is not a dictionary or a list.") - + if isinstance(outputs, dict): plots = outputs.pop("plots", []) self.output.output.update(outputs) self.handle_messages(outputs) else: plots = outputs - + self.plots = Plots(plots) def handle_messages(self, outputs: dict) -> None: @@ -188,7 +190,7 @@ def handle_messages(self, outputs: dict) -> None: self.messages.extend(messages) calc_method = None - plot_method = None # if the plot method uses the output of the calculation method, it should be prefixed with "output_" + plot_method = None # if the plot method uses the output of the calculation method, it should be prefixed with "output_" @property def calculation_input(self) -> dict: @@ -239,14 +241,13 @@ def validate_outputs(self, soft_check: bool = False) -> bool: :return: True if the outputs are valid, False otherwise :raises ValueError: If a required key is missing in the outputs """ - + print("Val0.0") for key in self.output_keys: print("Val0.5") if key not in self.output or self.output[key] is None: print("Val0.7") if not soft_check: - print("val1.0") raise ValueError( f"Output validation failed: missing output {key} in outputs." @@ -257,7 +258,6 @@ def validate_outputs(self, soft_check: bool = False) -> bool: class Output: - def __init__(self, output: dict = {}): if output is None: output = {} @@ -332,9 +332,10 @@ def export(self, settings: dict) -> list: :return: List of all exported plots. """ from ui.settings.plot_template import get_scale_factor + exports = [] format_ = settings["file_format"] - + for plot in self.plots: scale_factor = get_scale_factor(plot, settings) # For Plotly GO Figure @@ -372,6 +373,31 @@ def export(self, settings: dict) -> list: return exports +class DisplayOutput: + def __init__(self, display_output: dict = None): + if display_output is None: + display_output = {} + self.display_output = display_output + + def __iter__(self): + return iter(self.display_output) + + def __repr__(self): + return f"DisplayOutput: {self.display_output}" + + def __contains__(self, key): + return key in self.display_output + + def __getitem__(self, key): + return self.display_output[key] + + def __setitem__(self, key, value): + self.display_output[key] = value + + def is_empty(self) -> bool: + return len(self.display_output) == 0 + + class StepManager: def __repr__(self): return f"IMP: {self.importing} PRE: {self.data_preprocessing} ANA: {self.data_analysis} INT: {self.data_integration}" @@ -529,19 +555,19 @@ def all_steps_in_section(self, section: str) -> list[Step]: return self.sections[section] else: raise ValueError(f"Unknown section {section}") - + def set_steps_outdated(self, offset: int) -> None: count = 0 for step in self.following_steps[offset:]: - if (step.calculation_status == "complete"): + if step.calculation_status == "complete": step.calculation_status = "outdated" - count+=1 + count += 1 return count @property def previous_steps(self) -> list[Step]: return self.all_steps[: self.current_step_index] - + @property def following_steps(self) -> list[Step]: return self.all_steps[self.current_step_index :] @@ -587,7 +613,7 @@ def preprocessed_output(self) -> Output: if self.current_section == "data_preprocessing": return ( self.current_step.output - if self.current_step.calculation_status!="incomplete" + if self.current_step.calculation_status != "incomplete" else self.previous_steps[-1].output ) return self.data_preprocessing[-1].output diff --git a/protzilla/utilities/transform_dfs.py b/protzilla/utilities/transform_dfs.py index 59a83259..0ca48973 100644 --- a/protzilla/utilities/transform_dfs.py +++ b/protzilla/utilities/transform_dfs.py @@ -17,7 +17,9 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None): packages such as sklearn :rtype: pd.DataFrame """ - values_name = default_intensity_column(intensity_df) if value_name is None else value_name + values_name = ( + default_intensity_column(intensity_df) if value_name is None else value_name + ) return pd.pivot( intensity_df, index="Sample", columns="Protein ID", values=values_name ) diff --git a/tests/conftest.py b/tests/conftest.py index 7762bd7b..526b48b9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -207,30 +207,270 @@ def peptides_df(): def evidence_peptide_df(): df = pd.DataFrame( ( - ["Sample1", "Protein1", "SEQA", 1000000, "Unmodified", "_SEQA_", 1, 0.00001, "Raw_File_1"], - ["Sample1", "Protein2", "SEQB", 2000000, "Unmodified", "_SEQB_", None, 0.00002, "Raw_File_1"], - ["Sample1", "Protein2", "SEQC", 3000000, "Acetyl (Protein N-term)", "_(Acetyl (Protein N-term))SEQC_", None, 0.00003, "Raw_File_1"], - ["Sample1", "Protein2", "SEQD", 4000000, "Acetyl (Protein N-term),Oxidation (M)", "_(Acetyl (Protein N-term))SE(Oxidation (M))QD_", None, 0.00004, "Raw_File_1"], - ["Sample1", "Protein3", "SEQE", 5000000, "Unmodified", "_SEQE_", None, 0.00005, "Raw_File_1"], - ["Sample1", "Protein3", "SEQF", 6000000, "Unmodified", "_SEQF_", None, 0.00006, "Raw_File_1"], - ["Sample1", "Protein3", "SEQG", 7000000, "Unmodified", "_SEQG_", None, 0.00007, "Raw_File_1"], - ["Sample1", "Protein3", "SEQGG", 7000000, "Unmodified", "_SEQGG_", None, 0.00007, "Raw_File_1"], - ["Sample1", "Protein4", "SEQH", 8000000, "Unmodified", "_SEQH_", None, 0.00008, "Raw_File_1"], - ["Sample1", "Protein5", "SEQI", 9000000, "Unmodified", "_SEQI_", None, 0.00009, "Raw_File_1"], - ["Sample2", "Protein1", "SEQJ", 10000000, "Acetyl (Protein N-term)", "_(Acetyl (Protein N-term))SEQJ_", None, 0.0001, "Raw_File_2"], - ["Sample2", "Protein2", "SEQK", 11000000, "Unmodified", "_SEQK_", None, 0.00011, "Raw_File_2"], - ["Sample2", "Protein3", "SEQL", 12000000, "Unmodified", "_SEQL_", None, 0.00012, "Raw_File_2"], - ["Sample2", "Protein4", "SEQM", 13000000, "Unmodified", "_SEQM_", None, 0.00013, "Raw_File_2"], - ["Sample2", "Protein5", "SEQN", 14000000, "Unmodified", "_SEQN_", None, 0.00014, "Raw_File_2"], - ["Sample3", "Protein1", "SEQO", 15000000, "Unmodified", "_SEQO_", None, 0.00015, "Raw_File_3"], - ["Sample3", "Protein2", "SEQP", 16000000, "Unmodified", "_SEQP_", None, 0.00016, "Raw_File_3"], - ["Sample3", "Protein3", "SEQQ", 17000000, "Unmodified", "_SEQQ_", None, 0.00017, "Raw_File_3"], - ["Sample3", "Protein4", "SEQR", 18000000, "Unmodified", "_SEQR_", None, 0.00018, "Raw_File_3"], - ["Sample3", "Protein5", "SEQS", 19000000, "Unmodified", "_SEQS_", None, 0.00019, "Raw_File_3"], - ["Sample4", "Protein1", "SEQT", 20000000, "Unmodified", "_SEQT_", None, 0.0002, "Raw_File_4"], - ["Sample4", "Protein2", "SEQU", 21000000, "Unmodified", "_SEQU_", None, 0.00021, "Raw_File_4"], - ["Sample4", "Protein3", "SEQV", 22000000, "Unmodified", "_SEQV_", None, 0.00022, "Raw_File_4"], - ["Sample4", "Protein4", "SEQW", 23000000, "Unmodified", "_SEQW_", None, 0.00023, "Raw_File_4"], + [ + "Sample1", + "Protein1", + "SEQA", + 1000000, + "Unmodified", + "_SEQA_", + 1, + 0.00001, + "Raw_File_1", + ], + [ + "Sample1", + "Protein2", + "SEQB", + 2000000, + "Unmodified", + "_SEQB_", + None, + 0.00002, + "Raw_File_1", + ], + [ + "Sample1", + "Protein2", + "SEQC", + 3000000, + "Acetyl (Protein N-term)", + "_(Acetyl (Protein N-term))SEQC_", + None, + 0.00003, + "Raw_File_1", + ], + [ + "Sample1", + "Protein2", + "SEQD", + 4000000, + "Acetyl (Protein N-term),Oxidation (M)", + "_(Acetyl (Protein N-term))SE(Oxidation (M))QD_", + None, + 0.00004, + "Raw_File_1", + ], + [ + "Sample1", + "Protein3", + "SEQE", + 5000000, + "Unmodified", + "_SEQE_", + None, + 0.00005, + "Raw_File_1", + ], + [ + "Sample1", + "Protein3", + "SEQF", + 6000000, + "Unmodified", + "_SEQF_", + None, + 0.00006, + "Raw_File_1", + ], + [ + "Sample1", + "Protein3", + "SEQG", + 7000000, + "Unmodified", + "_SEQG_", + None, + 0.00007, + "Raw_File_1", + ], + [ + "Sample1", + "Protein3", + "SEQGG", + 7000000, + "Unmodified", + "_SEQGG_", + None, + 0.00007, + "Raw_File_1", + ], + [ + "Sample1", + "Protein4", + "SEQH", + 8000000, + "Unmodified", + "_SEQH_", + None, + 0.00008, + "Raw_File_1", + ], + [ + "Sample1", + "Protein5", + "SEQI", + 9000000, + "Unmodified", + "_SEQI_", + None, + 0.00009, + "Raw_File_1", + ], + [ + "Sample2", + "Protein1", + "SEQJ", + 10000000, + "Acetyl (Protein N-term)", + "_(Acetyl (Protein N-term))SEQJ_", + None, + 0.0001, + "Raw_File_2", + ], + [ + "Sample2", + "Protein2", + "SEQK", + 11000000, + "Unmodified", + "_SEQK_", + None, + 0.00011, + "Raw_File_2", + ], + [ + "Sample2", + "Protein3", + "SEQL", + 12000000, + "Unmodified", + "_SEQL_", + None, + 0.00012, + "Raw_File_2", + ], + [ + "Sample2", + "Protein4", + "SEQM", + 13000000, + "Unmodified", + "_SEQM_", + None, + 0.00013, + "Raw_File_2", + ], + [ + "Sample2", + "Protein5", + "SEQN", + 14000000, + "Unmodified", + "_SEQN_", + None, + 0.00014, + "Raw_File_2", + ], + [ + "Sample3", + "Protein1", + "SEQO", + 15000000, + "Unmodified", + "_SEQO_", + None, + 0.00015, + "Raw_File_3", + ], + [ + "Sample3", + "Protein2", + "SEQP", + 16000000, + "Unmodified", + "_SEQP_", + None, + 0.00016, + "Raw_File_3", + ], + [ + "Sample3", + "Protein3", + "SEQQ", + 17000000, + "Unmodified", + "_SEQQ_", + None, + 0.00017, + "Raw_File_3", + ], + [ + "Sample3", + "Protein4", + "SEQR", + 18000000, + "Unmodified", + "_SEQR_", + None, + 0.00018, + "Raw_File_3", + ], + [ + "Sample3", + "Protein5", + "SEQS", + 19000000, + "Unmodified", + "_SEQS_", + None, + 0.00019, + "Raw_File_3", + ], + [ + "Sample4", + "Protein1", + "SEQT", + 20000000, + "Unmodified", + "_SEQT_", + None, + 0.0002, + "Raw_File_4", + ], + [ + "Sample4", + "Protein2", + "SEQU", + 21000000, + "Unmodified", + "_SEQU_", + None, + 0.00021, + "Raw_File_4", + ], + [ + "Sample4", + "Protein3", + "SEQV", + 22000000, + "Unmodified", + "_SEQV_", + None, + 0.00022, + "Raw_File_4", + ], + [ + "Sample4", + "Protein4", + "SEQW", + 23000000, + "Unmodified", + "_SEQW_", + None, + 0.00023, + "Raw_File_4", + ], ), columns=[ "Sample", @@ -242,7 +482,7 @@ def evidence_peptide_df(): "Missed cleavages", "PEP", "Raw file", - ] + ], ) return df diff --git a/tests/protzilla/data_analysis/power_analysis_validation.py b/tests/protzilla/data_analysis/power_analysis_validation.py new file mode 100644 index 00000000..09586de8 --- /dev/null +++ b/tests/protzilla/data_analysis/power_analysis_validation.py @@ -0,0 +1,193 @@ +import math + +import numpy as np +import pandas as pd +from scipy import stats +from statsmodels.stats.power import TTestIndPower + + +def check_sample_size_calculation_with_libfunc( + differentially_expressed_proteins_df: pd.DataFrame, + significant_proteins_df: pd.DataFrame, + fc_threshold: float, + alpha: float, + power: float, + group1: str, + group2: str, + selected_protein_group: str, + intensity_name: str = None, +) -> dict: + """ + Function to calculate the required sample size for a selected protein to achieve the required power . + + :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. + :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. + :param fc_threshold: The fold change threshold. + :param alpha: The significance level. The value for alpha is taken from the t-test by default. + :param power: The power of the test. + :param group1: The name of the first group. + :param group2: The name of the second group. + :param selected_protein_group: The selected protein group for which the required sample size is to be calculated. + :param intensity_name: The name of the column containing the protein group intensities. + :return: The required sample size. + """ + + if ( + selected_protein_group not in significant_proteins_df["Protein ID"].values + and selected_protein_group + not in differentially_expressed_proteins_df["Protein ID"].values + ): + raise ValueError("Please select a valid protein group.") + + protein_group = differentially_expressed_proteins_df[ + differentially_expressed_proteins_df["Protein ID"] == selected_protein_group + ] + + group1_intensities = np.log2( + protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values + ) + group2_intensities = np.log2( + protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values + ) + variance_group1 = np.var(group1_intensities, ddof=1) + variance_group2 = np.var(group2_intensities, ddof=1) + + sd_pooled = math.sqrt((variance_group1 + variance_group2) / 2) + abs(group1_intensities.mean() - group2_intensities.mean()) + effect_size = (group1_intensities.mean() - group2_intensities.mean()) / sd_pooled + + obj = TTestIndPower() + required_sample_size = obj.solve_power( + effect_size=effect_size, + alpha=alpha, + power=power, + nobs1=None, + ratio=1.0, + alternative="two-sided", + ) + print(required_sample_size) + + required_sample_size = math.ceil(required_sample_size) + + return dict(required_sample_size=required_sample_size) + # required_sample_size = 2.27; pooled_sd = 0.23; effect_size = 4.39, mean_diff = 1.014 + + # impl: required_sample_size = 0.814; fc_threshold = 1.014; variance = 0.0534 + + +def check_sample_size_calculation_protzilla( + differentially_expressed_proteins_df: pd.DataFrame, + significant_proteins_df: pd.DataFrame, + fc_threshold: float, + alpha: float, + power: float, + group1: str, + group2: str, + selected_protein_group: str, + intensity_name: str = None, +) -> dict: + """ + Function to calculate the required sample size for a selected protein to achieve the required power . + + :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. + :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. + :param fc_threshold: The fold change threshold. + :param alpha: The significance level. The value for alpha is taken from the t-test by default. + :param power: The power of the test. + :param group1: The name of the first group. + :param group2: The name of the second group. + :param selected_protein_group: The selected protein group for which the required sample size is to be calculated. + :param intensity_name: The name of the column containing the protein group intensities. + :return: The required sample size. + """ + + if ( + selected_protein_group not in significant_proteins_df["Protein ID"].values + and selected_protein_group + not in differentially_expressed_proteins_df["Protein ID"].values + ): + raise ValueError("Please select a valid protein group.") + + z_alpha = stats.norm.ppf(1 - alpha / 2) + z_beta = stats.norm.ppf(power) + protein_group = differentially_expressed_proteins_df[ + differentially_expressed_proteins_df["Protein ID"] == selected_protein_group + ] + + group1_intensities = protein_group[protein_group["Group"] == group1][ + "Normalised iBAQ" + ].values + group2_intensities = protein_group[protein_group["Group"] == group2][ + "Normalised iBAQ" + ].values + fc_threshold = abs(group1_intensities.mean() - group2_intensities.mean()) + variance_group1 = np.var(group1_intensities, ddof=1) + variance_group2 = np.var(group2_intensities, ddof=1) + + pooled_variance = (variance_group1 + variance_group2) / 2 + required_sample_size = ( + 2 * ((z_alpha + z_beta) / fc_threshold) ** 2 * pooled_variance + ) + required_sample_size = math.ceil(required_sample_size) + print(required_sample_size) + + return dict(required_sample_size=required_sample_size) + + +def check_sample_size_calculation_protzilla_without_log( + differentially_expressed_proteins_df: pd.DataFrame, + significant_proteins_df: pd.DataFrame, + fc_threshold: float, + alpha: float, + power: float, + group1: str, + group2: str, + selected_protein_group: str, + intensity_name: str = None, +) -> dict: + """ + Function to calculate the required sample size for a selected protein to achieve the required power . + + :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. + :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. + :param fc_threshold: The fold change threshold. + :param alpha: The significance level. The value for alpha is taken from the t-test by default. + :param power: The power of the test. + :param group1: The name of the first group. + :param group2: The name of the second group. + :param selected_protein_group: The selected protein group for which the required sample size is to be calculated. + :param intensity_name: The name of the column containing the protein group intensities. + :return: The required sample size. + """ + + if ( + selected_protein_group not in significant_proteins_df["Protein ID"].values + and selected_protein_group + not in differentially_expressed_proteins_df["Protein ID"].values + ): + raise ValueError("Please select a valid protein group.") + + z_alpha = stats.norm.ppf(1 - alpha / 2) + z_beta = stats.norm.ppf(power) + protein_group = differentially_expressed_proteins_df[ + differentially_expressed_proteins_df["Protein ID"] == selected_protein_group + ] + + group1_intensities = protein_group[protein_group["Group"] == group1][ + "Normalised iBAQ" + ].values + group2_intensities = protein_group[protein_group["Group"] == group2][ + "Normalised iBAQ" + ].values + fc_threshold = abs(group1_intensities.mean() - group2_intensities.mean()) + variance_group1 = np.var(group1_intensities, ddof=1) + variance_group2 = np.var(group2_intensities, ddof=1) + + pooled_variance = (variance_group1 + variance_group2) / 2 + required_sample_size = ( + 2 * ((z_alpha + z_beta) / fc_threshold) ** 2 * pooled_variance + ) + required_sample_size = math.ceil(required_sample_size) + print(required_sample_size) + + return dict(required_sample_size=required_sample_size) diff --git a/tests/protzilla/data_analysis/test_analysis_plots.py b/tests/protzilla/data_analysis/test_analysis_plots.py index eca37a85..d5ef673f 100644 --- a/tests/protzilla/data_analysis/test_analysis_plots.py +++ b/tests/protzilla/data_analysis/test_analysis_plots.py @@ -83,7 +83,9 @@ def test_plots_volcano_plot_no_annotation(ttest_input, ttest_output, show_figure fig.show() -def test_plots_volcano_plot_multiple_annotations(ttest_input, ttest_output, show_figures): +def test_plots_volcano_plot_multiple_annotations( + ttest_input, ttest_output, show_figures +): fig = create_volcano_plot( p_values=ttest_output["corrected_p_values_df"], log2_fc=ttest_output["log2_fold_change_df"], diff --git a/tests/protzilla/data_analysis/test_differential_expression.py b/tests/protzilla/data_analysis/test_differential_expression.py index 3ecf569e..b9e92246 100644 --- a/tests/protzilla/data_analysis/test_differential_expression.py +++ b/tests/protzilla/data_analysis/test_differential_expression.py @@ -6,12 +6,12 @@ from protzilla.data_analysis.differential_expression import ( anova, + kruskal_wallis_test_on_intensity_data, + kruskal_wallis_test_on_ptm_data, linear_model, - t_test, mann_whitney_test_on_intensity_data, mann_whitney_test_on_ptm_data, - kruskal_wallis_test_on_intensity_data, - kruskal_wallis_test_on_ptm_data + t_test, ) from protzilla.data_analysis.plots import create_volcano_plot @@ -72,8 +72,8 @@ def diff_expr_test_data(): def test_differential_expression_linear_model( - diff_expr_test_data, - show_figures, + diff_expr_test_data, + show_figures, ): test_intensity_df, test_metadata_df = diff_expr_test_data test_alpha = 0.05 @@ -117,8 +117,8 @@ def test_differential_expression_linear_model( assert p_values_rounded == corrected_p_values assert log2fc_rounded == log2_fc assert ( - list(current_out["differentially_expressed_proteins_df"]["Protein ID"].unique()) - == differentially_expressed_proteins + list(current_out["differentially_expressed_proteins_df"]["Protein ID"].unique()) + == differentially_expressed_proteins ) assert current_out["corrected_alpha"] == test_alpha @@ -170,13 +170,13 @@ def test_differential_expression_student_t_test(diff_expr_test_data, show_figure assert p_values_rounded == corrected_p_values assert ( - list(current_out["differentially_expressed_proteins_df"]["Protein ID"].unique()) - == differentially_expressed_proteins + list(current_out["differentially_expressed_proteins_df"]["Protein ID"].unique()) + == differentially_expressed_proteins ) assert current_out["corrected_alpha"] == test_alpha assert ( - list(current_out["significant_proteins_df"]["Protein ID"].unique()) - == significant_proteins + list(current_out["significant_proteins_df"]["Protein ID"].unique()) + == significant_proteins ) @@ -227,13 +227,13 @@ def test_differential_expression_welch_t_test(diff_expr_test_data, show_figures) assert p_values_rounded == corrected_p_values assert ( - list(current_out["differentially_expressed_proteins_df"]["Protein ID"].unique()) - == differentially_expressed_proteins + list(current_out["differentially_expressed_proteins_df"]["Protein ID"].unique()) + == differentially_expressed_proteins ) assert current_out["corrected_alpha"] == test_alpha assert ( - list(current_out["significant_proteins_df"]["Protein ID"].unique()) - == significant_proteins + list(current_out["significant_proteins_df"]["Protein ID"].unique()) + == significant_proteins ) @@ -400,8 +400,8 @@ def test_differential_expression_anova(show_figures): def test_differential_expression_mann_whitney_on_intensity( - diff_expr_test_data, - show_figures, + diff_expr_test_data, + show_figures, ): test_intensity_df, test_metadata_df = diff_expr_test_data test_alpha = 0.05 @@ -434,7 +434,12 @@ def test_differential_expression_mann_whitney_on_intensity( expected_corrected_p_values = [0.2, 0.4916, 1.0, 0.2] expected_u_statistics = [9.0, 7.0, 4.5, 9.0] expected_log2_fc = [-10.1926, -1.0, 0.0, -5.0] - expected_differentially_expressed_proteins = ["Protein1", "Protein2", "Protein3", "Protein4"] + expected_differentially_expressed_proteins = [ + "Protein1", + "Protein2", + "Protein3", + "Protein4", + ] p_values_rounded = [ round(x, 4) for x in current_out["corrected_p_values_df"]["corrected_p_value"] @@ -448,15 +453,15 @@ def test_differential_expression_mann_whitney_on_intensity( assert all(u_statistics == expected_u_statistics) assert log2fc_rounded == expected_log2_fc assert ( - list(current_out["differentially_expressed_proteins_df"]["Protein ID"].unique()) - == expected_differentially_expressed_proteins + list(current_out["differentially_expressed_proteins_df"]["Protein ID"].unique()) + == expected_differentially_expressed_proteins ) assert current_out["corrected_alpha"] == test_alpha def test_differential_expression_kruskal_wallis_on_intensity_three_groups( - diff_expr_test_data, - show_figures, + diff_expr_test_data, + show_figures, ): test_intensity_df, test_metadata_df = diff_expr_test_data test_alpha = 0.05 @@ -475,7 +480,12 @@ def test_differential_expression_kruskal_wallis_on_intensity_three_groups( expected_corrected_p_values = [0.175, 0.33, 0.5712, 0.175] expected_h_statistics = [4.963, 2.7925, 1.12, 4.8727] - expected_differentially_expressed_proteins = ["Protein1", "Protein2", "Protein3", "Protein4"] + expected_differentially_expressed_proteins = [ + "Protein1", + "Protein2", + "Protein3", + "Protein4", + ] p_values_rounded = [ round(x, 4) for x in current_out["corrected_p_values_df"]["corrected_p_value"] @@ -487,15 +497,15 @@ def test_differential_expression_kruskal_wallis_on_intensity_three_groups( assert p_values_rounded == expected_corrected_p_values assert h_statistics_rounded == expected_h_statistics assert ( - list(current_out["differentially_expressed_proteins_df"]["Protein ID"].unique()) - == expected_differentially_expressed_proteins + list(current_out["differentially_expressed_proteins_df"]["Protein ID"].unique()) + == expected_differentially_expressed_proteins ) assert current_out["corrected_alpha"] == test_alpha def test_differential_expression_kruskal_wallis_on_intensity_group_handling( - diff_expr_test_data, - show_figures, + diff_expr_test_data, + show_figures, ): test_intensity_df, test_metadata_df = diff_expr_test_data test_alpha = 0.05 @@ -513,15 +523,13 @@ def test_differential_expression_kruskal_wallis_on_intensity_group_handling( assert "messages" in current_out assert any( - message["level"] == logging.WARNING and - "Group \'wrong_group\' were not found in metadata" in message["msg"] - + message["level"] == logging.WARNING + and "Group 'wrong_group' were not found in metadata" in message["msg"] for message in current_out["messages"] ) assert any( - message["level"] == logging.WARNING and - "Auto-selected the groups \'Group1\', \'Group2\', \'Group3\'" in message["msg"] - + message["level"] == logging.WARNING + and "Auto-selected the groups 'Group1', 'Group2', 'Group3'" in message["msg"] for message in current_out["messages"] ) @@ -550,12 +558,18 @@ def ptm_test_data(): ["Sample13", 13, 7, 18, 3333, 100, 100], ["Sample14", 14, 8, 19, 4444, 100, 100], ["Sample15", 15, 9, 20, 5555, 100, 100], - - ) test_amount_df = pd.DataFrame( data=test_amount_list, - columns=["Sample", "Oxidation", "Acetyl", "GlyGly", "Phospho", "Unmodified", "Total Amount of Peptides"], + columns=[ + "Sample", + "Oxidation", + "Acetyl", + "GlyGly", + "Phospho", + "Unmodified", + "Total Amount of Peptides", + ], ) test_metadata_list = ( @@ -584,8 +598,8 @@ def ptm_test_data(): def test_differential_expression_mann_whitney_on_ptm( - ptm_test_data, - show_figures, + ptm_test_data, + show_figures, ): test_amount_df, test_metadata_df = ptm_test_data test_alpha = 0.05 @@ -620,15 +634,15 @@ def test_differential_expression_mann_whitney_on_ptm( assert all(u_statistics == expected_u_statistics) assert log2_fc_rounded == expected_log2_fc assert ( - list(current_out["significant_ptm_df"]["PTM"].unique()) - == expected_significant_ptms + list(current_out["significant_ptm_df"]["PTM"].unique()) + == expected_significant_ptms ) assert current_out["corrected_alpha"] == test_alpha def test_differential_expression_kruskal_wallis_on_ptm( - ptm_test_data, - show_figures, + ptm_test_data, + show_figures, ): test_amount_df, test_metadata_df = ptm_test_data test_alpha = 0.05 @@ -657,7 +671,7 @@ def test_differential_expression_kruskal_wallis_on_ptm( assert p_values_rounded == expected_corrected_p_values assert h_statistics_rounded == expected_h_statistics assert ( - list(current_out["significant_ptm_df"]["PTM"].unique()) - == expected_significant_ptms + list(current_out["significant_ptm_df"]["PTM"].unique()) + == expected_significant_ptms ) - assert current_out["corrected_alpha"] == test_alpha \ No newline at end of file + assert current_out["corrected_alpha"] == test_alpha diff --git a/tests/protzilla/data_analysis/test_filter_peptites_of_protein.py b/tests/protzilla/data_analysis/test_filter_peptites_of_protein.py index 23d739a2..afcccac4 100644 --- a/tests/protzilla/data_analysis/test_filter_peptites_of_protein.py +++ b/tests/protzilla/data_analysis/test_filter_peptites_of_protein.py @@ -1,13 +1,18 @@ -import pytest - from protzilla.data_analysis.ptm_analysis import select_peptides_of_protein def test_filter_peptides_of_protein(peptides_df): - filtered_peptides_df = select_peptides_of_protein(peptides_df, ["Protein2"])["peptide_df"] + filtered_peptides_df = select_peptides_of_protein(peptides_df, ["Protein2"])[ + "peptide_df" + ] assert len(filtered_peptides_df) == 6 assert filtered_peptides_df["Sequence"].tolist() == [ - "SEQB", "SEQC", "SEQD", "SEQK", "SEQP", "SEQU" + "SEQB", + "SEQC", + "SEQD", + "SEQK", + "SEQP", + "SEQU", ] - assert (filtered_peptides_df["Protein ID"] == "Protein2").all() \ No newline at end of file + assert (filtered_peptides_df["Protein ID"] == "Protein2").all() diff --git a/tests/protzilla/data_analysis/test_peptide_analysis.py b/tests/protzilla/data_analysis/test_peptide_analysis.py index 8e87de88..7f0af9ae 100644 --- a/tests/protzilla/data_analysis/test_peptide_analysis.py +++ b/tests/protzilla/data_analysis/test_peptide_analysis.py @@ -1,18 +1,28 @@ import pytest -from protzilla.data_analysis.ptm_analysis import select_peptides_of_protein, ptms_per_sample, \ - ptms_per_protein_and_sample +from protzilla.data_analysis.ptm_analysis import ( + ptms_per_protein_and_sample, + ptms_per_sample, + select_peptides_of_protein, +) @pytest.mark.parametrize("df_num", [0, 1]) def test_select_peptides_of_protein(peptides_df, evidence_peptide_df, df_num): peptide_df = [peptides_df, evidence_peptide_df][df_num] - filtered_peptides_df = select_peptides_of_protein(peptide_df, ["Protein2"])["peptide_df"] + filtered_peptides_df = select_peptides_of_protein(peptide_df, ["Protein2"])[ + "peptide_df" + ] assert len(filtered_peptides_df) == 6 assert filtered_peptides_df["Sequence"].tolist() == [ - 'SEQB', 'SEQC', 'SEQD', 'SEQK', 'SEQP', 'SEQU' + "SEQB", + "SEQC", + "SEQD", + "SEQK", + "SEQP", + "SEQU", ] assert (filtered_peptides_df["Protein ID"] == "Protein2").all() @@ -20,7 +30,13 @@ def test_select_peptides_of_protein(peptides_df, evidence_peptide_df, df_num): def test_ptms_per_sampel(evidence_peptide_df): ptm_df = ptms_per_sample(evidence_peptide_df)["ptm_df"] - assert ptm_df.columns.tolist() == ["Sample", "Acetyl (Protein N-term)", "Oxidation (M)", "Unmodified", "Total Amount of Peptides"] + assert ptm_df.columns.tolist() == [ + "Sample", + "Acetyl (Protein N-term)", + "Oxidation (M)", + "Unmodified", + "Total Amount of Peptides", + ] assert ptm_df["Sample"].tolist() == ["Sample1", "Sample2", "Sample3", "Sample4"] assert ptm_df["Unmodified"].tolist() == [8, 4, 5, 4] assert ptm_df["Acetyl (Protein N-term)"].tolist() == [2, 1, 0, 0] @@ -31,15 +47,42 @@ def test_ptms_per_sampel(evidence_peptide_df): def test_ptms_per_protein_and_sample(evidence_peptide_df): ptm_df = ptms_per_protein_and_sample(evidence_peptide_df)["ptm_df"] - assert ptm_df.columns.tolist() == ["Sample", "Protein1", "Protein2", "Protein3", "Protein4", "Protein5"] + assert ptm_df.columns.tolist() == [ + "Sample", + "Protein1", + "Protein2", + "Protein3", + "Protein4", + "Protein5", + ] assert ptm_df["Sample"].tolist() == ["Sample1", "Sample2", "Sample3", "Sample4"] - assert (ptm_df["Protein1"].tolist() == - ["(1) Unmodified, ", "(1) Acetyl (Protein N-term), ", "(1) Unmodified, ", "(1) Unmodified, "]) - assert (ptm_df["Protein2"].tolist() == - ["(2) Acetyl (Protein N-term), (1) Oxidation (M), (1) Unmodified, ", "(1) Unmodified, ", "(1) Unmodified, ", "(1) Unmodified, "]) - assert (ptm_df["Protein3"].tolist() == - ["(4) Unmodified, ", "(1) Unmodified, ", "(1) Unmodified, ", "(1) Unmodified, "]) - assert (ptm_df["Protein4"].tolist() == - ["(1) Unmodified, ", "(1) Unmodified, ", "(1) Unmodified, ", "(1) Unmodified, "]) - assert (ptm_df["Protein5"].tolist() == - ["(1) Unmodified, ", "(1) Unmodified, ", "(1) Unmodified, ", ""]) \ No newline at end of file + assert ptm_df["Protein1"].tolist() == [ + "(1) Unmodified, ", + "(1) Acetyl (Protein N-term), ", + "(1) Unmodified, ", + "(1) Unmodified, ", + ] + assert ptm_df["Protein2"].tolist() == [ + "(2) Acetyl (Protein N-term), (1) Oxidation (M), (1) Unmodified, ", + "(1) Unmodified, ", + "(1) Unmodified, ", + "(1) Unmodified, ", + ] + assert ptm_df["Protein3"].tolist() == [ + "(4) Unmodified, ", + "(1) Unmodified, ", + "(1) Unmodified, ", + "(1) Unmodified, ", + ] + assert ptm_df["Protein4"].tolist() == [ + "(1) Unmodified, ", + "(1) Unmodified, ", + "(1) Unmodified, ", + "(1) Unmodified, ", + ] + assert ptm_df["Protein5"].tolist() == [ + "(1) Unmodified, ", + "(1) Unmodified, ", + "(1) Unmodified, ", + "", + ] diff --git a/tests/protzilla/data_analysis/test_plots_data_analysis.py b/tests/protzilla/data_analysis/test_plots_data_analysis.py index 60403907..ae2d9957 100644 --- a/tests/protzilla/data_analysis/test_plots_data_analysis.py +++ b/tests/protzilla/data_analysis/test_plots_data_analysis.py @@ -101,14 +101,20 @@ def test_scatter_plot_4d_df(wide_4d_df, color_df): assert "messages" in outputs assert "plots" not in outputs - assert any("Consider reducing the dimensionality" in message["msg"] for message in outputs["messages"]) + assert any( + "Consider reducing the dimensionality" in message["msg"] + for message in outputs["messages"] + ) def test_scatter_plot_color_df_2d(show_figures, wide_2d_df): outputs = scatter_plot(wide_2d_df, wide_2d_df) assert "messages" in outputs assert "plots" not in outputs - assert any("The color dataframe should have 1 dimension only" in message["msg"] for message in outputs["messages"]) + assert any( + "The color dataframe should have 1 dimension only" in message["msg"] + for message in outputs["messages"] + ) def test_clustergram(show_figures, wide_4d_df, color_df): @@ -151,8 +157,10 @@ def test_clustergram_input_not_right_type(wide_4d_df): assert "messages" in outputs2 assert "plots" not in outputs2 assert any( - 'The selected input for "grouping dataframe" is not a dataframe, ' in message["msg"] - for message in outputs2["messages"]) + 'The selected input for "grouping dataframe" is not a dataframe, ' + in message["msg"] + for message in outputs2["messages"] + ) def test_clustergram_dimension_mismatch(wide_4d_df): @@ -176,7 +184,10 @@ def test_clustergram_dimension_mismatch(wide_4d_df): ) assert "messages" in outputs assert "plots" not in outputs - assert any("There is a dimension mismatch" in message["msg"] for message in outputs["messages"]) + assert any( + "There is a dimension mismatch" in message["msg"] + for message in outputs["messages"] + ) def test_clustergram_different_samples(wide_4d_df): @@ -200,6 +211,7 @@ def test_clustergram_different_samples(wide_4d_df): assert "messages" in outputs assert "plots" not in outputs assert any( - "The input dataframe and the grouping contain different samples" in message["msg"] + "The input dataframe and the grouping contain different samples" + in message["msg"] for message in outputs["messages"] - ) \ No newline at end of file + ) diff --git a/tests/protzilla/data_analysis/test_power_analysis.py b/tests/protzilla/data_analysis/test_power_analysis.py new file mode 100644 index 00000000..8d551610 --- /dev/null +++ b/tests/protzilla/data_analysis/test_power_analysis.py @@ -0,0 +1,344 @@ +"""import numpy as np +import pandas as pd +import pytest +import math +from scipy import stats + +from protzilla.data_analysis.power_analysis import ( + power_calculation, + sample_size_calculation, + variance_protein_group_calculation_max, +) +from tests.protzilla.data_analysis.power_analysis_validation import ( + check_sample_size_calculation_with_libfunc, + check_sample_size_calculation_protzilla, + check_sample_size_calculation_protzilla_without_log, +) +from test_differential_expression import diff_expr_test_data + + +@pytest.fixture +def power_test_data(): + test_differentially_expressed_proteins_list = ( + ["Sample1", "Protein1", "Gene1", 18, "Group1"], + ["Sample1", "Protein2", "Gene1", 16, "Group1"], + ["Sample1", "Protein3", "Gene1", 1, "Group1"], + ["Sample1", "Protein4", "Gene1", 14, "Group1"], + ["Sample2", "Protein1", "Gene1", 19, "Group1"], + ["Sample2", "Protein2", "Gene1", 15, "Group1"], + ["Sample2", "Protein3", "Gene1", 2, "Group1"], + ["Sample2", "Protein4", "Gene1", 15, "Group1"], + ["Sample3", "Protein1", "Gene1", 22, "Group1"], + ["Sample3", "Protein2", "Gene1", 14, "Group1"], + ["Sample3", "Protein3", "Gene1", 3, "Group1"], + ["Sample3", "Protein4", "Gene1", 16, "Group1"], + ["Sample4", "Protein1", "Gene1", 16, "Group1"], + ["Sample4", "Protein2", "Gene1", 14, "Group1"], + ["Sample4", "Protein3", "Gene1", 3, "Group1"], + ["Sample4", "Protein4", "Gene1", 16, "Group1"], + ["Sample5", "Protein1", "Gene1", 24, "Group1"], + ["Sample5", "Protein2", "Gene1", 14, "Group1"], + ["Sample5", "Protein3", "Gene1", 3, "Group1"], + ["Sample5", "Protein4", "Gene1", 16, "Group1"], + ["Sample6", "Protein1", "Gene1", 21, "Group1"], + ["Sample6", "Protein2", "Gene1", 14, "Group1"], + ["Sample6", "Protein3", "Gene1", 3, "Group1"], + ["Sample6", "Protein4", "Gene1", 16, "Group1"], + ["Sample7", "Protein1", "Gene1", 8, "Group2"], + ["Sample7", "Protein2", "Gene1", 15, "Group2"], + ["Sample7", "Protein3", "Gene1", 1, "Group2"], + ["Sample7", "Protein4", "Gene1", 9, "Group2"], + ["Sample8", "Protein1", "Gene1", 9, "Group2"], + ["Sample8", "Protein2", "Gene1", 14, "Group2"], + ["Sample8", "Protein3", "Gene1", 2, "Group2"], + ["Sample8", "Protein4", "Gene1", 10, "Group2"], + ["Sample9", "Protein1", "Gene1", 12, "Group2"], + ["Sample9", "Protein2", "Gene1", 13, "Group2"], + ["Sample9", "Protein3", "Gene1", 3, "Group2"], + ["Sample9", "Protein4", "Gene1", 11, "Group2"], + ["Sample10", "Protein1", "Gene1", 6, "Group2"], + ["Sample10", "Protein2", "Gene1", 13, "Group2"], + ["Sample10", "Protein3", "Gene1", 3, "Group2"], + ["Sample10", "Protein4", "Gene1", 11, "Group2"], + ["Sample11", "Protein1", "Gene1", 14, "Group2"], + ["Sample11", "Protein2", "Gene1", 13, "Group2"], + ["Sample11", "Protein3", "Gene1", 3, "Group2"], + ["Sample11", "Protein4", "Gene1", 11, "Group2"], + ["Sample12", "Protein1", "Gene1", 11, "Group2"], + ["Sample12", "Protein2", "Gene1", 13, "Group2"], + ["Sample12", "Protein3", "Gene1", 3, "Group2"], + ["Sample12", "Protein4", "Gene1", 11, "Group2"], + ) + + test_differentially_expressed_proteins_df = pd.DataFrame( + data=test_differentially_expressed_proteins_list, + columns=["Sample", "Protein ID", "Gene", "Normalised iBAQ", "Group"], + ) + return test_differentially_expressed_proteins_df + +@pytest.fixture +def power_test_data_intensity_values(): + test_differentially_expressed_proteins_list = ( + ["Sample1", "Protein1", "Gene1", -1.56714, "Group1"], + ["Sample1", "Protein2", "Gene1", 16, "Group1"], + ["Sample1", "Protein3", "Gene1", 1, "Group1"], + ["Sample1", "Protein4", "Gene1", 14, "Group1"], + ["Sample2", "Protein1", "Gene1", -0.37691, "Group1"], + ["Sample2", "Protein2", "Gene1", 15, "Group1"], + ["Sample2", "Protein3", "Gene1", 2, "Group1"], + ["Sample2", "Protein4", "Gene1", 15, "Group1"], + ["Sample3", "Protein1", "Gene1", 0.38817, "Group1"], + ["Sample3", "Protein2", "Gene1", 14, "Group1"], + ["Sample3", "Protein3", "Gene1", 3, "Group1"], + ["Sample3", "Protein4", "Gene1", 16, "Group1"], + ["Sample4", "Protein1", "Gene1", 1.6, "Group1"], + ["Sample4", "Protein2", "Gene1", 14, "Group1"], + ["Sample4", "Protein3", "Gene1", 3, "Group1"], + ["Sample4", "Protein4", "Gene1", 16, "Group1"], + ["Sample5", "Protein1", "Gene1", 1.9, "Group1"], + ["Sample5", "Protein2", "Gene1", 14, "Group1"], + ["Sample5", "Protein3", "Gene1", 3, "Group1"], + ["Sample5", "Protein4", "Gene1", 16, "Group1"], + ["Sample6", "Protein1", "Gene1", -0.07, "Group1"], + ["Sample6", "Protein2", "Gene1", 14, "Group1"], + ["Sample6", "Protein3", "Gene1", 3, "Group1"], + ["Sample6", "Protein4", "Gene1", 16, "Group1"], + ["Sample7", "Protein1", "Gene1", 0.9819, "Group2"], + ["Sample7", "Protein2", "Gene1", 15, "Group2"], + ["Sample7", "Protein3", "Gene1", 1, "Group2"], + ["Sample7", "Protein4", "Gene1", 9, "Group2"], + ["Sample8", "Protein1", "Gene1", -0.26, "Group2"], + ["Sample8", "Protein2", "Gene1", 13, "Group2"], + ["Sample8", "Protein3", "Gene1", 3, "Group2"], + ["Sample8", "Protein4", "Gene1", 11, "Group2"], + ["Sample9", "Protein1", "Gene1", 1.116, "Group2"], + ["Sample9", "Protein2", "Gene1", 14, "Group2"], + ["Sample9", "Protein3", "Gene1", 3, "Group2"], + ["Sample9", "Protein4", "Gene1", 16, "Group2"], + ["Sample10", "Protein1", "Gene1", 0.81, "Group2"], + ["Sample10", "Protein2", "Gene1", 14, "Group2"], + ["Sample10", "Protein3", "Gene1", 3, "Group2"], + ["Sample10", "Protein4", "Gene1", 16, "Group2"], + ["Sample11", "Protein1", "Gene1", 1.336, "Group2"], + ["Sample11", "Protein2", "Gene1", 14, "Group2"], + ["Sample11", "Protein3", "Gene1", 3, "Group2"], + ["Sample11", "Protein4", "Gene1", 16, "Group2"], + ["Sample12", "Protein1", "Gene1", 1.81, "Group2"], + ["Sample12", "Protein2", "Gene1", 14, "Group2"], + ["Sample12", "Protein3", "Gene1", 2, "Group2"], + ["Sample12", "Protein4", "Gene1", 10, "Group2"], + ) + + test_differentially_expressed_proteins_df = pd.DataFrame( + data=test_differentially_expressed_proteins_list, + columns=["Sample", "Protein ID", "Gene", "Normalised iBAQ", "Group"], + ) + return test_differentially_expressed_proteins_df + + +def test_variance_protein_group_calculation(power_test_data): + intensity_df = power_test_data + + protein_id = "Protein1" + group1 = "Group1" + group2 = "Group2" + + variance = variance_protein_group_calculation_max( + intensity_df, protein_id, group1, group2 + ) + print(variance) + assert variance == 4.0 + + +def test_sample_size_calculation(power_test_data, diff_expr_test_data): + test_alpha = 0.05 + test_power = 0.8 + test_fc_threshold = 1 + test_selected_protein_group = "Protein1" + test_individual_column = "None" + test_differentially_expressed_proteins_df, test_metadata_df = diff_expr_test_data + + required_sample_size = sample_size_calculation( + differentially_expressed_proteins_df=power_test_data, + significant_proteins_df=power_test_data, + metadata_df=test_metadata_df, + fc_threshold=test_fc_threshold, + power=test_power, + alpha=test_alpha, + group1="Group1", + group2="Group2", + selected_protein_group=test_selected_protein_group, + individual_column=test_individual_column, + intensity_name=None, + ) + print(required_sample_size) + required_sample_size_int = next(iter(required_sample_size.values()), None) + assert required_sample_size_int == 63 + + +def test_power_calculation(power_test_data, diff_expr_test_data): + test_alpha = 0.05 + test_fc_threshold = 1 + test_selected_protein_group = "Protein1" + test_individual_column = "None" + test_differentially_expressed_proteins_df, test_metadata_df = diff_expr_test_data + + power = power_calculation( + differentially_expressed_proteins_df=power_test_data, + significant_proteins_df=power_test_data, + metadata_df=test_metadata_df, + fc_threshold=test_fc_threshold, + alpha=test_alpha, + group1="Group1", + group2="Group2", + selected_protein_group=test_selected_protein_group, + individual_column=test_individual_column, + intensity_name=None, + ) + print(power) + power_int = next(iter(power.values()), None) + assert power_int == 0.09 + +#TODO: The following tests has been used for thesis calculations. Should not be merged to dev branch. + +def test_check_sample_size_calculation_with_libfun_intensity_values(power_test_data_intensity_values): + test_alpha = 0.05 + test_power = 0.8 + test_fc_threshold = 5 + test_selected_protein_group = "Protein1" + + required_sample_size = check_sample_size_calculation_with_libfunc( + differentially_expressed_proteins_df=power_test_data_intensity_values, + significant_proteins_df=power_test_data_intensity_values, + fc_threshold=test_fc_threshold, + power=test_power, + alpha=test_alpha, + group1="Group1", + group2="Group2", + selected_protein_group=test_selected_protein_group, + intensity_name=None, + ) + print(required_sample_size) + required_sample_size_int = next(iter(required_sample_size.values()), None) + assert required_sample_size_int == 63 + +def test_check_sample_size_calculation_protzilla_intensity_values(power_test_data_intensity_values): + test_alpha = 0.05 + test_power = 0.8 + test_fc_threshold = 1 + test_selected_protein_group = "Protein1" + + required_sample_size = check_sample_size_calculation_protzilla( + differentially_expressed_proteins_df=power_test_data_intensity_values, + significant_proteins_df=power_test_data_intensity_values, + fc_threshold=test_fc_threshold, + power=test_power, + alpha=test_alpha, + group1="Group1", + group2="Group2", + selected_protein_group=test_selected_protein_group, + intensity_name=None, + ) + print(required_sample_size) + required_sample_size_int = next(iter(required_sample_size.values()), None) + assert required_sample_size_int == 1 + +def test_check_sample_size_calculation_with_libfun(power_test_data): + test_alpha = 0.05 + test_power = 0.8 + test_fc_threshold = 5 + test_selected_protein_group = "Protein1" + + required_sample_size = check_sample_size_calculation_with_libfunc( + differentially_expressed_proteins_df=power_test_data, + significant_proteins_df=power_test_data, + fc_threshold=test_fc_threshold, + power=test_power, + alpha=test_alpha, + group1="Group1", + group2="Group2", + selected_protein_group=test_selected_protein_group, + intensity_name=None, + ) + print(required_sample_size) + required_sample_size_int = next(iter(required_sample_size.values()), None) + assert required_sample_size_int == 63 + +def test_check_sample_size_calculation_protzilla(power_test_data): + test_alpha = 0.05 + test_power = 0.8 + power_test_data_log2 = power_test_data.copy() + power_test_data_log2["Normalised iBAQ"] = np.log2( + power_test_data_log2["Normalised iBAQ"] + ) + fc_threshold = 1 + test_selected_protein_group = "Protein1" + + required_sample_size = check_sample_size_calculation_protzilla( + differentially_expressed_proteins_df=power_test_data_log2, + significant_proteins_df=power_test_data, + fc_threshold=fc_threshold, + power=test_power, + alpha=test_alpha, + group1="Group1", + group2="Group2", + selected_protein_group=test_selected_protein_group, + intensity_name=None, + ) + print(required_sample_size) + required_sample_size_int = next(iter(required_sample_size.values()), None) + assert required_sample_size_int == 1 + + +def test_check_sample_size_calculation_protzilla_without_log(power_test_data): + test_alpha = 0.05 + test_power = 0.8 + test_fc_threshold = 5 + test_selected_protein_group = "Protein1" + + required_sample_size = check_sample_size_calculation_protzilla_without_log( + differentially_expressed_proteins_df=power_test_data, + significant_proteins_df=power_test_data, + fc_threshold=test_fc_threshold, + power=test_power, + alpha=test_alpha, + group1="Group1", + group2="Group2", + selected_protein_group=test_selected_protein_group, + intensity_name=None, + ) + print(required_sample_size) + required_sample_size_int = next(iter(required_sample_size.values()), None) + assert required_sample_size_int == 63 + +def test_replicate_paper_sample_size_calculation(power_test_data): + alpha = 0.001 + power = 0.95 + fc_threshold = math.log2(2) + biological_variance = 0.233 + technical_variance = 2.298 + number_of_replicates = 2 + + z_alpha = round(stats.norm.ppf(1 - alpha / 2), 3) + z_beta = round(stats.norm.ppf(power), 3) + + required_sample_size = ( + 2 + * ((z_alpha + z_beta) / fc_threshold) ** 2 + * ((technical_variance / number_of_replicates) + biological_variance) + ) # Equation (1) in Cairns, David A., et al., 2008, Sample size determination in clinical proteomic profiling experiments using mass spectrometry for class comparison + required_sample_size = math.ceil(required_sample_size) + print(required_sample_size) + + data = { + "Cairns": [44, 31, 62, 44, 14, 10, 19, 14, 5, 4, 7, 5], + "Calculated": [65, 52, 92, 74, 20, 16, 28, 23, 7, 6, 10, 8], + } + df = pd.DataFrame(data) + correlation = df["Cairns"].corr(df["Calculated"]) + print(correlation) + correlationmatrix = df.corr() + print(correlationmatrix) + + return dict(required_sample_size=required_sample_size) +""" \ No newline at end of file diff --git a/tests/protzilla/data_integration/test_plots_data_integration.py b/tests/protzilla/data_integration/test_plots_data_integration.py index 6388ed7c..ce55d3c0 100644 --- a/tests/protzilla/data_integration/test_plots_data_integration.py +++ b/tests/protzilla/data_integration/test_plots_data_integration.py @@ -25,7 +25,7 @@ def test_enrichment_bar_plot_restring(show_figures, helpers): top_terms=10, cutoff=0.05, value="fdr", - gene_sets={"KEGG" : "#E2A46D", "Process" : "#4A536A"}, + gene_sets={"KEGG": "#E2A46D", "Process": "#4A536A"}, ) if show_figures: helpers.open_graph_from_base64(bar_base64[0]) @@ -35,7 +35,7 @@ def test_enrichment_bar_plot_restring(show_figures, helpers): top_terms=10, cutoff=0.05, value="p_value", - gene_sets={"KEGG" : "#E2A46D", "Process" : "#4A536A"}, + gene_sets={"KEGG": "#E2A46D", "Process": "#4A536A"}, ) if show_figures: helpers.open_graph_from_base64(bar_base64[0]) @@ -50,7 +50,7 @@ def test_enrichment_bar_plot(show_figures, helpers, data_folder_tests): top_terms=10, cutoff=0.05, value="p_value", - gene_sets={"Reactome_2013" : "#E2A46D"}, + gene_sets={"Reactome_2013": "#E2A46D"}, ) if show_figures: helpers.open_graph_from_base64(bar_base64[0]) @@ -65,10 +65,13 @@ def test_enrichment_bar_plot_wrong_value(data_folder_tests): top_terms=10, cutoff=0.05, value="fdr", - gene_sets={"Reactome_2013" : "#E2A46D"}, + gene_sets={"Reactome_2013": "#E2A46D"}, ) assert "messages" in current_out - assert any(("FDR is not available" in message["msg"]) for message in current_out["messages"]) + assert any( + ("FDR is not available" in message["msg"]) + for message in current_out["messages"] + ) def test_enrichment_bar_plot_empty_df(): @@ -78,10 +81,12 @@ def test_enrichment_bar_plot_empty_df(): top_terms=10, cutoff=0.05, value="p_value", - gene_sets={"Reactome_2013" : "#E2A46D"}, + gene_sets={"Reactome_2013": "#E2A46D"}, ) assert "messages" in current_out - assert any(("No data to plot" in message["msg"]) for message in current_out["messages"]) + assert any( + ("No data to plot" in message["msg"]) for message in current_out["messages"] + ) def test_enrichment_bar_plot_no_category(data_folder_tests): @@ -92,7 +97,10 @@ def test_enrichment_bar_plot_no_category(data_folder_tests): input_df=enrichment_df, top_terms=10, cutoff=0.05, value="p_value", gene_sets=[] ) assert "messages" in current_out - assert any(("Please select at least one category" in message["msg"]) for message in current_out["messages"]) + assert any( + ("Please select at least one category" in message["msg"]) + for message in current_out["messages"] + ) def test_enrichment_bar_plot_wrong_df(): @@ -102,10 +110,13 @@ def test_enrichment_bar_plot_wrong_df(): top_terms=10, cutoff=0.05, value="p_value", - gene_sets={"KEGG" : "#E2A46D"}, + gene_sets={"KEGG": "#E2A46D"}, ) assert "messages" in current_out - assert any(("Please choose an enrichment result dataframe" in message["msg"]) for message in current_out["messages"]) + assert any( + ("Please choose an enrichment result dataframe" in message["msg"]) + for message in current_out["messages"] + ) def test_enrichment_bar_plot_cutoff(data_folder_tests): @@ -115,11 +126,14 @@ def test_enrichment_bar_plot_cutoff(data_folder_tests): top_terms=10, cutoff=0, value="fdr", - gene_sets={"KEGG" : "#E2A46D", "Process" : "#4A536A"}, + gene_sets={"KEGG": "#E2A46D", "Process": "#4A536A"}, ) assert "messages" in current_out - assert any(("No data to plot when applying cutoff" in message["msg"]) for message in current_out["messages"]) + assert any( + ("No data to plot when applying cutoff" in message["msg"]) + for message in current_out["messages"] + ) enrichment_df = pd.read_csv( data_folder_tests / "Reactome_enrichment_enrichr.csv", sep="\t" @@ -129,10 +143,13 @@ def test_enrichment_bar_plot_cutoff(data_folder_tests): top_terms=10, cutoff=0, value="p-value", - gene_sets={"Reactome_2013" : "#E2A46D"}, + gene_sets={"Reactome_2013": "#E2A46D"}, ) assert "messages" in current_out - assert any(("No data to plot when applying cutoff" in message["msg"]) for message in current_out["messages"]) + assert any( + ("No data to plot when applying cutoff" in message["msg"]) + for message in current_out["messages"] + ) @pytest.mark.parametrize("x_axis_type", ["Gene Sets", "Combined Score"]) diff --git a/tests/protzilla/data_preprocessing/test_normalisation.py b/tests/protzilla/data_preprocessing/test_normalisation.py index 29aa5d6e..86efefab 100644 --- a/tests/protzilla/data_preprocessing/test_normalisation.py +++ b/tests/protzilla/data_preprocessing/test_normalisation.py @@ -304,7 +304,9 @@ def test_normalisation_by_z_score( ): method_outputs = by_z_score(normalisation_df) - fig = by_z_score_plot(normalisation_df, method_outputs["protein_df"], "Boxplot", "Sample", "log10")[0] + fig = by_z_score_plot( + normalisation_df, method_outputs["protein_df"], "Boxplot", "Sample", "log10" + )[0] if show_figures: fig.show() @@ -321,7 +323,9 @@ def test_normalisation_by_median( ): method_outputs = by_median(normalisation_df) - fig = by_median_plot(normalisation_df, method_outputs["protein_df"], "Boxplot", "Sample", "log10")[0] + fig = by_median_plot( + normalisation_df, method_outputs["protein_df"], "Boxplot", "Sample", "log10" + )[0] if show_figures: fig.show() @@ -346,7 +350,9 @@ def test_totalsum_normalisation( ): method_outputs = by_totalsum(normalisation_df) - fig = by_totalsum_plot(normalisation_df, method_outputs["protein_df"], "Boxplot", "Sample", "log10")[0] + fig = by_totalsum_plot( + normalisation_df, method_outputs["protein_df"], "Boxplot", "Sample", "log10" + )[0] if show_figures: fig.show() @@ -373,9 +379,13 @@ def test_ref_protein_normalisation( } method_outputs = by_reference_protein(**method_input) - fig = by_reference_protein_plot(normalisation_by_ref_protein_df, method_outputs["protein_df"], "Boxplot", "Sample", "log10")[ - 0 - ] + fig = by_reference_protein_plot( + normalisation_by_ref_protein_df, + method_outputs["protein_df"], + "Boxplot", + "Sample", + "log10", + )[0] if show_figures: fig.show() diff --git a/tests/protzilla/data_preprocessing/test_outlier_detection.py b/tests/protzilla/data_preprocessing/test_outlier_detection.py index 4e4ececb..7aa613c2 100644 --- a/tests/protzilla/data_preprocessing/test_outlier_detection.py +++ b/tests/protzilla/data_preprocessing/test_outlier_detection.py @@ -65,8 +65,7 @@ def outlier_detection_df_with_nan(): def test_outlier_detection_with_isolation_forest( - show_figures, outlier_detection_df, - peptides_df + show_figures, outlier_detection_df, peptides_df ): method_inputs = { "protein_df": outlier_detection_df, @@ -144,7 +143,11 @@ def test_outlier_detection_with_pca(show_figures, outlier_detection_df, peptides "number_of_components": 3, } method_outputs = by_pca(**method_inputs) - fig = by_pca_plot(method_outputs["pca_df"], method_outputs["number_of_components"], method_outputs["explained_variance_ratio"])[0] + fig = by_pca_plot( + method_outputs["pca_df"], + method_outputs["number_of_components"], + method_outputs["explained_variance_ratio"], + )[0] if show_figures: fig.show() diff --git a/tests/protzilla/data_preprocessing/test_peptide_preprocessing.py b/tests/protzilla/data_preprocessing/test_peptide_preprocessing.py index f3900fdd..1de769f5 100644 --- a/tests/protzilla/data_preprocessing/test_peptide_preprocessing.py +++ b/tests/protzilla/data_preprocessing/test_peptide_preprocessing.py @@ -1,5 +1,4 @@ import pandas as pd -import pytest from protzilla.constants.paths import TEST_DATA_PATH from protzilla.data_preprocessing.peptide_filter import by_pep_value, by_pep_value_plot @@ -57,4 +56,3 @@ def test_pep_filter(show_figures, leftover_peptide_df, filtered_peptides_list): pd.testing.assert_frame_equal(method_outputs["peptide_df"], leftover_peptide_df) assert method_outputs["filtered_peptides"] == filtered_peptides_list - diff --git a/tests/protzilla/importing/test_ms_data_import.py b/tests/protzilla/importing/test_ms_data_import.py index 457fe145..e7909db8 100644 --- a/tests/protzilla/importing/test_ms_data_import.py +++ b/tests/protzilla/importing/test_ms_data_import.py @@ -218,7 +218,9 @@ def test_max_quant_import_no_protein_ids_column(): assert "protein_df" not in outputs assert "messages" in outputs assert any(message["level"] == logging.ERROR for message in outputs["messages"]) - assert any("Majority protein IDs" in message["msg"] for message in outputs["messages"]) + assert any( + "Majority protein IDs" in message["msg"] for message in outputs["messages"] + ) def test_max_quant_import_invalid_data(): @@ -310,9 +312,7 @@ def test_transform_and_clean(): ["C", "Q11111", np.nan], ] df = pd.DataFrame(data, columns=columns) - outputs = ms_data_import.transform_and_clean( - df, "intensity", map_to_uniprot=False - ) + outputs = ms_data_import.transform_and_clean(df, "intensity", map_to_uniprot=False) expected_df = pd.DataFrame(expected_output, columns=out_col) # we do not care about the genes column, it is deprecated (and replaced by nan) diff --git a/tests/protzilla/test_runner.py b/tests/protzilla/test_runner.py index 50171434..caebccdf 100644 --- a/tests/protzilla/test_runner.py +++ b/tests/protzilla/test_runner.py @@ -13,7 +13,6 @@ from protzilla.runner import Runner, _serialize_graphs from runner_cli import args_parser -from protzilla.steps import Output, Plots @pytest.fixture @@ -84,34 +83,61 @@ def test_runner_imports( runner.compute_workflow() expected_methods = [ - 'MaxQuantImport', - 'MetadataImport', - 'FilterProteinsBySamplesMissing', - 'FilterSamplesByProteinIntensitiesSum', - 'ImputationByKNN', - 'OutlierDetectionByLocalOutlierFactor', - 'TransformationLog', - 'NormalisationByMedian', - 'PlotProtQuant', - 'DifferentialExpressionTTest', - 'PlotVolcano', - 'EnrichmentAnalysisGOAnalysisWithString', - 'PlotGOEnrichmentBarPlot' + "MaxQuantImport", + "MetadataImport", + "FilterProteinsBySamplesMissing", + "FilterSamplesByProteinIntensitiesSum", + "ImputationByKNN", + "OutlierDetectionByLocalOutlierFactor", + "TransformationLog", + "NormalisationByMedian", + "PlotProtQuant", + "DifferentialExpressionTTest", + "PlotVolcano", + "EnrichmentAnalysisGOAnalysisWithString", + "PlotGOEnrichmentBarPlot", ] expected_method_parameters = [ - call({'intensity_name': 'iBAQ', 'map_to_uniprot': False, 'aggregation_mode': 'Sum', 'file_path': 'tests/proteinGroups_small_cut.txt'}), - call({'feature_orientation': 'Columns (samples in rows, features in columns)', 'file_path': 'tests/metadata_cut_columns.csv'}), - call({'percentage': 0.5}), - call({'deviation_threshold': 2.0}), - call({'number_of_neighbours': 5}), - call({'number_of_neighbors': 20}), - call({'log_base': 'log2'}), - call({'percentile': 0.5}), - call({'similarity_measure': 'euclidean distance'}), - call({'alpha': 0.05}), - call({'fc_threshold': 1}), - call({'differential_expression_threshold': 1, 'direction': 'both', 'gene_sets_restring': [], 'organism': 9606}), - call({'colors': [], 'cutoff': 0.05, 'gene_sets': ['Process', 'Component', 'Function', 'KEGG'], 'top_terms': 10, 'value': 'p-value'}) + call( + { + "intensity_name": "iBAQ", + "map_to_uniprot": False, + "aggregation_mode": "Sum", + "file_path": "tests/proteinGroups_small_cut.txt", + } + ), + call( + { + "feature_orientation": "Columns (samples in rows, features in columns)", + "file_path": "tests/metadata_cut_columns.csv", + } + ), + call({"percentage": 0.5}), + call({"deviation_threshold": 2.0}), + call({"number_of_neighbours": 5}), + call({"number_of_neighbors": 20}), + call({"log_base": "log2"}), + call({"percentile": 0.5}), + call({"similarity_measure": "euclidean distance"}), + call({"alpha": 0.05}), + call({"fc_threshold": 1}), + call( + { + "differential_expression_threshold": 1, + "direction": "both", + "gene_sets_restring": [], + "organism": 9606, + } + ), + call( + { + "colors": [], + "cutoff": 0.05, + "gene_sets": ["Process", "Component", "Function", "KEGG"], + "top_terms": 10, + "value": "p-value", + } + ), ] assert mock_method.call_count == 13 @@ -162,10 +188,21 @@ def test_runner_calculates(monkeypatch, tests_folder_name, ms_data_path, metadat "FilterProteinsBySamplesMissing", ] assert mock_method.call_args_list == [ - call({'intensity_name': 'iBAQ', 'map_to_uniprot': False, 'aggregation_method': 'Sum', 'file_path': 'tests/proteinGroups_small_cut.txt'}), - call({'feature_orientation': 'Columns (samples in rows, features in columns)', - 'file_path': 'tests/metadata_cut_columns.csv'}), - call({'percentage': 0.5}) + call( + { + "intensity_name": "iBAQ", + "map_to_uniprot": False, + "aggregation_method": "Sum", + "file_path": "tests/proteinGroups_small_cut.txt", + } + ), + call( + { + "feature_orientation": "Columns (samples in rows, features in columns)", + "file_path": "tests/metadata_cut_columns.csv", + } + ), + call({"percentage": 0.5}), ] mock_plot.assert_not_called() @@ -221,7 +258,9 @@ def test_serialize_workflow_graphs(): assert _serialize_graphs(step["graphs"]) == serial_filter_graphs -def test_integration_runner(metadata_path, ms_data_path, tests_folder_name, monkeypatch): +def test_integration_runner( + metadata_path, ms_data_path, tests_folder_name, monkeypatch +): name = tests_folder_name + "/test_runner_integration_" + random_string() runner = Runner( **{ diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py index 171b8df8..673e40e2 100644 --- a/ui/runs/form_mapping.py +++ b/ui/runs/form_mapping.py @@ -63,6 +63,10 @@ data_analysis.DimensionReductionUMAP: data_analysis_forms.DimensionReductionUMAPForm, data_analysis.ProteinGraphPeptidesToIsoform: data_analysis_forms.ProteinGraphPeptidesToIsoformForm, data_analysis.ProteinGraphVariationGraph: data_analysis_forms.ProteinGraphVariationGraphForm, + data_analysis.PowerAnalysisPowerCalculation: data_analysis_forms.PowerAnalysisPowerCalculationForm, + data_analysis.PowerAnalysisSampleSizeCalculation: data_analysis_forms.PowerAnalysisSampleSizeCalculationForm, + data_analysis.PowerAnalysisSampleSizeCalculationForAllProteins: data_analysis_forms.PowerAnalysisSampleSizeCalculationForAllProteinsForm, + data_analysis.PowerAnalysisPowerCalculationForAllProteins: data_analysis_forms.PowerAnalysisPowerCalculationForAllProteinsForm, data_analysis.SelectPeptidesForProtein: data_analysis_forms.SelectPeptidesForProteinForm, data_analysis.FLEXIQuantLF: data_analysis_forms.FLEXIQuantLFForm, data_analysis.PTMsPerSample: data_analysis_forms.PTMsPerSampleForm, diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 71ae355e..6031d383 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1,16 +1,11 @@ -import logging from enum import Enum, StrEnum -from protzilla.methods.data_preprocessing import DataPreprocessingStep from protzilla.methods.data_analysis import ( DataAnalysisStep, - DifferentialExpressionLinearModel, DifferentialExpressionTTest, DimensionReductionUMAP, - DataAnalysisStep, PTMsPerSample, SelectPeptidesForProtein, - DifferentialExpressionMannWhitneyOnPTM, ) from protzilla.methods.data_preprocessing import DataPreprocessingStep from protzilla.run import Run @@ -25,7 +20,6 @@ CustomFloatField, CustomMultipleChoiceField, CustomNumberField, - CustomBooleanField, ) @@ -169,7 +163,11 @@ class DifferentialExpressionANOVAForm(MethodForm): initial=MultipleTestingCorrectionMethod.benjamini_hochberg, ) alpha = CustomFloatField( - label="Error rate (alpha)", min_value=0, max_value=1, step_size=0.01, initial=0.05 + label="Error rate (alpha)", + min_value=0, + max_value=1, + step_size=0.01, + initial=0.05, ) grouping = CustomChoiceField(choices=[], label="Grouping from metadata") @@ -256,7 +254,11 @@ class DifferentialExpressionLinearModelForm(MethodForm): initial=MultipleTestingCorrectionMethod.benjamini_hochberg, ) alpha = CustomFloatField( - label="Error rate (alpha)", min_value=0, max_value=1, step_size=0.01, initial=0.05 + label="Error rate (alpha)", + min_value=0, + max_value=1, + step_size=0.01, + initial=0.05, ) grouping = CustomChoiceField(choices=[], label="Grouping from metadata") group1 = CustomChoiceField(choices=[], label="Group 1") @@ -293,16 +295,18 @@ def fill_form(self, run: Run) -> None: class DifferentialExpressionMannWhitneyOnIntensityForm(MethodForm): is_dynamic = True - protein_df = CustomChoiceField( - choices=[], label="Step to use protein data from" - ) + protein_df = CustomChoiceField(choices=[], label="Step to use protein data from") multiple_testing_correction_method = CustomChoiceField( choices=MultipleTestingCorrectionMethod, label="Multiple testing correction", initial=MultipleTestingCorrectionMethod.benjamini_hochberg, ) alpha = CustomFloatField( - label="Error rate (alpha)", min_value=0, max_value=1, step_size=0.01, initial=0.05 + label="Error rate (alpha)", + min_value=0, + max_value=1, + step_size=0.01, + initial=0.05, ) p_value_calculation_method = CustomChoiceField( choices=PValueCalculationMethod, @@ -314,9 +318,13 @@ class DifferentialExpressionMannWhitneyOnIntensityForm(MethodForm): group2 = CustomChoiceField(choices=[], label="Group 2") def fill_form(self, run: Run) -> None: - self.fields["protein_df"].choices = fill_helper.get_choices_for_protein_df_steps(run) + self.fields[ + "protein_df" + ].choices = fill_helper.get_choices_for_protein_df_steps(run) - self.fields["grouping"].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + self.fields[ + "grouping" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0]) @@ -351,7 +359,11 @@ class DifferentialExpressionMannWhitneyOnPTMForm(MethodForm): initial=MultipleTestingCorrectionMethod.benjamini_hochberg, ) alpha = CustomFloatField( - label="Error rate (alpha)", min_value=0, max_value=1, step_size=0.01, initial=0.05 + label="Error rate (alpha)", + min_value=0, + max_value=1, + step_size=0.01, + initial=0.05, ) p_value_calculation_method = CustomChoiceField( choices=PValueCalculationMethod, @@ -367,7 +379,9 @@ def fill_form(self, run: Run) -> None: run.steps.get_instance_identifiers(PTMsPerSample, "ptm_df") ) - self.fields["grouping"].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + self.fields[ + "grouping" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0]) @@ -395,16 +409,18 @@ def fill_form(self, run: Run) -> None: class DifferentialExpressionKruskalWallisOnIntensityForm(MethodForm): is_dynamic = True - protein_df = CustomChoiceField( - choices=[], label="Step to use protein data from" - ) + protein_df = CustomChoiceField(choices=[], label="Step to use protein data from") multiple_testing_correction_method = CustomChoiceField( choices=MultipleTestingCorrectionMethod, label="Multiple testing correction", initial=MultipleTestingCorrectionMethod.benjamini_hochberg, ) alpha = CustomFloatField( - label="Error rate (alpha)", min_value=0, max_value=1, step_size=0.01, initial=0.05 + label="Error rate (alpha)", + min_value=0, + max_value=1, + step_size=0.01, + initial=0.05, ) grouping = CustomChoiceField(choices=[], label="Grouping from metadata") @@ -413,9 +429,13 @@ class DifferentialExpressionKruskalWallisOnIntensityForm(MethodForm): ) def fill_form(self, run: Run) -> None: - self.fields["protein_df"].choices = fill_helper.get_choices_for_protein_df_steps(run) + self.fields[ + "protein_df" + ].choices = fill_helper.get_choices_for_protein_df_steps(run) - self.fields["grouping"].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + self.fields[ + "grouping" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0]) self.fields["selected_groups"].choices = fill_helper.to_choices( run.steps.metadata_df[grouping].unique() @@ -425,16 +445,18 @@ def fill_form(self, run: Run) -> None: class DifferentialExpressionKruskalWallisOnPTMForm(MethodForm): is_dynamic = True - ptm_df = CustomChoiceField( - choices=[], label="Step to use ptm data from" - ) + ptm_df = CustomChoiceField(choices=[], label="Step to use ptm data from") multiple_testing_correction_method = CustomChoiceField( choices=MultipleTestingCorrectionMethod, label="Multiple testing correction", initial=MultipleTestingCorrectionMethod.benjamini_hochberg, ) alpha = CustomFloatField( - label="Error rate (alpha)", min_value=0, max_value=1, step_size=0.01, initial=0.05 + label="Error rate (alpha)", + min_value=0, + max_value=1, + step_size=0.01, + initial=0.05, ) grouping = CustomChoiceField(choices=[], label="Grouping from metadata") @@ -446,7 +468,9 @@ def fill_form(self, run: Run) -> None: self.fields["ptm_df"].choices = fill_helper.to_choices( run.steps.get_instance_identifiers(PTMsPerSample, "ptm_df") ) - self.fields["grouping"].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + self.fields[ + "grouping" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0]) self.fields["selected_groups"].choices = fill_helper.to_choices( run.steps.metadata_df[grouping].unique() @@ -471,7 +495,8 @@ class PlotVolcanoForm(MethodForm): def fill_form(self, run: Run) -> None: self.fields["input_dict"].choices = fill_helper.to_choices( run.steps.get_instance_identifiers( - Step, ["corrected_p_values_df", "log2_fold_change_df"], + Step, + ["corrected_p_values_df", "log2_fold_change_df"], ) ) @@ -491,7 +516,9 @@ def fill_form(self, run: Run) -> None: if step_output is not None: items_of_interest = step_output["PTM"].unique() - self.fields["items_of_interest"].choices = fill_helper.to_choices(items_of_interest) + self.fields["items_of_interest"].choices = fill_helper.to_choices( + items_of_interest + ) class PlotScatterPlotForm(MethodForm): @@ -809,7 +836,7 @@ class ClassificationRandomForestForm(MethodForm): # TODO: Workflow_meta line 1763 train_val_split = CustomNumberField( label="Choose the size of the validation data set (you can either enter the absolute number of validation " - "samples or a number between 0.0 and 1.0 to represent the percentage of validation samples)", + "samples or a number between 0.0 and 1.0 to represent the percentage of validation samples)", initial=0.20, ) # TODO: Workflow_meta line 1770 @@ -897,7 +924,7 @@ class ClassificationSVMForm(MethodForm): ) train_val_split = CustomNumberField( label="Choose the size of the validation data set (you can either enter the absolute number of validation " - "samples or a number between 0.0 and 1.0 to represent the percentage of validation samples)", + "samples or a number between 0.0 and 1.0 to represent the percentage of validation samples)", initial=0.20, ) # TODO: Workflow_meta line 1973 @@ -1014,7 +1041,7 @@ class DimensionReductionUMAPForm(MethodForm): ) n_neighbors = CustomNumberField( label="The size of local neighborhood (in terms of number of neighboring sample points) used for manifold " - "approximation", + "approximation", min_value=2, max_value=100, step_size=1, @@ -1055,7 +1082,7 @@ class ProteinGraphPeptidesToIsoformForm(MethodForm): k = CustomNumberField(label="k-mer length", min_value=1, step_size=1, initial=5) allowed_mismatches = CustomNumberField( label="Number of allowed mismatched amino acids per peptide. For many allowed mismatches, this can take a " - "long time.", + "long time.", min_value=0, step_size=1, initial=2, @@ -1148,13 +1175,17 @@ def fill_form(self, run: Run) -> None: selected_auto_select = self.data.get("auto_select") - choices = fill_helper.to_choices([] if selected_auto_select else ["all proteins"]) - choices.extend(fill_helper.get_choices( - run, "significant_proteins_df", DataAnalysisStep - )) + choices = fill_helper.to_choices( + [] if selected_auto_select else ["all proteins"] + ) + choices.extend( + fill_helper.get_choices(run, "significant_proteins_df", DataAnalysisStep) + ) self.fields["protein_list"].choices = choices - chosen_list = self.data.get("protein_list", self.fields["protein_list"].choices[0][0]) + chosen_list = self.data.get( + "protein_list", self.fields["protein_list"].choices[0][0] + ) if not selected_auto_select: self.toggle_visibility("sort_proteins", True) self.toggle_visibility("protein_ids", True) @@ -1168,7 +1199,9 @@ def fill_form(self, run: Run) -> None: self.fields["protein_ids"].choices = fill_helper.to_choices( run.steps.get_step_output( DataAnalysisStep, "significant_proteins_df", chosen_list - ).sort_values(by="corrected_p_value")["Protein ID"].unique() + ) + .sort_values(by="corrected_p_value")["Protein ID"] + .unique() ) else: self.fields["protein_ids"].choices = fill_helper.to_choices( @@ -1188,9 +1221,7 @@ class PTMsPerSampleForm(MethodForm): ) def fill_form(self, run: Run) -> None: - self.fields["peptide_df"].choices = fill_helper.get_choices( - run, "peptide_df" - ) + self.fields["peptide_df"].choices = fill_helper.get_choices(run, "peptide_df") single_protein_peptides = run.steps.get_instance_identifiers( SelectPeptidesForProtein, "peptide_df" @@ -1212,4 +1243,340 @@ def fill_form(self, run: Run) -> None: SelectPeptidesForProtein, "peptide_df" ) if single_protein_peptides: - self.fields["peptide_df"].initial = single_protein_peptides[0] \ No newline at end of file + self.fields["peptide_df"].initial = single_protein_peptides[0] + + +class PowerAnalysisPowerCalculationForm(MethodForm): + is_dynamic = True + + input_dict = CustomChoiceField( + choices=[], + label="Input data dict (generated e.g. by t-Test)", + ) + alpha = CustomFloatField( + label="Error rate (alpha)", + min_value=0, + max_value=1, + step_size=0.05, + initial=0.05, + ) + fc_threshold = CustomFloatField( + label="Log2 fold change threshold", min_value=0, initial=1 + ) + significant_proteins_only = CustomChoiceField( + choices=YesNo, + label="Select only significant proteins", + initial=YesNo.yes, + ) + selected_protein_group = CustomChoiceField( + choices=[], + label="Protein group to calculate power for", + ) + individual_column = CustomChoiceField( + choices=[], + label="Column name for individuals in metadata, if it exists (mean value will be calculated per individual)", + ) + + def fill_form(self, run: Run) -> None: + self.fields["input_dict"].choices = fill_helper.to_choices( + run.steps.get_instance_identifiers( + DifferentialExpressionTTest, + "differentially_expressed_proteins_df", + ) + ) + + input_dict_instance_id = self.data.get( + "input_dict", self.fields["input_dict"].choices[0][0] + ) + self.fields["individual_column"].choices = [ + ("None", "None") + ] + fill_helper.get_choices_for_metadata_all_columns(run) + individual_column = self.data.get("individual_column", "None") + self.fields["individual_column"].initial = individual_column + self.fields["selected_protein_group"].choices = fill_helper.to_choices( + run.steps.get_step_output( + Step, "differentially_expressed_proteins_df", input_dict_instance_id + )["Protein ID"].unique() + ) + + significant_proteins_only = self.data.get( + "significant_proteins_only", + self.fields["significant_proteins_only"].choices[0][0], + ) + + if significant_proteins_only == YesNo.yes: + self.fields["selected_protein_group"].choices = fill_helper.to_choices( + run.steps.get_step_output( + Step, "significant_proteins_df", input_dict_instance_id + )["Protein ID"].unique() + ) + else: + self.fields["selected_protein_group"].choices = fill_helper.to_choices( + run.steps.get_step_output( + Step, "differentially_expressed_proteins_df", input_dict_instance_id + )["Protein ID"].unique() + ) + + self.fields["alpha"].initial = run.steps.get_step_output( + Step, "corrected_alpha", input_dict_instance_id + ) + + +class PowerAnalysisSampleSizeCalculationForm(MethodForm): + is_dynamic = True + + input_dict = CustomChoiceField( + choices=[], + label="Input data dict (generated e.g. by t-Test)", + ) + alpha = CustomFloatField( + label="Error rate (alpha)", + min_value=0, + max_value=1, + step_size=0.05, + initial=0.05, + ) + power = CustomFloatField( + label="Power", + min_value=0, + max_value=1, + step_size=0.05, + initial=0.8, + ) + fc_threshold = CustomFloatField( + label="Log2 fold change threshold", min_value=0, initial=1 + ) + significant_proteins_only = CustomChoiceField( + choices=YesNo, + label="Select only significant proteins", + initial=YesNo.yes, + ) + selected_protein_group = CustomChoiceField( + choices=[], + label="Protein group to calculate sample size for", + ) + individual_column = CustomChoiceField( + choices=[], + label="Column name for individuals in metadata, if it exists (mean value will be calculated per individual)", + ) + + def fill_form(self, run: Run) -> None: + self.fields["input_dict"].choices = fill_helper.to_choices( + run.steps.get_instance_identifiers( + DifferentialExpressionTTest, + "differentially_expressed_proteins_df", + ) + ) + + input_dict_instance_id = self.data.get( + "input_dict", self.fields["input_dict"].choices[0][0] + ) + self.fields["individual_column"].choices = [ + ("None", "None") + ] + fill_helper.get_choices_for_metadata_all_columns(run) + individual_column = self.data.get("individual_column", "None") + self.fields["individual_column"].initial = individual_column + self.fields["selected_protein_group"].choices = fill_helper.to_choices( + run.steps.get_step_output( + Step, "differentially_expressed_proteins_df", input_dict_instance_id + )["Protein ID"].unique() + ) + + significant_proteins_only = self.data.get( + "significant_proteins_only", + self.fields["significant_proteins_only"].choices[0][0], + ) + + if significant_proteins_only == YesNo.yes: + self.fields["selected_protein_group"].choices = fill_helper.to_choices( + run.steps.get_step_output( + Step, "significant_proteins_df", input_dict_instance_id + )["Protein ID"].unique() + ) + else: + self.fields["selected_protein_group"].choices = fill_helper.to_choices( + run.steps.get_step_output( + Step, "differentially_expressed_proteins_df", input_dict_instance_id + )["Protein ID"].unique() + ) + + self.fields["alpha"].initial = run.steps.get_step_output( + Step, "corrected_alpha", input_dict_instance_id + ) + + +class PowerAnalysisSampleSizeCalculationForAllProteinsForm(MethodForm): + is_dynamic = True + + input_dict = CustomChoiceField( + choices=[], + label="Input data dict (generated e.g. by t-Test)", + ) + alpha = CustomFloatField( + label="Error rate (alpha)", + min_value=0, + max_value=1, + step_size=0.05, + initial=0.05, + ) + power = CustomFloatField( + label="Power", + min_value=0, + max_value=1, + step_size=0.05, + initial=0.8, + ) + fc_threshold = CustomFloatField( + label="Log2 fold change threshold", min_value=0, initial=1 + ) + individual_column = CustomChoiceField( + choices=[], + label="Column name for individuals in metadata, if it exists (mean value will be calculated per individual)", + ) + significant_proteins_only = CustomChoiceField( + choices=YesNo, + label="Select only significant proteins", + initial=YesNo.yes, + ) + select_all_proteins = CustomBooleanField( + label="Select all proteins", + initial=True, + ) + selected_protein_groups = CustomMultipleChoiceField( + choices=[], + label="Protein groups to calculate sample size for", + ) + + def fill_form(self, run: Run) -> None: + self.fields["input_dict"].choices = fill_helper.to_choices( + run.steps.get_instance_identifiers( + DifferentialExpressionTTest, + "differentially_expressed_proteins_df", + ) + ) + input_dict_instance_id = self.data.get( + "input_dict", self.fields["input_dict"].choices[0][0] + ) + self.fields["alpha"].initial = run.steps.get_step_output( + Step, "corrected_alpha", input_dict_instance_id + ) + self.fields["individual_column"].choices = [ + ("None", "None") + ] + fill_helper.get_choices_for_metadata_all_columns(run) + individual_column = self.data.get("individual_column", "None") + self.fields["individual_column"].initial = individual_column + + significant_proteins_only = self.data.get( + "significant_proteins_only", + self.fields["significant_proteins_only"].choices[0][0], + ) + + if significant_proteins_only == YesNo.yes: + self.fields["selected_protein_groups"].choices = fill_helper.to_choices( + run.steps.get_step_output( + Step, "significant_proteins_df", input_dict_instance_id + )["Protein ID"].unique() + ) + else: + self.fields["selected_protein_groups"].choices = fill_helper.to_choices( + run.steps.get_step_output( + Step, "differentially_expressed_proteins_df", input_dict_instance_id + )["Protein ID"].unique() + ) + if not self.data: + select_all_proteins = True + else: + if "select_all_proteins" in self.data: + select_all_proteins = True + else: + select_all_proteins = False + + if select_all_proteins == False: + self.toggle_visibility("selected_protein_groups", True) + else: + self.toggle_visibility("selected_protein_groups", False) + + +class PowerAnalysisPowerCalculationForAllProteinsForm(MethodForm): + is_dynamic = True + + input_dict = CustomChoiceField( + choices=[], + label="Input data dict (generated e.g. by t-Test)", + ) + alpha = CustomFloatField( + label="Error rate (alpha)", + min_value=0, + max_value=1, + step_size=0.05, + initial=0.05, + ) + fc_threshold = CustomFloatField( + label="Log2 fold change threshold", min_value=0, initial=1 + ) + individual_column = CustomChoiceField( + choices=[], + label="Column name for individuals in metadata, if it exists (mean value will be calculated per individual)", + ) + significant_proteins_only = CustomChoiceField( + choices=YesNo, + label="Select only significant proteins", + initial=YesNo.yes, + ) + select_all_proteins = CustomBooleanField( + label="Select all proteins", + initial=True, + ) + selected_protein_groups = CustomMultipleChoiceField( + choices=[], + label="Protein groups to calculate power for", + ) + + def fill_form(self, run: Run) -> None: + self.fields["input_dict"].choices = fill_helper.to_choices( + run.steps.get_instance_identifiers( + DifferentialExpressionTTest, + "differentially_expressed_proteins_df", + ) + ) + input_dict_instance_id = self.data.get( + "input_dict", self.fields["input_dict"].choices[0][0] + ) + self.fields["alpha"].initial = run.steps.get_step_output( + Step, "corrected_alpha", input_dict_instance_id + ) + self.fields["individual_column"].choices = [ + ("None", "None") + ] + fill_helper.get_choices_for_metadata_all_columns(run) + individual_column = self.data.get("individual_column", "None") + self.fields["individual_column"].initial = individual_column + + significant_proteins_only = self.data.get( + "significant_proteins_only", + self.fields["significant_proteins_only"].choices[0][0], + ) + + if significant_proteins_only == YesNo.yes: + self.fields["selected_protein_groups"].choices = fill_helper.to_choices( + run.steps.get_step_output( + Step, "significant_proteins_df", input_dict_instance_id + )["Protein ID"].unique() + ) + else: + self.fields["selected_protein_groups"].choices = fill_helper.to_choices( + run.steps.get_step_output( + Step, "differentially_expressed_proteins_df", input_dict_instance_id + )["Protein ID"].unique() + ) + if not self.data: + select_all_proteins = True + else: + if "select_all_proteins" in self.data: + select_all_proteins = True + else: + select_all_proteins = False + + if select_all_proteins == False: + self.toggle_visibility("selected_protein_groups", True) + else: + self.toggle_visibility("selected_protein_groups", False) diff --git a/ui/runs/forms/fill_helper.py b/ui/runs/forms/fill_helper.py index eef83be6..adffee53 100644 --- a/ui/runs/forms/fill_helper.py +++ b/ui/runs/forms/fill_helper.py @@ -34,3 +34,7 @@ def get_choices_for_metadata_non_sample_columns(run: Run) -> list[tuple[str, str run.steps.metadata_df.columns != "Sample" ].unique() ) + + +def get_choices_for_metadata_all_columns(run: Run) -> list[tuple[str, str]]: + return to_choices(run.steps.metadata_df.columns) diff --git a/ui/runs/static/runs/style.css b/ui/runs/static/runs/style.css index b75b144a..d19306bc 100644 --- a/ui/runs/static/runs/style.css +++ b/ui/runs/static/runs/style.css @@ -16,7 +16,6 @@ html, body { #content { order: 1; padding: 20px; - padding-bottom: 0px; flex-grow: 1; } @@ -29,7 +28,7 @@ html, body { order: 2; flex-basis: 300px; max-width: 400px; - min-height: calc(100vh - 60px); + min-height: 100vh; flex-grow: 1; transition: 100ms; } @@ -77,6 +76,12 @@ html, body { width: 800px; } +.display-output-textarea { + display: flex; + width: 100%; + height: auto; + resize: none; +} .header { position: sticky; top: 80px; diff --git a/ui/runs/templates/runs/details.html b/ui/runs/templates/runs/details.html index f6b1ddec..340acea0 100644 --- a/ui/runs/templates/runs/details.html +++ b/ui/runs/templates/runs/details.html @@ -149,6 +149,14 @@

{{ display_name }}

View Protein Graph {% endif %} + {% if display_output %} +
+ + +
+ {% endif %} diff --git a/ui/runs/views.py b/ui/runs/views.py index e21b7b2d..1c4d0ae9 100644 --- a/ui/runs/views.py +++ b/ui/runs/views.py @@ -26,10 +26,10 @@ from protzilla.steps import Step from protzilla.utilities.utilities import ( check_is_path, + clean_uniprot_id, format_trace, get_memory_usage, name_to_title, - clean_uniprot_id, unique_justseen, ) from protzilla.workflow import get_available_workflow_names @@ -40,13 +40,10 @@ make_sidebar, ) from ui.runs.views_helper import display_message, display_messages - -from .form_mapping import ( - get_filled_form_by_method, - get_filled_form_by_request, -) from ui.settings.views import load_settings +from .form_mapping import get_filled_form_by_method, get_filled_form_by_request + active_runs: dict[str, Run] = {} @@ -70,8 +67,8 @@ def detail(request: HttpRequest, run_name: str): active_runs[run_name] = Run(run_name) run: Run = active_runs[run_name] - request.session['last_view'] = "runs:detail" - request.session['run_name'] = run_name + request.session["last_view"] = "runs:detail" + request.session["run_name"] = run_name if request.POST: method_form = get_filled_form_by_request( @@ -124,6 +121,14 @@ def detail(request: HttpRequest, run_name: str): and Path(run.current_outputs["graph_path"]).exists() ) + display_output_form = ( + run.steps.current_step.display_output is not None + and not run.current_step.display_output.is_empty() + ) + display_output_text = next( + iter(run.current_step.display_output.display_output.values()), None + ) + return render( request, "runs/details.html", @@ -142,13 +147,15 @@ def detail(request: HttpRequest, run_name: str): type(run.current_step).__name__, ), name_field=make_name_field( - run.current_step.calculation_status!="incomplete", run, False + run.current_step.calculation_status != "incomplete", run, False ), # TODO end_of_run current_plots=current_plots, - results_exist=run.current_step.calculation_status in ["complete","outdated"], - allow_calculate= run.steps.current_step_index <= run.steps.failed_step_index or run.steps.failed_step_index == -1, + results_exist=run.current_step.calculation_status + in ["complete", "outdated"], + allow_calculate=run.steps.current_step_index <= run.steps.failed_step_index + or run.steps.failed_step_index == -1, show_back=run.steps.current_step_index > 0, - show_plot_button=run.current_step.calculation_status!="incomplete", + show_plot_button=run.current_step.calculation_status != "incomplete", # TODO include plot exists and plot parameters match current plot or remove this and replace with results exist sidebar=make_sidebar(request, run), last_step=run.steps.current_step_index == len(run.steps.all_steps) - 1, @@ -159,6 +166,9 @@ def detail(request: HttpRequest, run_name: str): description=description, method_form=method_form, is_form_dynamic=method_form.is_dynamic, + plot_form=plot_form, + display_output=display_output_form, + display_output_result=display_output_text, current_step_index=run.steps.current_step_index, ), ) @@ -175,7 +185,7 @@ def index(request: HttpRequest, index_error: bool = False): :rtype: HttpResponse """ - request.session['last_view'] = "runs:index" + request.session["last_view"] = "runs:index" return render( request, @@ -672,15 +682,14 @@ def download_table(request, run_name, index, key): return FileResponse(csv_bytes, content_type="text/csv") -def update_form(request: HttpRequest, run_name:str): + +def update_form(request: HttpRequest, run_name: str): if run_name not in active_runs: active_runs[run_name] = Run(run_name) run: Run = active_runs[run_name] count = 0 - if (run.current_step.calculation_status == "complete"): + if run.current_step.calculation_status == "complete": count = run.step_set_outdated() - method_form = get_filled_form_by_request( - request, run - ) + method_form = get_filled_form_by_request(request, run) method_form.update_form(run) - return(JsonResponse({"status":run.current_step.calculation_status, "count":count})) \ No newline at end of file + return JsonResponse({"status": run.current_step.calculation_status, "count": count}) diff --git a/user_data/workflows/standard.yaml b/user_data/workflows/standard.yaml index 52d754b7..05ce3fb8 100644 --- a/user_data/workflows/standard.yaml +++ b/user_data/workflows/standard.yaml @@ -58,6 +58,18 @@ steps: alpha: 0.05 inputs: { } type: DifferentialExpressionTTest + - form_inputs: { } + inputs: { } + type: PowerAnalysisSampleSizeCalculation + - form_inputs: { } + inputs: { } + type: PowerAnalysisPowerCalculation + - form_inputs: {} + inputs: { } + type: PowerAnalysisSampleSizeCalculationForAllProteins + - form_inputs: { } + inputs: { } + type: PowerAnalysisPowerCalculationForAllProteins - form_inputs: fc_threshold: 1 inputs: { }