From 73e13a2c7bd76f2cfc85387375d273ae33d93476 Mon Sep 17 00:00:00 2001 From: selenabr Date: Wed, 5 Jun 2024 17:12:18 +0200 Subject: [PATCH 01/36] added sample size calculation in methods\data_analysis.py and forms\data_analysis.py --- protzilla/data_analysis/power_analysis.py | 31 ++++++++++++ protzilla/methods/data_analysis.py | 49 +++++++++++++++++++ ui/runs/form_mapping.py | 2 + ui/runs/forms/data_analysis.py | 59 ++++++++++++++++++++++- user_data/workflows/standard.yaml | 6 +++ 5 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 protzilla/data_analysis/power_analysis.py diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py new file mode 100644 index 00000000..5246ccad --- /dev/null +++ b/protzilla/data_analysis/power_analysis.py @@ -0,0 +1,31 @@ +import logging + +import numpy as np +import pandas as pd +from scipy import stats +from statsmodels.stats.power import TTestIndPower + + +def sample_size_calculation( + significant_proteins_df: pd.DataFrame, + alpha: float, + power: float, + group1: str, + group2: str, + intensity_name: str = None +) -> pd.DataFrame: + """ + Function to calculate the required sample size. + + :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. + :param alpha: The significance level. + :param power: The power of the test. + :param group1: The name of the first group. + :param group2: The name of the second group. + :param intensity_name: The name of the column containing the protein group intensities. + :return: The required sample size. + """ + power_analysis_results = [] + + + diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 4dca2149..35d2c331 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -15,6 +15,7 @@ prot_quant_plot, scatter_plot, ) +from protzilla.data_analysis.power_analysis import sample_size_calculation from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph from protzilla.methods.data_preprocessing import TransformationLog from protzilla.steps import Plots, Step, StepManager @@ -599,3 +600,51 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["peptide_df"] = steps.peptide_df inputs["isoform_df"] = steps.isoform_df return inputs + +class PowerAnalysisPowerCalculation(DataAnalysisStep): + display_name = "Power Calculation" + operation = "Power Analysis" + method_description = "post-hoc Power Calculation" + + input_keys = [ + "significant_proteins_df" + ] + +class PowerAnalysisSampleSizeCalculation(DataAnalysisStep): + display_name = "Sample Size Calculation" + operation = "Power Analysis" + method_description = "(apriori) Sample Size Calculation" + + input_keys = [ + "significant_proteins_df", + "alpha", + "group1", + "group2", + "effect_size", + "power", + "intensity_name", + "log2_fc", + ] + output_keys = [] + + def method(self, inputs: dict) -> dict: + return sample_size_calculation(**inputs) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["significant_proteins_df"] = steps.get_step_output( + Step, "significant_proteins_df", inputs["input_dict"] + ) + step = next( + s for s in steps.all_steps if s.instance_identifier == inputs["input_dict"] + ) + inputs["alpha"] = step.inputs["alpha"] + inputs["group1"] = step.inputs["group1"] + inputs["group2"] = step.inputs["group2"] + inputs["significant_"] + inputs["effect_size"] = step.inputs["effect_size"] + inputs["power"] = step.inputs["power"] + inputs["intensity_name"] = step.inputs["intensity_name"] + inputs["log2_fc"] = steps.get_step_output( + Step, "log2_fold_change_df", inputs["input_dict"] + ) + return inputs diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py index 13431322..5f574427 100644 --- a/ui/runs/form_mapping.py +++ b/ui/runs/form_mapping.py @@ -58,6 +58,8 @@ data_analysis.DimensionReductionUMAP: data_analysis_forms.DimensionReductionUMAPForm, data_analysis.ProteinGraphPeptidesToIsoform: data_analysis_forms.ProteinGraphPeptidesToIsoformForm, data_analysis.ProteinGraphVariationGraph: data_analysis_forms.ProteinGraphVariationGraphForm, + data_analysis.PowerAnalysisPowerCalculation: data_analysis_forms.PowerAnalysisPowerCalculationForm, + data_analysis.PowerAnalysisSampleSizeCalculation: data_analysis_forms.PowerAnalysisSampleSizeCalculationForm, data_preprocessing.ImputationByMinPerSample: data_preprocessing_forms.ImputationByMinPerSampleForms, data_integration.EnrichmentAnalysisGOAnalysisWithString: data_integration_forms.EnrichmentAnalysisGOAnalysisWithStringForm, data_integration.EnrichmentAnalysisGOAnalysisWithEnrichr: data_integration_forms.EnrichmentAnalysisGOAnalysisWithEnrichrForm, diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 353d7cff..3b4508de 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -140,7 +140,6 @@ class DimensionReductionMetric(Enum): cosine = "cosine" havensine = "havensine" - class DifferentialExpressionANOVAForm(MethodForm): is_dynamic = True @@ -881,3 +880,61 @@ class ProteinGraphVariationGraphForm(MethodForm): label="Protein ID", initial="Enter the Uniprot-ID of the protein" ) # TODO: workflow_meta line 2291 - 2295 + +class PowerAnalysisPowerCalculationForm(MethodForm): + t_test_results = CustomChoiceField( + choices=[], + label="T-test results", + ) + #fill alpha dynamic from t-test + alpha = CustomFloatField( + label="Error rate (alpha)", + min_value = 0, + max_value = 1, + step_size = 0.05, + initial = 0.05, + ) + def fill_form(self, run: Run) -> None: + self.fields["t_test_results"].choices = get_t_test_results(run) + +class PowerAnalysisSampleSizeCalculationForm(MethodForm): + is_dynamic = True + + input_dict = CustomChoiceField( + choices=[], + label="Input data dict (generated e.g. by t-Test)", + ) + effect_size = CustomNumberField( + label="Effect size", min_value=0, initial=0.5 + ) + #fill alpha dynamic from t-test + alpha = CustomFloatField( + label="Error rate (alpha)", + min_value = 0, + max_value = 1, + step_size = 0.05, + initial = 0.05, + ) + power = CustomFloatField( + label="Power", + min_value = 0, + max_value = 1, + step_size = 0.05, + initial = 0.8, + ) + + def fill_form(self, run: Run) -> None: + self.fields["input_dict"].choices = fill_helper.to_choices( + run.steps.get_instance_identifiers( + DifferentialExpressionTTest | DifferentialExpressionLinearModel, + "differentially_expressed_proteins_df", + ) + ) + + input_dict_instance_id = self.data.get( + "input_dict", self.fields["input_dict"].choices[0][0] + ) + + self.fields["alpha"].initial = run.steps.get_step_output( + Step, "corrected_alpha", input_dict_instance_id + ) diff --git a/user_data/workflows/standard.yaml b/user_data/workflows/standard.yaml index d1608481..b7bebac1 100644 --- a/user_data/workflows/standard.yaml +++ b/user_data/workflows/standard.yaml @@ -57,6 +57,12 @@ steps: alpha: 0.05 inputs: { } type: DifferentialExpressionTTest + - form_inputs: { } + inputs: { } + type: PowerAnalysisPowerCalculation + - form_inputs: { } + inputs: { } + type: PowerAnalysisSampleSizeCalculation - form_inputs: fc_threshold: 1 inputs: { } From f133c87f38ebc83ae80e8a75a3f5968be1bee441 Mon Sep 17 00:00:00 2001 From: selenabr Date: Mon, 17 Jun 2024 11:34:22 +0200 Subject: [PATCH 02/36] enabled possibility to choose one protein for calculation dependent on significance from t-test --- protzilla/methods/data_analysis.py | 16 +++++------- ui/runs/forms/data_analysis.py | 40 +++++++++++++++++++++++++----- 2 files changed, 40 insertions(+), 16 deletions(-) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 35d2c331..0ae399b5 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -613,16 +613,16 @@ class PowerAnalysisPowerCalculation(DataAnalysisStep): class PowerAnalysisSampleSizeCalculation(DataAnalysisStep): display_name = "Sample Size Calculation" operation = "Power Analysis" - method_description = "(apriori) Sample Size Calculation" + method_description = "Calculates sample size for protein groups" input_keys = [ - "significant_proteins_df", + "corrected_p_values_df", + "selected_protein_group", + "significant_proteins_only" "alpha", "group1", "group2", - "effect_size", "power", - "intensity_name", "log2_fc", ] output_keys = [] @@ -631,8 +631,8 @@ def method(self, inputs: dict) -> dict: return sample_size_calculation(**inputs) def insert_dataframes(self, steps: StepManager, inputs) -> dict: - inputs["significant_proteins_df"] = steps.get_step_output( - Step, "significant_proteins_df", inputs["input_dict"] + inputs["corrected_p_values_df"] = steps.get_step_output( + Step, "corrected_p_values_df", inputs["input_dict"] ) step = next( s for s in steps.all_steps if s.instance_identifier == inputs["input_dict"] @@ -640,10 +640,6 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["alpha"] = step.inputs["alpha"] inputs["group1"] = step.inputs["group1"] inputs["group2"] = step.inputs["group2"] - inputs["significant_"] - inputs["effect_size"] = step.inputs["effect_size"] - inputs["power"] = step.inputs["power"] - inputs["intensity_name"] = step.inputs["intensity_name"] inputs["log2_fc"] = steps.get_step_output( Step, "log2_fold_change_df", inputs["input_dict"] ) diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 3b4508de..7d96efc2 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -33,7 +33,7 @@ class MultipleTestingCorrectionMethod(Enum): bonferroni = "Bonferroni" -class YesNo(Enum): +class YesNo(StrEnum): yes = "Yes" no = "No" @@ -904,10 +904,6 @@ class PowerAnalysisSampleSizeCalculationForm(MethodForm): choices=[], label="Input data dict (generated e.g. by t-Test)", ) - effect_size = CustomNumberField( - label="Effect size", min_value=0, initial=0.5 - ) - #fill alpha dynamic from t-test alpha = CustomFloatField( label="Error rate (alpha)", min_value = 0, @@ -922,11 +918,20 @@ class PowerAnalysisSampleSizeCalculationForm(MethodForm): step_size = 0.05, initial = 0.8, ) + selected_protein_group = CustomChoiceField( + choices=[], + label="Protein group to calculate sample size for", + ) + significant_proteins_only = CustomChoiceField( + choices=YesNo, + label="Select only significant proteins", + initial = YesNo.yes, + ) def fill_form(self, run: Run) -> None: self.fields["input_dict"].choices = fill_helper.to_choices( run.steps.get_instance_identifiers( - DifferentialExpressionTTest | DifferentialExpressionLinearModel, + DifferentialExpressionTTest, "differentially_expressed_proteins_df", ) ) @@ -935,6 +940,29 @@ def fill_form(self, run: Run) -> None: "input_dict", self.fields["input_dict"].choices[0][0] ) + self.fields["selected_protein_group"].choices = fill_helper.to_choices( + run.steps.get_step_output( + Step, "differentially_expressed_proteins_df", input_dict_instance_id + )["Protein ID"].unique() + ) + + significant_proteins_only = self.data.get( + "significant_proteins_only", self.fields["significant_proteins_only"].choices[0][0] + ) + + if significant_proteins_only == YesNo.yes: + self.fields["selected_protein_group"].choices = fill_helper.to_choices( + run.steps.get_step_output( + Step, "significant_proteins_df", input_dict_instance_id + )["Protein ID"].unique() + ) + else: + self.fields["selected_protein_group"].choices = fill_helper.to_choices( + run.steps.get_step_output( + Step, "differentially_expressed_proteins_df", input_dict_instance_id + )["Protein ID"].unique() + ) + self.fields["alpha"].initial = run.steps.get_step_output( Step, "corrected_alpha", input_dict_instance_id ) From 49c7f0e87d2c7ef6a23a094cb9520ae270980566 Mon Sep 17 00:00:00 2001 From: selenabr Date: Tue, 18 Jun 2024 22:24:25 +0200 Subject: [PATCH 03/36] fixed errors with missing inputs --- protzilla/methods/data_analysis.py | 19 +++++++++++++------ ui/runs/forms/data_analysis.py | 12 ++++++++---- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 0ae399b5..4e66e4a4 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -616,14 +616,16 @@ class PowerAnalysisSampleSizeCalculation(DataAnalysisStep): method_description = "Calculates sample size for protein groups" input_keys = [ - "corrected_p_values_df", + "differentially_expressed_proteins_df", + "metadata_df", "selected_protein_group", - "significant_proteins_only" + "significant_proteins_df", + "significant_proteins_only", + "fc_threshold", "alpha", "group1", "group2", "power", - "log2_fc", ] output_keys = [] @@ -631,16 +633,21 @@ def method(self, inputs: dict) -> dict: return sample_size_calculation(**inputs) def insert_dataframes(self, steps: StepManager, inputs) -> dict: - inputs["corrected_p_values_df"] = steps.get_step_output( - Step, "corrected_p_values_df", inputs["input_dict"] + inputs["differentially_expressed_proteins_df"] = steps.get_step_output( + Step, "differentially_expressed_proteins_df", inputs["input_dict"] ) step = next( s for s in steps.all_steps if s.instance_identifier == inputs["input_dict"] ) + inputs["significant_proteins_df"] = steps.get_step_output( + Step, "significant_proteins_df", inputs["input_dict"] + ) + + inputs["metadata_df"] = steps.metadata_df inputs["alpha"] = step.inputs["alpha"] inputs["group1"] = step.inputs["group1"] inputs["group2"] = step.inputs["group2"] - inputs["log2_fc"] = steps.get_step_output( + inputs["fc_threshold"] = steps.get_step_output( Step, "log2_fold_change_df", inputs["input_dict"] ) return inputs diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 7d96efc2..736b817f 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -918,15 +918,19 @@ class PowerAnalysisSampleSizeCalculationForm(MethodForm): step_size = 0.05, initial = 0.8, ) - selected_protein_group = CustomChoiceField( - choices=[], - label="Protein group to calculate sample size for", + fc_threshold = CustomNumberField( + label="Log2 fold change threshold", min_value=0, initial=1 ) significant_proteins_only = CustomChoiceField( choices=YesNo, label="Select only significant proteins", - initial = YesNo.yes, + initial=YesNo.yes, ) + selected_protein_group = CustomChoiceField( + choices=[], + label="Protein group to calculate sample size for", + ) + def fill_form(self, run: Run) -> None: self.fields["input_dict"].choices = fill_helper.to_choices( From 6d8c9a8eda8d6a0470bc117baa24a18916708c43 Mon Sep 17 00:00:00 2001 From: selenabr Date: Tue, 18 Jun 2024 22:25:10 +0200 Subject: [PATCH 04/36] added variance calculation and testing function and edited sample size calculation function --- protzilla/data_analysis/power_analysis.py | 72 ++++++++++++++++++- .../data_analysis/test_power_analysis.py | 26 +++++++ 2 files changed, 96 insertions(+), 2 deletions(-) create mode 100644 tests/protzilla/data_analysis/test_power_analysis.py diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py index 5246ccad..88a91227 100644 --- a/protzilla/data_analysis/power_analysis.py +++ b/protzilla/data_analysis/power_analysis.py @@ -6,16 +6,64 @@ from statsmodels.stats.power import TTestIndPower +def variance_protein_group_calculation( + intensity_df: pd.DataFrame, + metadata_df: pd.DataFrame, + protein_id: str, + group1: str, + group2: str, + intensity_name: str = None, +) -> float: + """ + Function to calculate the variance of a protein group for the two classes and return the maximum variance. + + :param intensity_df: The dataframe containing the protein group intensities. + :param metadata_df: The dataframe containing the metadata. + :param protein_id: The protein ID. + :param group1: The name of the first group. + :param group2: The name of the second group. + :param intensity_name: The name of the column containing the protein group intensities. + :return: The variance of the protein group. + """ + + if intensity_name is None: + intensity_name = "Intensity" + + protein_group = intensity_df[intensity_df["Protein ID"] == protein_id] + + protein_group = pd.merge( + left=protein_group, + right=metadata_df[["Sample", "Group"]], + on="Sample", + copy=False, + ) + + + group1_intensities = protein_group[protein_group["Group"] == group1][intensity_name].values + group2_intensities = protein_group[protein_group["Group"] == group2][intensity_name].values + + variance_group1 = np.var(group1_intensities, ddof=1) + variance_group2 = np.var(group2_intensities, ddof=1) + + max_variance = max(variance_group1, variance_group2) + + return max_variance + def sample_size_calculation( + differentially_expressed_proteins_df: pd.DataFrame, significant_proteins_df: pd.DataFrame, + significant_proteins_only: bool, + metadata_df: pd.DataFrame, + fc_threshold: float, alpha: float, power: float, group1: str, group2: str, + selected_protein_group: str, intensity_name: str = None ) -> pd.DataFrame: """ - Function to calculate the required sample size. + Function to calculate the required sample size for each significant protein to achieve the required power . :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. :param alpha: The significance level. @@ -25,7 +73,27 @@ def sample_size_calculation( :param intensity_name: The name of the column containing the protein group intensities. :return: The required sample size. """ - power_analysis_results = [] + + if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values: + raise ValueError("Please select a valid protein group.") + protein_group = selected_protein_group + z_alpha = stats.norm.ppf(1 - alpha / 2) + z_beta = stats.norm.ppf(power) + + variance_protein_group = variance_protein_group_calculation( + intensity_df=differentially_expressed_proteins_df, + metadata_df=metadata_df, + protein_id=protein_group, + group1=group1, + group2=group2, + intensity_name=intensity_name, + ) + + required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * variance_protein_group) + + print(required_sample_size) + + return required_sample_size diff --git a/tests/protzilla/data_analysis/test_power_analysis.py b/tests/protzilla/data_analysis/test_power_analysis.py new file mode 100644 index 00000000..4d9bbbde --- /dev/null +++ b/tests/protzilla/data_analysis/test_power_analysis.py @@ -0,0 +1,26 @@ +import numpy as np +import pandas as pd +import pytest + +from protzilla.data_analysis.power_analysis import variance_protein_group_calculation, sample_size_calculation +from tests.protzilla.data_analysis.test_differential_expression import diff_expr_test_data + +def test_variance_protein_group_calculation( + diff_expr_test_data +): + intensity_df, metadata_df = diff_expr_test_data + + protein_id = "Protein1" + group1 = "Group1" + group2 = "Group2" + + variance = variance_protein_group_calculation( + intensity_df, metadata_df, protein_id, group1, group2 + ) + + assert variance == 4.0 + print(variance) + + + + From 0b95cf09db052f8b5c8104667d7baa595b605e51 Mon Sep 17 00:00:00 2001 From: selenabr Date: Wed, 19 Jun 2024 19:22:07 +0200 Subject: [PATCH 05/36] fixed some errors --- protzilla/data_analysis/power_analysis.py | 18 +++++------------- protzilla/methods/data_analysis.py | 7 +++---- ui/runs/forms/data_analysis.py | 2 +- 3 files changed, 9 insertions(+), 18 deletions(-) diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py index 88a91227..5e170758 100644 --- a/protzilla/data_analysis/power_analysis.py +++ b/protzilla/data_analysis/power_analysis.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd +import math from scipy import stats from statsmodels.stats.power import TTestIndPower @@ -27,18 +28,9 @@ def variance_protein_group_calculation( """ if intensity_name is None: - intensity_name = "Intensity" - + intensity_name = "Normalised iBAQ" protein_group = intensity_df[intensity_df["Protein ID"] == protein_id] - protein_group = pd.merge( - left=protein_group, - right=metadata_df[["Sample", "Group"]], - on="Sample", - copy=False, - ) - - group1_intensities = protein_group[protein_group["Group"] == group1][intensity_name].values group2_intensities = protein_group[protein_group["Group"] == group2][intensity_name].values @@ -61,7 +53,7 @@ def sample_size_calculation( group2: str, selected_protein_group: str, intensity_name: str = None -) -> pd.DataFrame: +) -> float: """ Function to calculate the required sample size for each significant protein to achieve the required power . @@ -90,10 +82,10 @@ def sample_size_calculation( ) required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * variance_protein_group) - + required_sample_size = math.ceil(required_sample_size) print(required_sample_size) - return required_sample_size + return dict(required_sample_size=required_sample_size) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 4e66e4a4..af3db103 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -627,7 +627,9 @@ class PowerAnalysisSampleSizeCalculation(DataAnalysisStep): "group2", "power", ] - output_keys = [] + output_keys = [ + "required_sample_size", + ] def method(self, inputs: dict) -> dict: return sample_size_calculation(**inputs) @@ -647,7 +649,4 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["alpha"] = step.inputs["alpha"] inputs["group1"] = step.inputs["group1"] inputs["group2"] = step.inputs["group2"] - inputs["fc_threshold"] = steps.get_step_output( - Step, "log2_fold_change_df", inputs["input_dict"] - ) return inputs diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 736b817f..650084e0 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -918,7 +918,7 @@ class PowerAnalysisSampleSizeCalculationForm(MethodForm): step_size = 0.05, initial = 0.8, ) - fc_threshold = CustomNumberField( + fc_threshold = CustomFloatField( label="Log2 fold change threshold", min_value=0, initial=1 ) significant_proteins_only = CustomChoiceField( From 22c293d007a313739a7881e595ba032097421608 Mon Sep 17 00:00:00 2001 From: selenabr Date: Thu, 20 Jun 2024 17:33:18 +0200 Subject: [PATCH 06/36] output field for result --- protzilla/methods/data_analysis.py | 4 ++++ protzilla/steps.py | 14 ++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index af3db103..1b06e98a 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -650,3 +650,7 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["group1"] = step.inputs["group1"] inputs["group2"] = step.inputs["group2"] return inputs + + def handle_outputs(self, outputs: dict): + super().handle_outputs(outputs) + self.display_output["required_sample_size"] = outputs["required_sample_size"] \ No newline at end of file diff --git a/protzilla/steps.py b/protzilla/steps.py index 4122c451..4673e961 100644 --- a/protzilla/steps.py +++ b/protzilla/steps.py @@ -36,6 +36,7 @@ def __init__(self, instance_identifier: str | None = None): self.messages: Messages = Messages([]) self.output: Output = Output() self.plots: Plots = Plots() + self.display_output: DisplayOutput = DisplayOutput() self.instance_identifier = instance_identifier if self.instance_identifier is None: @@ -306,6 +307,19 @@ def export(self, format_): exports.append(BytesIO(base64.b64decode(plot))) return exports +class DisplayOutput: + + def __init__(self, display_output: dict = None): + if display_output is None: + display_output = [] + self.display_output = display_output + def __iter__(self): + return iter(self.display_output) + def __repr__(self): + return f"DisplayOutput: {self.display_output}" + def __contains__(self, key): + return key in self.display_output + class StepManager: def __repr__(self): From b22b6e742f65a19b225222f7f29381e1f077b3c4 Mon Sep 17 00:00:00 2001 From: selenabr Date: Fri, 21 Jun 2024 20:24:11 +0200 Subject: [PATCH 07/36] further implementation of output field for result --- protzilla/methods/data_analysis.py | 3 +-- protzilla/steps.py | 10 +++++++++- ui/runs/templates/runs/details.html | 7 +++++++ ui/runs/views.py | 8 ++++++++ 4 files changed, 25 insertions(+), 3 deletions(-) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index cc49995c..995622f9 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -21,7 +21,7 @@ from protzilla.data_analysis.power_analysis import sample_size_calculation from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph from protzilla.methods.data_preprocessing import TransformationLog -from protzilla.steps import Plots, Step, StepManager +from protzilla.steps import Plots, Step, StepManager, DisplayOutput class DataAnalysisStep(Step): @@ -672,7 +672,6 @@ class PowerAnalysisSampleSizeCalculation(DataAnalysisStep): output_keys = [ "required_sample_size", ] - def method(self, inputs: dict) -> dict: return sample_size_calculation(**inputs) diff --git a/protzilla/steps.py b/protzilla/steps.py index 447f17b7..95c596d2 100644 --- a/protzilla/steps.py +++ b/protzilla/steps.py @@ -315,7 +315,7 @@ class DisplayOutput: def __init__(self, display_output: dict = None): if display_output is None: - display_output = [] + display_output = {} self.display_output = display_output def __iter__(self): return iter(self.display_output) @@ -323,6 +323,14 @@ def __repr__(self): return f"DisplayOutput: {self.display_output}" def __contains__(self, key): return key in self.display_output + def __getitem__(self, key): + return self.display_output[key] + def __setitem__(self, key, value): + self.display_output[key] = value + def is_empty(self) -> bool: + return len(self.display_output) == 0 + + class StepManager: diff --git a/ui/runs/templates/runs/details.html b/ui/runs/templates/runs/details.html index 5809d356..a930f486 100644 --- a/ui/runs/templates/runs/details.html +++ b/ui/runs/templates/runs/details.html @@ -211,6 +211,13 @@

{{ display_name }}

{% endif %} {% endif %} + {% if display_output %} +
+ + +
+ {% endif %} {% else %}

You are at the end of the run. Go back to add more steps of the same section, or add steps of diff --git a/ui/runs/views.py b/ui/runs/views.py index 87aa685d..2c3dc241 100644 --- a/ui/runs/views.py +++ b/ui/runs/views.py @@ -121,6 +121,12 @@ def detail(request: HttpRequest, run_name: str): and Path(run.current_outputs["graph_path"]).exists() ) + display_output_form = ( + run.steps.current_step.display_output is not None + and not run.current_step.display_output.is_empty() + ) + display_output_text = f"{run.current_step.display_output}" + return render( request, "runs/details.html", @@ -156,6 +162,8 @@ def detail(request: HttpRequest, run_name: str): method_form=method_form, is_form_dynamic=method_form.is_dynamic, plot_form=plot_form, + display_output=display_output_form, + display_output_result=display_output_text, ), ) From c6a2f3bc9c4e4abc6dc7fb2c8caab30f91da3f4f Mon Sep 17 00:00:00 2001 From: selenabr Date: Sun, 23 Jun 2024 02:43:34 +0200 Subject: [PATCH 08/36] display display_output in output field --- protzilla/methods/data_analysis.py | 2 +- ui/runs/templates/runs/details.html | 4 ++-- ui/runs/views.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 995622f9..238ba01a 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -694,4 +694,4 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: def handle_outputs(self, outputs: dict): super().handle_outputs(outputs) - self.display_output["required_sample_size"] = outputs["required_sample_size"] \ No newline at end of file + self.display_output["required_sample_size"] = f"Required Sample Size: {outputs['required_sample_size']}" diff --git a/ui/runs/templates/runs/details.html b/ui/runs/templates/runs/details.html index a930f486..361875f7 100644 --- a/ui/runs/templates/runs/details.html +++ b/ui/runs/templates/runs/details.html @@ -213,8 +213,8 @@

{{ display_name }}

{% endif %} {% if display_output %}
- -
{% endif %} diff --git a/ui/runs/views.py b/ui/runs/views.py index 2c3dc241..c7fa965b 100644 --- a/ui/runs/views.py +++ b/ui/runs/views.py @@ -125,7 +125,7 @@ def detail(request: HttpRequest, run_name: str): run.steps.current_step.display_output is not None and not run.current_step.display_output.is_empty() ) - display_output_text = f"{run.current_step.display_output}" + display_output_text = next(iter(run.current_step.display_output.display_output.values())) return render( request, From 032286c946f25d44f70ff66bd85bfb88a44f1e24 Mon Sep 17 00:00:00 2001 From: selenabr Date: Tue, 25 Jun 2024 13:25:53 +0200 Subject: [PATCH 09/36] display_output field displayed in the same size and position as the other fields --- ui/runs/static/runs/style.css | 7 +++++++ ui/runs/templates/runs/details.html | 15 ++++++++------- ui/runs/views.py | 2 +- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/ui/runs/static/runs/style.css b/ui/runs/static/runs/style.css index 63d66a0b..477e0f11 100644 --- a/ui/runs/static/runs/style.css +++ b/ui/runs/static/runs/style.css @@ -75,3 +75,10 @@ html, body { #gsea_enrichment_plot_img { width: 800px; } + +.display-output-textarea { + display: flex; + width: 100%; + height: auto; + resize: none; +} \ No newline at end of file diff --git a/ui/runs/templates/runs/details.html b/ui/runs/templates/runs/details.html index 361875f7..84ec3cfd 100644 --- a/ui/runs/templates/runs/details.html +++ b/ui/runs/templates/runs/details.html @@ -209,13 +209,14 @@

{{ display_name }}

{% endif %} - - {% endif %} - {% if display_output %} -
- - + {% if display_output %} +
+ + +
+ {% endif %}
{% endif %} diff --git a/ui/runs/views.py b/ui/runs/views.py index c7fa965b..4de29692 100644 --- a/ui/runs/views.py +++ b/ui/runs/views.py @@ -125,7 +125,7 @@ def detail(request: HttpRequest, run_name: str): run.steps.current_step.display_output is not None and not run.current_step.display_output.is_empty() ) - display_output_text = next(iter(run.current_step.display_output.display_output.values())) + display_output_text = next(iter(run.current_step.display_output.display_output.values()), None) return render( request, From e90fab3447ea88a42c5b64e4f34718861800acef Mon Sep 17 00:00:00 2001 From: selenabr Date: Tue, 25 Jun 2024 21:53:03 +0200 Subject: [PATCH 10/36] test function for sample_size_calculation --- protzilla/data_analysis/power_analysis.py | 4 - protzilla/methods/data_analysis.py | 2 - .../data_analysis/test_power_analysis.py | 76 +++++++++++++++++-- ui/runs/templates/runs/details.html | 2 +- 4 files changed, 71 insertions(+), 13 deletions(-) diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py index 5e170758..c05b7b67 100644 --- a/protzilla/data_analysis/power_analysis.py +++ b/protzilla/data_analysis/power_analysis.py @@ -9,7 +9,6 @@ def variance_protein_group_calculation( intensity_df: pd.DataFrame, - metadata_df: pd.DataFrame, protein_id: str, group1: str, group2: str, @@ -19,7 +18,6 @@ def variance_protein_group_calculation( Function to calculate the variance of a protein group for the two classes and return the maximum variance. :param intensity_df: The dataframe containing the protein group intensities. - :param metadata_df: The dataframe containing the metadata. :param protein_id: The protein ID. :param group1: The name of the first group. :param group2: The name of the second group. @@ -45,7 +43,6 @@ def sample_size_calculation( differentially_expressed_proteins_df: pd.DataFrame, significant_proteins_df: pd.DataFrame, significant_proteins_only: bool, - metadata_df: pd.DataFrame, fc_threshold: float, alpha: float, power: float, @@ -74,7 +71,6 @@ def sample_size_calculation( variance_protein_group = variance_protein_group_calculation( intensity_df=differentially_expressed_proteins_df, - metadata_df=metadata_df, protein_id=protein_group, group1=group1, group2=group2, diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 238ba01a..728faa46 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -659,7 +659,6 @@ class PowerAnalysisSampleSizeCalculation(DataAnalysisStep): input_keys = [ "differentially_expressed_proteins_df", - "metadata_df", "selected_protein_group", "significant_proteins_df", "significant_proteins_only", @@ -686,7 +685,6 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: Step, "significant_proteins_df", inputs["input_dict"] ) - inputs["metadata_df"] = steps.metadata_df inputs["alpha"] = step.inputs["alpha"] inputs["group1"] = step.inputs["group1"] inputs["group2"] = step.inputs["group2"] diff --git a/tests/protzilla/data_analysis/test_power_analysis.py b/tests/protzilla/data_analysis/test_power_analysis.py index 4d9bbbde..ec0bc1b8 100644 --- a/tests/protzilla/data_analysis/test_power_analysis.py +++ b/tests/protzilla/data_analysis/test_power_analysis.py @@ -2,24 +2,88 @@ import pandas as pd import pytest + from protzilla.data_analysis.power_analysis import variance_protein_group_calculation, sample_size_calculation -from tests.protzilla.data_analysis.test_differential_expression import diff_expr_test_data + + +@pytest.fixture +def power_test_data(): + test_differentially_expressed_proteins_list = ( + ["Sample1", "Protein1", "Gene1", 20, "Group1"], + ["Sample1", "Protein2", "Gene1", 16, "Group1"], + ["Sample1", "Protein3", "Gene1", 1, "Group1"], + ["Sample1", "Protein4", "Gene1", 14, "Group1"], + ["Sample2", "Protein1", "Gene1", 20, "Group1"], + ["Sample2", "Protein2", "Gene1", 15, "Group1"], + ["Sample2", "Protein3", "Gene1", 2, "Group1"], + ["Sample2", "Protein4", "Gene1", 15, "Group1"], + ["Sample3", "Protein1", "Gene1", 22, "Group1"], + ["Sample3", "Protein2", "Gene1", 14, "Group1"], + ["Sample3", "Protein3", "Gene1", 3, "Group1"], + ["Sample3", "Protein4", "Gene1", 16, "Group1"], + ["Sample4", "Protein1", "Gene1", 8, "Group2"], + ["Sample4", "Protein2", "Gene1", 15, "Group2"], + ["Sample4", "Protein3", "Gene1", 1, "Group2"], + ["Sample4", "Protein4", "Gene1", 9, "Group2"], + ["Sample5", "Protein1", "Gene1", 10, "Group2"], + ["Sample5", "Protein2", "Gene1", 14, "Group2"], + ["Sample5", "Protein3", "Gene1", 2, "Group2"], + ["Sample5", "Protein4", "Gene1", 10, "Group2"], + ["Sample6", "Protein1", "Gene1", 12, "Group2"], + ["Sample6", "Protein2", "Gene1", 13, "Group2"], + ["Sample6", "Protein3", "Gene1", 3, "Group2"], + ["Sample6", "Protein4", "Gene1", 11, "Group2"], + ) + + test_differentially_expressed_proteins_df = pd.DataFrame( + data=test_differentially_expressed_proteins_list, + columns=["Sample", "Protein ID", "Gene", "Normalised iBAQ", "Group"], + ) + return test_differentially_expressed_proteins_df + def test_variance_protein_group_calculation( - diff_expr_test_data + power_test_data ): - intensity_df, metadata_df = diff_expr_test_data + intensity_df = power_test_data protein_id = "Protein1" group1 = "Group1" group2 = "Group2" variance = variance_protein_group_calculation( - intensity_df, metadata_df, protein_id, group1, group2 + intensity_df, protein_id, group1, group2 ) - - assert variance == 4.0 print(variance) + assert variance == 4.0 + +def test_sample_size_calculation( + power_test_data + +): + test_alpha = 0.05 + test_power = 0.8 + test_fc_threshold = 1 + test_selected_protein_group = "Protein1" + + + required_sample_size = sample_size_calculation( + differentially_expressed_proteins_df=power_test_data, + significant_proteins_df=power_test_data, + fc_threshold=test_fc_threshold, + power=test_power, + alpha=test_alpha, + group1= "Group1", + group2= "Group2", + selected_protein_group=test_selected_protein_group, + significant_proteins_only=False, + intensity_name=None + ) + print(required_sample_size) + required_sample_size_int = next(iter(required_sample_size.values()),None) + assert required_sample_size_int == 63 + + diff --git a/ui/runs/templates/runs/details.html b/ui/runs/templates/runs/details.html index 84ec3cfd..e7884f99 100644 --- a/ui/runs/templates/runs/details.html +++ b/ui/runs/templates/runs/details.html @@ -212,7 +212,7 @@

{{ display_name }}

{% if display_output %}
-
From d3cf9d89ec64e7f250ac25d22d14124919373f98 Mon Sep 17 00:00:00 2001 From: selenabr Date: Wed, 26 Jun 2024 10:54:01 +0200 Subject: [PATCH 11/36] edited description of function --- protzilla/data_analysis/power_analysis.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py index c05b7b67..b3f63665 100644 --- a/protzilla/data_analysis/power_analysis.py +++ b/protzilla/data_analysis/power_analysis.py @@ -52,13 +52,17 @@ def sample_size_calculation( intensity_name: str = None ) -> float: """ - Function to calculate the required sample size for each significant protein to achieve the required power . + Function to calculate the required sample size for a selected protein to achieve the required power . + :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. - :param alpha: The significance level. + :param significant_proteins_only: A boolean to display only significant proteins for selection to the user. + :param fc_threshold: The fold change threshold. + :param alpha: The significance level. The value for alpha is taken from the t-test by default. :param power: The power of the test. :param group1: The name of the first group. :param group2: The name of the second group. + :param selected_protein_group: The selected protein group for which the required sample size is to be calculated. :param intensity_name: The name of the column containing the protein group intensities. :return: The required sample size. """ From 3ce4ae1d7b7d34a0dae705b273a284ce71b83140 Mon Sep 17 00:00:00 2001 From: selenabr Date: Mon, 8 Jul 2024 06:38:21 +0200 Subject: [PATCH 12/36] check if implemented function of Paper (Cairns et al., 2009) and library-function of Sample Size Calculation have the same result --- .../data_analysis/test_power_analysis.py | 80 ++++++++++++++++++- 1 file changed, 78 insertions(+), 2 deletions(-) diff --git a/tests/protzilla/data_analysis/test_power_analysis.py b/tests/protzilla/data_analysis/test_power_analysis.py index ec0bc1b8..27550248 100644 --- a/tests/protzilla/data_analysis/test_power_analysis.py +++ b/tests/protzilla/data_analysis/test_power_analysis.py @@ -3,13 +3,13 @@ import pytest -from protzilla.data_analysis.power_analysis import variance_protein_group_calculation, sample_size_calculation +from protzilla.data_analysis.power_analysis import sample_size_calculation, check_sample_size_calculation_with_libfunc, check_sample_size_calculation_implemented, check_sample_size_calculation_implemented_without_log @pytest.fixture def power_test_data(): test_differentially_expressed_proteins_list = ( - ["Sample1", "Protein1", "Gene1", 20, "Group1"], + ["Sample1", "Protein1", "Gene1", 18, "Group1"], ["Sample1", "Protein2", "Gene1", 16, "Group1"], ["Sample1", "Protein3", "Gene1", 1, "Group1"], ["Sample1", "Protein4", "Gene1", 14, "Group1"], @@ -84,6 +84,82 @@ def test_sample_size_calculation( assert required_sample_size_int == 63 +def test_check_sample_size_calculation_with_libfun( + power_test_data + +): + test_alpha = 0.05 + test_power = 0.8 + test_fc_threshold = 5 + test_selected_protein_group = "Protein1" + + required_sample_size = check_sample_size_calculation_with_libfunc( + differentially_expressed_proteins_df=power_test_data, + significant_proteins_df=power_test_data, + fc_threshold=test_fc_threshold, + power=test_power, + alpha=test_alpha, + group1="Group1", + group2="Group2", + selected_protein_group=test_selected_protein_group, + significant_proteins_only=False, + intensity_name=None + ) + print(required_sample_size) + required_sample_size_int = next(iter(required_sample_size.values()), None) + assert required_sample_size_int == 63 + +def test_check_sample_size_calculation_impl( + power_test_data + +): + test_alpha = 0.05 + test_power = 0.8 + power_test_data_log2 = power_test_data.copy() + power_test_data_log2["Normalised iBAQ"] = np.log2(power_test_data_log2["Normalised iBAQ"]) + fc_threshold = 1 + test_selected_protein_group = "Protein1" + + required_sample_size = check_sample_size_calculation_implemented( + differentially_expressed_proteins_df=power_test_data_log2, + significant_proteins_df=power_test_data, + fc_threshold=fc_threshold, + power=test_power, + alpha=test_alpha, + group1="Group1", + group2="Group2", + selected_protein_group=test_selected_protein_group, + significant_proteins_only=False, + intensity_name=None + ) + print(required_sample_size) + required_sample_size_int = next(iter(required_sample_size.values()), None) + assert required_sample_size_int == 63 + +def test_check_sample_size_calculation_implemented_without_log( + power_test_data + +): + test_alpha = 0.05 + test_power = 0.8 + test_fc_threshold = 5 + test_selected_protein_group = "Protein1" + + required_sample_size = check_sample_size_calculation_implemented_without_log( + differentially_expressed_proteins_df=power_test_data, + significant_proteins_df=power_test_data, + fc_threshold=test_fc_threshold, + power=test_power, + alpha=test_alpha, + group1="Group1", + group2="Group2", + selected_protein_group=test_selected_protein_group, + significant_proteins_only=False, + intensity_name=None + ) + print(required_sample_size) + required_sample_size_int = next(iter(required_sample_size.values()), None) + assert required_sample_size_int == 63 From f78b0b928638a984376b0069547b1dffcc54e474 Mon Sep 17 00:00:00 2001 From: selenabr Date: Mon, 8 Jul 2024 06:39:21 +0200 Subject: [PATCH 13/36] power calculation and test of library-function and implemented paper-function --- protzilla/data_analysis/power_analysis.py | 201 +++++++++++++++++++++- protzilla/methods/data_analysis.py | 41 ++++- ui/runs/forms/data_analysis.py | 59 ++++++- 3 files changed, 291 insertions(+), 10 deletions(-) diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py index b3f63665..a87ec0aa 100644 --- a/protzilla/data_analysis/power_analysis.py +++ b/protzilla/data_analysis/power_analysis.py @@ -7,7 +7,7 @@ from statsmodels.stats.power import TTestIndPower -def variance_protein_group_calculation( +def variance_protein_group_calculation_max( intensity_df: pd.DataFrame, protein_id: str, group1: str, @@ -73,7 +73,7 @@ def sample_size_calculation( z_alpha = stats.norm.ppf(1 - alpha / 2) z_beta = stats.norm.ppf(power) - variance_protein_group = variance_protein_group_calculation( + variance_protein_group = variance_protein_group_calculation_max( intensity_df=differentially_expressed_proteins_df, protein_id=protein_group, group1=group1, @@ -87,5 +87,202 @@ def sample_size_calculation( return dict(required_sample_size=required_sample_size) +def check_sample_size_calculation_with_libfunc( + differentially_expressed_proteins_df: pd.DataFrame, + significant_proteins_df: pd.DataFrame, + significant_proteins_only: bool, + fc_threshold: float, + alpha: float, + power: float, + group1: str, + group2: str, + selected_protein_group: str, + intensity_name: str = None +) -> float: + """ + Function to calculate the required sample size for a selected protein to achieve the required power . + + :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. + :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. + :param significant_proteins_only: A boolean to display only significant proteins for selection to the user. + :param fc_threshold: The fold change threshold. + :param alpha: The significance level. The value for alpha is taken from the t-test by default. + :param power: The power of the test. + :param group1: The name of the first group. + :param group2: The name of the second group. + :param selected_protein_group: The selected protein group for which the required sample size is to be calculated. + :param intensity_name: The name of the column containing the protein group intensities. + :return: The required sample size. + """ + + if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values: + raise ValueError("Please select a valid protein group.") + + protein_group = differentially_expressed_proteins_df[differentially_expressed_proteins_df["Protein ID"] == selected_protein_group] + + group1_intensities = np.log2(protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values) + group2_intensities = np.log2(protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values) + variance_group1 = np.var(group1_intensities, ddof=1) + variance_group2 = np.var(group2_intensities, ddof=1) + + sd_pooled = math.sqrt((variance_group1 + variance_group2) / 2) + mean_diff = abs(group1_intensities.mean() - group2_intensities.mean()) + effect_size = (group1_intensities.mean() - group2_intensities.mean())/sd_pooled + + obj = TTestIndPower() + required_sample_size = obj.solve_power( + effect_size=effect_size, + alpha=alpha, + power=power, nobs1=None, ratio=1.0, alternative='two-sided') + print(required_sample_size) + + required_sample_size = math.ceil(required_sample_size) + + return dict(required_sample_size=required_sample_size) + #required_sample_size = 2.27; pooled_sd = 0.23; effect_size = 4.39, mean_diff = 1.014 + + #impl: required_sample_size = 0.814; fc_threshold = 1.014; variance = 0.0534 +def check_sample_size_calculation_implemented( + differentially_expressed_proteins_df: pd.DataFrame, + significant_proteins_df: pd.DataFrame, + significant_proteins_only: bool, + fc_threshold: float, + alpha: float, + power: float, + group1: str, + group2: str, + selected_protein_group: str, + intensity_name: str = None +) -> float: + """ + Function to calculate the required sample size for a selected protein to achieve the required power . + + :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. + :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. + :param significant_proteins_only: A boolean to display only significant proteins for selection to the user. + :param fc_threshold: The fold change threshold. + :param alpha: The significance level. The value for alpha is taken from the t-test by default. + :param power: The power of the test. + :param group1: The name of the first group. + :param group2: The name of the second group. + :param selected_protein_group: The selected protein group for which the required sample size is to be calculated. + :param intensity_name: The name of the column containing the protein group intensities. + :return: The required sample size. + """ + + if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values: + raise ValueError("Please select a valid protein group.") + + z_alpha = stats.norm.ppf(1 - alpha / 2) + z_beta = stats.norm.ppf(power) + protein_group = differentially_expressed_proteins_df[ + differentially_expressed_proteins_df["Protein ID"] == selected_protein_group] + + group1_intensities = protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values + group2_intensities = protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values + fc_threshold = abs(group1_intensities.mean() - group2_intensities.mean()) + variance_group1 = np.var(group1_intensities, ddof=1) + variance_group2 = np.var(group2_intensities, ddof=1) + + pooled_variance = (variance_group1 + variance_group2) / 2 + required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * pooled_variance) + required_sample_size = math.ceil(required_sample_size) + print(required_sample_size) + + return dict(required_sample_size=required_sample_size) + +def check_sample_size_calculation_implemented_without_log( + differentially_expressed_proteins_df: pd.DataFrame, + significant_proteins_df: pd.DataFrame, + significant_proteins_only: bool, + fc_threshold: float, + alpha: float, + power: float, + group1: str, + group2: str, + selected_protein_group: str, + intensity_name: str = None +) -> float: + """ + Function to calculate the required sample size for a selected protein to achieve the required power . + + :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. + :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. + :param significant_proteins_only: A boolean to display only significant proteins for selection to the user. + :param fc_threshold: The fold change threshold. + :param alpha: The significance level. The value for alpha is taken from the t-test by default. + :param power: The power of the test. + :param group1: The name of the first group. + :param group2: The name of the second group. + :param selected_protein_group: The selected protein group for which the required sample size is to be calculated. + :param intensity_name: The name of the column containing the protein group intensities. + :return: The required sample size. + """ + + if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values: + raise ValueError("Please select a valid protein group.") + + z_alpha = stats.norm.ppf(1 - alpha / 2) + z_beta = stats.norm.ppf(power) + protein_group = differentially_expressed_proteins_df[ + differentially_expressed_proteins_df["Protein ID"] == selected_protein_group] + + group1_intensities = protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values + group2_intensities = protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values + fc_threshold = abs(group1_intensities.mean() - group2_intensities.mean()) + variance_group1 = np.var(group1_intensities, ddof=1) + variance_group2 = np.var(group2_intensities, ddof=1) + + pooled_variance = (variance_group1 + variance_group2) / 2 + required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * pooled_variance) + required_sample_size = math.ceil(required_sample_size) + print(required_sample_size) + + return dict(required_sample_size=required_sample_size) + +def power_calculation( + differentially_expressed_proteins_df: pd.DataFrame, + significant_proteins_df: pd.DataFrame, + significant_proteins_only: bool, + alpha: float, + fc_threshold: float, + group1: str, + group2: str, + selected_protein_group: str, + intensity_name: str = None +) -> float: + + """ + Function to calculate the power of the t-test for a selected protein group. + + :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. + :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. + :param significant_proteins_only: A boolean to display only significant proteins for selection to the user. + :param alpha: The significance level. The value for alpha is taken from the t-test by default. + :param fc_threshold: The fold change threshold. + :param group1: The name of the first group. + :param group2: The name of the second group. + :param selected_protein_group: The selected protein group for which the power is to be calculated. + :param intensity_name: The name of the column containing the protein group intensities. + :return: The power of the test. + """ + if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values: + raise ValueError("Please select a valid protein group.") + protein_group = selected_protein_group + z_alpha = stats.norm.ppf(1 - alpha / 2) + + variance_protein_group = variance_protein_group_calculation_max( + intensity_df=differentially_expressed_proteins_df, + protein_id=protein_group, + group1=group1, + group2=group2, + intensity_name=intensity_name, + ) + sample_size = differentially_expressed_proteins_df.groupby('Group')['Sample'].count() + z_beta = fc_threshold * np.sqrt(sample_size/(2*variance_protein_group**2))-z_alpha + power = stats.norm.cdf(z_beta) + + return dict(power=power) + diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 24319c9a..c37b524d 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -21,7 +21,7 @@ prot_quant_plot, scatter_plot, ) -from protzilla.data_analysis.power_analysis import sample_size_calculation +from protzilla.data_analysis.power_analysis import sample_size_calculation, power_calculation from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph from protzilla.methods.data_preprocessing import TransformationLog from protzilla.steps import Plots, Step, StepManager, DisplayOutput @@ -764,16 +764,49 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: class PowerAnalysisPowerCalculation(DataAnalysisStep): display_name = "Power Calculation" operation = "Power Analysis" - method_description = "post-hoc Power Calculation" + method_description = "Calculates power of the test for given protein groups" input_keys = [ - "significant_proteins_df" + "significant_proteins_df", + "differentially_expressed_proteins_df", + "selected_protein_group", + "significant_proteins_df", + "significant_proteins_only", + "fc_threshold", + "alpha", + "group1", + "group2", ] + output_keys = ["power",] + + def method(self, inputs: dict) -> dict: + return power_calculation(**inputs) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["differentially_expressed_proteins_df"] = steps.get_step_output( + Step, "differentially_expressed_proteins_df", inputs["input_dict"] + ) + step = next( + s for s in steps.all_steps if s.instance_identifier == inputs["input_dict"] + ) + inputs["significant_proteins_df"] = steps.get_step_output( + Step, "significant_proteins_df", inputs["input_dict"] + ) + + inputs["alpha"] = step.inputs["alpha"] + inputs["group1"] = step.inputs["group1"] + inputs["group2"] = step.inputs["group2"] + return inputs + + def handle_outputs(self, outputs: dict): + super().handle_outputs(outputs) + self.display_output["power"] = f"Power of the test: {outputs['power']}" + class PowerAnalysisSampleSizeCalculation(DataAnalysisStep): display_name = "Sample Size Calculation" operation = "Power Analysis" - method_description = "Calculates sample size for protein groups" + method_description = "Calculates sample size for given protein groups" input_keys = [ "differentially_expressed_proteins_df", diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 1a6b6d34..74c5a149 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1091,11 +1091,12 @@ def fill_form(self, run: Run) -> None: if single_protein_peptides: self.fields["peptide_df"].initial = single_protein_peptides[0] class PowerAnalysisPowerCalculationForm(MethodForm): - t_test_results = CustomChoiceField( + is_dynamic = True + + input_dict = CustomChoiceField( choices=[], - label="T-test results", + label="Input data dict (generated e.g. by t-Test)", ) - #fill alpha dynamic from t-test alpha = CustomFloatField( label="Error rate (alpha)", min_value = 0, @@ -1103,8 +1104,58 @@ class PowerAnalysisPowerCalculationForm(MethodForm): step_size = 0.05, initial = 0.05, ) + fc_threshold = CustomFloatField( + label="Log2 fold change threshold", min_value=0, initial=1 + ) + significant_proteins_only = CustomChoiceField( + choices=YesNo, + label="Select only significant proteins", + initial=YesNo.yes, + ) + selected_protein_group = CustomChoiceField( + choices=[], + label="Protein group to calculate power for", + ) + def fill_form(self, run: Run) -> None: - self.fields["t_test_results"].choices = get_t_test_results(run) + self.fields["input_dict"].choices = fill_helper.to_choices( + run.steps.get_instance_identifiers( + DifferentialExpressionTTest, + "differentially_expressed_proteins_df", + ) + ) + + input_dict_instance_id = self.data.get( + "input_dict", self.fields["input_dict"].choices[0][0] + ) + + self.fields["selected_protein_group"].choices = fill_helper.to_choices( + run.steps.get_step_output( + Step, "differentially_expressed_proteins_df", input_dict_instance_id + )["Protein ID"].unique() + ) + + significant_proteins_only = self.data.get( + "significant_proteins_only", self.fields["significant_proteins_only"].choices[0][0] + ) + + if significant_proteins_only == YesNo.yes: + self.fields["selected_protein_group"].choices = fill_helper.to_choices( + run.steps.get_step_output( + Step, "significant_proteins_df", input_dict_instance_id + )["Protein ID"].unique() + ) + else: + self.fields["selected_protein_group"].choices = fill_helper.to_choices( + run.steps.get_step_output( + Step, "differentially_expressed_proteins_df", input_dict_instance_id + )["Protein ID"].unique() + ) + + self.fields["alpha"].initial = run.steps.get_step_output( + Step, "corrected_alpha", input_dict_instance_id + ) + class PowerAnalysisSampleSizeCalculationForm(MethodForm): is_dynamic = True From e3dd1c35e5bb91d9d4648c9946bcb605cb9f1227 Mon Sep 17 00:00:00 2001 From: selenabr Date: Wed, 21 Aug 2024 00:56:51 +0200 Subject: [PATCH 14/36] added test for power_calculation method --- protzilla/data_analysis/power_analysis.py | 52 +++++++++++++++++-- .../data_analysis/test_power_analysis.py | 25 ++++++++- 2 files changed, 71 insertions(+), 6 deletions(-) diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py index a87ec0aa..92e76ba9 100644 --- a/protzilla/data_analysis/power_analysis.py +++ b/protzilla/data_analysis/power_analysis.py @@ -240,7 +240,7 @@ def check_sample_size_calculation_implemented_without_log( return dict(required_sample_size=required_sample_size) -def power_calculation( +def power_calculation_test( differentially_expressed_proteins_df: pd.DataFrame, significant_proteins_df: pd.DataFrame, significant_proteins_only: bool, @@ -278,11 +278,55 @@ def power_calculation( group2=group2, intensity_name=intensity_name, ) - sample_size = differentially_expressed_proteins_df.groupby('Group')['Sample'].count() - z_beta = fc_threshold * np.sqrt(sample_size/(2*variance_protein_group**2))-z_alpha - power = stats.norm.cdf(z_beta) + sample_size = min(differentially_expressed_proteins_df.groupby(['Group', 'Protein ID'])['Sample'].count()) + z_beta = fc_threshold * np.sqrt(sample_size / (2 * variance_protein_group)) - z_alpha + power = round(stats.norm.cdf(z_beta), 2) return dict(power=power) +def power_calculation( + differentially_expressed_proteins_df: pd.DataFrame, + significant_proteins_df: pd.DataFrame, + significant_proteins_only: bool, + alpha: float, + fc_threshold: float, + group1: str, + group2: str, + selected_protein_group: str, + intensity_name: str = None +) -> float: + + """ + Function to calculate the power of the t-test for a selected protein group. + + :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. + :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. + :param significant_proteins_only: A boolean to display only significant proteins for selection to the user. + :param alpha: The significance level. The value for alpha is taken from the t-test by default. + :param fc_threshold: The fold change threshold. + :param group1: The name of the first group. + :param group2: The name of the second group. + :param selected_protein_group: The selected protein group for which the power is to be calculated. + :param intensity_name: The name of the column containing the protein group intensities. + :return: The power of the test. + """ + if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values: + raise ValueError("Please select a valid protein group.") + protein_group = selected_protein_group + z_alpha = stats.norm.ppf(1 - alpha / 2) + + variance_protein_group = variance_protein_group_calculation_max( + intensity_df=differentially_expressed_proteins_df, + protein_id=protein_group, + group1=group1, + group2=group2, + intensity_name=intensity_name, + ) + sample_size = min(differentially_expressed_proteins_df.groupby(['Group', 'Protein ID'])['Sample'].count()) + z_beta = fc_threshold * np.sqrt(sample_size / (2 * variance_protein_group)) - z_alpha + power = round(stats.norm.cdf(z_beta), 2) + + return dict(power=power) + diff --git a/tests/protzilla/data_analysis/test_power_analysis.py b/tests/protzilla/data_analysis/test_power_analysis.py index 27550248..f83f7bf6 100644 --- a/tests/protzilla/data_analysis/test_power_analysis.py +++ b/tests/protzilla/data_analysis/test_power_analysis.py @@ -3,7 +3,7 @@ import pytest -from protzilla.data_analysis.power_analysis import sample_size_calculation, check_sample_size_calculation_with_libfunc, check_sample_size_calculation_implemented, check_sample_size_calculation_implemented_without_log +from protzilla.data_analysis.power_analysis import sample_size_calculation, power_calculation_test, check_sample_size_calculation_with_libfunc, check_sample_size_calculation_implemented, check_sample_size_calculation_implemented_without_log @pytest.fixture @@ -134,7 +134,7 @@ def test_check_sample_size_calculation_impl( ) print(required_sample_size) required_sample_size_int = next(iter(required_sample_size.values()), None) - assert required_sample_size_int == 63 + assert required_sample_size_int == 1 def test_check_sample_size_calculation_implemented_without_log( power_test_data @@ -163,4 +163,25 @@ def test_check_sample_size_calculation_implemented_without_log( +def test_power_calculation( + power_test_data +): + test_alpha = 0.05 + test_fc_threshold = 1 + test_selected_protein_group = "Protein1" + + power = power_calculation_test( + differentially_expressed_proteins_df=power_test_data, + significant_proteins_df=power_test_data, + fc_threshold=test_fc_threshold, + alpha=test_alpha, + group1="Group1", + group2="Group2", + selected_protein_group=test_selected_protein_group, + significant_proteins_only=False, + intensity_name=None + ) + print(power) + power_int = next(iter(power.values()), None) + assert power_int== 0.09 From 2e3de5ad1525e0f042b15df3868cbac62dbaac44 Mon Sep 17 00:00:00 2001 From: selenabr Date: Wed, 21 Aug 2024 20:29:25 +0200 Subject: [PATCH 15/36] fixed constructor error --- protzilla/data_analysis/power_analysis.py | 48 +------------------ protzilla/methods/data_analysis.py | 2 +- .../data_analysis/test_power_analysis.py | 4 +- user_data/workflows/standard.yaml | 4 +- 4 files changed, 7 insertions(+), 51 deletions(-) diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py index 92e76ba9..80d542a2 100644 --- a/protzilla/data_analysis/power_analysis.py +++ b/protzilla/data_analysis/power_analysis.py @@ -240,51 +240,6 @@ def check_sample_size_calculation_implemented_without_log( return dict(required_sample_size=required_sample_size) -def power_calculation_test( - differentially_expressed_proteins_df: pd.DataFrame, - significant_proteins_df: pd.DataFrame, - significant_proteins_only: bool, - alpha: float, - fc_threshold: float, - group1: str, - group2: str, - selected_protein_group: str, - intensity_name: str = None -) -> float: - - """ - Function to calculate the power of the t-test for a selected protein group. - - :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. - :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. - :param significant_proteins_only: A boolean to display only significant proteins for selection to the user. - :param alpha: The significance level. The value for alpha is taken from the t-test by default. - :param fc_threshold: The fold change threshold. - :param group1: The name of the first group. - :param group2: The name of the second group. - :param selected_protein_group: The selected protein group for which the power is to be calculated. - :param intensity_name: The name of the column containing the protein group intensities. - :return: The power of the test. - """ - if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values: - raise ValueError("Please select a valid protein group.") - protein_group = selected_protein_group - z_alpha = stats.norm.ppf(1 - alpha / 2) - - variance_protein_group = variance_protein_group_calculation_max( - intensity_df=differentially_expressed_proteins_df, - protein_id=protein_group, - group1=group1, - group2=group2, - intensity_name=intensity_name, - ) - sample_size = min(differentially_expressed_proteins_df.groupby(['Group', 'Protein ID'])['Sample'].count()) - z_beta = fc_threshold * np.sqrt(sample_size / (2 * variance_protein_group)) - z_alpha - power = round(stats.norm.cdf(z_beta), 2) - - return dict(power=power) - - def power_calculation( differentially_expressed_proteins_df: pd.DataFrame, significant_proteins_df: pd.DataFrame, @@ -323,9 +278,10 @@ def power_calculation( group2=group2, intensity_name=intensity_name, ) + sample_size = min(differentially_expressed_proteins_df.groupby(['Group', 'Protein ID'])['Sample'].count()) z_beta = fc_threshold * np.sqrt(sample_size / (2 * variance_protein_group)) - z_alpha - power = round(stats.norm.cdf(z_beta), 2) + power = float(round(stats.norm.cdf(z_beta), 2)) return dict(power=power) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index c37b524d..d5b2ccb4 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -777,7 +777,7 @@ class PowerAnalysisPowerCalculation(DataAnalysisStep): "group1", "group2", ] - output_keys = ["power",] + output_keys = ["power"] def method(self, inputs: dict) -> dict: return power_calculation(**inputs) diff --git a/tests/protzilla/data_analysis/test_power_analysis.py b/tests/protzilla/data_analysis/test_power_analysis.py index f83f7bf6..ceded6e3 100644 --- a/tests/protzilla/data_analysis/test_power_analysis.py +++ b/tests/protzilla/data_analysis/test_power_analysis.py @@ -3,7 +3,7 @@ import pytest -from protzilla.data_analysis.power_analysis import sample_size_calculation, power_calculation_test, check_sample_size_calculation_with_libfunc, check_sample_size_calculation_implemented, check_sample_size_calculation_implemented_without_log +from protzilla.data_analysis.power_analysis import sample_size_calculation, power_calculation, check_sample_size_calculation_with_libfunc, check_sample_size_calculation_implemented, check_sample_size_calculation_implemented_without_log @pytest.fixture @@ -171,7 +171,7 @@ def test_power_calculation( test_selected_protein_group = "Protein1" - power = power_calculation_test( + power = power_calculation( differentially_expressed_proteins_df=power_test_data, significant_proteins_df=power_test_data, fc_threshold=test_fc_threshold, diff --git a/user_data/workflows/standard.yaml b/user_data/workflows/standard.yaml index 3c27017d..970a2a2f 100644 --- a/user_data/workflows/standard.yaml +++ b/user_data/workflows/standard.yaml @@ -60,10 +60,10 @@ steps: type: DifferentialExpressionTTest - form_inputs: { } inputs: { } - type: PowerAnalysisPowerCalculation + type: PowerAnalysisSampleSizeCalculation - form_inputs: { } inputs: { } - type: PowerAnalysisSampleSizeCalculation + type: PowerAnalysisPowerCalculation - form_inputs: fc_threshold: 1 inputs: { } From a46a074eeb12539ee39fabb21191f55679445bdd Mon Sep 17 00:00:00 2001 From: selenabr Date: Fri, 23 Aug 2024 21:23:28 +0200 Subject: [PATCH 16/36] sample size calculation for different group sizes (Cohen 1988) and moved validation methods to separate file --- protzilla/data_analysis/power_analysis.py | 172 ++---------------- .../power_analysis_validation.py | 154 ++++++++++++++++ 2 files changed, 170 insertions(+), 156 deletions(-) create mode 100644 protzilla/data_analysis/power_analysis_validation.py diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py index 80d542a2..ec163dd5 100644 --- a/protzilla/data_analysis/power_analysis.py +++ b/protzilla/data_analysis/power_analysis.py @@ -87,159 +87,6 @@ def sample_size_calculation( return dict(required_sample_size=required_sample_size) -def check_sample_size_calculation_with_libfunc( - differentially_expressed_proteins_df: pd.DataFrame, - significant_proteins_df: pd.DataFrame, - significant_proteins_only: bool, - fc_threshold: float, - alpha: float, - power: float, - group1: str, - group2: str, - selected_protein_group: str, - intensity_name: str = None -) -> float: - """ - Function to calculate the required sample size for a selected protein to achieve the required power . - - :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. - :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. - :param significant_proteins_only: A boolean to display only significant proteins for selection to the user. - :param fc_threshold: The fold change threshold. - :param alpha: The significance level. The value for alpha is taken from the t-test by default. - :param power: The power of the test. - :param group1: The name of the first group. - :param group2: The name of the second group. - :param selected_protein_group: The selected protein group for which the required sample size is to be calculated. - :param intensity_name: The name of the column containing the protein group intensities. - :return: The required sample size. - """ - - if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values: - raise ValueError("Please select a valid protein group.") - - protein_group = differentially_expressed_proteins_df[differentially_expressed_proteins_df["Protein ID"] == selected_protein_group] - - group1_intensities = np.log2(protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values) - group2_intensities = np.log2(protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values) - variance_group1 = np.var(group1_intensities, ddof=1) - variance_group2 = np.var(group2_intensities, ddof=1) - - sd_pooled = math.sqrt((variance_group1 + variance_group2) / 2) - mean_diff = abs(group1_intensities.mean() - group2_intensities.mean()) - effect_size = (group1_intensities.mean() - group2_intensities.mean())/sd_pooled - - obj = TTestIndPower() - required_sample_size = obj.solve_power( - effect_size=effect_size, - alpha=alpha, - power=power, nobs1=None, ratio=1.0, alternative='two-sided') - print(required_sample_size) - - required_sample_size = math.ceil(required_sample_size) - - return dict(required_sample_size=required_sample_size) - #required_sample_size = 2.27; pooled_sd = 0.23; effect_size = 4.39, mean_diff = 1.014 - - #impl: required_sample_size = 0.814; fc_threshold = 1.014; variance = 0.0534 -def check_sample_size_calculation_implemented( - differentially_expressed_proteins_df: pd.DataFrame, - significant_proteins_df: pd.DataFrame, - significant_proteins_only: bool, - fc_threshold: float, - alpha: float, - power: float, - group1: str, - group2: str, - selected_protein_group: str, - intensity_name: str = None -) -> float: - """ - Function to calculate the required sample size for a selected protein to achieve the required power . - - :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. - :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. - :param significant_proteins_only: A boolean to display only significant proteins for selection to the user. - :param fc_threshold: The fold change threshold. - :param alpha: The significance level. The value for alpha is taken from the t-test by default. - :param power: The power of the test. - :param group1: The name of the first group. - :param group2: The name of the second group. - :param selected_protein_group: The selected protein group for which the required sample size is to be calculated. - :param intensity_name: The name of the column containing the protein group intensities. - :return: The required sample size. - """ - - if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values: - raise ValueError("Please select a valid protein group.") - - z_alpha = stats.norm.ppf(1 - alpha / 2) - z_beta = stats.norm.ppf(power) - protein_group = differentially_expressed_proteins_df[ - differentially_expressed_proteins_df["Protein ID"] == selected_protein_group] - - group1_intensities = protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values - group2_intensities = protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values - fc_threshold = abs(group1_intensities.mean() - group2_intensities.mean()) - variance_group1 = np.var(group1_intensities, ddof=1) - variance_group2 = np.var(group2_intensities, ddof=1) - - pooled_variance = (variance_group1 + variance_group2) / 2 - required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * pooled_variance) - required_sample_size = math.ceil(required_sample_size) - print(required_sample_size) - - return dict(required_sample_size=required_sample_size) - -def check_sample_size_calculation_implemented_without_log( - differentially_expressed_proteins_df: pd.DataFrame, - significant_proteins_df: pd.DataFrame, - significant_proteins_only: bool, - fc_threshold: float, - alpha: float, - power: float, - group1: str, - group2: str, - selected_protein_group: str, - intensity_name: str = None -) -> float: - """ - Function to calculate the required sample size for a selected protein to achieve the required power . - - :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. - :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. - :param significant_proteins_only: A boolean to display only significant proteins for selection to the user. - :param fc_threshold: The fold change threshold. - :param alpha: The significance level. The value for alpha is taken from the t-test by default. - :param power: The power of the test. - :param group1: The name of the first group. - :param group2: The name of the second group. - :param selected_protein_group: The selected protein group for which the required sample size is to be calculated. - :param intensity_name: The name of the column containing the protein group intensities. - :return: The required sample size. - """ - - if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values: - raise ValueError("Please select a valid protein group.") - - z_alpha = stats.norm.ppf(1 - alpha / 2) - z_beta = stats.norm.ppf(power) - protein_group = differentially_expressed_proteins_df[ - differentially_expressed_proteins_df["Protein ID"] == selected_protein_group] - - group1_intensities = protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values - group2_intensities = protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values - fc_threshold = abs(group1_intensities.mean() - group2_intensities.mean()) - variance_group1 = np.var(group1_intensities, ddof=1) - variance_group2 = np.var(group2_intensities, ddof=1) - - pooled_variance = (variance_group1 + variance_group2) / 2 - required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * pooled_variance) - required_sample_size = math.ceil(required_sample_size) - print(required_sample_size) - - return dict(required_sample_size=required_sample_size) - def power_calculation( differentially_expressed_proteins_df: pd.DataFrame, significant_proteins_df: pd.DataFrame, @@ -257,7 +104,6 @@ def power_calculation( :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. - :param significant_proteins_only: A boolean to display only significant proteins for selection to the user. :param alpha: The significance level. The value for alpha is taken from the t-test by default. :param fc_threshold: The fold change threshold. :param group1: The name of the first group. @@ -279,8 +125,22 @@ def power_calculation( intensity_name=intensity_name, ) - sample_size = min(differentially_expressed_proteins_df.groupby(['Group', 'Protein ID'])['Sample'].count()) - z_beta = fc_threshold * np.sqrt(sample_size / (2 * variance_protein_group)) - z_alpha + """ + filtered_df = differentially_expressed_proteins_df[differentially_expressed_proteins_df["Protein ID"] == protein_group] + filtered_df["Person"] = filtered_df["Sample"].apply( + lambda x: x[:7]) + + variance = filtered_df.groupby(['Person', 'Group'])['Normalised iBAQ'].var().reset_index() + + filtered_df["Measurement"] = filtered_df["Sample"].apply( + lambda x: int(x[-2:])) + """ + filtered_protein_df = differentially_expressed_proteins_df[differentially_expressed_proteins_df["Protein ID"] == protein_group] + grouped_df= filtered_protein_df.groupby(['Group', 'Protein ID'])['Sample'].count() + sample_size_group1 = grouped_df[group1][0] + sample_size_group2 = grouped_df[group2][0] + sample_size = (2 * sample_size_group1 * sample_size_group2) / (sample_size_group1 + sample_size_group2) # Equation 2.3.1 from Cohen 1988, Statistical Power Analysis for the Behavioral Sciences + z_beta = fc_threshold * np.sqrt(sample_size / (2 * variance_protein_group)) - z_alpha power = float(round(stats.norm.cdf(z_beta), 2)) return dict(power=power) diff --git a/protzilla/data_analysis/power_analysis_validation.py b/protzilla/data_analysis/power_analysis_validation.py new file mode 100644 index 00000000..40d517d3 --- /dev/null +++ b/protzilla/data_analysis/power_analysis_validation.py @@ -0,0 +1,154 @@ +import numpy as np +import pandas as pd +import math +from scipy import stats +from statsmodels.stats.power import TTestIndPower + + + +def check_sample_size_calculation_with_libfunc( + differentially_expressed_proteins_df: pd.DataFrame, + significant_proteins_df: pd.DataFrame, + fc_threshold: float, + alpha: float, + power: float, + group1: str, + group2: str, + selected_protein_group: str, + intensity_name: str = None +) -> float: + """ + Function to calculate the required sample size for a selected protein to achieve the required power . + + :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. + :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. + :param fc_threshold: The fold change threshold. + :param alpha: The significance level. The value for alpha is taken from the t-test by default. + :param power: The power of the test. + :param group1: The name of the first group. + :param group2: The name of the second group. + :param selected_protein_group: The selected protein group for which the required sample size is to be calculated. + :param intensity_name: The name of the column containing the protein group intensities. + :return: The required sample size. + """ + + if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values: + raise ValueError("Please select a valid protein group.") + + protein_group = differentially_expressed_proteins_df[differentially_expressed_proteins_df["Protein ID"] == selected_protein_group] + + group1_intensities = np.log2(protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values) + group2_intensities = np.log2(protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values) + variance_group1 = np.var(group1_intensities, ddof=1) + variance_group2 = np.var(group2_intensities, ddof=1) + + sd_pooled = math.sqrt((variance_group1 + variance_group2) / 2) + mean_diff = abs(group1_intensities.mean() - group2_intensities.mean()) + effect_size = (group1_intensities.mean() - group2_intensities.mean())/sd_pooled + + obj = TTestIndPower() + required_sample_size = obj.solve_power( + effect_size=effect_size, + alpha=alpha, + power=power, nobs1=None, ratio=1.0, alternative='two-sided') + print(required_sample_size) + + required_sample_size = math.ceil(required_sample_size) + + return dict(required_sample_size=required_sample_size) + #required_sample_size = 2.27; pooled_sd = 0.23; effect_size = 4.39, mean_diff = 1.014 + + #impl: required_sample_size = 0.814; fc_threshold = 1.014; variance = 0.0534 +def check_sample_size_calculation_implemented( + differentially_expressed_proteins_df: pd.DataFrame, + significant_proteins_df: pd.DataFrame, + fc_threshold: float, + alpha: float, + power: float, + group1: str, + group2: str, + selected_protein_group: str, + intensity_name: str = None +) -> float: + """ + Function to calculate the required sample size for a selected protein to achieve the required power . + + :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. + :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. + :param fc_threshold: The fold change threshold. + :param alpha: The significance level. The value for alpha is taken from the t-test by default. + :param power: The power of the test. + :param group1: The name of the first group. + :param group2: The name of the second group. + :param selected_protein_group: The selected protein group for which the required sample size is to be calculated. + :param intensity_name: The name of the column containing the protein group intensities. + :return: The required sample size. + """ + + if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values: + raise ValueError("Please select a valid protein group.") + + z_alpha = stats.norm.ppf(1 - alpha / 2) + z_beta = stats.norm.ppf(power) + protein_group = differentially_expressed_proteins_df[ + differentially_expressed_proteins_df["Protein ID"] == selected_protein_group] + + group1_intensities = protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values + group2_intensities = protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values + fc_threshold = abs(group1_intensities.mean() - group2_intensities.mean()) + variance_group1 = np.var(group1_intensities, ddof=1) + variance_group2 = np.var(group2_intensities, ddof=1) + + pooled_variance = (variance_group1 + variance_group2) / 2 + required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * pooled_variance) + required_sample_size = math.ceil(required_sample_size) + print(required_sample_size) + + return dict(required_sample_size=required_sample_size) + +def check_sample_size_calculation_implemented_without_log( + differentially_expressed_proteins_df: pd.DataFrame, + significant_proteins_df: pd.DataFrame, + fc_threshold: float, + alpha: float, + power: float, + group1: str, + group2: str, + selected_protein_group: str, + intensity_name: str = None +) -> float: + """ + Function to calculate the required sample size for a selected protein to achieve the required power . + + :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. + :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. + :param fc_threshold: The fold change threshold. + :param alpha: The significance level. The value for alpha is taken from the t-test by default. + :param power: The power of the test. + :param group1: The name of the first group. + :param group2: The name of the second group. + :param selected_protein_group: The selected protein group for which the required sample size is to be calculated. + :param intensity_name: The name of the column containing the protein group intensities. + :return: The required sample size. + """ + + if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values: + raise ValueError("Please select a valid protein group.") + + z_alpha = stats.norm.ppf(1 - alpha / 2) + z_beta = stats.norm.ppf(power) + protein_group = differentially_expressed_proteins_df[ + differentially_expressed_proteins_df["Protein ID"] == selected_protein_group] + + group1_intensities = protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values + group2_intensities = protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values + fc_threshold = abs(group1_intensities.mean() - group2_intensities.mean()) + variance_group1 = np.var(group1_intensities, ddof=1) + variance_group2 = np.var(group2_intensities, ddof=1) + + pooled_variance = (variance_group1 + variance_group2) / 2 + required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * pooled_variance) + required_sample_size = math.ceil(required_sample_size) + print(required_sample_size) + + return dict(required_sample_size=required_sample_size) \ No newline at end of file From 3446be375e4c4cee0c0481c037db449faf72e02d Mon Sep 17 00:00:00 2001 From: selenabr Date: Mon, 26 Aug 2024 11:05:29 +0200 Subject: [PATCH 17/36] code formatting, resolved comments (output not a float, significant_proteins_only, intensity_name) --- .../differential_expression_mann_whitney.py | 71 +++-- protzilla/data_analysis/power_analysis.py | 67 +++-- .../power_analysis_validation.py | 101 +++++-- protzilla/data_analysis/ptm_analysis.py | 49 +-- protzilla/data_integration/di_plots.py | 1 - .../data_preprocessing/filter_proteins.py | 5 +- protzilla/data_preprocessing/normalisation.py | 38 +-- .../data_preprocessing/outlier_detection.py | 30 +- .../data_preprocessing/peptide_filter.py | 4 +- .../data_preprocessing/transformation.py | 4 +- protzilla/importing/ms_data_import.py | 67 ++++- protzilla/importing/peptide_import.py | 4 +- protzilla/methods/data_analysis.py | 94 ++++-- protzilla/methods/data_preprocessing.py | 2 +- protzilla/methods/importing.py | 4 +- protzilla/steps.py | 10 +- protzilla/utilities/transform_dfs.py | 4 +- tests/conftest.py | 278 ++++++++++++++++-- .../data_analysis/test_analysis_plots.py | 4 +- .../test_differential_expression.py | 2 +- .../test_filter_peptites_of_protein.py | 15 +- .../data_analysis/test_peptide_analysis.py | 74 ++++- .../data_analysis/test_plots_data_analysis.py | 26 +- .../data_analysis/test_power_analysis.py | 59 ++-- .../test_plots_data_integration.py | 29 +- .../data_preprocessing/test_normalisation.py | 10 +- .../test_outlier_detection.py | 3 +- .../test_peptide_preprocessing.py | 2 - .../importing/test_ms_data_import.py | 8 +- tests/protzilla/test_runner.py | 107 ++++--- ui/runs/forms/data_analysis.py | 102 ++++--- ui/runs/views.py | 6 +- .../workflows/overhaul.yaml:Zone.Identifier | 3 + 33 files changed, 903 insertions(+), 380 deletions(-) create mode 100644 user_data/workflows/overhaul.yaml:Zone.Identifier diff --git a/protzilla/data_analysis/differential_expression_mann_whitney.py b/protzilla/data_analysis/differential_expression_mann_whitney.py index 89041261..9c699e58 100644 --- a/protzilla/data_analysis/differential_expression_mann_whitney.py +++ b/protzilla/data_analysis/differential_expression_mann_whitney.py @@ -4,19 +4,22 @@ import pandas as pd from scipy import stats -from protzilla.data_analysis.differential_expression_helper import _map_log_base, apply_multiple_testing_correction +from protzilla.data_analysis.differential_expression_helper import ( + _map_log_base, + apply_multiple_testing_correction, +) from protzilla.utilities.transform_dfs import long_to_wide def mann_whitney_test_on_intensity_data( - intensity_df: pd.DataFrame, - metadata_df: pd.DataFrame, - grouping: str, - group1: str, - group2: str, - log_base: str = None, - alpha=0.05, - multiple_testing_correction_method: str = "", + intensity_df: pd.DataFrame, + metadata_df: pd.DataFrame, + grouping: str, + group1: str, + group2: str, + log_base: str = None, + alpha=0.05, + multiple_testing_correction_method: str = "", ) -> dict: wide_df = long_to_wide(intensity_df) @@ -31,13 +34,24 @@ def mann_whitney_test_on_intensity_data( multiple_testing_correction_method=multiple_testing_correction_method, columns_name="Protein ID", ) - differentially_expressed_proteins_df = pd.merge(intensity_df, outputs["differential_expressed_columns_df"], on="Protein ID", how="left") + differentially_expressed_proteins_df = pd.merge( + intensity_df, + outputs["differential_expressed_columns_df"], + on="Protein ID", + how="left", + ) differentially_expressed_proteins_df = differentially_expressed_proteins_df.loc[ - differentially_expressed_proteins_df["Protein ID"].isin(outputs["differential_expressed_columns_df"]["Protein ID"]) + differentially_expressed_proteins_df["Protein ID"].isin( + outputs["differential_expressed_columns_df"]["Protein ID"] + ) ] - significant_proteins_df = pd.merge(intensity_df, outputs["significant_columns_df"], on="Protein ID", how="left") + significant_proteins_df = pd.merge( + intensity_df, outputs["significant_columns_df"], on="Protein ID", how="left" + ) significant_proteins_df = significant_proteins_df.loc[ - significant_proteins_df["Protein ID"].isin(outputs["significant_columns_df"]["Protein ID"]) + significant_proteins_df["Protein ID"].isin( + outputs["significant_columns_df"]["Protein ID"] + ) ] return dict( @@ -50,16 +64,17 @@ def mann_whitney_test_on_intensity_data( messages=outputs["messages"], ) + def mann_whitney_test_on_columns( - df: pd.DataFrame, - metadata_df: pd.DataFrame, - grouping: str, - group1: str, - group2: str, - log_base: str = None, - alpha=0.05, - multiple_testing_correction_method: str = "", - columns_name: str = "Protein ID", + df: pd.DataFrame, + metadata_df: pd.DataFrame, + grouping: str, + group1: str, + group2: str, + log_base: str = None, + alpha=0.05, + multiple_testing_correction_method: str = "", + columns_name: str = "Protein ID", ) -> dict: """ Perform Mann-Whitney U test on all columns of the data frame. @@ -104,7 +119,9 @@ def mann_whitney_test_on_columns( for column in data_columns: group1_data = df_with_groups[df_with_groups[grouping] == group1][column] group2_data = df_with_groups[df_with_groups[grouping] == group2][column] - u_statistic, p_value = stats.mannwhitneyu(group1_data, group2_data, alternative="two-sided") + u_statistic, p_value = stats.mannwhitneyu( + group1_data, group2_data, alternative="two-sided" + ) if not np.isnan(p_value): log2_fold_change = ( @@ -149,9 +166,13 @@ def mann_whitney_test_on_columns( significant_columns_df = combined_df[ combined_df["corrected_p_value"] <= corrected_alpha - ] + ] - messages = [dict(level=logging.INFO, msg=f"Invalid columns: {invalid_columns}")] if invalid_columns else [] + messages = ( + [dict(level=logging.INFO, msg=f"Invalid columns: {invalid_columns}")] + if invalid_columns + else [] + ) return dict( differential_expressed_columns_df=combined_df, diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py index ec163dd5..4a303a18 100644 --- a/protzilla/data_analysis/power_analysis.py +++ b/protzilla/data_analysis/power_analysis.py @@ -1,10 +1,10 @@ -import logging +import math import numpy as np import pandas as pd -import math from scipy import stats -from statsmodels.stats.power import TTestIndPower + +from protzilla.utilities import default_intensity_column def variance_protein_group_calculation_max( @@ -24,13 +24,15 @@ def variance_protein_group_calculation_max( :param intensity_name: The name of the column containing the protein group intensities. :return: The variance of the protein group. """ - - if intensity_name is None: - intensity_name = "Normalised iBAQ" + intensity_name = default_intensity_column(intensity_df, intensity_name) protein_group = intensity_df[intensity_df["Protein ID"] == protein_id] - group1_intensities = protein_group[protein_group["Group"] == group1][intensity_name].values - group2_intensities = protein_group[protein_group["Group"] == group2][intensity_name].values + group1_intensities = protein_group[protein_group["Group"] == group1][ + intensity_name + ].values + group2_intensities = protein_group[protein_group["Group"] == group2][ + intensity_name + ].values variance_group1 = np.var(group1_intensities, ddof=1) variance_group2 = np.var(group2_intensities, ddof=1) @@ -39,18 +41,18 @@ def variance_protein_group_calculation_max( return max_variance + def sample_size_calculation( differentially_expressed_proteins_df: pd.DataFrame, significant_proteins_df: pd.DataFrame, - significant_proteins_only: bool, fc_threshold: float, alpha: float, power: float, group1: str, group2: str, selected_protein_group: str, - intensity_name: str = None -) -> float: + intensity_name: str = None, +) -> dict: """ Function to calculate the required sample size for a selected protein to achieve the required power . @@ -67,7 +69,11 @@ def sample_size_calculation( :return: The required sample size. """ - if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values: + if ( + selected_protein_group not in significant_proteins_df["Protein ID"].values + and selected_protein_group + not in differentially_expressed_proteins_df["Protein ID"].values + ): raise ValueError("Please select a valid protein group.") protein_group = selected_protein_group z_alpha = stats.norm.ppf(1 - alpha / 2) @@ -81,24 +87,25 @@ def sample_size_calculation( intensity_name=intensity_name, ) - required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * variance_protein_group) + required_sample_size = ( + 2 * ((z_alpha + z_beta) / fc_threshold) ** 2 * variance_protein_group + ) required_sample_size = math.ceil(required_sample_size) print(required_sample_size) return dict(required_sample_size=required_sample_size) + def power_calculation( differentially_expressed_proteins_df: pd.DataFrame, significant_proteins_df: pd.DataFrame, - significant_proteins_only: bool, alpha: float, fc_threshold: float, group1: str, group2: str, selected_protein_group: str, - intensity_name: str = None -) -> float: - + intensity_name: str = None, +) -> dict: """ Function to calculate the power of the t-test for a selected protein group. @@ -112,7 +119,11 @@ def power_calculation( :param intensity_name: The name of the column containing the protein group intensities. :return: The power of the test. """ - if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values: + if ( + selected_protein_group not in significant_proteins_df["Protein ID"].values + and selected_protein_group + not in differentially_expressed_proteins_df["Protein ID"].values + ): raise ValueError("Please select a valid protein group.") protein_group = selected_protein_group z_alpha = stats.norm.ppf(1 - alpha / 2) @@ -135,14 +146,18 @@ def power_calculation( filtered_df["Measurement"] = filtered_df["Sample"].apply( lambda x: int(x[-2:])) """ - filtered_protein_df = differentially_expressed_proteins_df[differentially_expressed_proteins_df["Protein ID"] == protein_group] - grouped_df= filtered_protein_df.groupby(['Group', 'Protein ID'])['Sample'].count() - sample_size_group1 = grouped_df[group1][0] - sample_size_group2 = grouped_df[group2][0] - sample_size = (2 * sample_size_group1 * sample_size_group2) / (sample_size_group1 + sample_size_group2) # Equation 2.3.1 from Cohen 1988, Statistical Power Analysis for the Behavioral Sciences - z_beta = fc_threshold * np.sqrt(sample_size / (2 * variance_protein_group)) - z_alpha + filtered_protein_df = differentially_expressed_proteins_df[ + differentially_expressed_proteins_df["Protein ID"] == protein_group + ] + grouped_df = filtered_protein_df.groupby(["Group", "Protein ID"])["Sample"].count() + sample_size_group1 = grouped_df[group1][0] + sample_size_group2 = grouped_df[group2][0] + sample_size = (2 * sample_size_group1 * sample_size_group2) / ( + sample_size_group1 + sample_size_group2 + ) # Equation 2.3.1 from Cohen 1988, Statistical Power Analysis for the Behavioral Sciences + z_beta = ( + fc_threshold * np.sqrt(sample_size / (2 * variance_protein_group)) - z_alpha + ) power = float(round(stats.norm.cdf(z_beta), 2)) return dict(power=power) - - diff --git a/protzilla/data_analysis/power_analysis_validation.py b/protzilla/data_analysis/power_analysis_validation.py index 40d517d3..8351202d 100644 --- a/protzilla/data_analysis/power_analysis_validation.py +++ b/protzilla/data_analysis/power_analysis_validation.py @@ -1,11 +1,11 @@ +import math + import numpy as np import pandas as pd -import math from scipy import stats from statsmodels.stats.power import TTestIndPower - def check_sample_size_calculation_with_libfunc( differentially_expressed_proteins_df: pd.DataFrame, significant_proteins_df: pd.DataFrame, @@ -15,8 +15,8 @@ def check_sample_size_calculation_with_libfunc( group1: str, group2: str, selected_protein_group: str, - intensity_name: str = None -) -> float: + intensity_name: str = None, +) -> dict: """ Function to calculate the required sample size for a selected protein to achieve the required power . @@ -32,33 +32,49 @@ def check_sample_size_calculation_with_libfunc( :return: The required sample size. """ - if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values: + if ( + selected_protein_group not in significant_proteins_df["Protein ID"].values + and selected_protein_group + not in differentially_expressed_proteins_df["Protein ID"].values + ): raise ValueError("Please select a valid protein group.") - protein_group = differentially_expressed_proteins_df[differentially_expressed_proteins_df["Protein ID"] == selected_protein_group] - - group1_intensities = np.log2(protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values) - group2_intensities = np.log2(protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values) + protein_group = differentially_expressed_proteins_df[ + differentially_expressed_proteins_df["Protein ID"] == selected_protein_group + ] + + group1_intensities = np.log2( + protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values + ) + group2_intensities = np.log2( + protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values + ) variance_group1 = np.var(group1_intensities, ddof=1) variance_group2 = np.var(group2_intensities, ddof=1) sd_pooled = math.sqrt((variance_group1 + variance_group2) / 2) - mean_diff = abs(group1_intensities.mean() - group2_intensities.mean()) - effect_size = (group1_intensities.mean() - group2_intensities.mean())/sd_pooled + abs(group1_intensities.mean() - group2_intensities.mean()) + effect_size = (group1_intensities.mean() - group2_intensities.mean()) / sd_pooled obj = TTestIndPower() required_sample_size = obj.solve_power( effect_size=effect_size, alpha=alpha, - power=power, nobs1=None, ratio=1.0, alternative='two-sided') + power=power, + nobs1=None, + ratio=1.0, + alternative="two-sided", + ) print(required_sample_size) required_sample_size = math.ceil(required_sample_size) return dict(required_sample_size=required_sample_size) - #required_sample_size = 2.27; pooled_sd = 0.23; effect_size = 4.39, mean_diff = 1.014 + # required_sample_size = 2.27; pooled_sd = 0.23; effect_size = 4.39, mean_diff = 1.014 + + # impl: required_sample_size = 0.814; fc_threshold = 1.014; variance = 0.0534 + - #impl: required_sample_size = 0.814; fc_threshold = 1.014; variance = 0.0534 def check_sample_size_calculation_implemented( differentially_expressed_proteins_df: pd.DataFrame, significant_proteins_df: pd.DataFrame, @@ -68,8 +84,8 @@ def check_sample_size_calculation_implemented( group1: str, group2: str, selected_protein_group: str, - intensity_name: str = None -) -> float: + intensity_name: str = None, +) -> dict: """ Function to calculate the required sample size for a selected protein to achieve the required power . @@ -85,27 +101,39 @@ def check_sample_size_calculation_implemented( :return: The required sample size. """ - if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values: + if ( + selected_protein_group not in significant_proteins_df["Protein ID"].values + and selected_protein_group + not in differentially_expressed_proteins_df["Protein ID"].values + ): raise ValueError("Please select a valid protein group.") z_alpha = stats.norm.ppf(1 - alpha / 2) z_beta = stats.norm.ppf(power) protein_group = differentially_expressed_proteins_df[ - differentially_expressed_proteins_df["Protein ID"] == selected_protein_group] - - group1_intensities = protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values - group2_intensities = protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values + differentially_expressed_proteins_df["Protein ID"] == selected_protein_group + ] + + group1_intensities = protein_group[protein_group["Group"] == group1][ + "Normalised iBAQ" + ].values + group2_intensities = protein_group[protein_group["Group"] == group2][ + "Normalised iBAQ" + ].values fc_threshold = abs(group1_intensities.mean() - group2_intensities.mean()) variance_group1 = np.var(group1_intensities, ddof=1) variance_group2 = np.var(group2_intensities, ddof=1) pooled_variance = (variance_group1 + variance_group2) / 2 - required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * pooled_variance) + required_sample_size = ( + 2 * ((z_alpha + z_beta) / fc_threshold) ** 2 * pooled_variance + ) required_sample_size = math.ceil(required_sample_size) print(required_sample_size) return dict(required_sample_size=required_sample_size) + def check_sample_size_calculation_implemented_without_log( differentially_expressed_proteins_df: pd.DataFrame, significant_proteins_df: pd.DataFrame, @@ -115,8 +143,8 @@ def check_sample_size_calculation_implemented_without_log( group1: str, group2: str, selected_protein_group: str, - intensity_name: str = None -) -> float: + intensity_name: str = None, +) -> dict: """ Function to calculate the required sample size for a selected protein to achieve the required power . @@ -132,23 +160,34 @@ def check_sample_size_calculation_implemented_without_log( :return: The required sample size. """ - if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values: + if ( + selected_protein_group not in significant_proteins_df["Protein ID"].values + and selected_protein_group + not in differentially_expressed_proteins_df["Protein ID"].values + ): raise ValueError("Please select a valid protein group.") z_alpha = stats.norm.ppf(1 - alpha / 2) z_beta = stats.norm.ppf(power) protein_group = differentially_expressed_proteins_df[ - differentially_expressed_proteins_df["Protein ID"] == selected_protein_group] - - group1_intensities = protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values - group2_intensities = protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values + differentially_expressed_proteins_df["Protein ID"] == selected_protein_group + ] + + group1_intensities = protein_group[protein_group["Group"] == group1][ + "Normalised iBAQ" + ].values + group2_intensities = protein_group[protein_group["Group"] == group2][ + "Normalised iBAQ" + ].values fc_threshold = abs(group1_intensities.mean() - group2_intensities.mean()) variance_group1 = np.var(group1_intensities, ddof=1) variance_group2 = np.var(group2_intensities, ddof=1) pooled_variance = (variance_group1 + variance_group2) / 2 - required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * pooled_variance) + required_sample_size = ( + 2 * ((z_alpha + z_beta) / fc_threshold) ** 2 * pooled_variance + ) required_sample_size = math.ceil(required_sample_size) print(required_sample_size) - return dict(required_sample_size=required_sample_size) \ No newline at end of file + return dict(required_sample_size=required_sample_size) diff --git a/protzilla/data_analysis/ptm_analysis.py b/protzilla/data_analysis/ptm_analysis.py index 7917699d..8368b73d 100644 --- a/protzilla/data_analysis/ptm_analysis.py +++ b/protzilla/data_analysis/ptm_analysis.py @@ -1,15 +1,15 @@ import logging -from math import log +import re import numpy as np import pandas as pd -import re from protzilla.utilities.transform_dfs import long_to_wide def filter_peptides_of_protein( - peptide_df: pd.DataFrame, protein_ids: list[str], + peptide_df: pd.DataFrame, + protein_ids: list[str], ) -> dict: """ This function filters out all peptides with a PEP value (assigned to all samples @@ -23,15 +23,21 @@ def filter_peptides_of_protein( filtered_peptide_dfs = [pd.DataFrame] * len(protein_ids) for i, protein_id in enumerate(protein_ids): - filtered_peptide_dfs[i] = peptide_df[peptide_df["Protein ID"].str.contains(protein_id)] + filtered_peptide_dfs[i] = peptide_df[ + peptide_df["Protein ID"].str.contains(protein_id) + ] filtered_peptides = pd.concat(filtered_peptide_dfs) return dict( peptide_df=filtered_peptides, - messages=[{ - "level": logging.INFO if len(filtered_peptides) > 0 else logging.WARNING, - "msg": f"Selected {len(filtered_peptides)} entry's from the peptide dataframe." - }], + messages=[ + { + "level": logging.INFO + if len(filtered_peptides) > 0 + else logging.WARNING, + "msg": f"Selected {len(filtered_peptides)} entry's from the peptide dataframe.", + } + ], ) @@ -48,8 +54,12 @@ def ptms_per_sample(peptide_df: pd.DataFrame) -> dict: modification_df = peptide_df[["Sample", "Modifications"]] modification_df = pd.concat( - [modification_df["Sample"], - (modification_df['Modifications'].str.get_dummies(sep=","))], axis=1) + [ + modification_df["Sample"], + (modification_df["Modifications"].str.get_dummies(sep=",")), + ], + axis=1, + ) for column, data in modification_df.iteritems(): amount, name = from_string(column) @@ -80,7 +90,7 @@ def ptms_per_protein_and_sample(peptide_df: pd.DataFrame) -> dict: modification_df = peptide_df[["Sample", "Protein ID", "Modifications"]] modification_df = modification_df[["Sample", "Protein ID"]].join( - modification_df['Modifications'].str.get_dummies(sep=",") + modification_df["Modifications"].str.get_dummies(sep=",") ) for column, data in modification_df.iteritems(): @@ -95,16 +105,19 @@ def ptms_per_protein_and_sample(peptide_df: pd.DataFrame) -> dict: modification_df = modification_df.reset_index() - modi = ( - modification_df.drop(["Sample", "Protein ID"], axis=1).apply(lambda x: ('(' + x.astype(str) + ') ' + x.name + ", "))) + modi = modification_df.drop(["Sample", "Protein ID"], axis=1).apply( + lambda x: ("(" + x.astype(str) + ") " + x.name + ", ") + ) for column, data in modi.iteritems(): modi[column] = np.where(modification_df[column] > 0, modi[column], "") - modification_df["Modifications"] = modi.apply(''.join, axis=1) - modification_df = modification_df[['Sample', 'Protein ID', 'Modifications']] + modification_df["Modifications"] = modi.apply("".join, axis=1) + modification_df = modification_df[["Sample", "Protein ID", "Modifications"]] - modification_df = long_to_wide(modification_df, "Modifications").fillna("").reset_index() + modification_df = ( + long_to_wide(modification_df, "Modifications").fillna("").reset_index() + ) return dict(ptm_df=modification_df) @@ -118,9 +131,9 @@ def from_string(mod_string: str) -> tuple[int, str]: :return: tuple containing the amount and name of the modification """ - re_search = re.search(r'\d+', mod_string) + re_search = re.search(r"\d+", mod_string) amount = int(re_search.group()) if re_search else 1 - name = re.search(r'\D+', mod_string).group() + name = re.search(r"\D+", mod_string).group() name = name[1:] if name[0] == " " else name return amount, name diff --git a/protzilla/data_integration/di_plots.py b/protzilla/data_integration/di_plots.py index 74d50ff1..ef19515f 100644 --- a/protzilla/data_integration/di_plots.py +++ b/protzilla/data_integration/di_plots.py @@ -108,7 +108,6 @@ def GO_enrichment_bar_plot( elif value == "p-value": column = "P-value" if restring_input else "Adjusted P-value" - if colors == "" or colors is None or len(colors) == 0: colors = PROTZILLA_DISCRETE_COLOR_SEQUENCE size_y = top_terms * 0.5 * len(gene_sets) diff --git a/protzilla/data_preprocessing/filter_proteins.py b/protzilla/data_preprocessing/filter_proteins.py index 5e0bb7b8..479503f8 100644 --- a/protzilla/data_preprocessing/filter_proteins.py +++ b/protzilla/data_preprocessing/filter_proteins.py @@ -1,6 +1,7 @@ import pandas as pd from protzilla.data_preprocessing.plots import create_bar_plot, create_pie_plot + from ..utilities.transform_dfs import long_to_wide @@ -30,9 +31,7 @@ def by_samples_missing( filtered_proteins_list = ( transformed_df.drop(remaining_proteins_list, axis=1).columns.unique().tolist() ) - filtered_df = protein_df[ - (protein_df["Protein ID"].isin(remaining_proteins_list)) - ] + filtered_df = protein_df[(protein_df["Protein ID"].isin(remaining_proteins_list))] filtered_peptide_df = None if peptide_df is not None: filtered_peptide_df = peptide_df[ diff --git a/protzilla/data_preprocessing/normalisation.py b/protzilla/data_preprocessing/normalisation.py index e2be755b..ec4398bf 100644 --- a/protzilla/data_preprocessing/normalisation.py +++ b/protzilla/data_preprocessing/normalisation.py @@ -229,64 +229,50 @@ def by_reference_protein( def by_z_score_plot( - method_inputs, - method_outputs, - graph_type, - group_by, - visual_transformation + method_inputs, method_outputs, graph_type, group_by, visual_transformation ): return _build_box_hist_plot( method_inputs["protein_df"], method_outputs["protein_df"], graph_type, group_by, - visual_transformation + visual_transformation, ) def by_median_plot( - method_inputs, - method_outputs, - graph_type, - group_by, - visual_transformation + method_inputs, method_outputs, graph_type, group_by, visual_transformation ): return _build_box_hist_plot( method_inputs["protein_df"], method_outputs["protein_df"], - graph_type, group_by, - visual_transformation + graph_type, + group_by, + visual_transformation, ) def by_totalsum_plot( - method_inputs, - method_outputs, - graph_type, - group_by, - visual_transformation + method_inputs, method_outputs, graph_type, group_by, visual_transformation ): return _build_box_hist_plot( method_inputs["protein_df"], method_outputs["protein_df"], - graph_type, group_by, - visual_transformation + graph_type, + group_by, + visual_transformation, ) def by_reference_protein_plot( - method_inputs, - method_outputs, - graph_type, - group_by, - visual_transformation + method_inputs, method_outputs, graph_type, group_by, visual_transformation ): return _build_box_hist_plot( method_inputs["protein_df"], method_outputs["protein_df"], graph_type, group_by, - visual_transformation + visual_transformation, ) diff --git a/protzilla/data_preprocessing/outlier_detection.py b/protzilla/data_preprocessing/outlier_detection.py index be7b8eb8..b5008830 100644 --- a/protzilla/data_preprocessing/outlier_detection.py +++ b/protzilla/data_preprocessing/outlier_detection.py @@ -10,14 +10,15 @@ create_pca_2d_scatter_plot, create_pca_3d_scatter_plot, ) + from ..utilities.transform_dfs import long_to_wide def by_isolation_forest( - protein_df: pd.DataFrame, - peptide_df: pd.DataFrame | None, - n_estimators: int = 100, - n_jobs: int = -1, + protein_df: pd.DataFrame, + peptide_df: pd.DataFrame | None, + n_estimators: int = 100, + n_jobs: int = -1, ) -> dict: """ This function filters out outliers using a clustering @@ -62,8 +63,11 @@ def by_isolation_forest( ].index.tolist() protein_df = protein_df[~(protein_df["Sample"].isin(outlier_list))] - peptide_df = (None if peptide_df is None - else peptide_df[~(peptide_df["Sample"].isin(outlier_list))]) + peptide_df = ( + None + if peptide_df is None + else peptide_df[~(peptide_df["Sample"].isin(outlier_list))] + ) return dict( protein_df=protein_df, @@ -125,8 +129,11 @@ def by_local_outlier_factor( outlier_list = df_lof_data[df_lof_data["Outlier"]].index.tolist() protein_df = protein_df[~(protein_df["Sample"].isin(outlier_list))] - peptide_df = (None if peptide_df is None - else peptide_df[~(peptide_df["Sample"].isin(outlier_list))]) + peptide_df = ( + None + if peptide_df is None + else peptide_df[~(peptide_df["Sample"].isin(outlier_list))] + ) return dict( protein_df=protein_df, @@ -232,8 +239,11 @@ def by_pca( df_transformed_pca_data["Outlier"] ].index.tolist() protein_df = protein_df[~(protein_df["Sample"].isin(outlier_list))] - peptide_df = (None if peptide_df is None - else peptide_df[~(peptide_df["Sample"].isin(outlier_list))]) + peptide_df = ( + None + if peptide_df is None + else peptide_df[~(peptide_df["Sample"].isin(outlier_list))] + ) return dict( protein_df=protein_df, diff --git a/protzilla/data_preprocessing/peptide_filter.py b/protzilla/data_preprocessing/peptide_filter.py index 3b1caee9..80dafe9e 100644 --- a/protzilla/data_preprocessing/peptide_filter.py +++ b/protzilla/data_preprocessing/peptide_filter.py @@ -3,9 +3,7 @@ from protzilla.data_preprocessing.plots import create_bar_plot, create_pie_plot -def by_pep_value( - peptide_df: pd.DataFrame, threshold: float -) -> dict: +def by_pep_value(peptide_df: pd.DataFrame, threshold: float) -> dict: """ This function filters out all peptides with a PEP value (assigned to all samples together for each peptide) below a certain threshold. diff --git a/protzilla/data_preprocessing/transformation.py b/protzilla/data_preprocessing/transformation.py index 221b01ab..401ea396 100644 --- a/protzilla/data_preprocessing/transformation.py +++ b/protzilla/data_preprocessing/transformation.py @@ -5,7 +5,9 @@ from protzilla.utilities import default_intensity_column -def by_log(protein_df: pd.DataFrame, peptide_df: pd.DataFrame | None, log_base="log10") -> dict: +def by_log( + protein_df: pd.DataFrame, peptide_df: pd.DataFrame | None, log_base="log10" +) -> dict: """ This function log-transforms intensity DataFrames. Supports log-transformation to the base diff --git a/protzilla/importing/ms_data_import.py b/protzilla/importing/ms_data_import.py index c3d9136f..8dce747b 100644 --- a/protzilla/importing/ms_data_import.py +++ b/protzilla/importing/ms_data_import.py @@ -11,7 +11,10 @@ def max_quant_import( - file_path: str, intensity_name: str, map_to_uniprot=False, aggregation_method: str ="Sum" + file_path: str, + intensity_name: str, + map_to_uniprot=False, + aggregation_method: str = "Sum", ) -> dict: assert intensity_name in ["Intensity", "iBAQ", "LFQ intensity"] try: @@ -34,15 +37,28 @@ def max_quant_import( c[len(intensity_name) + 1 :] for c in intensity_df.columns ] intensity_df = intensity_df.assign(**{"Protein ID": protein_groups}) - return transform_and_clean(intensity_df, intensity_name, map_to_uniprot, aggregation_method) + return transform_and_clean( + intensity_df, intensity_name, map_to_uniprot, aggregation_method + ) except Exception as e: msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid Max Quant file." - return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))]) + return dict( + messages=[ + dict( + level=logging.ERROR, + msg=msg, + trace=format_trace(traceback.format_exception(e)), + ) + ] + ) def ms_fragger_import( - file_path: str, intensity_name: str, map_to_uniprot=False, aggregation_method: str ="Sum" + file_path: str, + intensity_name: str, + map_to_uniprot=False, + aggregation_method: str = "Sum", ) -> dict: assert intensity_name in [ "Intensity", @@ -87,13 +103,25 @@ def ms_fragger_import( ) intensity_df = intensity_df.assign(**{"Protein ID": protein_groups}) - return transform_and_clean(intensity_df, intensity_name, map_to_uniprot, aggregation_method) + return transform_and_clean( + intensity_df, intensity_name, map_to_uniprot, aggregation_method + ) except Exception as e: msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid MS Fragger file." - return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))]) + return dict( + messages=[ + dict( + level=logging.ERROR, + msg=msg, + trace=format_trace(traceback.format_exception(e)), + ) + ] + ) -def diann_import(file_path, map_to_uniprot=False, aggregation_method: str ="Sum") -> dict: +def diann_import( + file_path, map_to_uniprot=False, aggregation_method: str = "Sum" +) -> dict: try: df = pd.read_csv( file_path, @@ -117,14 +145,27 @@ def diann_import(file_path, map_to_uniprot=False, aggregation_method: str ="Sum" intensity_name = "Intensity" - return transform_and_clean(intensity_df, intensity_name, map_to_uniprot, aggregation_method) + return transform_and_clean( + intensity_df, intensity_name, map_to_uniprot, aggregation_method + ) except Exception as e: msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid DIA-NN MS file." - return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))]) + return dict( + messages=[ + dict( + level=logging.ERROR, + msg=msg, + trace=format_trace(traceback.format_exception(e)), + ) + ] + ) def transform_and_clean( - df: pd.DataFrame, intensity_name: str, map_to_uniprot: bool, aggregation_method: str ="Sum" + df: pd.DataFrame, + intensity_name: str, + map_to_uniprot: bool, + aggregation_method: str = "Sum", ) -> dict: """ Transforms a dataframe that is read from a file in wide format into long format, @@ -158,7 +199,9 @@ def transform_and_clean( # applies the selected aggregation to duplicate protein groups, NaN if all are NaN, aggregation of numbers otherwise aggregation_method = aggregation_method.lower() agg_kwargs = {"sum": {"min_count": 1}, "median": {}, "mean": {}} - df = df.groupby("Protein ID", as_index=False).agg(aggregation_method, **agg_kwargs[aggregation_method]) + df = df.groupby("Protein ID", as_index=False).agg( + aggregation_method, **agg_kwargs[aggregation_method] + ) df = df.assign(Gene=lambda _: np.nan) # add deprecated genes column @@ -230,7 +273,7 @@ def clean_protein_groups(protein_groups, map_to_uniprot=True): all_ids_of_group.extend(new_ids) else: all_ids_of_group.append(old_id) - new_groups.append(all_ids_of_group[0] if all_ids_of_group else '') + new_groups.append(all_ids_of_group[0] if all_ids_of_group else "") return new_groups, removed_protein_ids diff --git a/protzilla/importing/peptide_import.py b/protzilla/importing/peptide_import.py index d38495dd..3056f1d3 100644 --- a/protzilla/importing/peptide_import.py +++ b/protzilla/importing/peptide_import.py @@ -47,9 +47,7 @@ def peptide_import(file_path, intensity_name, map_to_uniprot) -> dict: ) molten = molten.rename(columns={"Leading razor protein": "Protein ID"}) - ordered = molten[ - ["Sample", "Protein ID", "Sequence", "Intensity", "PEP"] - ] + ordered = molten[["Sample", "Protein ID", "Sequence", "Intensity", "PEP"]] ordered.dropna(subset=["Protein ID"], inplace=True) ordered.sort_values(by=["Sample", "Protein ID"], ignore_index=True, inplace=True) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index d5b2ccb4..c1150335 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -8,12 +8,12 @@ ) from protzilla.data_analysis.differential_expression_anova import anova from protzilla.data_analysis.differential_expression_linear_model import linear_model -from protzilla.data_analysis.differential_expression_mann_whitney import mann_whitney_test_on_columns, \ - mann_whitney_test_on_intensity_data +from protzilla.data_analysis.differential_expression_mann_whitney import ( + mann_whitney_test_on_columns, + mann_whitney_test_on_intensity_data, +) from protzilla.data_analysis.differential_expression_t_test import t_test from protzilla.data_analysis.dimension_reduction import t_sne, umap -from protzilla.data_analysis.ptm_analysis import filter_peptides_of_protein, ptms_per_sample, \ - ptms_per_protein_and_sample from protzilla.data_analysis.model_evaluation import evaluate_classification_model from protzilla.data_analysis.plots import ( clustergram_plot, @@ -21,10 +21,18 @@ prot_quant_plot, scatter_plot, ) -from protzilla.data_analysis.power_analysis import sample_size_calculation, power_calculation +from protzilla.data_analysis.power_analysis import ( + power_calculation, + sample_size_calculation, +) from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph +from protzilla.data_analysis.ptm_analysis import ( + filter_peptides_of_protein, + ptms_per_protein_and_sample, + ptms_per_sample, +) from protzilla.methods.data_preprocessing import TransformationLog -from protzilla.steps import Plots, Step, StepManager, DisplayOutput +from protzilla.steps import Plots, Step, StepManager class DataAnalysisStep(Step): @@ -157,8 +165,10 @@ def plot(self, inputs): class DifferentialExpressionMannWhitneyOnIntensity(DataAnalysisStep): display_name = "Mann-Whitney Test" operation = "differential_expression" - method_description = ("A function to conduct a Mann-Whitney U test between groups defined in the clinical data." - "The p-values are corrected for multiple testing.") + method_description = ( + "A function to conduct a Mann-Whitney U test between groups defined in the clinical data." + "The p-values are corrected for multiple testing." + ) input_keys = [ "intensity_df", @@ -181,8 +191,13 @@ def method(self, inputs: dict) -> dict: return mann_whitney_test_on_intensity_data(**inputs) def insert_dataframes(self, steps: StepManager, inputs) -> dict: - if steps.get_step_output(Step, "protein_df", inputs["intensity_df"]) is not None: - inputs["intensity_df"] = steps.get_step_output(Step, "protein_df", inputs["intensity_df"]) + if ( + steps.get_step_output(Step, "protein_df", inputs["intensity_df"]) + is not None + ): + inputs["intensity_df"] = steps.get_step_output( + Step, "protein_df", inputs["intensity_df"] + ) inputs["metadata_df"] = steps.metadata_df inputs["log_base"] = steps.get_step_input(TransformationLog, "log_base") return inputs @@ -191,8 +206,10 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: class DifferentialExpressionMannWhitneyOnPTM(DataAnalysisStep): display_name = "Mann-Whitney Test" operation = "Peptide analysis" - method_description = ("A function to conduct a Mann-Whitney U test between groups defined in the clinical data." - "The p-values are corrected for multiple testing.") + method_description = ( + "A function to conduct a Mann-Whitney U test between groups defined in the clinical data." + "The p-values are corrected for multiple testing." + ) input_keys = [ "df", @@ -223,7 +240,9 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: return inputs def handle_outputs(self, outputs: dict) -> None: - outputs["differentially_expressed_ptm_df"] = outputs.pop("differential_expressed_columns_df", None) + outputs["differentially_expressed_ptm_df"] = outputs.pop( + "differential_expressed_columns_df", None + ) outputs["significant_ptm_df"] = outputs.pop("significant_columns_df", None) super().handle_outputs(outputs) @@ -702,17 +721,23 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: ) if inputs["auto_select"]: - significant_proteins = ( - steps.get_step_output(DataAnalysisStep, "significant_proteins_df", inputs["protein_list"])) - index_of_most_significant_protein = significant_proteins['corrected_p_value'].idxmin() - most_significant_protein = significant_proteins.loc[index_of_most_significant_protein] + significant_proteins = steps.get_step_output( + DataAnalysisStep, "significant_proteins_df", inputs["protein_list"] + ) + index_of_most_significant_protein = significant_proteins[ + "corrected_p_value" + ].idxmin() + most_significant_protein = significant_proteins.loc[ + index_of_most_significant_protein + ] inputs["protein_id"] = [most_significant_protein["Protein ID"]] - self.messages.append({ - "level": logging.INFO, - "msg": - f"Selected the most significant Protein: {most_significant_protein['Protein ID']}, " - f"from {inputs['protein_list']}" - }) + self.messages.append( + { + "level": logging.INFO, + "msg": f"Selected the most significant Protein: {most_significant_protein['Protein ID']}, " + f"from {inputs['protein_list']}", + } + ) return inputs @@ -720,8 +745,10 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: class PTMsPerSample(DataAnalysisStep): display_name = "PTMs per Sample" operation = "Peptide analysis" - method_description = ("Analyze the post-translational modifications (PTMs) of a single protein of interest. " - "This function requires a peptide dataframe with PTM information.") + method_description = ( + "Analyze the post-translational modifications (PTMs) of a single protein of interest. " + "This function requires a peptide dataframe with PTM information." + ) input_keys = [ "peptide_df", @@ -743,8 +770,10 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: class PTMsProteinAndPerSample(DataAnalysisStep): display_name = "PTMs per Sample and Protein" operation = "Peptide analysis" - method_description = ("Analyze the post-translational modifications (PTMs) of all Proteins. " - "This function requires a peptide dataframe with PTM information.") + method_description = ( + "Analyze the post-translational modifications (PTMs) of all Proteins. " + "This function requires a peptide dataframe with PTM information." + ) input_keys = [ "peptide_df", @@ -761,6 +790,8 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: Step, "peptide_df", inputs["peptide_df"] ) return inputs + + class PowerAnalysisPowerCalculation(DataAnalysisStep): display_name = "Power Calculation" operation = "Power Analysis" @@ -780,6 +811,8 @@ class PowerAnalysisPowerCalculation(DataAnalysisStep): output_keys = ["power"] def method(self, inputs: dict) -> dict: + if "significant_proteins_only" in inputs: + del inputs["significant_proteins_only"] return power_calculation(**inputs) def insert_dataframes(self, steps: StepManager, inputs) -> dict: @@ -822,7 +855,10 @@ class PowerAnalysisSampleSizeCalculation(DataAnalysisStep): output_keys = [ "required_sample_size", ] + def method(self, inputs: dict) -> dict: + if "significant_proteins_only" in inputs: + del inputs["significant_proteins_only"] return sample_size_calculation(**inputs) def insert_dataframes(self, steps: StepManager, inputs) -> dict: @@ -843,4 +879,6 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: def handle_outputs(self, outputs: dict): super().handle_outputs(outputs) - self.display_output["required_sample_size"] = f"Required Sample Size: {outputs['required_sample_size']}" + self.display_output[ + "required_sample_size" + ] = f"Required Sample Size: {outputs['required_sample_size']}" diff --git a/protzilla/methods/data_preprocessing.py b/protzilla/methods/data_preprocessing.py index 0565eaf0..9099627e 100644 --- a/protzilla/methods/data_preprocessing.py +++ b/protzilla/methods/data_preprocessing.py @@ -168,7 +168,7 @@ class TransformationLog(DataPreprocessingStep): operation = "transformation" method_description = "Transform data by log" - input_keys = [ "protein_df", "peptide_df", "log_base"] + input_keys = ["protein_df", "peptide_df", "log_base"] def method(self, inputs): return transformation.by_log(**inputs) diff --git a/protzilla/methods/importing.py b/protzilla/methods/importing.py index 7cde1ba0..1cec5aaa 100644 --- a/protzilla/methods/importing.py +++ b/protzilla/methods/importing.py @@ -10,7 +10,7 @@ max_quant_import, ms_fragger_import, ) -from protzilla.importing.peptide_import import peptide_import, evidence_import +from protzilla.importing.peptide_import import evidence_import, peptide_import from protzilla.steps import Step, StepManager @@ -139,4 +139,4 @@ class EvidenceImport(ImportingStep): output_keys = ["peptide_df"] def method(self, inputs): - return evidence_import(**inputs) \ No newline at end of file + return evidence_import(**inputs) diff --git a/protzilla/steps.py b/protzilla/steps.py index 32ce93b3..185e4f3e 100644 --- a/protzilla/steps.py +++ b/protzilla/steps.py @@ -311,28 +311,32 @@ def export(self, format_): exports.append(BytesIO(base64.b64decode(plot))) return exports -class DisplayOutput: +class DisplayOutput: def __init__(self, display_output: dict = None): if display_output is None: display_output = {} self.display_output = display_output + def __iter__(self): return iter(self.display_output) + def __repr__(self): return f"DisplayOutput: {self.display_output}" + def __contains__(self, key): return key in self.display_output + def __getitem__(self, key): return self.display_output[key] + def __setitem__(self, key, value): self.display_output[key] = value + def is_empty(self) -> bool: return len(self.display_output) == 0 - - class StepManager: def __repr__(self): return f"IMP: {self.importing} PRE: {self.data_preprocessing} ANA: {self.data_analysis} INT: {self.data_integration}" diff --git a/protzilla/utilities/transform_dfs.py b/protzilla/utilities/transform_dfs.py index f3605b08..4ca02446 100644 --- a/protzilla/utilities/transform_dfs.py +++ b/protzilla/utilities/transform_dfs.py @@ -17,7 +17,9 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str = None): packages such as sklearn :rtype: pd.DataFrame """ - values_name = default_intensity_column(intensity_df) if value_name is None else value_name + values_name = ( + default_intensity_column(intensity_df) if value_name is None else value_name + ) return pd.pivot( intensity_df, index="Sample", columns="Protein ID", values=values_name ) diff --git a/tests/conftest.py b/tests/conftest.py index ea8728b1..1bc23e04 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -207,29 +207,259 @@ def peptides_df(): def evidence_peptide_df(): df = pd.DataFrame( ( - ["Sample1", "Protein1", "SEQA", 1000000, "Unmodified", "_SEQA_", 1, 0.00001, "Raw_File_1"], - ["Sample1", "Protein2", "SEQB", 2000000, "Unmodified", "_SEQB_", None, 0.00002, "Raw_File_1"], - ["Sample1", "Protein2", "SEQC", 3000000, "Acetyl (Protein N-term)", "_(Acetyl (Protein N-term))SEQC_", None, 0.00003, "Raw_File_1"], - ["Sample1", "Protein2", "SEQD", 4000000, "Acetyl (Protein N-term),Oxidation (M)", "_(Acetyl (Protein N-term))SE(Oxidation (M))QD_", None, 0.00004, "Raw_File_1"], - ["Sample1", "Protein3", "SEQE", 5000000, "Unmodified", "_SEQE_", None, 0.00005, "Raw_File_1"], - ["Sample1", "Protein3", "SEQF", 6000000, "Unmodified", "_SEQF_", None, 0.00006, "Raw_File_1"], - ["Sample1", "Protein3", "SEQG", 7000000, "Unmodified", "_SEQG_", None, 0.00007, "Raw_File_1"], - ["Sample1", "Protein4", "SEQH", 8000000, "Unmodified", "_SEQH_", None, 0.00008, "Raw_File_1"], - ["Sample1", "Protein5", "SEQI", 9000000, "Unmodified", "_SEQI_", None, 0.00009, "Raw_File_1"], - ["Sample2", "Protein1", "SEQJ", 10000000, "Acetyl (Protein N-term)", "_(Acetyl (Protein N-term))SEQJ_", None, 0.0001, "Raw_File_2"], - ["Sample2", "Protein2", "SEQK", 11000000, "Unmodified", "_SEQK_", None, 0.00011, "Raw_File_2"], - ["Sample2", "Protein3", "SEQL", 12000000, "Unmodified", "_SEQL_", None, 0.00012, "Raw_File_2"], - ["Sample2", "Protein4", "SEQM", 13000000, "Unmodified", "_SEQM_", None, 0.00013, "Raw_File_2"], - ["Sample2", "Protein5", "SEQN", 14000000, "Unmodified", "_SEQN_", None, 0.00014, "Raw_File_2"], - ["Sample3", "Protein1", "SEQO", 15000000, "Unmodified", "_SEQO_", None, 0.00015, "Raw_File_3"], - ["Sample3", "Protein2", "SEQP", 16000000, "Unmodified", "_SEQP_", None, 0.00016, "Raw_File_3"], - ["Sample3", "Protein3", "SEQQ", 17000000, "Unmodified", "_SEQQ_", None, 0.00017, "Raw_File_3"], - ["Sample3", "Protein4", "SEQR", 18000000, "Unmodified", "_SEQR_", None, 0.00018, "Raw_File_3"], - ["Sample3", "Protein5", "SEQS", 19000000, "Unmodified", "_SEQS_", None, 0.00019, "Raw_File_3"], - ["Sample4", "Protein1", "SEQT", 20000000, "Unmodified", "_SEQT_", None, 0.0002, "Raw_File_4"], - ["Sample4", "Protein2", "SEQU", 21000000, "Unmodified", "_SEQU_", None, 0.00021, "Raw_File_4"], - ["Sample4", "Protein3", "SEQV", 22000000, "Unmodified", "_SEQV_", None, 0.00022, "Raw_File_4"], - ["Sample4", "Protein4", "SEQW", 23000000, "Unmodified", "_SEQW_", None, 0.00023, "Raw_File_4"], + [ + "Sample1", + "Protein1", + "SEQA", + 1000000, + "Unmodified", + "_SEQA_", + 1, + 0.00001, + "Raw_File_1", + ], + [ + "Sample1", + "Protein2", + "SEQB", + 2000000, + "Unmodified", + "_SEQB_", + None, + 0.00002, + "Raw_File_1", + ], + [ + "Sample1", + "Protein2", + "SEQC", + 3000000, + "Acetyl (Protein N-term)", + "_(Acetyl (Protein N-term))SEQC_", + None, + 0.00003, + "Raw_File_1", + ], + [ + "Sample1", + "Protein2", + "SEQD", + 4000000, + "Acetyl (Protein N-term),Oxidation (M)", + "_(Acetyl (Protein N-term))SE(Oxidation (M))QD_", + None, + 0.00004, + "Raw_File_1", + ], + [ + "Sample1", + "Protein3", + "SEQE", + 5000000, + "Unmodified", + "_SEQE_", + None, + 0.00005, + "Raw_File_1", + ], + [ + "Sample1", + "Protein3", + "SEQF", + 6000000, + "Unmodified", + "_SEQF_", + None, + 0.00006, + "Raw_File_1", + ], + [ + "Sample1", + "Protein3", + "SEQG", + 7000000, + "Unmodified", + "_SEQG_", + None, + 0.00007, + "Raw_File_1", + ], + [ + "Sample1", + "Protein4", + "SEQH", + 8000000, + "Unmodified", + "_SEQH_", + None, + 0.00008, + "Raw_File_1", + ], + [ + "Sample1", + "Protein5", + "SEQI", + 9000000, + "Unmodified", + "_SEQI_", + None, + 0.00009, + "Raw_File_1", + ], + [ + "Sample2", + "Protein1", + "SEQJ", + 10000000, + "Acetyl (Protein N-term)", + "_(Acetyl (Protein N-term))SEQJ_", + None, + 0.0001, + "Raw_File_2", + ], + [ + "Sample2", + "Protein2", + "SEQK", + 11000000, + "Unmodified", + "_SEQK_", + None, + 0.00011, + "Raw_File_2", + ], + [ + "Sample2", + "Protein3", + "SEQL", + 12000000, + "Unmodified", + "_SEQL_", + None, + 0.00012, + "Raw_File_2", + ], + [ + "Sample2", + "Protein4", + "SEQM", + 13000000, + "Unmodified", + "_SEQM_", + None, + 0.00013, + "Raw_File_2", + ], + [ + "Sample2", + "Protein5", + "SEQN", + 14000000, + "Unmodified", + "_SEQN_", + None, + 0.00014, + "Raw_File_2", + ], + [ + "Sample3", + "Protein1", + "SEQO", + 15000000, + "Unmodified", + "_SEQO_", + None, + 0.00015, + "Raw_File_3", + ], + [ + "Sample3", + "Protein2", + "SEQP", + 16000000, + "Unmodified", + "_SEQP_", + None, + 0.00016, + "Raw_File_3", + ], + [ + "Sample3", + "Protein3", + "SEQQ", + 17000000, + "Unmodified", + "_SEQQ_", + None, + 0.00017, + "Raw_File_3", + ], + [ + "Sample3", + "Protein4", + "SEQR", + 18000000, + "Unmodified", + "_SEQR_", + None, + 0.00018, + "Raw_File_3", + ], + [ + "Sample3", + "Protein5", + "SEQS", + 19000000, + "Unmodified", + "_SEQS_", + None, + 0.00019, + "Raw_File_3", + ], + [ + "Sample4", + "Protein1", + "SEQT", + 20000000, + "Unmodified", + "_SEQT_", + None, + 0.0002, + "Raw_File_4", + ], + [ + "Sample4", + "Protein2", + "SEQU", + 21000000, + "Unmodified", + "_SEQU_", + None, + 0.00021, + "Raw_File_4", + ], + [ + "Sample4", + "Protein3", + "SEQV", + 22000000, + "Unmodified", + "_SEQV_", + None, + 0.00022, + "Raw_File_4", + ], + [ + "Sample4", + "Protein4", + "SEQW", + 23000000, + "Unmodified", + "_SEQW_", + None, + 0.00023, + "Raw_File_4", + ], ), columns=[ "Sample", @@ -241,7 +471,7 @@ def evidence_peptide_df(): "Missed cleavages", "PEP", "Raw file", - ] + ], ) return df diff --git a/tests/protzilla/data_analysis/test_analysis_plots.py b/tests/protzilla/data_analysis/test_analysis_plots.py index 3b665b0b..4d6f8156 100644 --- a/tests/protzilla/data_analysis/test_analysis_plots.py +++ b/tests/protzilla/data_analysis/test_analysis_plots.py @@ -83,7 +83,9 @@ def test_plots_volcano_plot_no_annotation(ttest_input, ttest_output, show_figure fig.show() -def test_plots_volcano_plot_multiple_annotations(ttest_input, ttest_output, show_figures): +def test_plots_volcano_plot_multiple_annotations( + ttest_input, ttest_output, show_figures +): fig = create_volcano_plot( p_values=ttest_output["corrected_p_values_df"], log2_fc=ttest_output["log2_fold_change_df"], diff --git a/tests/protzilla/data_analysis/test_differential_expression.py b/tests/protzilla/data_analysis/test_differential_expression.py index 43ad6021..d9f75d28 100644 --- a/tests/protzilla/data_analysis/test_differential_expression.py +++ b/tests/protzilla/data_analysis/test_differential_expression.py @@ -386,4 +386,4 @@ def test_differential_expression_anova(show_figures): 1.0000, ] - assert assertion_p_values == p_values_rounded \ No newline at end of file + assert assertion_p_values == p_values_rounded diff --git a/tests/protzilla/data_analysis/test_filter_peptites_of_protein.py b/tests/protzilla/data_analysis/test_filter_peptites_of_protein.py index b191f335..85981476 100644 --- a/tests/protzilla/data_analysis/test_filter_peptites_of_protein.py +++ b/tests/protzilla/data_analysis/test_filter_peptites_of_protein.py @@ -1,13 +1,18 @@ -import pytest - from protzilla.data_analysis.ptm_analysis import filter_peptides_of_protein def test_filter_peptides_of_protein(peptides_df): - filtered_peptides_df = filter_peptides_of_protein(peptides_df, ["Protein2"])["peptide_df"] + filtered_peptides_df = filter_peptides_of_protein(peptides_df, ["Protein2"])[ + "peptide_df" + ] assert len(filtered_peptides_df) == 6 assert filtered_peptides_df["Sequence"].tolist() == [ - "SEQB", "SEQC", "SEQD", "SEQK", "SEQP", "SEQU" + "SEQB", + "SEQC", + "SEQD", + "SEQK", + "SEQP", + "SEQU", ] - assert (filtered_peptides_df["Protein ID"] == "Protein2").all() \ No newline at end of file + assert (filtered_peptides_df["Protein ID"] == "Protein2").all() diff --git a/tests/protzilla/data_analysis/test_peptide_analysis.py b/tests/protzilla/data_analysis/test_peptide_analysis.py index b2848a65..40f225cb 100644 --- a/tests/protzilla/data_analysis/test_peptide_analysis.py +++ b/tests/protzilla/data_analysis/test_peptide_analysis.py @@ -1,18 +1,28 @@ import pytest -from protzilla.data_analysis.ptm_analysis import filter_peptides_of_protein, ptms_per_sample, \ - ptms_per_protein_and_sample +from protzilla.data_analysis.ptm_analysis import ( + filter_peptides_of_protein, + ptms_per_protein_and_sample, + ptms_per_sample, +) @pytest.mark.parametrize("df_num", [0, 1]) def test_filter_peptides_of_protein(peptides_df, evidence_peptide_df, df_num): peptide_df = [peptides_df, evidence_peptide_df][df_num] - filtered_peptides_df = filter_peptides_of_protein(peptide_df, ["Protein2"])["peptide_df"] + filtered_peptides_df = filter_peptides_of_protein(peptide_df, ["Protein2"])[ + "peptide_df" + ] assert len(filtered_peptides_df) == 6 assert filtered_peptides_df["Sequence"].tolist() == [ - 'SEQB', 'SEQC', 'SEQD', 'SEQK', 'SEQP', 'SEQU' + "SEQB", + "SEQC", + "SEQD", + "SEQK", + "SEQP", + "SEQU", ] assert (filtered_peptides_df["Protein ID"] == "Protein2").all() @@ -20,7 +30,12 @@ def test_filter_peptides_of_protein(peptides_df, evidence_peptide_df, df_num): def test_ptms_per_sampel(evidence_peptide_df): ptm_df = ptms_per_sample(evidence_peptide_df)["ptm_df"] - assert ptm_df.columns.tolist() == ["Sample", "Acetyl (Protein N-term)", "Oxidation (M)", "Unmodified"] + assert ptm_df.columns.tolist() == [ + "Sample", + "Acetyl (Protein N-term)", + "Oxidation (M)", + "Unmodified", + ] assert ptm_df["Sample"].tolist() == ["Sample1", "Sample2", "Sample3", "Sample4"] assert ptm_df["Unmodified"].tolist() == [7, 4, 5, 4] assert ptm_df["Acetyl (Protein N-term)"].tolist() == [2, 1, 0, 0] @@ -30,15 +45,42 @@ def test_ptms_per_sampel(evidence_peptide_df): def test_ptms_per_protein_and_sample(evidence_peptide_df): ptm_df = ptms_per_protein_and_sample(evidence_peptide_df)["ptm_df"] - assert ptm_df.columns.tolist() == ["Sample", "Protein1", "Protein2", "Protein3", "Protein4", "Protein5"] + assert ptm_df.columns.tolist() == [ + "Sample", + "Protein1", + "Protein2", + "Protein3", + "Protein4", + "Protein5", + ] assert ptm_df["Sample"].tolist() == ["Sample1", "Sample2", "Sample3", "Sample4"] - assert (ptm_df["Protein1"].tolist() == - ["(1) Unmodified, ", "(1) Acetyl (Protein N-term), ", "(1) Unmodified, ", "(1) Unmodified, "]) - assert (ptm_df["Protein2"].tolist() == - ["(2) Acetyl (Protein N-term), (1) Oxidation (M), (1) Unmodified, ", "(1) Unmodified, ", "(1) Unmodified, ", "(1) Unmodified, "]) - assert (ptm_df["Protein3"].tolist() == - ["(3) Unmodified, ", "(1) Unmodified, ", "(1) Unmodified, ", "(1) Unmodified, "]) - assert (ptm_df["Protein4"].tolist() == - ["(1) Unmodified, ", "(1) Unmodified, ", "(1) Unmodified, ", "(1) Unmodified, "]) - assert (ptm_df["Protein5"].tolist() == - ["(1) Unmodified, ", "(1) Unmodified, ", "(1) Unmodified, ", ""]) \ No newline at end of file + assert ptm_df["Protein1"].tolist() == [ + "(1) Unmodified, ", + "(1) Acetyl (Protein N-term), ", + "(1) Unmodified, ", + "(1) Unmodified, ", + ] + assert ptm_df["Protein2"].tolist() == [ + "(2) Acetyl (Protein N-term), (1) Oxidation (M), (1) Unmodified, ", + "(1) Unmodified, ", + "(1) Unmodified, ", + "(1) Unmodified, ", + ] + assert ptm_df["Protein3"].tolist() == [ + "(3) Unmodified, ", + "(1) Unmodified, ", + "(1) Unmodified, ", + "(1) Unmodified, ", + ] + assert ptm_df["Protein4"].tolist() == [ + "(1) Unmodified, ", + "(1) Unmodified, ", + "(1) Unmodified, ", + "(1) Unmodified, ", + ] + assert ptm_df["Protein5"].tolist() == [ + "(1) Unmodified, ", + "(1) Unmodified, ", + "(1) Unmodified, ", + "", + ] diff --git a/tests/protzilla/data_analysis/test_plots_data_analysis.py b/tests/protzilla/data_analysis/test_plots_data_analysis.py index 60403907..ae2d9957 100644 --- a/tests/protzilla/data_analysis/test_plots_data_analysis.py +++ b/tests/protzilla/data_analysis/test_plots_data_analysis.py @@ -101,14 +101,20 @@ def test_scatter_plot_4d_df(wide_4d_df, color_df): assert "messages" in outputs assert "plots" not in outputs - assert any("Consider reducing the dimensionality" in message["msg"] for message in outputs["messages"]) + assert any( + "Consider reducing the dimensionality" in message["msg"] + for message in outputs["messages"] + ) def test_scatter_plot_color_df_2d(show_figures, wide_2d_df): outputs = scatter_plot(wide_2d_df, wide_2d_df) assert "messages" in outputs assert "plots" not in outputs - assert any("The color dataframe should have 1 dimension only" in message["msg"] for message in outputs["messages"]) + assert any( + "The color dataframe should have 1 dimension only" in message["msg"] + for message in outputs["messages"] + ) def test_clustergram(show_figures, wide_4d_df, color_df): @@ -151,8 +157,10 @@ def test_clustergram_input_not_right_type(wide_4d_df): assert "messages" in outputs2 assert "plots" not in outputs2 assert any( - 'The selected input for "grouping dataframe" is not a dataframe, ' in message["msg"] - for message in outputs2["messages"]) + 'The selected input for "grouping dataframe" is not a dataframe, ' + in message["msg"] + for message in outputs2["messages"] + ) def test_clustergram_dimension_mismatch(wide_4d_df): @@ -176,7 +184,10 @@ def test_clustergram_dimension_mismatch(wide_4d_df): ) assert "messages" in outputs assert "plots" not in outputs - assert any("There is a dimension mismatch" in message["msg"] for message in outputs["messages"]) + assert any( + "There is a dimension mismatch" in message["msg"] + for message in outputs["messages"] + ) def test_clustergram_different_samples(wide_4d_df): @@ -200,6 +211,7 @@ def test_clustergram_different_samples(wide_4d_df): assert "messages" in outputs assert "plots" not in outputs assert any( - "The input dataframe and the grouping contain different samples" in message["msg"] + "The input dataframe and the grouping contain different samples" + in message["msg"] for message in outputs["messages"] - ) \ No newline at end of file + ) diff --git a/tests/protzilla/data_analysis/test_power_analysis.py b/tests/protzilla/data_analysis/test_power_analysis.py index ceded6e3..5b2f92ed 100644 --- a/tests/protzilla/data_analysis/test_power_analysis.py +++ b/tests/protzilla/data_analysis/test_power_analysis.py @@ -2,8 +2,13 @@ import pandas as pd import pytest - -from protzilla.data_analysis.power_analysis import sample_size_calculation, power_calculation, check_sample_size_calculation_with_libfunc, check_sample_size_calculation_implemented, check_sample_size_calculation_implemented_without_log +from protzilla.data_analysis.power_analysis import ( + check_sample_size_calculation_implemented, + check_sample_size_calculation_implemented_without_log, + check_sample_size_calculation_with_libfunc, + power_calculation, + sample_size_calculation, +) @pytest.fixture @@ -42,9 +47,7 @@ def power_test_data(): return test_differentially_expressed_proteins_df -def test_variance_protein_group_calculation( - power_test_data -): +def test_variance_protein_group_calculation(power_test_data): intensity_df = power_test_data protein_id = "Protein1" @@ -57,37 +60,31 @@ def test_variance_protein_group_calculation( print(variance) assert variance == 4.0 -def test_sample_size_calculation( - power_test_data -): +def test_sample_size_calculation(power_test_data): test_alpha = 0.05 test_power = 0.8 test_fc_threshold = 1 test_selected_protein_group = "Protein1" - required_sample_size = sample_size_calculation( differentially_expressed_proteins_df=power_test_data, significant_proteins_df=power_test_data, fc_threshold=test_fc_threshold, power=test_power, alpha=test_alpha, - group1= "Group1", - group2= "Group2", + group1="Group1", + group2="Group2", selected_protein_group=test_selected_protein_group, significant_proteins_only=False, - intensity_name=None + intensity_name=None, ) print(required_sample_size) - required_sample_size_int = next(iter(required_sample_size.values()),None) + required_sample_size_int = next(iter(required_sample_size.values()), None) assert required_sample_size_int == 63 -def test_check_sample_size_calculation_with_libfun( - power_test_data - -): +def test_check_sample_size_calculation_with_libfun(power_test_data): test_alpha = 0.05 test_power = 0.8 test_fc_threshold = 5 @@ -103,20 +100,20 @@ def test_check_sample_size_calculation_with_libfun( group2="Group2", selected_protein_group=test_selected_protein_group, significant_proteins_only=False, - intensity_name=None + intensity_name=None, ) print(required_sample_size) required_sample_size_int = next(iter(required_sample_size.values()), None) assert required_sample_size_int == 63 -def test_check_sample_size_calculation_impl( - power_test_data -): +def test_check_sample_size_calculation_impl(power_test_data): test_alpha = 0.05 test_power = 0.8 power_test_data_log2 = power_test_data.copy() - power_test_data_log2["Normalised iBAQ"] = np.log2(power_test_data_log2["Normalised iBAQ"]) + power_test_data_log2["Normalised iBAQ"] = np.log2( + power_test_data_log2["Normalised iBAQ"] + ) fc_threshold = 1 test_selected_protein_group = "Protein1" @@ -130,16 +127,14 @@ def test_check_sample_size_calculation_impl( group2="Group2", selected_protein_group=test_selected_protein_group, significant_proteins_only=False, - intensity_name=None + intensity_name=None, ) print(required_sample_size) required_sample_size_int = next(iter(required_sample_size.values()), None) assert required_sample_size_int == 1 -def test_check_sample_size_calculation_implemented_without_log( - power_test_data -): +def test_check_sample_size_calculation_implemented_without_log(power_test_data): test_alpha = 0.05 test_power = 0.8 test_fc_threshold = 5 @@ -155,22 +150,18 @@ def test_check_sample_size_calculation_implemented_without_log( group2="Group2", selected_protein_group=test_selected_protein_group, significant_proteins_only=False, - intensity_name=None + intensity_name=None, ) print(required_sample_size) required_sample_size_int = next(iter(required_sample_size.values()), None) assert required_sample_size_int == 63 - -def test_power_calculation( - power_test_data -): +def test_power_calculation(power_test_data): test_alpha = 0.05 test_fc_threshold = 1 test_selected_protein_group = "Protein1" - power = power_calculation( differentially_expressed_proteins_df=power_test_data, significant_proteins_df=power_test_data, @@ -180,8 +171,8 @@ def test_power_calculation( group2="Group2", selected_protein_group=test_selected_protein_group, significant_proteins_only=False, - intensity_name=None + intensity_name=None, ) print(power) power_int = next(iter(power.values()), None) - assert power_int== 0.09 + assert power_int == 0.09 diff --git a/tests/protzilla/data_integration/test_plots_data_integration.py b/tests/protzilla/data_integration/test_plots_data_integration.py index 044c7c5a..39c801f5 100644 --- a/tests/protzilla/data_integration/test_plots_data_integration.py +++ b/tests/protzilla/data_integration/test_plots_data_integration.py @@ -68,7 +68,10 @@ def test_enrichment_bar_plot_wrong_value(data_folder_tests): gene_sets=["Reactome_2013"], ) assert "messages" in current_out - assert any(("FDR is not available" in message["msg"]) for message in current_out["messages"]) + assert any( + ("FDR is not available" in message["msg"]) + for message in current_out["messages"] + ) def test_enrichment_bar_plot_empty_df(): @@ -81,7 +84,9 @@ def test_enrichment_bar_plot_empty_df(): gene_sets=["Reactome_2013"], ) assert "messages" in current_out - assert any(("No data to plot" in message["msg"]) for message in current_out["messages"]) + assert any( + ("No data to plot" in message["msg"]) for message in current_out["messages"] + ) def test_enrichment_bar_plot_no_category(data_folder_tests): @@ -92,7 +97,10 @@ def test_enrichment_bar_plot_no_category(data_folder_tests): input_df=enrichment_df, top_terms=10, cutoff=0.05, value="p_value", gene_sets=[] ) assert "messages" in current_out - assert any(("Please select at least one category" in message["msg"]) for message in current_out["messages"]) + assert any( + ("Please select at least one category" in message["msg"]) + for message in current_out["messages"] + ) def test_enrichment_bar_plot_wrong_df(): @@ -105,7 +113,10 @@ def test_enrichment_bar_plot_wrong_df(): gene_sets=["KEGG"], ) assert "messages" in current_out - assert any(("Please choose an enrichment result dataframe" in message["msg"]) for message in current_out["messages"]) + assert any( + ("Please choose an enrichment result dataframe" in message["msg"]) + for message in current_out["messages"] + ) def test_enrichment_bar_plot_cutoff(data_folder_tests): @@ -119,7 +130,10 @@ def test_enrichment_bar_plot_cutoff(data_folder_tests): ) assert "messages" in current_out - assert any(("No data to plot when applying cutoff" in message["msg"]) for message in current_out["messages"]) + assert any( + ("No data to plot when applying cutoff" in message["msg"]) + for message in current_out["messages"] + ) enrichment_df = pd.read_csv( data_folder_tests / "Reactome_enrichment_enrichr.csv", sep="\t" @@ -132,7 +146,10 @@ def test_enrichment_bar_plot_cutoff(data_folder_tests): gene_sets=["Reactome_2013"], ) assert "messages" in current_out - assert any(("No data to plot when applying cutoff" in message["msg"]) for message in current_out["messages"]) + assert any( + ("No data to plot when applying cutoff" in message["msg"]) + for message in current_out["messages"] + ) @pytest.mark.parametrize("x_axis_type", ["Gene Sets", "Combined Score"]) diff --git a/tests/protzilla/data_preprocessing/test_normalisation.py b/tests/protzilla/data_preprocessing/test_normalisation.py index 8e5c2a45..cd2acdcc 100644 --- a/tests/protzilla/data_preprocessing/test_normalisation.py +++ b/tests/protzilla/data_preprocessing/test_normalisation.py @@ -349,7 +349,9 @@ def test_totalsum_normalisation( method_inputs = {"protein_df": normalisation_df} method_outputs = by_totalsum(**method_inputs) - fig = by_totalsum_plot(method_inputs, method_outputs, "Boxplot", "Sample", "log10")[0] + fig = by_totalsum_plot(method_inputs, method_outputs, "Boxplot", "Sample", "log10")[ + 0 + ] if show_figures: fig.show() @@ -376,9 +378,9 @@ def test_ref_protein_normalisation( } method_outputs = by_reference_protein(**method_input) - fig = by_reference_protein_plot(method_input, method_outputs, "Boxplot", "Sample", "log10")[ - 0 - ] + fig = by_reference_protein_plot( + method_input, method_outputs, "Boxplot", "Sample", "log10" + )[0] if show_figures: fig.show() diff --git a/tests/protzilla/data_preprocessing/test_outlier_detection.py b/tests/protzilla/data_preprocessing/test_outlier_detection.py index e94f84b2..f21e18e0 100644 --- a/tests/protzilla/data_preprocessing/test_outlier_detection.py +++ b/tests/protzilla/data_preprocessing/test_outlier_detection.py @@ -65,8 +65,7 @@ def outlier_detection_df_with_nan(): def test_outlier_detection_with_isolation_forest( - show_figures, outlier_detection_df, - peptides_df + show_figures, outlier_detection_df, peptides_df ): method_inputs = { "protein_df": outlier_detection_df, diff --git a/tests/protzilla/data_preprocessing/test_peptide_preprocessing.py b/tests/protzilla/data_preprocessing/test_peptide_preprocessing.py index f3900fdd..1de769f5 100644 --- a/tests/protzilla/data_preprocessing/test_peptide_preprocessing.py +++ b/tests/protzilla/data_preprocessing/test_peptide_preprocessing.py @@ -1,5 +1,4 @@ import pandas as pd -import pytest from protzilla.constants.paths import TEST_DATA_PATH from protzilla.data_preprocessing.peptide_filter import by_pep_value, by_pep_value_plot @@ -57,4 +56,3 @@ def test_pep_filter(show_figures, leftover_peptide_df, filtered_peptides_list): pd.testing.assert_frame_equal(method_outputs["peptide_df"], leftover_peptide_df) assert method_outputs["filtered_peptides"] == filtered_peptides_list - diff --git a/tests/protzilla/importing/test_ms_data_import.py b/tests/protzilla/importing/test_ms_data_import.py index 457fe145..e7909db8 100644 --- a/tests/protzilla/importing/test_ms_data_import.py +++ b/tests/protzilla/importing/test_ms_data_import.py @@ -218,7 +218,9 @@ def test_max_quant_import_no_protein_ids_column(): assert "protein_df" not in outputs assert "messages" in outputs assert any(message["level"] == logging.ERROR for message in outputs["messages"]) - assert any("Majority protein IDs" in message["msg"] for message in outputs["messages"]) + assert any( + "Majority protein IDs" in message["msg"] for message in outputs["messages"] + ) def test_max_quant_import_invalid_data(): @@ -310,9 +312,7 @@ def test_transform_and_clean(): ["C", "Q11111", np.nan], ] df = pd.DataFrame(data, columns=columns) - outputs = ms_data_import.transform_and_clean( - df, "intensity", map_to_uniprot=False - ) + outputs = ms_data_import.transform_and_clean(df, "intensity", map_to_uniprot=False) expected_df = pd.DataFrame(expected_output, columns=out_col) # we do not care about the genes column, it is deprecated (and replaced by nan) diff --git a/tests/protzilla/test_runner.py b/tests/protzilla/test_runner.py index b5de3148..0d251eda 100644 --- a/tests/protzilla/test_runner.py +++ b/tests/protzilla/test_runner.py @@ -12,8 +12,8 @@ sys.path.append(f"{PROJECT_PATH}") from protzilla.runner import Runner, _serialize_graphs -from runner_cli import args_parser from protzilla.steps import Output, Plots +from runner_cli import args_parser @pytest.fixture @@ -43,7 +43,8 @@ def mock_current_parameters(*args, **kwargs): # side effect to mark the step as finished runner.run.current_step.output = Output( - {key: "mock_output_value" for key in runner.run.current_step.output_keys}) + {key: "mock_output_value" for key in runner.run.current_step.output_keys} + ) if len(runner.run.current_step.output_keys) == 0: runner.run.current_step.plots = Plots(["mock_plot"]) @@ -88,34 +89,61 @@ def test_runner_imports( runner.compute_workflow() expected_methods = [ - 'MaxQuantImport', - 'MetadataImport', - 'FilterProteinsBySamplesMissing', - 'FilterSamplesByProteinIntensitiesSum', - 'ImputationByKNN', - 'OutlierDetectionByLocalOutlierFactor', - 'NormalisationByMedian', - 'TransformationLog', - 'PlotProtQuant', - 'DifferentialExpressionTTest', - 'PlotVolcano', - 'EnrichmentAnalysisGOAnalysisWithString', - 'PlotGOEnrichmentBarPlot' + "MaxQuantImport", + "MetadataImport", + "FilterProteinsBySamplesMissing", + "FilterSamplesByProteinIntensitiesSum", + "ImputationByKNN", + "OutlierDetectionByLocalOutlierFactor", + "NormalisationByMedian", + "TransformationLog", + "PlotProtQuant", + "DifferentialExpressionTTest", + "PlotVolcano", + "EnrichmentAnalysisGOAnalysisWithString", + "PlotGOEnrichmentBarPlot", ] expected_method_parameters = [ - call({'intensity_name': 'iBAQ', 'map_to_uniprot': False, 'aggregation_mode': 'Sum', 'file_path': 'tests/proteinGroups_small_cut.txt'}), - call({'feature_orientation': 'Columns (samples in rows, features in columns)', 'file_path': 'tests/metadata_cut_columns.csv'}), - call({'percentage': 0.5}), - call({'deviation_threshold': 2.0}), - call({'number_of_neighbours': 5}), - call({'number_of_neighbors': 20}), - call({'percentile': 0.5}), - call({'log_base': 'log2'}), - call({'similarity_measure': 'euclidean distance'}), - call({'alpha': 0.05}), - call({'fc_threshold': 1}), - call({'differential_expression_threshold': 1, 'direction': 'both', 'gene_sets_restring': [], 'organism': 9606}), - call({'colors': [], 'cutoff': 0.05, 'gene_sets': ['Process', 'Component', 'Function', 'KEGG'], 'top_terms': 10, 'value': 'p-value'}) + call( + { + "intensity_name": "iBAQ", + "map_to_uniprot": False, + "aggregation_mode": "Sum", + "file_path": "tests/proteinGroups_small_cut.txt", + } + ), + call( + { + "feature_orientation": "Columns (samples in rows, features in columns)", + "file_path": "tests/metadata_cut_columns.csv", + } + ), + call({"percentage": 0.5}), + call({"deviation_threshold": 2.0}), + call({"number_of_neighbours": 5}), + call({"number_of_neighbors": 20}), + call({"percentile": 0.5}), + call({"log_base": "log2"}), + call({"similarity_measure": "euclidean distance"}), + call({"alpha": 0.05}), + call({"fc_threshold": 1}), + call( + { + "differential_expression_threshold": 1, + "direction": "both", + "gene_sets_restring": [], + "organism": 9606, + } + ), + call( + { + "colors": [], + "cutoff": 0.05, + "gene_sets": ["Process", "Component", "Function", "KEGG"], + "top_terms": 10, + "value": "p-value", + } + ), ] assert mock_method.call_count == 13 @@ -168,10 +196,21 @@ def test_runner_calculates(monkeypatch, tests_folder_name, ms_data_path, metadat "FilterProteinsBySamplesMissing", ] assert mock_method.call_args_list == [ - call({'intensity_name': 'iBAQ', 'map_to_uniprot': False, 'aggregation_method': 'Sum', 'file_path': 'tests/proteinGroups_small_cut.txt'}), - call({'feature_orientation': 'Columns (samples in rows, features in columns)', - 'file_path': 'tests/metadata_cut_columns.csv'}), - call({'percentage': 0.5}) + call( + { + "intensity_name": "iBAQ", + "map_to_uniprot": False, + "aggregation_method": "Sum", + "file_path": "tests/proteinGroups_small_cut.txt", + } + ), + call( + { + "feature_orientation": "Columns (samples in rows, features in columns)", + "file_path": "tests/metadata_cut_columns.csv", + } + ), + call({"percentage": 0.5}), ] mock_plot.assert_not_called() @@ -251,7 +290,9 @@ def test_serialize_workflow_graphs(): assert _serialize_graphs(step["graphs"]) == serial_filter_graphs -def test_integration_runner(metadata_path, ms_data_path, tests_folder_name, monkeypatch): +def test_integration_runner( + metadata_path, ms_data_path, tests_folder_name, monkeypatch +): name = tests_folder_name + "/test_runner_integration_" + random_string() runner = Runner( **{ diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 74c5a149..7811c6b9 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1,24 +1,26 @@ -import logging from enum import Enum, StrEnum -from protzilla.methods.data_preprocessing import DataPreprocessingStep from protzilla.methods.data_analysis import ( + DataAnalysisStep, DifferentialExpressionLinearModel, DifferentialExpressionTTest, - DimensionReductionUMAP, DataAnalysisStep, SelectPeptidesForProtein, PTMsPerSample, + DimensionReductionUMAP, + PTMsPerSample, + SelectPeptidesForProtein, ) +from protzilla.methods.data_preprocessing import DataPreprocessingStep from protzilla.run import Run from protzilla.steps import Step from . import fill_helper from .base import MethodForm from .custom_fields import ( + CustomBooleanField, CustomCharField, CustomChoiceField, CustomFloatField, CustomMultipleChoiceField, CustomNumberField, - CustomBooleanField, ) @@ -143,6 +145,7 @@ class DimensionReductionMetric(Enum): cosine = "cosine" havensine = "havensine" + class DifferentialExpressionANOVAForm(MethodForm): is_dynamic = True @@ -295,9 +298,13 @@ class DifferentialExpressionMannWhitneyOnIntensityForm(MethodForm): group2 = CustomChoiceField(choices=[], label="Group 2") def fill_form(self, run: Run) -> None: - self.fields["intensity_df"].choices = fill_helper.get_choices_for_protein_df_steps(run) + self.fields[ + "intensity_df" + ].choices = fill_helper.get_choices_for_protein_df_steps(run) - self.fields["grouping"].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + self.fields[ + "grouping" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0]) @@ -325,9 +332,7 @@ def fill_form(self, run: Run) -> None: class DifferentialExpressionMannWhitneyOnPTMForm(MethodForm): is_dynamic = True - ptm_df = CustomChoiceField( - choices=[], label="Step to use ptm data from" - ) + ptm_df = CustomChoiceField(choices=[], label="Step to use ptm data from") multiple_testing_correction_method = CustomChoiceField( choices=MultipleTestingCorrectionMethod, label="Multiple testing correction", @@ -345,7 +350,9 @@ def fill_form(self, run: Run) -> None: run.steps.get_instance_identifiers(PTMsPerSample, "ptm_df") ) - self.fields["grouping"].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + self.fields[ + "grouping" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0]) @@ -719,7 +726,7 @@ class ClassificationRandomForestForm(MethodForm): # TODO: Workflow_meta line 1763 train_val_split = CustomNumberField( label="Choose the size of the validation data set (you can either enter the absolute number of validation " - "samples or a number between 0.0 and 1.0 to represent the percentage of validation samples)", + "samples or a number between 0.0 and 1.0 to represent the percentage of validation samples)", initial=0.20, ) # TODO: Workflow_meta line 1770 @@ -807,7 +814,7 @@ class ClassificationSVMForm(MethodForm): ) train_val_split = CustomNumberField( label="Choose the size of the validation data set (you can either enter the absolute number of validation " - "samples or a number between 0.0 and 1.0 to represent the percentage of validation samples)", + "samples or a number between 0.0 and 1.0 to represent the percentage of validation samples)", initial=0.20, ) # TODO: Workflow_meta line 1973 @@ -924,7 +931,7 @@ class DimensionReductionUMAPForm(MethodForm): ) n_neighbors = CustomNumberField( label="The size of local neighborhood (in terms of number of neighboring sample points) used for manifold " - "approximation", + "approximation", min_value=2, max_value=100, step_size=1, @@ -965,7 +972,7 @@ class ProteinGraphPeptidesToIsoformForm(MethodForm): k = CustomNumberField(label="k-mer length", min_value=1, step_size=1, initial=5) allowed_mismatches = CustomNumberField( label="Number of allowed mismatched amino acids per peptide. For many allowed mismatches, this can take a " - "long time.", + "long time.", min_value=0, step_size=1, initial=2, @@ -1016,29 +1023,33 @@ def fill_form(self, run: Run) -> None: selected_auto_select = self.data.get("auto_select") - choices = fill_helper.to_choices([] if selected_auto_select else ["all proteins"]) - choices.extend(fill_helper.get_choices( - run, "significant_proteins_df", DataAnalysisStep - )) + choices = fill_helper.to_choices( + [] if selected_auto_select else ["all proteins"] + ) + choices.extend( + fill_helper.get_choices(run, "significant_proteins_df", DataAnalysisStep) + ) self.fields["protein_list"].choices = choices - chosen_list = self.data.get("protein_list", self.fields["protein_list"].choices[0][0]) + chosen_list = self.data.get( + "protein_list", self.fields["protein_list"].choices[0][0] + ) if not selected_auto_select: self.toggle_visibility("sort_proteins", True) self.toggle_visibility("protein_ids", True) if chosen_list == "all proteins": self.fields["protein_ids"].choices = fill_helper.to_choices( - run.steps.get_step_output( - Step, "protein_df" - )["Protein ID"].unique() + run.steps.get_step_output(Step, "protein_df")["Protein ID"].unique() ) else: if self.data.get("sort_proteins"): self.fields["protein_ids"].choices = fill_helper.to_choices( run.steps.get_step_output( DataAnalysisStep, "significant_proteins_df", chosen_list - ).sort_values(by="corrected_p_value")["Protein ID"].unique() + ) + .sort_values(by="corrected_p_value")["Protein ID"] + .unique() ) else: self.fields["protein_ids"].choices = fill_helper.to_choices( @@ -1061,12 +1072,12 @@ def fill_form(self, run: Run) -> None: single_protein_peptides = run.steps.get_instance_identifiers( SelectPeptidesForProtein, "peptide_df" ) - self.fields["peptide_df"].choices = fill_helper.to_choices(single_protein_peptides) - - self.fields["peptide_df"].choices = fill_helper.get_choices( - run, "peptide_df" + self.fields["peptide_df"].choices = fill_helper.to_choices( + single_protein_peptides ) + self.fields["peptide_df"].choices = fill_helper.get_choices(run, "peptide_df") + single_protein_peptides = run.steps.get_instance_identifiers( SelectPeptidesForProtein, "peptide_df" ) @@ -1081,15 +1092,15 @@ class PTMsPerProteinAndSampleForm(MethodForm): ) def fill_form(self, run: Run) -> None: - self.fields["peptide_df"].choices = fill_helper.get_choices( - run, "peptide_df" - ) + self.fields["peptide_df"].choices = fill_helper.get_choices(run, "peptide_df") single_protein_peptides = run.steps.get_instance_identifiers( SelectPeptidesForProtein, "peptide_df" ) if single_protein_peptides: self.fields["peptide_df"].initial = single_protein_peptides[0] + + class PowerAnalysisPowerCalculationForm(MethodForm): is_dynamic = True @@ -1099,10 +1110,10 @@ class PowerAnalysisPowerCalculationForm(MethodForm): ) alpha = CustomFloatField( label="Error rate (alpha)", - min_value = 0, - max_value = 1, - step_size = 0.05, - initial = 0.05, + min_value=0, + max_value=1, + step_size=0.05, + initial=0.05, ) fc_threshold = CustomFloatField( label="Log2 fold change threshold", min_value=0, initial=1 @@ -1136,7 +1147,8 @@ def fill_form(self, run: Run) -> None: ) significant_proteins_only = self.data.get( - "significant_proteins_only", self.fields["significant_proteins_only"].choices[0][0] + "significant_proteins_only", + self.fields["significant_proteins_only"].choices[0][0], ) if significant_proteins_only == YesNo.yes: @@ -1166,17 +1178,17 @@ class PowerAnalysisSampleSizeCalculationForm(MethodForm): ) alpha = CustomFloatField( label="Error rate (alpha)", - min_value = 0, - max_value = 1, - step_size = 0.05, - initial = 0.05, + min_value=0, + max_value=1, + step_size=0.05, + initial=0.05, ) power = CustomFloatField( label="Power", - min_value = 0, - max_value = 1, - step_size = 0.05, - initial = 0.8, + min_value=0, + max_value=1, + step_size=0.05, + initial=0.8, ) fc_threshold = CustomFloatField( label="Log2 fold change threshold", min_value=0, initial=1 @@ -1191,7 +1203,6 @@ class PowerAnalysisSampleSizeCalculationForm(MethodForm): label="Protein group to calculate sample size for", ) - def fill_form(self, run: Run) -> None: self.fields["input_dict"].choices = fill_helper.to_choices( run.steps.get_instance_identifiers( @@ -1211,7 +1222,8 @@ def fill_form(self, run: Run) -> None: ) significant_proteins_only = self.data.get( - "significant_proteins_only", self.fields["significant_proteins_only"].choices[0][0] + "significant_proteins_only", + self.fields["significant_proteins_only"].choices[0][0], ) if significant_proteins_only == YesNo.yes: diff --git a/ui/runs/views.py b/ui/runs/views.py index 4de29692..7d2254ed 100644 --- a/ui/runs/views.py +++ b/ui/runs/views.py @@ -18,8 +18,8 @@ from django.shortcuts import render from django.urls import reverse -from protzilla.run_helper import log_messages from protzilla.run import Run, get_available_run_names +from protzilla.run_helper import log_messages from protzilla.stepfactory import StepFactory from protzilla.steps import Step from protzilla.utilities.utilities import ( @@ -125,7 +125,9 @@ def detail(request: HttpRequest, run_name: str): run.steps.current_step.display_output is not None and not run.current_step.display_output.is_empty() ) - display_output_text = next(iter(run.current_step.display_output.display_output.values()), None) + display_output_text = next( + iter(run.current_step.display_output.display_output.values()), None + ) return render( request, diff --git a/user_data/workflows/overhaul.yaml:Zone.Identifier b/user_data/workflows/overhaul.yaml:Zone.Identifier new file mode 100644 index 00000000..71c6e851 --- /dev/null +++ b/user_data/workflows/overhaul.yaml:Zone.Identifier @@ -0,0 +1,3 @@ +[ZoneTransfer] +ZoneId=3 +HostUrl=https://files.slack.com/files-pri/T055BG3H51R-F06U5LX84NS/download/overhaul.yaml?origin_team=E055BG3H51R From cb25777ce725e0ad3bbc7e8990482b0ff431d5a3 Mon Sep 17 00:00:00 2001 From: selenabr Date: Wed, 28 Aug 2024 19:56:38 +0200 Subject: [PATCH 18/36] feature: user can choose whether metadata contains a column for individuals. If so, the mean values per individual are used to calculate the power and sample size. --- protzilla/data_analysis/power_analysis.py | 74 +++++++++++++++++++---- protzilla/methods/data_analysis.py | 14 ++--- ui/runs/forms/data_analysis.py | 20 +++++- ui/runs/forms/fill_helper.py | 4 ++ 4 files changed, 91 insertions(+), 21 deletions(-) diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py index 4a303a18..7989ce7b 100644 --- a/protzilla/data_analysis/power_analysis.py +++ b/protzilla/data_analysis/power_analysis.py @@ -45,12 +45,14 @@ def variance_protein_group_calculation_max( def sample_size_calculation( differentially_expressed_proteins_df: pd.DataFrame, significant_proteins_df: pd.DataFrame, + metadata_df: pd.DataFrame, fc_threshold: float, alpha: float, power: float, group1: str, group2: str, selected_protein_group: str, + individual_column: str, intensity_name: str = None, ) -> dict: """ @@ -58,7 +60,6 @@ def sample_size_calculation( :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. - :param significant_proteins_only: A boolean to display only significant proteins for selection to the user. :param fc_threshold: The fold change threshold. :param alpha: The significance level. The value for alpha is taken from the t-test by default. :param power: The power of the test. @@ -79,8 +80,32 @@ def sample_size_calculation( z_alpha = stats.norm.ppf(1 - alpha / 2) z_beta = stats.norm.ppf(power) + intensity_name = default_intensity_column( + differentially_expressed_proteins_df, intensity_name + ) + filtered_protein_group_df = differentially_expressed_proteins_df[ + differentially_expressed_proteins_df["Protein ID"] == protein_group + ] + + if individual_column != "None" and individual_column in metadata_df.columns: + # filtered_protein_group_df["Individual"] = filtered_protein_group_df["Sample"].apply(lambda x: x[:4]) + filtered_protein_group_merged_df = pd.merge( + filtered_protein_group_df, + metadata_df[["Sample", individual_column]], + on="Sample", + ) + # filtered_protein_group_df.join(metadata_df[["Sample", individual_column]].set_index("Sample"), on="Sample") + + filtered_protein_group_df = ( + filtered_protein_group_merged_df.groupby( + ["Protein ID", "Group", individual_column] + )[intensity_name] + .mean() + .reset_index() + ) + variance_protein_group = variance_protein_group_calculation_max( - intensity_df=differentially_expressed_proteins_df, + intensity_df=filtered_protein_group_df, protein_id=protein_group, group1=group1, group2=group2, @@ -89,7 +114,7 @@ def sample_size_calculation( required_sample_size = ( 2 * ((z_alpha + z_beta) / fc_threshold) ** 2 * variance_protein_group - ) + ) # Equation (1) in Cairns, David A., et al., 2008, Sample size determination in clinical proteomic profiling experiments using mass spectrometry for class comparison required_sample_size = math.ceil(required_sample_size) print(required_sample_size) @@ -99,11 +124,13 @@ def sample_size_calculation( def power_calculation( differentially_expressed_proteins_df: pd.DataFrame, significant_proteins_df: pd.DataFrame, + metadata_df: pd.DataFrame, alpha: float, fc_threshold: float, group1: str, group2: str, selected_protein_group: str, + individual_column: str, intensity_name: str = None, ) -> dict: """ @@ -128,8 +155,33 @@ def power_calculation( protein_group = selected_protein_group z_alpha = stats.norm.ppf(1 - alpha / 2) + intensity_name = default_intensity_column( + differentially_expressed_proteins_df, intensity_name + ) + filtered_protein_group_df = differentially_expressed_proteins_df[ + differentially_expressed_proteins_df["Protein ID"] == protein_group + ] + if individual_column != "None" and individual_column in metadata_df.columns: + filtered_protein_group_merged_df = pd.merge( + filtered_protein_group_df, + metadata_df[["Sample", individual_column]], + on="Sample", + ) + # filtered_protein_group_df.join(metadata_df[["Sample", individual_column]].set_index("Sample"), on="Sample") + + filtered_protein_group_df = ( + filtered_protein_group_merged_df.groupby( + ["Protein ID", "Group", individual_column] + )[intensity_name] + .mean() + .reset_index() + ) + filtered_protein_group_df = filtered_protein_group_df.rename( + columns={individual_column: "Sample"} + ) + variance_protein_group = variance_protein_group_calculation_max( - intensity_df=differentially_expressed_proteins_df, + intensity_df=filtered_protein_group_df, protein_id=protein_group, group1=group1, group2=group2, @@ -146,12 +198,12 @@ def power_calculation( filtered_df["Measurement"] = filtered_df["Sample"].apply( lambda x: int(x[-2:])) """ - filtered_protein_df = differentially_expressed_proteins_df[ - differentially_expressed_proteins_df["Protein ID"] == protein_group - ] - grouped_df = filtered_protein_df.groupby(["Group", "Protein ID"])["Sample"].count() - sample_size_group1 = grouped_df[group1][0] - sample_size_group2 = grouped_df[group2][0] + + group_count_df = filtered_protein_group_df.groupby(["Group", "Protein ID"])[ + "Sample" + ].count() + sample_size_group1 = group_count_df[group1][0] + sample_size_group2 = group_count_df[group2][0] sample_size = (2 * sample_size_group1 * sample_size_group2) / ( sample_size_group1 + sample_size_group2 ) # Equation 2.3.1 from Cohen 1988, Statistical Power Analysis for the Behavioral Sciences @@ -160,4 +212,4 @@ def power_calculation( ) power = float(round(stats.norm.cdf(z_beta), 2)) - return dict(power=power) + return dict(power=power) \ No newline at end of file diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index c1150335..62db5c10 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -802,17 +802,16 @@ class PowerAnalysisPowerCalculation(DataAnalysisStep): "differentially_expressed_proteins_df", "selected_protein_group", "significant_proteins_df", - "significant_proteins_only", "fc_threshold", "alpha", "group1", "group2", + "individual_column", + "metadata_df", ] output_keys = ["power"] def method(self, inputs: dict) -> dict: - if "significant_proteins_only" in inputs: - del inputs["significant_proteins_only"] return power_calculation(**inputs) def insert_dataframes(self, steps: StepManager, inputs) -> dict: @@ -825,7 +824,7 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["significant_proteins_df"] = steps.get_step_output( Step, "significant_proteins_df", inputs["input_dict"] ) - + inputs["metadata_df"] = steps.metadata_df inputs["alpha"] = step.inputs["alpha"] inputs["group1"] = step.inputs["group1"] inputs["group2"] = step.inputs["group2"] @@ -845,20 +844,19 @@ class PowerAnalysisSampleSizeCalculation(DataAnalysisStep): "differentially_expressed_proteins_df", "selected_protein_group", "significant_proteins_df", - "significant_proteins_only", "fc_threshold", "alpha", "group1", "group2", "power", + "individual_column", + "metadata_df", ] output_keys = [ "required_sample_size", ] def method(self, inputs: dict) -> dict: - if "significant_proteins_only" in inputs: - del inputs["significant_proteins_only"] return sample_size_calculation(**inputs) def insert_dataframes(self, steps: StepManager, inputs) -> dict: @@ -871,7 +869,7 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["significant_proteins_df"] = steps.get_step_output( Step, "significant_proteins_df", inputs["input_dict"] ) - + inputs["metadata_df"] = steps.metadata_df inputs["alpha"] = step.inputs["alpha"] inputs["group1"] = step.inputs["group1"] inputs["group2"] = step.inputs["group2"] diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 7811c6b9..d7508230 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1127,6 +1127,10 @@ class PowerAnalysisPowerCalculationForm(MethodForm): choices=[], label="Protein group to calculate power for", ) + individual_column = CustomChoiceField( + choices=[], + label="Column name for individuals in metadata, if it exists (mean value will be calculated per individual)", + ) def fill_form(self, run: Run) -> None: self.fields["input_dict"].choices = fill_helper.to_choices( @@ -1139,7 +1143,11 @@ def fill_form(self, run: Run) -> None: input_dict_instance_id = self.data.get( "input_dict", self.fields["input_dict"].choices[0][0] ) - + self.fields["individual_column"].choices = [ + ("None", "None") + ] + fill_helper.get_choices_for_metadata_all_columns(run) + individual_column = self.data.get("individual_column", "None") + self.fields["individual_column"].initial = individual_column self.fields["selected_protein_group"].choices = fill_helper.to_choices( run.steps.get_step_output( Step, "differentially_expressed_proteins_df", input_dict_instance_id @@ -1202,6 +1210,10 @@ class PowerAnalysisSampleSizeCalculationForm(MethodForm): choices=[], label="Protein group to calculate sample size for", ) + individual_column = CustomChoiceField( + choices=[], + label="Column name for individuals in metadata, if it exists (mean value will be calculated per individual)", + ) def fill_form(self, run: Run) -> None: self.fields["input_dict"].choices = fill_helper.to_choices( @@ -1214,7 +1226,11 @@ def fill_form(self, run: Run) -> None: input_dict_instance_id = self.data.get( "input_dict", self.fields["input_dict"].choices[0][0] ) - + self.fields["individual_column"].choices = [ + ("None", "None") + ] + fill_helper.get_choices_for_metadata_all_columns(run) + individual_column = self.data.get("individual_column", "None") + self.fields["individual_column"].initial = individual_column self.fields["selected_protein_group"].choices = fill_helper.to_choices( run.steps.get_step_output( Step, "differentially_expressed_proteins_df", input_dict_instance_id diff --git a/ui/runs/forms/fill_helper.py b/ui/runs/forms/fill_helper.py index 0b416f8f..641c7763 100644 --- a/ui/runs/forms/fill_helper.py +++ b/ui/runs/forms/fill_helper.py @@ -32,3 +32,7 @@ def get_choices_for_metadata_non_sample_columns(run: Run) -> list[tuple[str, str run.steps.metadata_df.columns != "Sample" ].unique() ) + + +def get_choices_for_metadata_all_columns(run: Run) -> list[tuple[str, str]]: + return to_choices(run.steps.metadata_df.columns) From 52ef105633da888ad4999368028770cace9a6080 Mon Sep 17 00:00:00 2001 From: selenabr Date: Wed, 28 Aug 2024 21:13:08 +0200 Subject: [PATCH 19/36] adapted test for power_calculation and sample_size_calculation and checked values from paper of Cairns --- .../data_analysis/test_power_analysis.py | 92 ++++++++++++++----- 1 file changed, 67 insertions(+), 25 deletions(-) diff --git a/tests/protzilla/data_analysis/test_power_analysis.py b/tests/protzilla/data_analysis/test_power_analysis.py index 5b2f92ed..eb563d68 100644 --- a/tests/protzilla/data_analysis/test_power_analysis.py +++ b/tests/protzilla/data_analysis/test_power_analysis.py @@ -1,14 +1,20 @@ import numpy as np import pandas as pd import pytest +import math +from scipy import stats from protzilla.data_analysis.power_analysis import ( - check_sample_size_calculation_implemented, - check_sample_size_calculation_implemented_without_log, - check_sample_size_calculation_with_libfunc, power_calculation, sample_size_calculation, + variance_protein_group_calculation_max, +) +from protzilla.data_analysis.power_analysis_validation import ( + check_sample_size_calculation_with_libfunc, + check_sample_size_calculation_implemented, + check_sample_size_calculation_implemented_without_log, ) +from test_differential_expression import diff_expr_test_data @pytest.fixture @@ -54,35 +60,61 @@ def test_variance_protein_group_calculation(power_test_data): group1 = "Group1" group2 = "Group2" - variance = variance_protein_group_calculation( + variance = variance_protein_group_calculation_max( intensity_df, protein_id, group1, group2 ) print(variance) assert variance == 4.0 -def test_sample_size_calculation(power_test_data): +def test_sample_size_calculation(power_test_data, diff_expr_test_data): test_alpha = 0.05 test_power = 0.8 test_fc_threshold = 1 test_selected_protein_group = "Protein1" + test_individual_column = "None" + test_differentially_expressed_proteins_df, test_metadata_df = diff_expr_test_data required_sample_size = sample_size_calculation( differentially_expressed_proteins_df=power_test_data, significant_proteins_df=power_test_data, + metadata_df=test_metadata_df, fc_threshold=test_fc_threshold, power=test_power, alpha=test_alpha, group1="Group1", group2="Group2", selected_protein_group=test_selected_protein_group, - significant_proteins_only=False, + individual_column=test_individual_column, intensity_name=None, ) print(required_sample_size) required_sample_size_int = next(iter(required_sample_size.values()), None) assert required_sample_size_int == 63 +def test_power_calculation(power_test_data, diff_expr_test_data): + test_alpha = 0.05 + test_fc_threshold = 1 + test_selected_protein_group = "Protein1" + test_individual_column = "None" + test_differentially_expressed_proteins_df, test_metadata_df = diff_expr_test_data + + power = power_calculation( + differentially_expressed_proteins_df=power_test_data, + significant_proteins_df=power_test_data, + metadata_df=test_metadata_df, + fc_threshold=test_fc_threshold, + alpha=test_alpha, + group1="Group1", + group2="Group2", + selected_protein_group=test_selected_protein_group, + individual_column=test_individual_column, + intensity_name=None, + ) + print(power) + power_int = next(iter(power.values()), None) + assert power_int == 0.09 + def test_check_sample_size_calculation_with_libfun(power_test_data): test_alpha = 0.05 @@ -156,23 +188,33 @@ def test_check_sample_size_calculation_implemented_without_log(power_test_data): required_sample_size_int = next(iter(required_sample_size.values()), None) assert required_sample_size_int == 63 +def test_replicate_paper_sample_size_calculation(power_test_data): + alpha = 0.001 + power = 0.95 + fc_threshold = math.log2(2) + biological_variance = 0.233 + technical_variance = 2.298 + number_of_replicates = 2 + + z_alpha = round(stats.norm.ppf(1 - alpha / 2), 3) + z_beta = round(stats.norm.ppf(power), 3) + + required_sample_size = ( + 2 + * ((z_alpha + z_beta) / fc_threshold) ** 2 + * ((technical_variance / number_of_replicates) + biological_variance) + ) # Equation (1) in Cairns, David A., et al., 2008, Sample size determination in clinical proteomic profiling experiments using mass spectrometry for class comparison + required_sample_size = math.ceil(required_sample_size) + print(required_sample_size) -def test_power_calculation(power_test_data): - test_alpha = 0.05 - test_fc_threshold = 1 - test_selected_protein_group = "Protein1" - - power = power_calculation( - differentially_expressed_proteins_df=power_test_data, - significant_proteins_df=power_test_data, - fc_threshold=test_fc_threshold, - alpha=test_alpha, - group1="Group1", - group2="Group2", - selected_protein_group=test_selected_protein_group, - significant_proteins_only=False, - intensity_name=None, - ) - print(power) - power_int = next(iter(power.values()), None) - assert power_int == 0.09 + data = { + "Cairns": [44, 31, 62, 44, 14, 10, 19, 14, 5, 4, 7, 5], + "Calculated": [65, 52, 92, 74, 20, 16, 28, 23, 7, 6, 10, 8], + } + df = pd.DataFrame(data) + correlation = df["Cairns"].corr(df["Calculated"]) + print(correlation) + correlationmatrix = df.corr() + print(correlationmatrix) + + return dict(required_sample_size=required_sample_size) From ac9e783f39887f22a58affd6448b661309b6ea18 Mon Sep 17 00:00:00 2001 From: selenabr Date: Tue, 3 Sep 2024 17:28:20 +0200 Subject: [PATCH 20/36] added function that calculates sample size for all proteins and shows the distribution in a violin plot --- protzilla/data_analysis/power_analysis.py | 112 +++++++++++++++++++++- protzilla/methods/data_analysis.py | 52 +++++++++- ui/runs/form_mapping.py | 1 + ui/runs/forms/data_analysis.py | 99 +++++++++++++++++++ user_data/workflows/standard.yaml | 3 + 5 files changed, 264 insertions(+), 3 deletions(-) diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py index 7989ce7b..76fb032f 100644 --- a/protzilla/data_analysis/power_analysis.py +++ b/protzilla/data_analysis/power_analysis.py @@ -3,6 +3,8 @@ import numpy as np import pandas as pd from scipy import stats +import plotly.express as px +import plotly.graph_objs as go from protzilla.utilities import default_intensity_column @@ -56,16 +58,21 @@ def sample_size_calculation( intensity_name: str = None, ) -> dict: """ - Function to calculate the required sample size for a selected protein to achieve the required power . + Function to calculate the required sample size for a selected protein to achieve the desired statistical power. + If metadata_df contains a column that identifies individuals, the function first calculates the mean intensity for + each individual (based on replicates) within the dataset. These individual means are used to determine the variance + for the sample size calculation formula. :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. + :param metadata_df: The dataframe containing the clinical data. :param fc_threshold: The fold change threshold. :param alpha: The significance level. The value for alpha is taken from the t-test by default. :param power: The power of the test. :param group1: The name of the first group. :param group2: The name of the second group. :param selected_protein_group: The selected protein group for which the required sample size is to be calculated. + :param individual_column: The name of the column in metadata_df containing the individual ID. :param intensity_name: The name of the column containing the protein group intensities. :return: The required sample size. """ @@ -135,14 +142,21 @@ def power_calculation( ) -> dict: """ Function to calculate the power of the t-test for a selected protein group. + If metadata_df contains a column that identifies individuals, the function first calculates the mean intensity for + each individual (based on replicates) within the dataset. These individual means are used to determine the variance + for the power calculation formula. + If both groups have different numbers of samples, the sample size for the power formula is calculated according + to the equation 2.3.1 from Cohen 1988, Statistical Power Analysis for the Behavioral Sciences. :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. + :param metadata_df: The dataframe containing the clinical data. :param alpha: The significance level. The value for alpha is taken from the t-test by default. :param fc_threshold: The fold change threshold. :param group1: The name of the first group. :param group2: The name of the second group. :param selected_protein_group: The selected protein group for which the power is to be calculated. + :param individual_column: The name of the column in metadata_df containing the individual ID. :param intensity_name: The name of the column containing the protein group intensities. :return: The power of the test. """ @@ -212,4 +226,98 @@ def power_calculation( ) power = float(round(stats.norm.cdf(z_beta), 2)) - return dict(power=power) \ No newline at end of file + return dict(power=power) + +def sample_size_calculation_for_all_proteins( + differentially_expressed_proteins_df: pd.DataFrame, + significant_proteins_df: pd.DataFrame, + significant_proteins_only: str, + metadata_df: pd.DataFrame, + fc_threshold: float, + alpha: float, + power: float, + group1: str, + group2: str, + individual_column: str, + select_all_proteins: bool, + selected_protein_groups: list, + intensity_name: str = None, + +) -> dict: + """ + Function to calculate the required sample size for all proteins in the dataset to achieve the required power. + Variance estimation ... + + :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. + :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. + :param significant_proteins_only: A boolean indicating whether only significant proteins should be considered. + :param metadata_df: The dataframe containing the clinical data. + :param fc_threshold: The fold change threshold. + :param alpha: The significance level. The value for alpha is taken from the t-test by default. + :param power: The power of the test. + :param group1: The name of the first group. + :param group2: The name of the second group. + :param individual_column: The name of the column in metadata_df containing the individual ID. + :param select_all_proteins: A boolean indicating whether all proteins should be considered. + :param selected_protein_groups: A list of selected protein groups, if not all proteins should be considered. + :param intensity_name: The name of the column containing the protein group intensities. + """ + if select_all_proteins and significant_proteins_only == 'No': + protein_groups_for_calculation = differentially_expressed_proteins_df["Protein ID"].unique() + elif select_all_proteins and significant_proteins_only == 'Yes': + protein_groups_for_calculation = significant_proteins_df["Protein ID"].unique() + else: + protein_groups_for_calculation = selected_protein_groups + + required_sample_sizes = [] + + for protein_group in protein_groups_for_calculation: + required_sample_size = sample_size_calculation( + differentially_expressed_proteins_df=differentially_expressed_proteins_df, + significant_proteins_df=significant_proteins_df, + metadata_df=metadata_df, + fc_threshold=fc_threshold, + alpha=alpha, + power=power, + group1=group1, + group2=group2, + selected_protein_group=protein_group, + individual_column=individual_column, + intensity_name=intensity_name, + )["required_sample_size"] + + required_sample_sizes.append(required_sample_size) + + required_sample_size_for_all_proteins = max(required_sample_sizes) + + violin_plot_args = dict( + meanline_visible=True, + box_visible=True, + scalemode='width', + spanmode='hard', + span=[0, required_sample_size_for_all_proteins] + ) + + fig = go.Figure() + + fig.add_trace(go.Violin( + x=['Protein group'] * len(required_sample_sizes), + y=required_sample_sizes, + line_color='red', + **violin_plot_args + )) + sample_size_dataframe = pd.DataFrame(protein_groups_for_calculation) + sample_size_dataframe["Sample Size"] = required_sample_sizes + + differentially_expressed_proteins_df = pd.merge( + differentially_expressed_proteins_df, + sample_size_dataframe, + on="Protein ID", + ) + #merge["Sample Size"] = required_sample_sizes + + return dict(required_sample_size_for_all_proteins=required_sample_size_for_all_proteins, + plots=[fig], + differentially_expressed_proteins_df=differentially_expressed_proteins_df, + sample_size_dataframe=sample_size_dataframe, + ) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 62db5c10..245b885f 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -24,6 +24,7 @@ from protzilla.data_analysis.power_analysis import ( power_calculation, sample_size_calculation, + sample_size_calculation_for_all_proteins, ) from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph from protzilla.data_analysis.ptm_analysis import ( @@ -801,7 +802,6 @@ class PowerAnalysisPowerCalculation(DataAnalysisStep): "significant_proteins_df", "differentially_expressed_proteins_df", "selected_protein_group", - "significant_proteins_df", "fc_threshold", "alpha", "group1", @@ -880,3 +880,53 @@ def handle_outputs(self, outputs: dict): self.display_output[ "required_sample_size" ] = f"Required Sample Size: {outputs['required_sample_size']}" + +class PowerAnalysisSampleSizeCalculationForAllProteins(PlotStep): + display_name = "Sample Size Calculation for all Proteins" + operation = "Power Analysis" + method_description = "Calculates sample size for all proteins" + + input_keys = [ + "differentially_expressed_proteins_df", + "significant_proteins_df", + "significant_proteins_only", + "fc_threshold", + "alpha", + "group1", + "group2", + "power", + "individual_column", + "metadata_df", + "select_all_proteins", + "selected_protein_groups", + ] + output_keys = [ + "required_sample_size_for_all_proteins", + "differentially_expressed_proteins_df", + "sample_size_dataframe" + ] + + def method(self, inputs: dict) -> dict: + return sample_size_calculation_for_all_proteins(**inputs) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["differentially_expressed_proteins_df"] = steps.get_step_output( + Step, "differentially_expressed_proteins_df", inputs["input_dict"] + ) + step = next( + s for s in steps.all_steps if s.instance_identifier == inputs["input_dict"] + ) + inputs["significant_proteins_df"] = steps.get_step_output( + Step, "significant_proteins_df", inputs["input_dict"] + ) + inputs["metadata_df"] = steps.metadata_df + inputs["alpha"] = step.inputs["alpha"] + inputs["group1"] = step.inputs["group1"] + inputs["group2"] = step.inputs["group2"] + return inputs + + def handle_outputs(self, outputs: dict): + super().handle_outputs(outputs) + self.display_output[ + "required_sample_size_for_all_proteins" + ] = f"Required Sample Size for all Proteins: {outputs['required_sample_size_for_all_proteins']}" \ No newline at end of file diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py index 0fec7200..90cf6d43 100644 --- a/ui/runs/form_mapping.py +++ b/ui/runs/form_mapping.py @@ -63,6 +63,7 @@ data_analysis.ProteinGraphVariationGraph: data_analysis_forms.ProteinGraphVariationGraphForm, data_analysis.PowerAnalysisPowerCalculation: data_analysis_forms.PowerAnalysisPowerCalculationForm, data_analysis.PowerAnalysisSampleSizeCalculation: data_analysis_forms.PowerAnalysisSampleSizeCalculationForm, + data_analysis.PowerAnalysisSampleSizeCalculationForAllProteins: data_analysis_forms.PowerAnalysisSampleSizeCalculationForAllProteinsForm, data_analysis.SelectPeptidesForProtein: data_analysis_forms.SelectPeptidesForProteinForm, data_analysis.PTMsPerSample: data_analysis_forms.PTMsPerSampleForm, data_analysis.PTMsProteinAndPerSample: data_analysis_forms.PTMsPerProteinAndSampleForm, diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index d7508230..f37e8e46 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1258,3 +1258,102 @@ def fill_form(self, run: Run) -> None: self.fields["alpha"].initial = run.steps.get_step_output( Step, "corrected_alpha", input_dict_instance_id ) + +class PowerAnalysisSampleSizeCalculationForAllProteinsForm(MethodForm): + is_dynamic = True + + input_dict = CustomChoiceField( + choices=[], + label="Input data dict (generated e.g. by t-Test)", + ) + alpha = CustomFloatField( + label="Error rate (alpha)", + min_value=0, + max_value=1, + step_size=0.05, + initial=0.05, + ) + power = CustomFloatField( + label="Power", + min_value=0, + max_value=1, + step_size=0.05, + initial=0.8, + ) + fc_threshold = CustomFloatField( + label="Log2 fold change threshold", min_value=0, initial=1 + ) + individual_column = CustomChoiceField( + choices=[], + label="Column name for individuals in metadata, if it exists (mean value will be calculated per individual)", + ) + significant_proteins_only = CustomChoiceField( + choices=YesNo, + label="Select only significant proteins", + initial=YesNo.yes, + ) + select_all_proteins = CustomBooleanField( + label="Select all proteins", + initial=True, + ) + selected_protein_groups = CustomMultipleChoiceField( + choices=[], + label="Protein groups to calculate sample size for", + ) + + #def __init__(self, *args, **kwargs): + # super().__init__(*args, **kwargs) + # select_all_proteins = self.data.get("select_all_proteins", True) + # if select_all_proteins == False: + # self.toggle_visibility("selected_protein_groups", True) + # else: + # self.toggle_visibility("selected_protein_groups", False)""" + + def fill_form(self, run: Run) -> None: + self.fields["input_dict"].choices = fill_helper.to_choices( + run.steps.get_instance_identifiers( + DifferentialExpressionTTest, + "differentially_expressed_proteins_df", + ) + ) + input_dict_instance_id = self.data.get( + "input_dict", self.fields["input_dict"].choices[0][0] + ) + self.fields["alpha"].initial = run.steps.get_step_output( + Step, "corrected_alpha", input_dict_instance_id + ) + self.fields["individual_column"].choices = [ + ("None", "None") + ] + fill_helper.get_choices_for_metadata_all_columns(run) + individual_column = self.data.get("individual_column", "None") + self.fields["individual_column"].initial = individual_column + + significant_proteins_only = self.data.get( + "significant_proteins_only", + self.fields["significant_proteins_only"].choices[0][0], + ) + + if significant_proteins_only == YesNo.yes: + self.fields["selected_protein_groups"].choices = fill_helper.to_choices( + run.steps.get_step_output( + Step, "significant_proteins_df", input_dict_instance_id + )["Protein ID"].unique() + ) + else: + self.fields["selected_protein_groups"].choices = fill_helper.to_choices( + run.steps.get_step_output( + Step, "differentially_expressed_proteins_df", input_dict_instance_id + )["Protein ID"].unique() + ) + if not self.data: + select_all_proteins = True + else: + if "select_all_proteins" in self.data: + select_all_proteins = True + else: + select_all_proteins = False + + if select_all_proteins == False: + self.toggle_visibility("selected_protein_groups", True) + else: + self.toggle_visibility("selected_protein_groups", False) \ No newline at end of file diff --git a/user_data/workflows/standard.yaml b/user_data/workflows/standard.yaml index 970a2a2f..abd07c9b 100644 --- a/user_data/workflows/standard.yaml +++ b/user_data/workflows/standard.yaml @@ -64,6 +64,9 @@ steps: - form_inputs: { } inputs: { } type: PowerAnalysisPowerCalculation + - form_inputs: {} + inputs: { } + type: PowerAnalysisSampleSizeCalculationForAllProteins - form_inputs: fc_threshold: 1 inputs: { } From e54c767c784763c32ba26a2225bb8fbea56e2d37 Mon Sep 17 00:00:00 2001 From: selenabr Date: Tue, 3 Sep 2024 17:29:21 +0200 Subject: [PATCH 21/36] formatting --- protzilla/data_analysis/power_analysis.py | 69 ++++++++++++----------- protzilla/methods/data_analysis.py | 5 +- ui/runs/forms/data_analysis.py | 15 ++--- 3 files changed, 48 insertions(+), 41 deletions(-) diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py index 76fb032f..f207dc63 100644 --- a/protzilla/data_analysis/power_analysis.py +++ b/protzilla/data_analysis/power_analysis.py @@ -228,21 +228,21 @@ def power_calculation( return dict(power=power) -def sample_size_calculation_for_all_proteins( - differentially_expressed_proteins_df: pd.DataFrame, - significant_proteins_df: pd.DataFrame, - significant_proteins_only: str, - metadata_df: pd.DataFrame, - fc_threshold: float, - alpha: float, - power: float, - group1: str, - group2: str, - individual_column: str, - select_all_proteins: bool, - selected_protein_groups: list, - intensity_name: str = None, +def sample_size_calculation_for_all_proteins( + differentially_expressed_proteins_df: pd.DataFrame, + significant_proteins_df: pd.DataFrame, + significant_proteins_only: str, + metadata_df: pd.DataFrame, + fc_threshold: float, + alpha: float, + power: float, + group1: str, + group2: str, + individual_column: str, + select_all_proteins: bool, + selected_protein_groups: list, + intensity_name: str = None, ) -> dict: """ Function to calculate the required sample size for all proteins in the dataset to achieve the required power. @@ -262,9 +262,11 @@ def sample_size_calculation_for_all_proteins( :param selected_protein_groups: A list of selected protein groups, if not all proteins should be considered. :param intensity_name: The name of the column containing the protein group intensities. """ - if select_all_proteins and significant_proteins_only == 'No': - protein_groups_for_calculation = differentially_expressed_proteins_df["Protein ID"].unique() - elif select_all_proteins and significant_proteins_only == 'Yes': + if select_all_proteins and significant_proteins_only == "No": + protein_groups_for_calculation = differentially_expressed_proteins_df[ + "Protein ID" + ].unique() + elif select_all_proteins and significant_proteins_only == "Yes": protein_groups_for_calculation = significant_proteins_df["Protein ID"].unique() else: protein_groups_for_calculation = selected_protein_groups @@ -293,19 +295,21 @@ def sample_size_calculation_for_all_proteins( violin_plot_args = dict( meanline_visible=True, box_visible=True, - scalemode='width', - spanmode='hard', - span=[0, required_sample_size_for_all_proteins] + scalemode="width", + spanmode="hard", + span=[0, required_sample_size_for_all_proteins], ) fig = go.Figure() - fig.add_trace(go.Violin( - x=['Protein group'] * len(required_sample_sizes), - y=required_sample_sizes, - line_color='red', - **violin_plot_args - )) + fig.add_trace( + go.Violin( + x=["Protein group"] * len(required_sample_sizes), + y=required_sample_sizes, + line_color="red", + **violin_plot_args + ) + ) sample_size_dataframe = pd.DataFrame(protein_groups_for_calculation) sample_size_dataframe["Sample Size"] = required_sample_sizes @@ -314,10 +318,11 @@ def sample_size_calculation_for_all_proteins( sample_size_dataframe, on="Protein ID", ) - #merge["Sample Size"] = required_sample_sizes + # merge["Sample Size"] = required_sample_sizes - return dict(required_sample_size_for_all_proteins=required_sample_size_for_all_proteins, - plots=[fig], - differentially_expressed_proteins_df=differentially_expressed_proteins_df, - sample_size_dataframe=sample_size_dataframe, - ) + return dict( + required_sample_size_for_all_proteins=required_sample_size_for_all_proteins, + plots=[fig], + differentially_expressed_proteins_df=differentially_expressed_proteins_df, + sample_size_dataframe=sample_size_dataframe, + ) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 245b885f..229ad415 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -881,6 +881,7 @@ def handle_outputs(self, outputs: dict): "required_sample_size" ] = f"Required Sample Size: {outputs['required_sample_size']}" + class PowerAnalysisSampleSizeCalculationForAllProteins(PlotStep): display_name = "Sample Size Calculation for all Proteins" operation = "Power Analysis" @@ -903,7 +904,7 @@ class PowerAnalysisSampleSizeCalculationForAllProteins(PlotStep): output_keys = [ "required_sample_size_for_all_proteins", "differentially_expressed_proteins_df", - "sample_size_dataframe" + "sample_size_dataframe", ] def method(self, inputs: dict) -> dict: @@ -929,4 +930,4 @@ def handle_outputs(self, outputs: dict): super().handle_outputs(outputs) self.display_output[ "required_sample_size_for_all_proteins" - ] = f"Required Sample Size for all Proteins: {outputs['required_sample_size_for_all_proteins']}" \ No newline at end of file + ] = f"Required Sample Size for all Proteins: {outputs['required_sample_size_for_all_proteins']}" diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index f37e8e46..3fe7b399 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1259,6 +1259,7 @@ def fill_form(self, run: Run) -> None: Step, "corrected_alpha", input_dict_instance_id ) + class PowerAnalysisSampleSizeCalculationForAllProteinsForm(MethodForm): is_dynamic = True @@ -1301,13 +1302,13 @@ class PowerAnalysisSampleSizeCalculationForAllProteinsForm(MethodForm): label="Protein groups to calculate sample size for", ) - #def __init__(self, *args, **kwargs): + # def __init__(self, *args, **kwargs): # super().__init__(*args, **kwargs) - # select_all_proteins = self.data.get("select_all_proteins", True) - # if select_all_proteins == False: - # self.toggle_visibility("selected_protein_groups", True) - # else: - # self.toggle_visibility("selected_protein_groups", False)""" + # select_all_proteins = self.data.get("select_all_proteins", True) + # if select_all_proteins == False: + # self.toggle_visibility("selected_protein_groups", True) + # else: + # self.toggle_visibility("selected_protein_groups", False)""" def fill_form(self, run: Run) -> None: self.fields["input_dict"].choices = fill_helper.to_choices( @@ -1356,4 +1357,4 @@ def fill_form(self, run: Run) -> None: if select_all_proteins == False: self.toggle_visibility("selected_protein_groups", True) else: - self.toggle_visibility("selected_protein_groups", False) \ No newline at end of file + self.toggle_visibility("selected_protein_groups", False) From 2faa9726a89888d1dd50a954cc462ef0a6cb5683 Mon Sep 17 00:00:00 2001 From: selenabr Date: Tue, 3 Sep 2024 18:37:23 +0200 Subject: [PATCH 22/36] commented the dataframe-output-stuff out, otherwise violin plot couldn't be displayed anymore (WIP...) --- protzilla/data_analysis/power_analysis.py | 18 ++++++++++-------- protzilla/methods/data_analysis.py | 4 ++-- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py index f207dc63..bd94996d 100644 --- a/protzilla/data_analysis/power_analysis.py +++ b/protzilla/data_analysis/power_analysis.py @@ -261,6 +261,8 @@ def sample_size_calculation_for_all_proteins( :param select_all_proteins: A boolean indicating whether all proteins should be considered. :param selected_protein_groups: A list of selected protein groups, if not all proteins should be considered. :param intensity_name: The name of the column containing the protein group intensities. + + :return: """ if select_all_proteins and significant_proteins_only == "No": protein_groups_for_calculation = differentially_expressed_proteins_df[ @@ -290,13 +292,13 @@ def sample_size_calculation_for_all_proteins( required_sample_sizes.append(required_sample_size) - required_sample_size_for_all_proteins = max(required_sample_sizes) + required_sample_size_for_all_proteins = max(required_sample_sizes) violin_plot_args = dict( meanline_visible=True, box_visible=True, - scalemode="width", - spanmode="hard", + scalemode='width', + spanmode='hard', span=[0, required_sample_size_for_all_proteins], ) @@ -310,19 +312,19 @@ def sample_size_calculation_for_all_proteins( **violin_plot_args ) ) - sample_size_dataframe = pd.DataFrame(protein_groups_for_calculation) + """sample_size_dataframe = pd.DataFrame(protein_groups_for_calculation) sample_size_dataframe["Sample Size"] = required_sample_sizes differentially_expressed_proteins_df = pd.merge( differentially_expressed_proteins_df, sample_size_dataframe, on="Protein ID", - ) - # merge["Sample Size"] = required_sample_sizes + )""" + return dict( required_sample_size_for_all_proteins=required_sample_size_for_all_proteins, plots=[fig], - differentially_expressed_proteins_df=differentially_expressed_proteins_df, - sample_size_dataframe=sample_size_dataframe, + #differentially_expressed_proteins_df=differentially_expressed_proteins_df, + #sample_size_dataframe=sample_size_dataframe, ) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 229ad415..ed53f223 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -903,8 +903,8 @@ class PowerAnalysisSampleSizeCalculationForAllProteins(PlotStep): ] output_keys = [ "required_sample_size_for_all_proteins", - "differentially_expressed_proteins_df", - "sample_size_dataframe", + #"differentially_expressed_proteins_df", + #"sample_size_dataframe", ] def method(self, inputs: dict) -> dict: From 25cf2b2b000271c9bde193ce57c979cd63ea3df8 Mon Sep 17 00:00:00 2001 From: selenabr Date: Tue, 3 Sep 2024 19:59:23 +0200 Subject: [PATCH 23/36] changed color of violinplot and added axis-description --- protzilla/data_analysis/power_analysis.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py index bd94996d..ec013962 100644 --- a/protzilla/data_analysis/power_analysis.py +++ b/protzilla/data_analysis/power_analysis.py @@ -5,7 +5,9 @@ from scipy import stats import plotly.express as px import plotly.graph_objs as go +import protzilla.constants.colors as colorscheme +from ..constants.colors import PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE from protzilla.utilities import default_intensity_column @@ -294,24 +296,33 @@ def sample_size_calculation_for_all_proteins( required_sample_size_for_all_proteins = max(required_sample_sizes) + colors = colorscheme.PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE + violin_plot_args = dict( meanline_visible=True, box_visible=True, scalemode='width', spanmode='hard', span=[0, required_sample_size_for_all_proteins], + fillcolor='rgba(0,0,0,0)' ) fig = go.Figure() fig.add_trace( go.Violin( - x=["Protein group"] * len(required_sample_sizes), + x=["Protein Groups"] * len(required_sample_sizes), y=required_sample_sizes, - line_color="red", + line_color=colors[1], **violin_plot_args ) ) + fig.update_layout( + title="Distribution of Required Sample Sizes for All Proteins", + xaxis_title="Protein Groups", + yaxis_title="Required Sample Size", + showlegend=False, + ) """sample_size_dataframe = pd.DataFrame(protein_groups_for_calculation) sample_size_dataframe["Sample Size"] = required_sample_sizes From ae4e8cbe539a1847bc0f671de71adc08202f431d Mon Sep 17 00:00:00 2001 From: selenabr Date: Thu, 5 Sep 2024 13:10:24 +0200 Subject: [PATCH 24/36] changed color of violinplot and removed axis-description --- protzilla/data_analysis/power_analysis.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py index ec013962..5deca159 100644 --- a/protzilla/data_analysis/power_analysis.py +++ b/protzilla/data_analysis/power_analysis.py @@ -304,14 +304,13 @@ def sample_size_calculation_for_all_proteins( scalemode='width', spanmode='hard', span=[0, required_sample_size_for_all_proteins], - fillcolor='rgba(0,0,0,0)' ) fig = go.Figure() fig.add_trace( go.Violin( - x=["Protein Groups"] * len(required_sample_sizes), + x=[""] * len(required_sample_sizes), y=required_sample_sizes, line_color=colors[1], **violin_plot_args @@ -319,7 +318,6 @@ def sample_size_calculation_for_all_proteins( ) fig.update_layout( title="Distribution of Required Sample Sizes for All Proteins", - xaxis_title="Protein Groups", yaxis_title="Required Sample Size", showlegend=False, ) From 5c630081f4c8e0f643f0d64691498dbb174d2861 Mon Sep 17 00:00:00 2001 From: selenabr Date: Thu, 5 Sep 2024 14:39:35 +0200 Subject: [PATCH 25/36] resolved comments --- protzilla/data_analysis/power_analysis.py | 26 +++++++++-------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py index 5deca159..2307b3f7 100644 --- a/protzilla/data_analysis/power_analysis.py +++ b/protzilla/data_analysis/power_analysis.py @@ -298,24 +298,19 @@ def sample_size_calculation_for_all_proteins( colors = colorscheme.PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE - violin_plot_args = dict( - meanline_visible=True, - box_visible=True, - scalemode='width', - spanmode='hard', - span=[0, required_sample_size_for_all_proteins], - ) - - fig = go.Figure() - - fig.add_trace( + fig = go.Figure( go.Violin( - x=[""] * len(required_sample_sizes), + name="" * len(required_sample_sizes), y=required_sample_sizes, line_color=colors[1], - **violin_plot_args + meanline_visible=True, + box_visible=True, + scalemode="width", + spanmode="hard", + span=[0, required_sample_size_for_all_proteins], ) ) + fig.update_layout( title="Distribution of Required Sample Sizes for All Proteins", yaxis_title="Required Sample Size", @@ -330,10 +325,9 @@ def sample_size_calculation_for_all_proteins( on="Protein ID", )""" - return dict( required_sample_size_for_all_proteins=required_sample_size_for_all_proteins, plots=[fig], - #differentially_expressed_proteins_df=differentially_expressed_proteins_df, - #sample_size_dataframe=sample_size_dataframe, + # differentially_expressed_proteins_df=differentially_expressed_proteins_df, + # sample_size_dataframe=sample_size_dataframe, ) From 0adc15c7705dee105ec6e6ef2b19fc43023bdb19 Mon Sep 17 00:00:00 2001 From: selenabr Date: Thu, 5 Sep 2024 16:20:25 +0200 Subject: [PATCH 26/36] Added function to get dataframes with sample size column as output --- protzilla/data_analysis/power_analysis.py | 26 +++++++++++++++---- protzilla/methods/data_analysis.py | 5 ++-- .../power_analysis_validation.py | 0 3 files changed, 24 insertions(+), 7 deletions(-) rename {protzilla => tests/protzilla}/data_analysis/power_analysis_validation.py (100%) diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py index 2307b3f7..0379f91e 100644 --- a/protzilla/data_analysis/power_analysis.py +++ b/protzilla/data_analysis/power_analysis.py @@ -316,18 +316,34 @@ def sample_size_calculation_for_all_proteins( yaxis_title="Required Sample Size", showlegend=False, ) - """sample_size_dataframe = pd.DataFrame(protein_groups_for_calculation) + sample_size_dataframe = pd.DataFrame(protein_groups_for_calculation) + sample_size_dataframe.columns = ["Protein ID"] sample_size_dataframe["Sample Size"] = required_sample_sizes - differentially_expressed_proteins_df = pd.merge( + if select_all_proteins and significant_proteins_only == "No": + differentially_expressed_proteins_df = pd.merge( differentially_expressed_proteins_df, sample_size_dataframe, on="Protein ID", - )""" + ) + elif select_all_proteins and significant_proteins_only == "Yes": + significant_proteins_df = pd.merge( + significant_proteins_df, + sample_size_dataframe, + on="Protein ID", + ) + else: + sample_size_dataframe = pd.merge( + sample_size_dataframe, + sample_size_dataframe, + on="Protein ID", + ) + return dict( required_sample_size_for_all_proteins=required_sample_size_for_all_proteins, plots=[fig], - # differentially_expressed_proteins_df=differentially_expressed_proteins_df, - # sample_size_dataframe=sample_size_dataframe, + differentially_expressed_proteins_df=differentially_expressed_proteins_df, + significant_proteins_df=significant_proteins_df, + sample_size_dataframe=sample_size_dataframe, ) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index ed53f223..3f36ba07 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -903,8 +903,9 @@ class PowerAnalysisSampleSizeCalculationForAllProteins(PlotStep): ] output_keys = [ "required_sample_size_for_all_proteins", - #"differentially_expressed_proteins_df", - #"sample_size_dataframe", + "differentially_expressed_proteins_df", + "sample_size_dataframe", + "significant_proteins_df", ] def method(self, inputs: dict) -> dict: diff --git a/protzilla/data_analysis/power_analysis_validation.py b/tests/protzilla/data_analysis/power_analysis_validation.py similarity index 100% rename from protzilla/data_analysis/power_analysis_validation.py rename to tests/protzilla/data_analysis/power_analysis_validation.py From dcba877792c4dd4cefdba71063644a8157e64434 Mon Sep 17 00:00:00 2001 From: selenabr Date: Fri, 6 Sep 2024 14:11:18 +0200 Subject: [PATCH 27/36] Added power_calculation_for_all_proteins to calculate minimum power for all proteins --- protzilla/data_analysis/power_analysis.py | 122 ++++++++++++++++++++-- protzilla/methods/data_analysis.py | 51 +++++++++ ui/runs/form_mapping.py | 1 + ui/runs/forms/data_analysis.py | 90 ++++++++++++++-- user_data/workflows/standard.yaml | 3 + 5 files changed, 253 insertions(+), 14 deletions(-) diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py index 0379f91e..e2286ff5 100644 --- a/protzilla/data_analysis/power_analysis.py +++ b/protzilla/data_analysis/power_analysis.py @@ -248,7 +248,6 @@ def sample_size_calculation_for_all_proteins( ) -> dict: """ Function to calculate the required sample size for all proteins in the dataset to achieve the required power. - Variance estimation ... :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. @@ -265,7 +264,13 @@ def sample_size_calculation_for_all_proteins( :param intensity_name: The name of the column containing the protein group intensities. :return: + - required_sample_size_for_all_proteins: The maximum required sample size for all proteins. + - a violin plot showing the distribution of required sample sizes for all proteins. + - a df differentially_expressed_proteins_df from t-test output with added sample size column. + - a df significant_proteins_df from t-test output with added sample size column. + - a df sample_size_dataframe containing the sample sizes for all proteins. """ + if select_all_proteins and significant_proteins_only == "No": protein_groups_for_calculation = differentially_expressed_proteins_df[ "Protein ID" @@ -332,18 +337,121 @@ def sample_size_calculation_for_all_proteins( sample_size_dataframe, on="Protein ID", ) + + return dict( + required_sample_size_for_all_proteins=required_sample_size_for_all_proteins, + plots=[fig], + differentially_expressed_proteins_df=differentially_expressed_proteins_df, + significant_proteins_df=significant_proteins_df, + sample_size_dataframe=sample_size_dataframe, + ) + +def power_calculation_for_all_proteins( + differentially_expressed_proteins_df: pd.DataFrame, + significant_proteins_df: pd.DataFrame, + significant_proteins_only: str, + metadata_df: pd.DataFrame, + fc_threshold: float, + alpha: float, + group1: str, + group2: str, + individual_column: str, + select_all_proteins: bool, + selected_protein_groups: list, + intensity_name: str = None, +) -> dict: + """ + Function to calculate the power of the t-test for all proteins in the dataset. + + :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. + :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. + :param significant_proteins_only: A boolean indicating whether only significant proteins should be considered. + :param metadata_df: The dataframe containing the clinical data. + :param fc_threshold: The fold change threshold. + :param alpha: The significance level. The value for alpha is taken from the t-test by default. + :param group1: The name of the first group. + :param group2: The name of the second group. + :param individual_column: The name of the column in metadata_df containing the individual ID. + :param select_all_proteins: A boolean indicating whether all proteins should be considered. + :param selected_protein_groups: A list of selected protein groups, if not all proteins should be considered. + :param intensity_name: The name of the column containing the protein group intensities. + + :return: + - power_for_all_proteins: The minimum power of all proteins. + - a df differentially_expressed_proteins_df from t-test output with added power column. + - a df significant_proteins_df from t-test output with added power column. + - a df power_dataframe containing the power for all proteins. + """ + if select_all_proteins and significant_proteins_only == "No": + protein_groups_for_calculation = differentially_expressed_proteins_df[ + "Protein ID" + ].unique() + elif select_all_proteins and significant_proteins_only == "Yes": + protein_groups_for_calculation = significant_proteins_df["Protein ID"].unique() else: - sample_size_dataframe = pd.merge( - sample_size_dataframe, - sample_size_dataframe, - on="Protein ID", + protein_groups_for_calculation = selected_protein_groups + + power_list = [] + + for protein_group in protein_groups_for_calculation: + power = power_calculation( + differentially_expressed_proteins_df=differentially_expressed_proteins_df, + significant_proteins_df=significant_proteins_df, + metadata_df=metadata_df, + fc_threshold=fc_threshold, + alpha=alpha, + group1=group1, + group2=group2, + selected_protein_group=protein_group, + individual_column=individual_column, + intensity_name=intensity_name, + )["power"] + + power_list.append(power) + + power_for_all_proteins = min(power_list) + + colors = colorscheme.PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE + + fig = go.Figure( + go.Violin( + name="" * len(power_list), + y=power_list, + line_color=colors[1], + meanline_visible=True, + box_visible=True, + scalemode="width", + spanmode="hard", + span=[power_for_all_proteins, 1], + ) ) + fig.update_layout( + title="Distribution of Power for All Proteins", + yaxis_title="Power", + showlegend=False, + ) + power_dataframe = pd.DataFrame(protein_groups_for_calculation) + power_dataframe.columns = ["Protein ID"] + power_dataframe["Power"] = power_list + + if select_all_proteins and significant_proteins_only == "No": + differentially_expressed_proteins_df = pd.merge( + differentially_expressed_proteins_df, + power_dataframe, + on="Protein ID", + ) + elif select_all_proteins and significant_proteins_only == "Yes": + significant_proteins_df = pd.merge( + significant_proteins_df, + power_dataframe, + on="Protein ID", + ) return dict( - required_sample_size_for_all_proteins=required_sample_size_for_all_proteins, + power_for_all_proteins=power_for_all_proteins, plots=[fig], differentially_expressed_proteins_df=differentially_expressed_proteins_df, significant_proteins_df=significant_proteins_df, - sample_size_dataframe=sample_size_dataframe, + power_dataframe=power_dataframe, ) diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 3f36ba07..21c430b5 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -25,6 +25,7 @@ power_calculation, sample_size_calculation, sample_size_calculation_for_all_proteins, + power_calculation_for_all_proteins, ) from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph from protzilla.data_analysis.ptm_analysis import ( @@ -932,3 +933,53 @@ def handle_outputs(self, outputs: dict): self.display_output[ "required_sample_size_for_all_proteins" ] = f"Required Sample Size for all Proteins: {outputs['required_sample_size_for_all_proteins']}" + +class PowerAnalysisPowerCalculationForAllProteins(PlotStep): + display_name = "Power Calculation for all Proteins" + operation = "Power Analysis" + method_description = "Calculates power for all proteins" + + input_keys = [ + "differentially_expressed_proteins_df", + "significant_proteins_df", + "significant_proteins_only", + "fc_threshold", + "alpha", + "group1", + "group2", + "individual_column", + "metadata_df", + "select_all_proteins", + "selected_protein_groups", + ] + output_keys = [ + "power_for_all_proteins", + "differentially_expressed_proteins_df", + "power_dataframe", + "significant_proteins_df", + ] + + def method(self, inputs: dict) -> dict: + return power_calculation_for_all_proteins(**inputs) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["differentially_expressed_proteins_df"] = steps.get_step_output( + Step, "differentially_expressed_proteins_df", inputs["input_dict"] + ) + step = next( + s for s in steps.all_steps if s.instance_identifier == inputs["input_dict"] + ) + inputs["significant_proteins_df"] = steps.get_step_output( + Step, "significant_proteins_df", inputs["input_dict"] + ) + inputs["metadata_df"] = steps.metadata_df + inputs["alpha"] = step.inputs["alpha"] + inputs["group1"] = step.inputs["group1"] + inputs["group2"] = step.inputs["group2"] + return inputs + + def handle_outputs(self, outputs: dict): + super().handle_outputs(outputs) + self.display_output[ + "power_for_all_proteins" + ] = f"Power for all Proteins: {outputs['power_for_all_proteins']}" diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py index 90cf6d43..82e9511b 100644 --- a/ui/runs/form_mapping.py +++ b/ui/runs/form_mapping.py @@ -64,6 +64,7 @@ data_analysis.PowerAnalysisPowerCalculation: data_analysis_forms.PowerAnalysisPowerCalculationForm, data_analysis.PowerAnalysisSampleSizeCalculation: data_analysis_forms.PowerAnalysisSampleSizeCalculationForm, data_analysis.PowerAnalysisSampleSizeCalculationForAllProteins: data_analysis_forms.PowerAnalysisSampleSizeCalculationForAllProteinsForm, + data_analysis.PowerAnalysisPowerCalculationForAllProteins: data_analysis_forms.PowerAnalysisPowerCalculationForAllProteinsForm, data_analysis.SelectPeptidesForProtein: data_analysis_forms.SelectPeptidesForProteinForm, data_analysis.PTMsPerSample: data_analysis_forms.PTMsPerSampleForm, data_analysis.PTMsProteinAndPerSample: data_analysis_forms.PTMsPerProteinAndSampleForm, diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 3fe7b399..093a5fb7 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1302,13 +1302,89 @@ class PowerAnalysisSampleSizeCalculationForAllProteinsForm(MethodForm): label="Protein groups to calculate sample size for", ) - # def __init__(self, *args, **kwargs): - # super().__init__(*args, **kwargs) - # select_all_proteins = self.data.get("select_all_proteins", True) - # if select_all_proteins == False: - # self.toggle_visibility("selected_protein_groups", True) - # else: - # self.toggle_visibility("selected_protein_groups", False)""" + def fill_form(self, run: Run) -> None: + self.fields["input_dict"].choices = fill_helper.to_choices( + run.steps.get_instance_identifiers( + DifferentialExpressionTTest, + "differentially_expressed_proteins_df", + ) + ) + input_dict_instance_id = self.data.get( + "input_dict", self.fields["input_dict"].choices[0][0] + ) + self.fields["alpha"].initial = run.steps.get_step_output( + Step, "corrected_alpha", input_dict_instance_id + ) + self.fields["individual_column"].choices = [ + ("None", "None") + ] + fill_helper.get_choices_for_metadata_all_columns(run) + individual_column = self.data.get("individual_column", "None") + self.fields["individual_column"].initial = individual_column + + significant_proteins_only = self.data.get( + "significant_proteins_only", + self.fields["significant_proteins_only"].choices[0][0], + ) + + if significant_proteins_only == YesNo.yes: + self.fields["selected_protein_groups"].choices = fill_helper.to_choices( + run.steps.get_step_output( + Step, "significant_proteins_df", input_dict_instance_id + )["Protein ID"].unique() + ) + else: + self.fields["selected_protein_groups"].choices = fill_helper.to_choices( + run.steps.get_step_output( + Step, "differentially_expressed_proteins_df", input_dict_instance_id + )["Protein ID"].unique() + ) + if not self.data: + select_all_proteins = True + else: + if "select_all_proteins" in self.data: + select_all_proteins = True + else: + select_all_proteins = False + + if select_all_proteins == False: + self.toggle_visibility("selected_protein_groups", True) + else: + self.toggle_visibility("selected_protein_groups", False) + +class PowerAnalysisPowerCalculationForAllProteinsForm(MethodForm): + is_dynamic = True + + input_dict = CustomChoiceField( + choices=[], + label="Input data dict (generated e.g. by t-Test)", + ) + alpha = CustomFloatField( + label="Error rate (alpha)", + min_value=0, + max_value=1, + step_size=0.05, + initial=0.05, + ) + fc_threshold = CustomFloatField( + label="Log2 fold change threshold", min_value=0, initial=1 + ) + individual_column = CustomChoiceField( + choices=[], + label="Column name for individuals in metadata, if it exists (mean value will be calculated per individual)", + ) + significant_proteins_only = CustomChoiceField( + choices=YesNo, + label="Select only significant proteins", + initial=YesNo.yes, + ) + select_all_proteins = CustomBooleanField( + label="Select all proteins", + initial=True, + ) + selected_protein_groups = CustomMultipleChoiceField( + choices=[], + label="Protein groups to calculate sample size for", + ) def fill_form(self, run: Run) -> None: self.fields["input_dict"].choices = fill_helper.to_choices( diff --git a/user_data/workflows/standard.yaml b/user_data/workflows/standard.yaml index abd07c9b..b2bb0d5d 100644 --- a/user_data/workflows/standard.yaml +++ b/user_data/workflows/standard.yaml @@ -67,6 +67,9 @@ steps: - form_inputs: {} inputs: { } type: PowerAnalysisSampleSizeCalculationForAllProteins + - form_inputs: { } + inputs: { } + type: PowerAnalysisPowerCalculationForAllProteins - form_inputs: fc_threshold: 1 inputs: { } From eb3298453cce4250d39c8b44edf61fc77b2f1ce5 Mon Sep 17 00:00:00 2001 From: selenabr Date: Sun, 8 Sep 2024 14:59:11 +0200 Subject: [PATCH 28/36] Fixed hover display of violin plots --- protzilla/data_analysis/power_analysis.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py index e2286ff5..e4fd2c9a 100644 --- a/protzilla/data_analysis/power_analysis.py +++ b/protzilla/data_analysis/power_analysis.py @@ -313,6 +313,7 @@ def sample_size_calculation_for_all_proteins( scalemode="width", spanmode="hard", span=[0, required_sample_size_for_all_proteins], + hoverinfo="y", ) ) @@ -423,6 +424,7 @@ def power_calculation_for_all_proteins( scalemode="width", spanmode="hard", span=[power_for_all_proteins, 1], + hoverinfo="y" ) ) From 1adda1ba676a2a452edf3578bdb21fd3c23fcf1a Mon Sep 17 00:00:00 2001 From: selenabr Date: Tue, 8 Oct 2024 19:05:28 +0200 Subject: [PATCH 29/36] fixed typo and removed unnecessary comment --- protzilla/data_analysis/power_analysis.py | 13 ------------- ui/runs/forms/data_analysis.py | 2 +- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py index e4fd2c9a..304f2823 100644 --- a/protzilla/data_analysis/power_analysis.py +++ b/protzilla/data_analysis/power_analysis.py @@ -103,7 +103,6 @@ def sample_size_calculation( metadata_df[["Sample", individual_column]], on="Sample", ) - # filtered_protein_group_df.join(metadata_df[["Sample", individual_column]].set_index("Sample"), on="Sample") filtered_protein_group_df = ( filtered_protein_group_merged_df.groupby( @@ -183,7 +182,6 @@ def power_calculation( metadata_df[["Sample", individual_column]], on="Sample", ) - # filtered_protein_group_df.join(metadata_df[["Sample", individual_column]].set_index("Sample"), on="Sample") filtered_protein_group_df = ( filtered_protein_group_merged_df.groupby( @@ -204,17 +202,6 @@ def power_calculation( intensity_name=intensity_name, ) - """ - filtered_df = differentially_expressed_proteins_df[differentially_expressed_proteins_df["Protein ID"] == protein_group] - filtered_df["Person"] = filtered_df["Sample"].apply( - lambda x: x[:7]) - - variance = filtered_df.groupby(['Person', 'Group'])['Normalised iBAQ'].var().reset_index() - - filtered_df["Measurement"] = filtered_df["Sample"].apply( - lambda x: int(x[-2:])) - """ - group_count_df = filtered_protein_group_df.groupby(["Group", "Protein ID"])[ "Sample" ].count() diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 093a5fb7..0798fdde 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1383,7 +1383,7 @@ class PowerAnalysisPowerCalculationForAllProteinsForm(MethodForm): ) selected_protein_groups = CustomMultipleChoiceField( choices=[], - label="Protein groups to calculate sample size for", + label="Protein groups to calculate power for", ) def fill_form(self, run: Run) -> None: From d0ec1749ea51a78faad7e86b9e25f7c08a3cd071 Mon Sep 17 00:00:00 2001 From: selenabr Date: Tue, 29 Oct 2024 22:38:05 +0100 Subject: [PATCH 30/36] calculations for thesis (should be removed before merging into dev) --- protzilla/data_analysis/power_analysis.py | 77 ++++++++++++++++++- protzilla/methods/data_analysis.py | 1 + .../data_analysis/test_power_analysis.py | 3 +- 3 files changed, 78 insertions(+), 3 deletions(-) diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py index 304f2823..0ad98566 100644 --- a/protzilla/data_analysis/power_analysis.py +++ b/protzilla/data_analysis/power_analysis.py @@ -126,7 +126,8 @@ def sample_size_calculation( required_sample_size = math.ceil(required_sample_size) print(required_sample_size) - return dict(required_sample_size=required_sample_size) + return dict(required_sample_size=required_sample_size, + variance_protein_group=variance_protein_group) #TODO: remove this line before merging into main def power_calculation( @@ -217,7 +218,6 @@ def power_calculation( return dict(power=power) - def sample_size_calculation_for_all_proteins( differentially_expressed_proteins_df: pd.DataFrame, significant_proteins_df: pd.DataFrame, @@ -288,6 +288,56 @@ def sample_size_calculation_for_all_proteins( required_sample_size_for_all_proteins = max(required_sample_sizes) + #TODO: remove before merging into main + required_sample_size_above_threshold = [] + + for protein_group in protein_groups_for_calculation: + required_sample_size = sample_size_calculation( + differentially_expressed_proteins_df=differentially_expressed_proteins_df, + significant_proteins_df=significant_proteins_df, + metadata_df=metadata_df, + fc_threshold=fc_threshold, + alpha=alpha, + power=power, + group1=group1, + group2=group2, + selected_protein_group=protein_group, + individual_column=individual_column, + intensity_name=intensity_name, + )["required_sample_size"] + + if required_sample_size > 44: + required_sample_size_above_threshold.append({"Protein ID": protein_group, "Required Sample Size": required_sample_size}) + + num_proteins_above_threshold = len(required_sample_size_above_threshold) + print(num_proteins_above_threshold) + num_required_sample_sizes = len(required_sample_sizes) + print(num_required_sample_sizes) + + + variance_protein_group_all = [] + for protein_group in protein_groups_for_calculation: + variance_protein_group = sample_size_calculation( + differentially_expressed_proteins_df=differentially_expressed_proteins_df, + significant_proteins_df=significant_proteins_df, + metadata_df=metadata_df, + fc_threshold=fc_threshold, + alpha=alpha, + power=power, + group1=group1, + group2=group2, + selected_protein_group=protein_group, + individual_column=individual_column, + intensity_name=intensity_name, + )["variance_protein_group"] + + variance_protein_group_all.append(variance_protein_group) + + variance_mean = np.mean(variance_protein_group_all) + print(variance_mean) + + #end of lines that should be removed before merging + colors = colorscheme.PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE fig = go.Figure( @@ -399,6 +449,29 @@ def power_calculation_for_all_proteins( power_for_all_proteins = min(power_list) + power_below_threshold = [] + for protein_group in protein_groups_for_calculation: + power = power_calculation( + differentially_expressed_proteins_df=differentially_expressed_proteins_df, + significant_proteins_df=significant_proteins_df, + metadata_df=metadata_df, + fc_threshold=fc_threshold, + alpha=alpha, + group1=group1, + group2=group2, + selected_protein_group=protein_group, + individual_column=individual_column, + intensity_name=intensity_name, + )["power"] + power_list.append({"Protein ID": protein_group, "Power": power}) + if power < 0.8: + power_below_threshold.append({"Protein ID": protein_group, "Power": power}) + num_proteins_below_threshold = len(power_below_threshold) + print(num_proteins_below_threshold) + num_power_list = len(power_list) + print(num_power_list) + + colors = colorscheme.PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE fig = go.Figure( diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 21c430b5..38b1fb9f 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -855,6 +855,7 @@ class PowerAnalysisSampleSizeCalculation(DataAnalysisStep): ] output_keys = [ "required_sample_size", + "variance_protein_group", #TODO: remove this line before merging into main ] def method(self, inputs: dict) -> dict: diff --git a/tests/protzilla/data_analysis/test_power_analysis.py b/tests/protzilla/data_analysis/test_power_analysis.py index eb563d68..0026621b 100644 --- a/tests/protzilla/data_analysis/test_power_analysis.py +++ b/tests/protzilla/data_analysis/test_power_analysis.py @@ -9,7 +9,7 @@ sample_size_calculation, variance_protein_group_calculation_max, ) -from protzilla.data_analysis.power_analysis_validation import ( +from tests.protzilla.data_analysis.power_analysis_validation import ( check_sample_size_calculation_with_libfunc, check_sample_size_calculation_implemented, check_sample_size_calculation_implemented_without_log, @@ -92,6 +92,7 @@ def test_sample_size_calculation(power_test_data, diff_expr_test_data): required_sample_size_int = next(iter(required_sample_size.values()), None) assert required_sample_size_int == 63 + def test_power_calculation(power_test_data, diff_expr_test_data): test_alpha = 0.05 test_fc_threshold = 1 From 776dc55d49a0c0986c0affd0844c1df7d0e01214 Mon Sep 17 00:00:00 2001 From: selenabr Date: Mon, 4 Nov 2024 03:19:59 +0100 Subject: [PATCH 31/36] calculations for thesis (should be removed before merging into dev) --- .../power_analysis_validation.py | 4 +- .../data_analysis/test_power_analysis.py | 168 +++++++++++++++--- 2 files changed, 147 insertions(+), 25 deletions(-) diff --git a/tests/protzilla/data_analysis/power_analysis_validation.py b/tests/protzilla/data_analysis/power_analysis_validation.py index 8351202d..09586de8 100644 --- a/tests/protzilla/data_analysis/power_analysis_validation.py +++ b/tests/protzilla/data_analysis/power_analysis_validation.py @@ -75,7 +75,7 @@ def check_sample_size_calculation_with_libfunc( # impl: required_sample_size = 0.814; fc_threshold = 1.014; variance = 0.0534 -def check_sample_size_calculation_implemented( +def check_sample_size_calculation_protzilla( differentially_expressed_proteins_df: pd.DataFrame, significant_proteins_df: pd.DataFrame, fc_threshold: float, @@ -134,7 +134,7 @@ def check_sample_size_calculation_implemented( return dict(required_sample_size=required_sample_size) -def check_sample_size_calculation_implemented_without_log( +def check_sample_size_calculation_protzilla_without_log( differentially_expressed_proteins_df: pd.DataFrame, significant_proteins_df: pd.DataFrame, fc_threshold: float, diff --git a/tests/protzilla/data_analysis/test_power_analysis.py b/tests/protzilla/data_analysis/test_power_analysis.py index 0026621b..76cd1b82 100644 --- a/tests/protzilla/data_analysis/test_power_analysis.py +++ b/tests/protzilla/data_analysis/test_power_analysis.py @@ -11,8 +11,8 @@ ) from tests.protzilla.data_analysis.power_analysis_validation import ( check_sample_size_calculation_with_libfunc, - check_sample_size_calculation_implemented, - check_sample_size_calculation_implemented_without_log, + check_sample_size_calculation_protzilla, + check_sample_size_calculation_protzilla_without_log, ) from test_differential_expression import diff_expr_test_data @@ -24,7 +24,7 @@ def power_test_data(): ["Sample1", "Protein2", "Gene1", 16, "Group1"], ["Sample1", "Protein3", "Gene1", 1, "Group1"], ["Sample1", "Protein4", "Gene1", 14, "Group1"], - ["Sample2", "Protein1", "Gene1", 20, "Group1"], + ["Sample2", "Protein1", "Gene1", 19, "Group1"], ["Sample2", "Protein2", "Gene1", 15, "Group1"], ["Sample2", "Protein3", "Gene1", 2, "Group1"], ["Sample2", "Protein4", "Gene1", 15, "Group1"], @@ -32,18 +32,101 @@ def power_test_data(): ["Sample3", "Protein2", "Gene1", 14, "Group1"], ["Sample3", "Protein3", "Gene1", 3, "Group1"], ["Sample3", "Protein4", "Gene1", 16, "Group1"], - ["Sample4", "Protein1", "Gene1", 8, "Group2"], - ["Sample4", "Protein2", "Gene1", 15, "Group2"], - ["Sample4", "Protein3", "Gene1", 1, "Group2"], - ["Sample4", "Protein4", "Gene1", 9, "Group2"], - ["Sample5", "Protein1", "Gene1", 10, "Group2"], - ["Sample5", "Protein2", "Gene1", 14, "Group2"], - ["Sample5", "Protein3", "Gene1", 2, "Group2"], - ["Sample5", "Protein4", "Gene1", 10, "Group2"], - ["Sample6", "Protein1", "Gene1", 12, "Group2"], - ["Sample6", "Protein2", "Gene1", 13, "Group2"], - ["Sample6", "Protein3", "Gene1", 3, "Group2"], - ["Sample6", "Protein4", "Gene1", 11, "Group2"], + ["Sample4", "Protein1", "Gene1", 16, "Group1"], + ["Sample4", "Protein2", "Gene1", 14, "Group1"], + ["Sample4", "Protein3", "Gene1", 3, "Group1"], + ["Sample4", "Protein4", "Gene1", 16, "Group1"], + ["Sample5", "Protein1", "Gene1", 24, "Group1"], + ["Sample5", "Protein2", "Gene1", 14, "Group1"], + ["Sample5", "Protein3", "Gene1", 3, "Group1"], + ["Sample5", "Protein4", "Gene1", 16, "Group1"], + ["Sample6", "Protein1", "Gene1", 21, "Group1"], + ["Sample6", "Protein2", "Gene1", 14, "Group1"], + ["Sample6", "Protein3", "Gene1", 3, "Group1"], + ["Sample6", "Protein4", "Gene1", 16, "Group1"], + ["Sample7", "Protein1", "Gene1", 8, "Group2"], + ["Sample7", "Protein2", "Gene1", 15, "Group2"], + ["Sample7", "Protein3", "Gene1", 1, "Group2"], + ["Sample7", "Protein4", "Gene1", 9, "Group2"], + ["Sample8", "Protein1", "Gene1", 9, "Group2"], + ["Sample8", "Protein2", "Gene1", 14, "Group2"], + ["Sample8", "Protein3", "Gene1", 2, "Group2"], + ["Sample8", "Protein4", "Gene1", 10, "Group2"], + ["Sample9", "Protein1", "Gene1", 12, "Group2"], + ["Sample9", "Protein2", "Gene1", 13, "Group2"], + ["Sample9", "Protein3", "Gene1", 3, "Group2"], + ["Sample9", "Protein4", "Gene1", 11, "Group2"], + ["Sample10", "Protein1", "Gene1", 6, "Group2"], + ["Sample10", "Protein2", "Gene1", 13, "Group2"], + ["Sample10", "Protein3", "Gene1", 3, "Group2"], + ["Sample10", "Protein4", "Gene1", 11, "Group2"], + ["Sample11", "Protein1", "Gene1", 14, "Group2"], + ["Sample11", "Protein2", "Gene1", 13, "Group2"], + ["Sample11", "Protein3", "Gene1", 3, "Group2"], + ["Sample11", "Protein4", "Gene1", 11, "Group2"], + ["Sample12", "Protein1", "Gene1", 11, "Group2"], + ["Sample12", "Protein2", "Gene1", 13, "Group2"], + ["Sample12", "Protein3", "Gene1", 3, "Group2"], + ["Sample12", "Protein4", "Gene1", 11, "Group2"], + ) + + test_differentially_expressed_proteins_df = pd.DataFrame( + data=test_differentially_expressed_proteins_list, + columns=["Sample", "Protein ID", "Gene", "Normalised iBAQ", "Group"], + ) + return test_differentially_expressed_proteins_df + +@pytest.fixture +def power_test_data_intensity_values(): + test_differentially_expressed_proteins_list = ( + ["Sample1", "Protein1", "Gene1", -1.56714, "Group1"], + ["Sample1", "Protein2", "Gene1", 16, "Group1"], + ["Sample1", "Protein3", "Gene1", 1, "Group1"], + ["Sample1", "Protein4", "Gene1", 14, "Group1"], + ["Sample2", "Protein1", "Gene1", -0.37691, "Group1"], + ["Sample2", "Protein2", "Gene1", 15, "Group1"], + ["Sample2", "Protein3", "Gene1", 2, "Group1"], + ["Sample2", "Protein4", "Gene1", 15, "Group1"], + ["Sample3", "Protein1", "Gene1", 0.38817, "Group1"], + ["Sample3", "Protein2", "Gene1", 14, "Group1"], + ["Sample3", "Protein3", "Gene1", 3, "Group1"], + ["Sample3", "Protein4", "Gene1", 16, "Group1"], + ["Sample4", "Protein1", "Gene1", 1.6, "Group1"], + ["Sample4", "Protein2", "Gene1", 14, "Group1"], + ["Sample4", "Protein3", "Gene1", 3, "Group1"], + ["Sample4", "Protein4", "Gene1", 16, "Group1"], + ["Sample5", "Protein1", "Gene1", 1.9, "Group1"], + ["Sample5", "Protein2", "Gene1", 14, "Group1"], + ["Sample5", "Protein3", "Gene1", 3, "Group1"], + ["Sample5", "Protein4", "Gene1", 16, "Group1"], + ["Sample6", "Protein1", "Gene1", -0.07, "Group1"], + ["Sample6", "Protein2", "Gene1", 14, "Group1"], + ["Sample6", "Protein3", "Gene1", 3, "Group1"], + ["Sample6", "Protein4", "Gene1", 16, "Group1"], + ["Sample7", "Protein1", "Gene1", 0.9819, "Group2"], + ["Sample7", "Protein2", "Gene1", 15, "Group2"], + ["Sample7", "Protein3", "Gene1", 1, "Group2"], + ["Sample7", "Protein4", "Gene1", 9, "Group2"], + ["Sample8", "Protein1", "Gene1", -0.26, "Group2"], + ["Sample8", "Protein2", "Gene1", 13, "Group2"], + ["Sample8", "Protein3", "Gene1", 3, "Group2"], + ["Sample8", "Protein4", "Gene1", 11, "Group2"], + ["Sample9", "Protein1", "Gene1", 1.116, "Group2"], + ["Sample9", "Protein2", "Gene1", 14, "Group2"], + ["Sample9", "Protein3", "Gene1", 3, "Group2"], + ["Sample9", "Protein4", "Gene1", 16, "Group2"], + ["Sample10", "Protein1", "Gene1", 0.81, "Group2"], + ["Sample10", "Protein2", "Gene1", 14, "Group2"], + ["Sample10", "Protein3", "Gene1", 3, "Group2"], + ["Sample10", "Protein4", "Gene1", 16, "Group2"], + ["Sample11", "Protein1", "Gene1", 1.336, "Group2"], + ["Sample11", "Protein2", "Gene1", 14, "Group2"], + ["Sample11", "Protein3", "Gene1", 3, "Group2"], + ["Sample11", "Protein4", "Gene1", 16, "Group2"], + ["Sample12", "Protein1", "Gene1", 1.81, "Group2"], + ["Sample12", "Protein2", "Gene1", 14, "Group2"], + ["Sample12", "Protein3", "Gene1", 2, "Group2"], + ["Sample12", "Protein4", "Gene1", 10, "Group2"], ) test_differentially_expressed_proteins_df = pd.DataFrame( @@ -116,6 +199,49 @@ def test_power_calculation(power_test_data, diff_expr_test_data): power_int = next(iter(power.values()), None) assert power_int == 0.09 +#TODO: The following tests has been used for thesis calculations. Should not be merged to dev branch. + +def test_check_sample_size_calculation_with_libfun_intensity_values(power_test_data_intensity_values): + test_alpha = 0.05 + test_power = 0.8 + test_fc_threshold = 5 + test_selected_protein_group = "Protein1" + + required_sample_size = check_sample_size_calculation_with_libfunc( + differentially_expressed_proteins_df=power_test_data_intensity_values, + significant_proteins_df=power_test_data_intensity_values, + fc_threshold=test_fc_threshold, + power=test_power, + alpha=test_alpha, + group1="Group1", + group2="Group2", + selected_protein_group=test_selected_protein_group, + intensity_name=None, + ) + print(required_sample_size) + required_sample_size_int = next(iter(required_sample_size.values()), None) + assert required_sample_size_int == 63 + +def test_check_sample_size_calculation_protzilla_intensity_values(power_test_data_intensity_values): + test_alpha = 0.05 + test_power = 0.8 + test_fc_threshold = 1 + test_selected_protein_group = "Protein1" + + required_sample_size = check_sample_size_calculation_protzilla( + differentially_expressed_proteins_df=power_test_data_intensity_values, + significant_proteins_df=power_test_data_intensity_values, + fc_threshold=test_fc_threshold, + power=test_power, + alpha=test_alpha, + group1="Group1", + group2="Group2", + selected_protein_group=test_selected_protein_group, + intensity_name=None, + ) + print(required_sample_size) + required_sample_size_int = next(iter(required_sample_size.values()), None) + assert required_sample_size_int == 1 def test_check_sample_size_calculation_with_libfun(power_test_data): test_alpha = 0.05 @@ -132,15 +258,13 @@ def test_check_sample_size_calculation_with_libfun(power_test_data): group1="Group1", group2="Group2", selected_protein_group=test_selected_protein_group, - significant_proteins_only=False, intensity_name=None, ) print(required_sample_size) required_sample_size_int = next(iter(required_sample_size.values()), None) assert required_sample_size_int == 63 - -def test_check_sample_size_calculation_impl(power_test_data): +def test_check_sample_size_calculation_protzilla(power_test_data): test_alpha = 0.05 test_power = 0.8 power_test_data_log2 = power_test_data.copy() @@ -150,7 +274,7 @@ def test_check_sample_size_calculation_impl(power_test_data): fc_threshold = 1 test_selected_protein_group = "Protein1" - required_sample_size = check_sample_size_calculation_implemented( + required_sample_size = check_sample_size_calculation_protzilla( differentially_expressed_proteins_df=power_test_data_log2, significant_proteins_df=power_test_data, fc_threshold=fc_threshold, @@ -159,7 +283,6 @@ def test_check_sample_size_calculation_impl(power_test_data): group1="Group1", group2="Group2", selected_protein_group=test_selected_protein_group, - significant_proteins_only=False, intensity_name=None, ) print(required_sample_size) @@ -167,13 +290,13 @@ def test_check_sample_size_calculation_impl(power_test_data): assert required_sample_size_int == 1 -def test_check_sample_size_calculation_implemented_without_log(power_test_data): +def test_check_sample_size_calculation_protzilla_without_log(power_test_data): test_alpha = 0.05 test_power = 0.8 test_fc_threshold = 5 test_selected_protein_group = "Protein1" - required_sample_size = check_sample_size_calculation_implemented_without_log( + required_sample_size = check_sample_size_calculation_protzilla_without_log( differentially_expressed_proteins_df=power_test_data, significant_proteins_df=power_test_data, fc_threshold=test_fc_threshold, @@ -182,7 +305,6 @@ def test_check_sample_size_calculation_implemented_without_log(power_test_data): group1="Group1", group2="Group2", selected_protein_group=test_selected_protein_group, - significant_proteins_only=False, intensity_name=None, ) print(required_sample_size) From 7131d3b043bf38fff5301bd01cdb59f4c566bb79 Mon Sep 17 00:00:00 2001 From: selenabr Date: Mon, 18 Nov 2024 03:22:16 +0100 Subject: [PATCH 32/36] put calculation for thesis into comment and changed description of methods "...for All Proteins" --- protzilla/data_analysis/power_analysis.py | 3 ++- protzilla/methods/data_analysis.py | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py index 0ad98566..932d49c2 100644 --- a/protzilla/data_analysis/power_analysis.py +++ b/protzilla/data_analysis/power_analysis.py @@ -449,6 +449,7 @@ def power_calculation_for_all_proteins( power_for_all_proteins = min(power_list) + """ power_below_threshold = [] for protein_group in protein_groups_for_calculation: power = power_calculation( @@ -470,7 +471,7 @@ def power_calculation_for_all_proteins( print(num_proteins_below_threshold) num_power_list = len(power_list) print(num_power_list) - + """ colors = colorscheme.PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 38b1fb9f..09183b27 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -885,9 +885,9 @@ def handle_outputs(self, outputs: dict): class PowerAnalysisSampleSizeCalculationForAllProteins(PlotStep): - display_name = "Sample Size Calculation for all Proteins" + display_name = "Sample Size Calculation for All Proteins" operation = "Power Analysis" - method_description = "Calculates sample size for all proteins" + method_description = "Calculates sample size for a selected group of proteins and returns the maximum required sample size." input_keys = [ "differentially_expressed_proteins_df", @@ -936,9 +936,9 @@ def handle_outputs(self, outputs: dict): ] = f"Required Sample Size for all Proteins: {outputs['required_sample_size_for_all_proteins']}" class PowerAnalysisPowerCalculationForAllProteins(PlotStep): - display_name = "Power Calculation for all Proteins" + display_name = "Power Calculation for All Proteins" operation = "Power Analysis" - method_description = "Calculates power for all proteins" + method_description = "Calculates power for a selected group of proteins and returns the minimum power." input_keys = [ "differentially_expressed_proteins_df", From 6e2daa3d10eae6a3fd0bddfd7c9b8a176fa21413 Mon Sep 17 00:00:00 2001 From: selenabr <116892527+selenabr@users.noreply.github.com> Date: Mon, 18 Nov 2024 03:53:24 +0100 Subject: [PATCH 33/36] Add files via upload meta file that includes an additional column that identifies the individual sample IDs. --- meta_individual_column.csv | 144 +++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 meta_individual_column.csv diff --git a/meta_individual_column.csv b/meta_individual_column.csv new file mode 100644 index 00000000..82053971 --- /dev/null +++ b/meta_individual_column.csv @@ -0,0 +1,144 @@ +Sample,Group,Batch,Individual +AD01_C1_INSOLUBLE_01,AD,C1,AD01_ +AD01_C1_INSOLUBLE_02,AD,C1,AD01_ +AD01_C1_INSOLUBLE_03,AD,C1,AD01_ +AD01_C2_INSOLUBLE_01,AD,C2,AD01_ +AD02_C1_INSOLUBLE_01,AD,C1,AD02_ +AD02_C1_INSOLUBLE_02,AD,C1,AD02_ +AD02_C2_INSOLUBLE_01,AD,C2,AD02_ +AD03_C1_INSOLUBLE_01,AD,C1,AD03_ +AD03_C1_INSOLUBLE_02,AD,C1,AD03_ +AD03_C1_INSOLUBLE_03,AD,C1,AD03_ +AD03_C2_INSOLUBLE_01,AD,C2,AD03_ +AD04_C1_INSOLUBLE_01,AD,C1,AD04_ +AD04_C2_INSOLUBLE_01,AD,C2,AD04_ +AD05_C2_INSOLUBLE_01,AD,C2,AD05_ +AD06_C1_INSOLUBLE_01,AD,C1,AD06_ +AD07_C1_INSOLUBLE_01,AD,C1,AD07_ +AD07_C1_INSOLUBLE_02,AD,C1,AD07_ +AD07_C1_INSOLUBLE_03,AD,C1,AD07_ +AD07_C2_INSOLUBLE_01,AD,C2,AD07_ +AD08_C2_INSOLUBLE_01,AD,C2,AD08_ +AD09_C1_INSOLUBLE_01,AD,C1,AD09_ +AD10_C1_INSOLUBLE_01,AD,C1,AD10_ +AD10_C2_INSOLUBLE_01,AD,C2,AD10_ +AD11_C2_INSOLUBLE_01,AD,C2,AD11_ +AD12_C2_INSOLUBLE_01,AD,C2,AD12_ +AD13_C2_INSOLUBLE_01,AD,C2,AD13_ +AD14_C2_INSOLUBLE_01,AD,C2,AD14_ +AD15_C2_INSOLUBLE_01,AD,C2,AD15_ +AD16_C2_INSOLUBLE_01,AD,C2,AD16_ +AD17_C2_INSOLUBLE_01,AD,C2,AD17_ +AD18_C2_INSOLUBLE_01,AD,C2,AD18_ +AD19_C2_INSOLUBLE_01,AD,C2,AD19_ +AD20_C1_INSOLUBLE_01,AD,C1,AD20_ +AD21_C1_INSOLUBLE_01,AD,C1,AD21_ +AD21_C2_INSOLUBLE_01,AD,C2,AD21_ +AD22_C1_INSOLUBLE_01,AD,C1,AD22_ +AD23_C1_INSOLUBLE_01,AD,C1,AD23_ +AD23_C1_INSOLUBLE_02,AD,C1,AD23_ +AD23_C2_INSOLUBLE_01,AD,C2,AD23_ +AD24_C1_INSOLUBLE_01,AD,C1,AD24_ +AD24_C1_INSOLUBLE_02,AD,C1,AD24_ +AD25_C1_INSOLUBLE_01,AD,C1,AD25_ +AD26_C1_INSOLUBLE_01,AD,C1,AD26_ +AD27_C1_INSOLUBLE_01,AD,C1,AD27_ +AD27_C1_INSOLUBLE_02,AD,C1,AD27_ +AD28_C2_INSOLUBLE_01,AD,C2,AD28_ +AD29_C1_INSOLUBLE_01,AD,C1,AD29_ +AD30_C1_INSOLUBLE_01,AD,C1,AD30_ +AD30_C1_INSOLUBLE_02,AD,C1,AD30_ +AD30_C2_INSOLUBLE_01,AD,C2,AD30_ +AD31_C2_INSOLUBLE_01,AD,C2,AD31_ +AD32_C2_INSOLUBLE_01,AD,C2,AD32_ +AD33_C2_INSOLUBLE_01,AD,C2,AD33_ +AD34_C1_INSOLUBLE_01,AD,C1,AD34_ +AD34_C1_INSOLUBLE_02,AD,C1,AD34_ +AD35_C1_INSOLUBLE_01,AD,C1,AD35_ +AD35_C1_INSOLUBLE_02,AD,C1,AD35_ +AD36_C1_INSOLUBLE_01,AD,C1,AD36_ +AD37_C1_INSOLUBLE_01,AD,C1,AD37_ +AD37_C2_INSOLUBLE_01,AD,C2,AD37_ +AD38_C1_INSOLUBLE_01,AD,C1,AD38_ +AD38_C1_INSOLUBLE_02,AD,C1,AD38_ +AD38_C1_INSOLUBLE_03,AD,C1,AD38_ +AD39_C2_INSOLUBLE_01,AD,C2,AD39_ +AD40_C2_INSOLUBLE_01,AD,C2,AD40_ +AD41_C2_INSOLUBLE_01,AD,C2,AD41_ +AD42_C2_INSOLUBLE_01,AD,C2,AD42_ +AD43_C1_INSOLUBLE_01,AD,C1,AD43_ +AD44_C1_INSOLUBLE_01,AD,C1,AD44_ +AD44_C1_INSOLUBLE_02,AD,C1,AD44_ +AD44_C1_INSOLUBLE_03,AD,C1,AD44_ +AD44_C1_INSOLUBLE_04,AD,C1,AD44_ +AD45_C1_INSOLUBLE_01,AD,C1,AD45_ +AD45_C1_INSOLUBLE_02,AD,C1,AD45_ +AD46_C1_INSOLUBLE_01,AD,C1,AD46_ +AD46_C1_INSOLUBLE_02,AD,C1,AD46_ +AD46_C1_INSOLUBLE_03,AD,C1,AD46_ +AD46_C2_INSOLUBLE_01,AD,C2,AD46_ +AD47_C1_INSOLUBLE_01,AD,C1,AD47_ +AD48_C2_INSOLUBLE_01,AD,C2,AD48_ +AD49_C2_INSOLUBLE_01,AD,C2,AD49_ +CTR01_C1_INSOLUBLE_01,CTR,C1,CTR01 +CTR02_C1_INSOLUBLE_01,CTR,C1,CTR02 +CTR03_C1_INSOLUBLE_01,CTR,C1,CTR03 +CTR04_C1_INSOLUBLE_01,CTR,C1,CTR04 +CTR05_C2_INSOLUBLE_01,CTR,C2,CTR05 +CTR06_C2_INSOLUBLE_01,CTR,C2,CTR06 +CTR07_C1_INSOLUBLE_01,CTR,C1,CTR07 +CTR08_C1_INSOLUBLE_01,CTR,C1,CTR08 +CTR08_C2_INSOLUBLE_01,CTR,C2,CTR08 +CTR09_C2_INSOLUBLE_01,CTR,C2,CTR09 +CTR10_C1_INSOLUBLE_01,CTR,C1,CTR10 +CTR10_C2_INSOLUBLE_01,CTR,C2,CTR10 +CTR11_C2_INSOLUBLE_01,CTR,C2,CTR11 +CTR12_C2_INSOLUBLE_01,CTR,C2,CTR12 +CTR13_C2_INSOLUBLE_01,CTR,C2,CTR13 +CTR14_C2_INSOLUBLE_01,CTR,C2,CTR14 +CTR15_C2_INSOLUBLE_01,CTR,C2,CTR15 +CTR16_C2_INSOLUBLE_01,CTR,C2,CTR16 +CTR17_C2_INSOLUBLE_01,CTR,C2,CTR17 +CTR18_C2_INSOLUBLE_01,CTR,C2,CTR18 +CTR19_C1_INSOLUBLE_01,CTR,C1,CTR19 +CTR20_C1_INSOLUBLE_01,CTR,C1,CTR20 +CTR21_C2_INSOLUBLE_01,CTR,C2,CTR21 +CTR22_C2_INSOLUBLE_01,CTR,C2,CTR22 +CTR23_C2_INSOLUBLE_01,CTR,C2,CTR23 +CTR24_C1_INSOLUBLE_01,CTR,C1,CTR24 +CTR25_C1_INSOLUBLE_01,CTR,C1,CTR25 +CTR26_C2_INSOLUBLE_01,CTR,C2,CTR26 +CTR27_C1_INSOLUBLE_01,CTR,C1,CTR27 +CTR28_C1_INSOLUBLE_01,CTR,C1,CTR28 +CTR28_C1_INSOLUBLE_02,CTR,C1,CTR28 +CTR28_C2_INSOLUBLE_01,CTR,C2,CTR28 +CTR29_C1_INSOLUBLE_01,CTR,C1,CTR29 +CTR29_C1_INSOLUBLE_02,CTR,C1,CTR29 +CTR29_C1_INSOLUBLE_03,CTR,C1,CTR29 +CTR30_C1_INSOLUBLE_01,CTR,C1,CTR30 +CTR30_C1_INSOLUBLE_02,CTR,C1,CTR30 +CTR30_C2_INSOLUBLE_01,CTR,C2,CTR30 +CTR31_C1_INSOLUBLE_01,CTR,C1,CTR31 +CTR31_C2_INSOLUBLE_01,CTR,C2,CTR31 +CTR32_C1_INSOLUBLE_01,CTR,C1,CTR32 +CTR32_C2_INSOLUBLE_01,CTR,C2,CTR32 +CTR33_C1_INSOLUBLE_01,CTR,C1,CTR33 +CTR34_C1_INSOLUBLE_01,CTR,C1,CTR34 +CTR34_C2_INSOLUBLE_01,CTR,C2,CTR34 +CTR35_C1_INSOLUBLE_01,CTR,C1,CTR35 +CTR36_C1_INSOLUBLE_01,CTR,C1,CTR36 +CTR36_C1_INSOLUBLE_02,CTR,C1,CTR36 +CTR37_C1_INSOLUBLE_01,CTR,C1,CTR37 +CTR38_C1_INSOLUBLE_01,CTR,C1,CTR38 +CTR39_C1_INSOLUBLE_01,CTR,C1,CTR39 +CTR40_C1_INSOLUBLE_01,CTR,C1,CTR40 +CTR40_C1_INSOLUBLE_02,CTR,C1,CTR40 +CTR40_C1_INSOLUBLE_03,CTR,C1,CTR40 +CTR41_C1_INSOLUBLE_01,CTR,C1,CTR41 +CTR41_C1_INSOLUBLE_02,CTR,C1,CTR41 +CTR41_C1_INSOLUBLE_03,CTR,C1,CTR41 +CTR42_C1_INSOLUBLE_01,CTR,C1,CTR42 +CTR42_C1_INSOLUBLE_02,CTR,C1,CTR42 +CTR42_C1_INSOLUBLE_03,CTR,C1,CTR42 +CTR43_C2_INSOLUBLE_01,CTR,C2,CTR43 +CTR44_C1_INSOLUBLE_01,CTR,C1,CTR44 From 01e9d5f51423bf368fe7c7857a818330f955df95 Mon Sep 17 00:00:00 2001 From: selenabr Date: Tue, 4 Mar 2025 19:17:46 +0100 Subject: [PATCH 34/36] merge bachelor-thesis-selena into dev --- .pre-commit-config.yaml | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a62c2094..885f0833 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1 +1,27 @@ -repos: [] \ No newline at end of file +repos: + - repo: local + hooks: + - id: autoflake + name: Remove unused variables and imports + entry: bash -c 'autoflake "$@"; git add -u' -- + language: python + args: + [ + "--in-place", + "--remove-all-unused-imports", + "--remove-unused-variables", + "--expand-star-imports", + "--ignore-init-module-imports", + ] + files: \.py$ + - id: isort + name: Sorting import statements + entry: bash -c 'isort "$@"; git add -u' -- + language: python + args: ["--filter-files"] + files: \.py$ + - id: black + name: Black Python code formatting + entry: bash -c 'black "$@"; git add -u' -- + language: python + types: [python] \ No newline at end of file From 412dfd1e4284f309784683aa9a605e7bc643037e Mon Sep 17 00:00:00 2001 From: selenabr Date: Tue, 4 Mar 2025 19:52:21 +0100 Subject: [PATCH 35/36] fixed error in power_analysis.py (constants.color) and commented file test_power_analysis.py --- protzilla/data_analysis/power_analysis.py | 2 +- tests/protzilla/data_analysis/test_power_analysis.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py index 932d49c2..a2192c2a 100644 --- a/protzilla/data_analysis/power_analysis.py +++ b/protzilla/data_analysis/power_analysis.py @@ -7,7 +7,7 @@ import plotly.graph_objs as go import protzilla.constants.colors as colorscheme -from ..constants.colors import PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE +from ..constants.colors import PLOT_COLOR_SEQUENCE from protzilla.utilities import default_intensity_column diff --git a/tests/protzilla/data_analysis/test_power_analysis.py b/tests/protzilla/data_analysis/test_power_analysis.py index 76cd1b82..8d551610 100644 --- a/tests/protzilla/data_analysis/test_power_analysis.py +++ b/tests/protzilla/data_analysis/test_power_analysis.py @@ -1,4 +1,4 @@ -import numpy as np +"""import numpy as np import pandas as pd import pytest import math @@ -341,3 +341,4 @@ def test_replicate_paper_sample_size_calculation(power_test_data): print(correlationmatrix) return dict(required_sample_size=required_sample_size) +""" \ No newline at end of file From 7b6c1596d29efd055323ba7240dff4edcc346678 Mon Sep 17 00:00:00 2001 From: Jonas Krohn Date: Thu, 6 Mar 2025 23:59:18 +0100 Subject: [PATCH 36/36] changed steps to new format --- protzilla/methods/data_analysis.py | 91 +++---------------- .../workflows/overhaul.yaml:Zone.Identifier | 3 - 2 files changed, 14 insertions(+), 80 deletions(-) delete mode 100644 user_data/workflows/overhaul.yaml:Zone.Identifier diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 8d0e37e7..13d01fd2 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -49,15 +49,6 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: return inputs -class PlotStep(DataAnalysisStep): - step = "plot" - - def handle_outputs(self, outputs: dict): - super().handle_outputs(outputs) - plots = self.output.output.pop("plots", []) - self.plots = Plots(plots) - - class DifferentialExpressionANOVA(DataAnalysisStep): display_name = "ANOVA" operation = "differential_expression" @@ -666,21 +657,9 @@ class PowerAnalysisPowerCalculation(DataAnalysisStep): operation = "Power Analysis" method_description = "Calculates power of the test for given protein groups" - input_keys = [ - "significant_proteins_df", - "differentially_expressed_proteins_df", - "selected_protein_group", - "fc_threshold", - "alpha", - "group1", - "group2", - "individual_column", - "metadata_df", - ] output_keys = ["power"] - def method(self, inputs: dict) -> dict: - return power_calculation(**inputs) + calc_method = staticmethod(power_calculation) def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["differentially_expressed_proteins_df"] = steps.get_step_output( @@ -698,8 +677,8 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["group2"] = step.inputs["group2"] return inputs - def handle_outputs(self, outputs: dict): - super().handle_outputs(outputs) + def handle_calc_outputs(self, outputs : dict): + super().handle_calc_outputs(outputs) self.display_output["power"] = f"Power of the test: {outputs['power']}" @@ -708,25 +687,12 @@ class PowerAnalysisSampleSizeCalculation(DataAnalysisStep): operation = "Power Analysis" method_description = "Calculates sample size for given protein groups" - input_keys = [ - "differentially_expressed_proteins_df", - "selected_protein_group", - "significant_proteins_df", - "fc_threshold", - "alpha", - "group1", - "group2", - "power", - "individual_column", - "metadata_df", - ] output_keys = [ "required_sample_size", "variance_protein_group", # TODO: remove this line before merging into main ] - def method(self, inputs: dict) -> dict: - return sample_size_calculation(**inputs) + calc_method = staticmethod(sample_size_calculation) def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["differentially_expressed_proteins_df"] = steps.get_step_output( @@ -744,32 +710,18 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["group2"] = step.inputs["group2"] return inputs - def handle_outputs(self, outputs: dict): - super().handle_outputs(outputs) + def handle_calc_outputs(self, outputs: dict): + super().handle_calc_outputs(outputs) self.display_output[ "required_sample_size" ] = f"Required Sample Size: {outputs['required_sample_size']}" -class PowerAnalysisSampleSizeCalculationForAllProteins(PlotStep): +class PowerAnalysisSampleSizeCalculationForAllProteins(Step): display_name = "Sample Size Calculation for All Proteins" operation = "Power Analysis" method_description = "Calculates sample size for a selected group of proteins and returns the maximum required sample size." - input_keys = [ - "differentially_expressed_proteins_df", - "significant_proteins_df", - "significant_proteins_only", - "fc_threshold", - "alpha", - "group1", - "group2", - "power", - "individual_column", - "metadata_df", - "select_all_proteins", - "selected_protein_groups", - ] output_keys = [ "required_sample_size_for_all_proteins", "differentially_expressed_proteins_df", @@ -777,8 +729,7 @@ class PowerAnalysisSampleSizeCalculationForAllProteins(PlotStep): "significant_proteins_df", ] - def method(self, inputs: dict) -> dict: - return sample_size_calculation_for_all_proteins(**inputs) + plot_method = staticmethod(sample_size_calculation_for_all_proteins) def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["differentially_expressed_proteins_df"] = steps.get_step_output( @@ -796,31 +747,18 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["group2"] = step.inputs["group2"] return inputs - def handle_outputs(self, outputs: dict): - super().handle_outputs(outputs) + def handle_plot_outputs(self, outputs: dict): + super().handle_plot_outputs(outputs) self.display_output[ "required_sample_size_for_all_proteins" ] = f"Required Sample Size for all Proteins: {outputs['required_sample_size_for_all_proteins']}" -class PowerAnalysisPowerCalculationForAllProteins(PlotStep): +class PowerAnalysisPowerCalculationForAllProteins(): display_name = "Power Calculation for All Proteins" operation = "Power Analysis" method_description = "Calculates power for a selected group of proteins and returns the minimum power." - input_keys = [ - "differentially_expressed_proteins_df", - "significant_proteins_df", - "significant_proteins_only", - "fc_threshold", - "alpha", - "group1", - "group2", - "individual_column", - "metadata_df", - "select_all_proteins", - "selected_protein_groups", - ] output_keys = [ "power_for_all_proteins", "differentially_expressed_proteins_df", @@ -828,8 +766,7 @@ class PowerAnalysisPowerCalculationForAllProteins(PlotStep): "significant_proteins_df", ] - def method(self, inputs: dict) -> dict: - return power_calculation_for_all_proteins(**inputs) + plot_method = staticmethod(power_calculation_for_all_proteins) def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["differentially_expressed_proteins_df"] = steps.get_step_output( @@ -847,8 +784,8 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["group2"] = step.inputs["group2"] return inputs - def handle_outputs(self, outputs: dict): - super().handle_outputs(outputs) + def handle_plot_outputs(self, outputs: dict): + super().handle_plot_outputs(outputs) self.display_output[ "power_for_all_proteins" ] = f"Power for all Proteins: {outputs['power_for_all_proteins']}" diff --git a/user_data/workflows/overhaul.yaml:Zone.Identifier b/user_data/workflows/overhaul.yaml:Zone.Identifier deleted file mode 100644 index 71c6e851..00000000 --- a/user_data/workflows/overhaul.yaml:Zone.Identifier +++ /dev/null @@ -1,3 +0,0 @@ -[ZoneTransfer] -ZoneId=3 -HostUrl=https://files.slack.com/files-pri/T055BG3H51R-F06U5LX84NS/download/overhaul.yaml?origin_team=E055BG3H51R