From 73e13a2c7bd76f2cfc85387375d273ae33d93476 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Wed, 5 Jun 2024 17:12:18 +0200
Subject: [PATCH 01/36] added sample size calculation in
 methods\data_analysis.py and forms\data_analysis.py

---
 protzilla/data_analysis/power_analysis.py | 31 ++++++++++++
 protzilla/methods/data_analysis.py        | 49 +++++++++++++++++++
 ui/runs/form_mapping.py                   |  2 +
 ui/runs/forms/data_analysis.py            | 59 ++++++++++++++++++++++-
 user_data/workflows/standard.yaml         |  6 +++
 5 files changed, 146 insertions(+), 1 deletion(-)
 create mode 100644 protzilla/data_analysis/power_analysis.py

diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py
new file mode 100644
index 00000000..5246ccad
--- /dev/null
+++ b/protzilla/data_analysis/power_analysis.py
@@ -0,0 +1,31 @@
+import logging
+
+import numpy as np
+import pandas as pd
+from scipy import stats
+from statsmodels.stats.power import TTestIndPower
+
+
+def sample_size_calculation(
+    significant_proteins_df: pd.DataFrame,
+    alpha: float,
+    power: float,
+    group1: str,
+    group2: str,
+    intensity_name: str = None
+) -> pd.DataFrame:
+    """
+    Function to calculate the required sample size.
+
+    :param significant_proteins_df: The dataframe containing the significant proteins from t-test output.
+    :param alpha: The significance level.
+    :param power: The power of the test.
+    :param group1: The name of the first group.
+    :param group2: The name of the second group.
+    :param intensity_name: The name of the column containing the protein group intensities.
+    :return: The required sample size.
+    """
+    power_analysis_results = []
+
+
+
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 4dca2149..35d2c331 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -15,6 +15,7 @@
     prot_quant_plot,
     scatter_plot,
 )
+from protzilla.data_analysis.power_analysis import sample_size_calculation
 from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph
 from protzilla.methods.data_preprocessing import TransformationLog
 from protzilla.steps import Plots, Step, StepManager
@@ -599,3 +600,51 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         inputs["peptide_df"] = steps.peptide_df
         inputs["isoform_df"] = steps.isoform_df
         return inputs
+
+class PowerAnalysisPowerCalculation(DataAnalysisStep):
+    display_name = "Power Calculation"
+    operation = "Power Analysis"
+    method_description = "post-hoc Power Calculation"
+
+    input_keys = [
+        "significant_proteins_df"
+    ]
+
+class PowerAnalysisSampleSizeCalculation(DataAnalysisStep):
+    display_name = "Sample Size Calculation"
+    operation = "Power Analysis"
+    method_description = "(apriori) Sample Size Calculation"
+
+    input_keys = [
+        "significant_proteins_df",
+        "alpha",
+        "group1",
+        "group2",
+        "effect_size",
+        "power",
+        "intensity_name",
+        "log2_fc",
+    ]
+    output_keys = []
+
+    def method(self, inputs: dict) -> dict:
+        return sample_size_calculation(**inputs)
+
+    def insert_dataframes(self, steps: StepManager, inputs) -> dict:
+        inputs["significant_proteins_df"] = steps.get_step_output(
+            Step, "significant_proteins_df", inputs["input_dict"]
+        )
+        step = next(
+            s for s in steps.all_steps if s.instance_identifier == inputs["input_dict"]
+        )
+        inputs["alpha"] = step.inputs["alpha"]
+        inputs["group1"] = step.inputs["group1"]
+        inputs["group2"] = step.inputs["group2"]
+        inputs["significant_"]
+        inputs["effect_size"] = step.inputs["effect_size"]
+        inputs["power"] = step.inputs["power"]
+        inputs["intensity_name"] = step.inputs["intensity_name"]
+        inputs["log2_fc"] = steps.get_step_output(
+            Step, "log2_fold_change_df", inputs["input_dict"]
+        )
+        return inputs
diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py
index 13431322..5f574427 100644
--- a/ui/runs/form_mapping.py
+++ b/ui/runs/form_mapping.py
@@ -58,6 +58,8 @@
     data_analysis.DimensionReductionUMAP: data_analysis_forms.DimensionReductionUMAPForm,
     data_analysis.ProteinGraphPeptidesToIsoform: data_analysis_forms.ProteinGraphPeptidesToIsoformForm,
     data_analysis.ProteinGraphVariationGraph: data_analysis_forms.ProteinGraphVariationGraphForm,
+    data_analysis.PowerAnalysisPowerCalculation: data_analysis_forms.PowerAnalysisPowerCalculationForm,
+    data_analysis.PowerAnalysisSampleSizeCalculation: data_analysis_forms.PowerAnalysisSampleSizeCalculationForm,
     data_preprocessing.ImputationByMinPerSample: data_preprocessing_forms.ImputationByMinPerSampleForms,
     data_integration.EnrichmentAnalysisGOAnalysisWithString: data_integration_forms.EnrichmentAnalysisGOAnalysisWithStringForm,
     data_integration.EnrichmentAnalysisGOAnalysisWithEnrichr: data_integration_forms.EnrichmentAnalysisGOAnalysisWithEnrichrForm,
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index 353d7cff..3b4508de 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -140,7 +140,6 @@ class DimensionReductionMetric(Enum):
     cosine = "cosine"
     havensine = "havensine"
 
-
 class DifferentialExpressionANOVAForm(MethodForm):
     is_dynamic = True
 
@@ -881,3 +880,61 @@ class ProteinGraphVariationGraphForm(MethodForm):
         label="Protein ID", initial="Enter the Uniprot-ID of the protein"
     )
     # TODO: workflow_meta line 2291 - 2295
+
+class PowerAnalysisPowerCalculationForm(MethodForm):
+    t_test_results = CustomChoiceField(
+        choices=[],
+        label="T-test results",
+    )
+    #fill alpha dynamic from t-test
+    alpha = CustomFloatField(
+        label="Error rate (alpha)",
+        min_value = 0,
+        max_value = 1,
+        step_size = 0.05,
+        initial = 0.05,
+    )
+    def fill_form(self, run: Run) -> None:
+        self.fields["t_test_results"].choices = get_t_test_results(run)
+
+class PowerAnalysisSampleSizeCalculationForm(MethodForm):
+    is_dynamic = True
+
+    input_dict = CustomChoiceField(
+        choices=[],
+        label="Input data dict (generated e.g. by t-Test)",
+    )
+    effect_size = CustomNumberField(
+        label="Effect size", min_value=0, initial=0.5
+    )
+    #fill alpha dynamic from t-test
+    alpha = CustomFloatField(
+        label="Error rate (alpha)",
+        min_value = 0,
+        max_value = 1,
+        step_size = 0.05,
+        initial = 0.05,
+    )
+    power = CustomFloatField(
+        label="Power",
+        min_value = 0,
+        max_value = 1,
+        step_size = 0.05,
+        initial = 0.8,
+    )
+
+    def fill_form(self, run: Run) -> None:
+        self.fields["input_dict"].choices = fill_helper.to_choices(
+            run.steps.get_instance_identifiers(
+                DifferentialExpressionTTest | DifferentialExpressionLinearModel,
+                "differentially_expressed_proteins_df",
+            )
+        )
+
+        input_dict_instance_id = self.data.get(
+            "input_dict", self.fields["input_dict"].choices[0][0]
+        )
+
+        self.fields["alpha"].initial = run.steps.get_step_output(
+            Step, "corrected_alpha", input_dict_instance_id
+        )
diff --git a/user_data/workflows/standard.yaml b/user_data/workflows/standard.yaml
index d1608481..b7bebac1 100644
--- a/user_data/workflows/standard.yaml
+++ b/user_data/workflows/standard.yaml
@@ -57,6 +57,12 @@ steps:
       alpha: 0.05
     inputs: { }
     type: DifferentialExpressionTTest
+  - form_inputs: { }
+    inputs: { }
+    type: PowerAnalysisPowerCalculation
+  - form_inputs: { }
+    inputs: { }
+    type: PowerAnalysisSampleSizeCalculation
   - form_inputs:
       fc_threshold: 1
     inputs: { }

From f133c87f38ebc83ae80e8a75a3f5968be1bee441 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Mon, 17 Jun 2024 11:34:22 +0200
Subject: [PATCH 02/36] enabled possibility to choose one protein for
 calculation dependent on significance from t-test

---
 protzilla/methods/data_analysis.py | 16 +++++-------
 ui/runs/forms/data_analysis.py     | 40 +++++++++++++++++++++++++-----
 2 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 35d2c331..0ae399b5 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -613,16 +613,16 @@ class PowerAnalysisPowerCalculation(DataAnalysisStep):
 class PowerAnalysisSampleSizeCalculation(DataAnalysisStep):
     display_name = "Sample Size Calculation"
     operation = "Power Analysis"
-    method_description = "(apriori) Sample Size Calculation"
+    method_description = "Calculates sample size for protein groups"
 
     input_keys = [
-        "significant_proteins_df",
+        "corrected_p_values_df",
+        "selected_protein_group",
+        "significant_proteins_only"
         "alpha",
         "group1",
         "group2",
-        "effect_size",
         "power",
-        "intensity_name",
         "log2_fc",
     ]
     output_keys = []
@@ -631,8 +631,8 @@ def method(self, inputs: dict) -> dict:
         return sample_size_calculation(**inputs)
 
     def insert_dataframes(self, steps: StepManager, inputs) -> dict:
-        inputs["significant_proteins_df"] = steps.get_step_output(
-            Step, "significant_proteins_df", inputs["input_dict"]
+        inputs["corrected_p_values_df"] = steps.get_step_output(
+            Step, "corrected_p_values_df", inputs["input_dict"]
         )
         step = next(
             s for s in steps.all_steps if s.instance_identifier == inputs["input_dict"]
@@ -640,10 +640,6 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         inputs["alpha"] = step.inputs["alpha"]
         inputs["group1"] = step.inputs["group1"]
         inputs["group2"] = step.inputs["group2"]
-        inputs["significant_"]
-        inputs["effect_size"] = step.inputs["effect_size"]
-        inputs["power"] = step.inputs["power"]
-        inputs["intensity_name"] = step.inputs["intensity_name"]
         inputs["log2_fc"] = steps.get_step_output(
             Step, "log2_fold_change_df", inputs["input_dict"]
         )
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index 3b4508de..7d96efc2 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -33,7 +33,7 @@ class MultipleTestingCorrectionMethod(Enum):
     bonferroni = "Bonferroni"
 
 
-class YesNo(Enum):
+class YesNo(StrEnum):
     yes = "Yes"
     no = "No"
 
@@ -904,10 +904,6 @@ class PowerAnalysisSampleSizeCalculationForm(MethodForm):
         choices=[],
         label="Input data dict (generated e.g. by t-Test)",
     )
-    effect_size = CustomNumberField(
-        label="Effect size", min_value=0, initial=0.5
-    )
-    #fill alpha dynamic from t-test
     alpha = CustomFloatField(
         label="Error rate (alpha)",
         min_value = 0,
@@ -922,11 +918,20 @@ class PowerAnalysisSampleSizeCalculationForm(MethodForm):
         step_size = 0.05,
         initial = 0.8,
     )
+    selected_protein_group = CustomChoiceField(
+        choices=[],
+        label="Protein group to calculate sample size for",
+    )
+    significant_proteins_only = CustomChoiceField(
+        choices=YesNo,
+        label="Select only significant proteins",
+        initial = YesNo.yes,
+    )
 
     def fill_form(self, run: Run) -> None:
         self.fields["input_dict"].choices = fill_helper.to_choices(
             run.steps.get_instance_identifiers(
-                DifferentialExpressionTTest | DifferentialExpressionLinearModel,
+                DifferentialExpressionTTest,
                 "differentially_expressed_proteins_df",
             )
         )
@@ -935,6 +940,29 @@ def fill_form(self, run: Run) -> None:
             "input_dict", self.fields["input_dict"].choices[0][0]
         )
 
+        self.fields["selected_protein_group"].choices = fill_helper.to_choices(
+            run.steps.get_step_output(
+                Step, "differentially_expressed_proteins_df", input_dict_instance_id
+            )["Protein ID"].unique()
+        )
+
+        significant_proteins_only = self.data.get(
+            "significant_proteins_only", self.fields["significant_proteins_only"].choices[0][0]
+        )
+
+        if significant_proteins_only == YesNo.yes:
+            self.fields["selected_protein_group"].choices = fill_helper.to_choices(
+                run.steps.get_step_output(
+                    Step, "significant_proteins_df", input_dict_instance_id
+                )["Protein ID"].unique()
+            )
+        else:
+            self.fields["selected_protein_group"].choices = fill_helper.to_choices(
+                run.steps.get_step_output(
+                    Step, "differentially_expressed_proteins_df", input_dict_instance_id
+                )["Protein ID"].unique()
+            )
+
         self.fields["alpha"].initial = run.steps.get_step_output(
             Step, "corrected_alpha", input_dict_instance_id
         )

From 49c7f0e87d2c7ef6a23a094cb9520ae270980566 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Tue, 18 Jun 2024 22:24:25 +0200
Subject: [PATCH 03/36] fixed errors with missing inputs

---
 protzilla/methods/data_analysis.py | 19 +++++++++++++------
 ui/runs/forms/data_analysis.py     | 12 ++++++++----
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 0ae399b5..4e66e4a4 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -616,14 +616,16 @@ class PowerAnalysisSampleSizeCalculation(DataAnalysisStep):
     method_description = "Calculates sample size for protein groups"
 
     input_keys = [
-        "corrected_p_values_df",
+        "differentially_expressed_proteins_df",
+        "metadata_df",
         "selected_protein_group",
-        "significant_proteins_only"
+        "significant_proteins_df",
+        "significant_proteins_only",
+        "fc_threshold",
         "alpha",
         "group1",
         "group2",
         "power",
-        "log2_fc",
     ]
     output_keys = []
 
@@ -631,16 +633,21 @@ def method(self, inputs: dict) -> dict:
         return sample_size_calculation(**inputs)
 
     def insert_dataframes(self, steps: StepManager, inputs) -> dict:
-        inputs["corrected_p_values_df"] = steps.get_step_output(
-            Step, "corrected_p_values_df", inputs["input_dict"]
+        inputs["differentially_expressed_proteins_df"] = steps.get_step_output(
+            Step, "differentially_expressed_proteins_df", inputs["input_dict"]
         )
         step = next(
             s for s in steps.all_steps if s.instance_identifier == inputs["input_dict"]
         )
+        inputs["significant_proteins_df"] = steps.get_step_output(
+            Step, "significant_proteins_df", inputs["input_dict"]
+        )
+
+        inputs["metadata_df"] = steps.metadata_df
         inputs["alpha"] = step.inputs["alpha"]
         inputs["group1"] = step.inputs["group1"]
         inputs["group2"] = step.inputs["group2"]
-        inputs["log2_fc"] = steps.get_step_output(
+        inputs["fc_threshold"] = steps.get_step_output(
             Step, "log2_fold_change_df", inputs["input_dict"]
         )
         return inputs
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index 7d96efc2..736b817f 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -918,15 +918,19 @@ class PowerAnalysisSampleSizeCalculationForm(MethodForm):
         step_size = 0.05,
         initial = 0.8,
     )
-    selected_protein_group = CustomChoiceField(
-        choices=[],
-        label="Protein group to calculate sample size for",
+    fc_threshold = CustomNumberField(
+        label="Log2 fold change threshold", min_value=0, initial=1
     )
     significant_proteins_only = CustomChoiceField(
         choices=YesNo,
         label="Select only significant proteins",
-        initial = YesNo.yes,
+        initial=YesNo.yes,
     )
+    selected_protein_group = CustomChoiceField(
+        choices=[],
+        label="Protein group to calculate sample size for",
+    )
+
 
     def fill_form(self, run: Run) -> None:
         self.fields["input_dict"].choices = fill_helper.to_choices(

From 6d8c9a8eda8d6a0470bc117baa24a18916708c43 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Tue, 18 Jun 2024 22:25:10 +0200
Subject: [PATCH 04/36] added variance calculation and testing function and
 edited sample size calculation function

---
 protzilla/data_analysis/power_analysis.py     | 72 ++++++++++++++++++-
 .../data_analysis/test_power_analysis.py      | 26 +++++++
 2 files changed, 96 insertions(+), 2 deletions(-)
 create mode 100644 tests/protzilla/data_analysis/test_power_analysis.py

diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py
index 5246ccad..88a91227 100644
--- a/protzilla/data_analysis/power_analysis.py
+++ b/protzilla/data_analysis/power_analysis.py
@@ -6,16 +6,64 @@
 from statsmodels.stats.power import TTestIndPower
 
 
+def variance_protein_group_calculation(
+    intensity_df: pd.DataFrame,
+    metadata_df: pd.DataFrame,
+    protein_id: str,
+    group1: str,
+    group2: str,
+    intensity_name: str = None,
+) -> float:
+    """
+    Function to calculate the variance of a protein group for the two classes and return the maximum variance.
+
+    :param intensity_df: The dataframe containing the protein group intensities.
+    :param metadata_df: The dataframe containing the metadata.
+    :param protein_id: The protein ID.
+    :param group1: The name of the first group.
+    :param group2: The name of the second group.
+    :param intensity_name: The name of the column containing the protein group intensities.
+    :return: The variance of the protein group.
+    """
+
+    if intensity_name is None:
+        intensity_name = "Intensity"
+
+    protein_group = intensity_df[intensity_df["Protein ID"] == protein_id]
+
+    protein_group = pd.merge(
+        left=protein_group,
+        right=metadata_df[["Sample", "Group"]],
+        on="Sample",
+        copy=False,
+    )
+
+
+    group1_intensities = protein_group[protein_group["Group"] == group1][intensity_name].values
+    group2_intensities = protein_group[protein_group["Group"] == group2][intensity_name].values
+
+    variance_group1 = np.var(group1_intensities, ddof=1)
+    variance_group2 = np.var(group2_intensities, ddof=1)
+
+    max_variance = max(variance_group1, variance_group2)
+
+    return max_variance
+
 def sample_size_calculation(
+    differentially_expressed_proteins_df: pd.DataFrame,
     significant_proteins_df: pd.DataFrame,
+    significant_proteins_only: bool,
+    metadata_df: pd.DataFrame,
+    fc_threshold: float,
     alpha: float,
     power: float,
     group1: str,
     group2: str,
+    selected_protein_group: str,
     intensity_name: str = None
 ) -> pd.DataFrame:
     """
-    Function to calculate the required sample size.
+    Function to calculate the required sample size for each significant protein to achieve the required power .
 
     :param significant_proteins_df: The dataframe containing the significant proteins from t-test output.
     :param alpha: The significance level.
@@ -25,7 +73,27 @@ def sample_size_calculation(
     :param intensity_name: The name of the column containing the protein group intensities.
     :return: The required sample size.
     """
-    power_analysis_results = []
+
+    if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values:
+        raise ValueError("Please select a valid protein group.")
+    protein_group = selected_protein_group
+    z_alpha = stats.norm.ppf(1 - alpha / 2)
+    z_beta = stats.norm.ppf(power)
+
+    variance_protein_group = variance_protein_group_calculation(
+        intensity_df=differentially_expressed_proteins_df,
+        metadata_df=metadata_df,
+        protein_id=protein_group,
+        group1=group1,
+        group2=group2,
+        intensity_name=intensity_name,
+    )
+
+    required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * variance_protein_group)
+
+    print(required_sample_size)
+
+    return required_sample_size
 
 
 
diff --git a/tests/protzilla/data_analysis/test_power_analysis.py b/tests/protzilla/data_analysis/test_power_analysis.py
new file mode 100644
index 00000000..4d9bbbde
--- /dev/null
+++ b/tests/protzilla/data_analysis/test_power_analysis.py
@@ -0,0 +1,26 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from protzilla.data_analysis.power_analysis import variance_protein_group_calculation, sample_size_calculation
+from tests.protzilla.data_analysis.test_differential_expression import diff_expr_test_data
+
+def test_variance_protein_group_calculation(
+        diff_expr_test_data
+):
+    intensity_df, metadata_df = diff_expr_test_data
+
+    protein_id = "Protein1"
+    group1 = "Group1"
+    group2 = "Group2"
+
+    variance = variance_protein_group_calculation(
+        intensity_df, metadata_df, protein_id, group1, group2
+    )
+
+    assert variance == 4.0
+    print(variance)
+
+
+
+

From 0b95cf09db052f8b5c8104667d7baa595b605e51 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Wed, 19 Jun 2024 19:22:07 +0200
Subject: [PATCH 05/36] fixed some errors

---
 protzilla/data_analysis/power_analysis.py | 18 +++++-------------
 protzilla/methods/data_analysis.py        |  7 +++----
 ui/runs/forms/data_analysis.py            |  2 +-
 3 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py
index 88a91227..5e170758 100644
--- a/protzilla/data_analysis/power_analysis.py
+++ b/protzilla/data_analysis/power_analysis.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 import pandas as pd
+import math
 from scipy import stats
 from statsmodels.stats.power import TTestIndPower
 
@@ -27,18 +28,9 @@ def variance_protein_group_calculation(
     """
 
     if intensity_name is None:
-        intensity_name = "Intensity"
-
+        intensity_name = "Normalised iBAQ"
     protein_group = intensity_df[intensity_df["Protein ID"] == protein_id]
 
-    protein_group = pd.merge(
-        left=protein_group,
-        right=metadata_df[["Sample", "Group"]],
-        on="Sample",
-        copy=False,
-    )
-
-
     group1_intensities = protein_group[protein_group["Group"] == group1][intensity_name].values
     group2_intensities = protein_group[protein_group["Group"] == group2][intensity_name].values
 
@@ -61,7 +53,7 @@ def sample_size_calculation(
     group2: str,
     selected_protein_group: str,
     intensity_name: str = None
-) -> pd.DataFrame:
+) -> float:
     """
     Function to calculate the required sample size for each significant protein to achieve the required power .
 
@@ -90,10 +82,10 @@ def sample_size_calculation(
     )
 
     required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * variance_protein_group)
-
+    required_sample_size = math.ceil(required_sample_size)
     print(required_sample_size)
 
-    return required_sample_size
+    return dict(required_sample_size=required_sample_size)
 
 
 
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 4e66e4a4..af3db103 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -627,7 +627,9 @@ class PowerAnalysisSampleSizeCalculation(DataAnalysisStep):
         "group2",
         "power",
     ]
-    output_keys = []
+    output_keys = [
+        "required_sample_size",
+    ]
 
     def method(self, inputs: dict) -> dict:
         return sample_size_calculation(**inputs)
@@ -647,7 +649,4 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         inputs["alpha"] = step.inputs["alpha"]
         inputs["group1"] = step.inputs["group1"]
         inputs["group2"] = step.inputs["group2"]
-        inputs["fc_threshold"] = steps.get_step_output(
-            Step, "log2_fold_change_df", inputs["input_dict"]
-        )
         return inputs
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index 736b817f..650084e0 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -918,7 +918,7 @@ class PowerAnalysisSampleSizeCalculationForm(MethodForm):
         step_size = 0.05,
         initial = 0.8,
     )
-    fc_threshold = CustomNumberField(
+    fc_threshold = CustomFloatField(
         label="Log2 fold change threshold", min_value=0, initial=1
     )
     significant_proteins_only = CustomChoiceField(

From 22c293d007a313739a7881e595ba032097421608 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Thu, 20 Jun 2024 17:33:18 +0200
Subject: [PATCH 06/36]  output field for result

---
 protzilla/methods/data_analysis.py |  4 ++++
 protzilla/steps.py                 | 14 ++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index af3db103..1b06e98a 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -650,3 +650,7 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         inputs["group1"] = step.inputs["group1"]
         inputs["group2"] = step.inputs["group2"]
         return inputs
+
+    def handle_outputs(self, outputs: dict):
+        super().handle_outputs(outputs)
+        self.display_output["required_sample_size"] = outputs["required_sample_size"]
\ No newline at end of file
diff --git a/protzilla/steps.py b/protzilla/steps.py
index 4122c451..4673e961 100644
--- a/protzilla/steps.py
+++ b/protzilla/steps.py
@@ -36,6 +36,7 @@ def __init__(self, instance_identifier: str | None = None):
         self.messages: Messages = Messages([])
         self.output: Output = Output()
         self.plots: Plots = Plots()
+        self.display_output: DisplayOutput = DisplayOutput()
         self.instance_identifier = instance_identifier
 
         if self.instance_identifier is None:
@@ -306,6 +307,19 @@ def export(self, format_):
                     exports.append(BytesIO(base64.b64decode(plot)))
         return exports
 
+class DisplayOutput:
+
+    def __init__(self, display_output: dict = None):
+        if display_output is None:
+            display_output = []
+        self.display_output = display_output
+    def __iter__(self):
+        return iter(self.display_output)
+    def __repr__(self):
+        return f"DisplayOutput: {self.display_output}"
+    def __contains__(self, key):
+        return key in self.display_output
+
 
 class StepManager:
     def __repr__(self):

From b22b6e742f65a19b225222f7f29381e1f077b3c4 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Fri, 21 Jun 2024 20:24:11 +0200
Subject: [PATCH 07/36]  further implementation of output field for result

---
 protzilla/methods/data_analysis.py  |  3 +--
 protzilla/steps.py                  | 10 +++++++++-
 ui/runs/templates/runs/details.html |  7 +++++++
 ui/runs/views.py                    |  8 ++++++++
 4 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index cc49995c..995622f9 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -21,7 +21,7 @@
 from protzilla.data_analysis.power_analysis import sample_size_calculation
 from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph
 from protzilla.methods.data_preprocessing import TransformationLog
-from protzilla.steps import Plots, Step, StepManager
+from protzilla.steps import Plots, Step, StepManager, DisplayOutput
 
 
 class DataAnalysisStep(Step):
@@ -672,7 +672,6 @@ class PowerAnalysisSampleSizeCalculation(DataAnalysisStep):
     output_keys = [
         "required_sample_size",
     ]
-
     def method(self, inputs: dict) -> dict:
         return sample_size_calculation(**inputs)
 
diff --git a/protzilla/steps.py b/protzilla/steps.py
index 447f17b7..95c596d2 100644
--- a/protzilla/steps.py
+++ b/protzilla/steps.py
@@ -315,7 +315,7 @@ class DisplayOutput:
 
     def __init__(self, display_output: dict = None):
         if display_output is None:
-            display_output = []
+            display_output = {}
         self.display_output = display_output
     def __iter__(self):
         return iter(self.display_output)
@@ -323,6 +323,14 @@ def __repr__(self):
         return f"DisplayOutput: {self.display_output}"
     def __contains__(self, key):
         return key in self.display_output
+    def __getitem__(self, key):
+        return self.display_output[key]
+    def __setitem__(self, key, value):
+        self.display_output[key] = value
+    def is_empty(self) -> bool:
+        return len(self.display_output) == 0
+
+
 
 
 class StepManager:
diff --git a/ui/runs/templates/runs/details.html b/ui/runs/templates/runs/details.html
index 5809d356..a930f486 100644
--- a/ui/runs/templates/runs/details.html
+++ b/ui/runs/templates/runs/details.html
@@ -211,6 +211,13 @@ <h3>{{ display_name }}</h3>
                                 {% endif %}
                             </div>
                         {% endif %}
+                        {% if display_output %}
+                            <div>
+                                <label for="display_output">Outputs:</label>
+                                <textarea class="form-control" id="display_output" rows="5" readonly>{{ display_output_result }}
+                                </textarea>
+                            </div>
+                        {% endif %}
                     </div>
                 {% else %}
                     <p>You are at the end of the run. Go back to add more steps of the same section, or add steps of
diff --git a/ui/runs/views.py b/ui/runs/views.py
index 87aa685d..2c3dc241 100644
--- a/ui/runs/views.py
+++ b/ui/runs/views.py
@@ -121,6 +121,12 @@ def detail(request: HttpRequest, run_name: str):
         and Path(run.current_outputs["graph_path"]).exists()
     )
 
+    display_output_form = (
+        run.steps.current_step.display_output is not None
+        and not run.current_step.display_output.is_empty()
+    )
+    display_output_text = f"{run.current_step.display_output}"
+
     return render(
         request,
         "runs/details.html",
@@ -156,6 +162,8 @@ def detail(request: HttpRequest, run_name: str):
             method_form=method_form,
             is_form_dynamic=method_form.is_dynamic,
             plot_form=plot_form,
+            display_output=display_output_form,
+            display_output_result=display_output_text,
         ),
     )
 

From c6a2f3bc9c4e4abc6dc7fb2c8caab30f91da3f4f Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Sun, 23 Jun 2024 02:43:34 +0200
Subject: [PATCH 08/36] display display_output in output field

---
 protzilla/methods/data_analysis.py  | 2 +-
 ui/runs/templates/runs/details.html | 4 ++--
 ui/runs/views.py                    | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 995622f9..238ba01a 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -694,4 +694,4 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
 
     def handle_outputs(self, outputs: dict):
         super().handle_outputs(outputs)
-        self.display_output["required_sample_size"] = outputs["required_sample_size"]
\ No newline at end of file
+        self.display_output["required_sample_size"] = f"Required Sample Size: {outputs['required_sample_size']}"
diff --git a/ui/runs/templates/runs/details.html b/ui/runs/templates/runs/details.html
index a930f486..361875f7 100644
--- a/ui/runs/templates/runs/details.html
+++ b/ui/runs/templates/runs/details.html
@@ -213,8 +213,8 @@ <h3>{{ display_name }}</h3>
                         {% endif %}
                         {% if display_output %}
                             <div>
-                                <label for="display_output">Outputs:</label>
-                                <textarea class="form-control" id="display_output" rows="5" readonly>{{ display_output_result }}
+                                <label for="display_output"></label>
+                                <textarea class="form-control" id="display_output" rows="1" width="100%" style="resize: none" readonly>{{ display_output_result }}
                                 </textarea>
                             </div>
                         {% endif %}
diff --git a/ui/runs/views.py b/ui/runs/views.py
index 2c3dc241..c7fa965b 100644
--- a/ui/runs/views.py
+++ b/ui/runs/views.py
@@ -125,7 +125,7 @@ def detail(request: HttpRequest, run_name: str):
         run.steps.current_step.display_output is not None
         and not run.current_step.display_output.is_empty()
     )
-    display_output_text = f"{run.current_step.display_output}"
+    display_output_text = next(iter(run.current_step.display_output.display_output.values()))
 
     return render(
         request,

From 032286c946f25d44f70ff66bd85bfb88a44f1e24 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Tue, 25 Jun 2024 13:25:53 +0200
Subject: [PATCH 09/36] display_output field displayed in the same size and
 position as the other fields

---
 ui/runs/static/runs/style.css       |  7 +++++++
 ui/runs/templates/runs/details.html | 15 ++++++++-------
 ui/runs/views.py                    |  2 +-
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/ui/runs/static/runs/style.css b/ui/runs/static/runs/style.css
index 63d66a0b..477e0f11 100644
--- a/ui/runs/static/runs/style.css
+++ b/ui/runs/static/runs/style.css
@@ -75,3 +75,10 @@ html, body {
 #gsea_enrichment_plot_img {
     width: 800px;
 }
+
+.display-output-textarea {
+    display: flex;
+    width: 100%;
+    height: auto;
+    resize: none;
+}
\ No newline at end of file
diff --git a/ui/runs/templates/runs/details.html b/ui/runs/templates/runs/details.html
index 361875f7..84ec3cfd 100644
--- a/ui/runs/templates/runs/details.html
+++ b/ui/runs/templates/runs/details.html
@@ -209,13 +209,14 @@ <h3>{{ display_name }}</h3>
                                         </div>
                                     </form>
                                 {% endif %}
-                            </div>
-                        {% endif %}
-                        {% if display_output %}
-                            <div>
-                                <label for="display_output"></label>
-                                <textarea class="form-control" id="display_output" rows="1" width="100%" style="resize: none" readonly>{{ display_output_result }}
-                                </textarea>
+                                {% if display_output %}
+                                    <div class="mb-5">
+                                        <label for="display_output"></label>
+                                            <textarea class="form-control display-output-textarea" id="display_output" rows="1"
+                                                      readonly>{{ display_output_result }}
+                                            </textarea>
+                                    </div>
+                                {% endif %}
                             </div>
                         {% endif %}
                     </div>
diff --git a/ui/runs/views.py b/ui/runs/views.py
index c7fa965b..4de29692 100644
--- a/ui/runs/views.py
+++ b/ui/runs/views.py
@@ -125,7 +125,7 @@ def detail(request: HttpRequest, run_name: str):
         run.steps.current_step.display_output is not None
         and not run.current_step.display_output.is_empty()
     )
-    display_output_text = next(iter(run.current_step.display_output.display_output.values()))
+    display_output_text = next(iter(run.current_step.display_output.display_output.values()), None)
 
     return render(
         request,

From e90fab3447ea88a42c5b64e4f34718861800acef Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Tue, 25 Jun 2024 21:53:03 +0200
Subject: [PATCH 10/36] test function for sample_size_calculation

---
 protzilla/data_analysis/power_analysis.py     |  4 -
 protzilla/methods/data_analysis.py            |  2 -
 .../data_analysis/test_power_analysis.py      | 76 +++++++++++++++++--
 ui/runs/templates/runs/details.html           |  2 +-
 4 files changed, 71 insertions(+), 13 deletions(-)

diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py
index 5e170758..c05b7b67 100644
--- a/protzilla/data_analysis/power_analysis.py
+++ b/protzilla/data_analysis/power_analysis.py
@@ -9,7 +9,6 @@
 
 def variance_protein_group_calculation(
     intensity_df: pd.DataFrame,
-    metadata_df: pd.DataFrame,
     protein_id: str,
     group1: str,
     group2: str,
@@ -19,7 +18,6 @@ def variance_protein_group_calculation(
     Function to calculate the variance of a protein group for the two classes and return the maximum variance.
 
     :param intensity_df: The dataframe containing the protein group intensities.
-    :param metadata_df: The dataframe containing the metadata.
     :param protein_id: The protein ID.
     :param group1: The name of the first group.
     :param group2: The name of the second group.
@@ -45,7 +43,6 @@ def sample_size_calculation(
     differentially_expressed_proteins_df: pd.DataFrame,
     significant_proteins_df: pd.DataFrame,
     significant_proteins_only: bool,
-    metadata_df: pd.DataFrame,
     fc_threshold: float,
     alpha: float,
     power: float,
@@ -74,7 +71,6 @@ def sample_size_calculation(
 
     variance_protein_group = variance_protein_group_calculation(
         intensity_df=differentially_expressed_proteins_df,
-        metadata_df=metadata_df,
         protein_id=protein_group,
         group1=group1,
         group2=group2,
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 238ba01a..728faa46 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -659,7 +659,6 @@ class PowerAnalysisSampleSizeCalculation(DataAnalysisStep):
 
     input_keys = [
         "differentially_expressed_proteins_df",
-        "metadata_df",
         "selected_protein_group",
         "significant_proteins_df",
         "significant_proteins_only",
@@ -686,7 +685,6 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
             Step, "significant_proteins_df", inputs["input_dict"]
         )
 
-        inputs["metadata_df"] = steps.metadata_df
         inputs["alpha"] = step.inputs["alpha"]
         inputs["group1"] = step.inputs["group1"]
         inputs["group2"] = step.inputs["group2"]
diff --git a/tests/protzilla/data_analysis/test_power_analysis.py b/tests/protzilla/data_analysis/test_power_analysis.py
index 4d9bbbde..ec0bc1b8 100644
--- a/tests/protzilla/data_analysis/test_power_analysis.py
+++ b/tests/protzilla/data_analysis/test_power_analysis.py
@@ -2,24 +2,88 @@
 import pandas as pd
 import pytest
 
+
 from protzilla.data_analysis.power_analysis import variance_protein_group_calculation, sample_size_calculation
-from tests.protzilla.data_analysis.test_differential_expression import diff_expr_test_data
+
+
+@pytest.fixture
+def power_test_data():
+    test_differentially_expressed_proteins_list = (
+        ["Sample1", "Protein1", "Gene1", 20, "Group1"],
+        ["Sample1", "Protein2", "Gene1", 16, "Group1"],
+        ["Sample1", "Protein3", "Gene1", 1, "Group1"],
+        ["Sample1", "Protein4", "Gene1", 14, "Group1"],
+        ["Sample2", "Protein1", "Gene1", 20, "Group1"],
+        ["Sample2", "Protein2", "Gene1", 15, "Group1"],
+        ["Sample2", "Protein3", "Gene1", 2, "Group1"],
+        ["Sample2", "Protein4", "Gene1", 15, "Group1"],
+        ["Sample3", "Protein1", "Gene1", 22, "Group1"],
+        ["Sample3", "Protein2", "Gene1", 14, "Group1"],
+        ["Sample3", "Protein3", "Gene1", 3, "Group1"],
+        ["Sample3", "Protein4", "Gene1", 16, "Group1"],
+        ["Sample4", "Protein1", "Gene1", 8, "Group2"],
+        ["Sample4", "Protein2", "Gene1", 15, "Group2"],
+        ["Sample4", "Protein3", "Gene1", 1, "Group2"],
+        ["Sample4", "Protein4", "Gene1", 9, "Group2"],
+        ["Sample5", "Protein1", "Gene1", 10, "Group2"],
+        ["Sample5", "Protein2", "Gene1", 14, "Group2"],
+        ["Sample5", "Protein3", "Gene1", 2, "Group2"],
+        ["Sample5", "Protein4", "Gene1", 10, "Group2"],
+        ["Sample6", "Protein1", "Gene1", 12, "Group2"],
+        ["Sample6", "Protein2", "Gene1", 13, "Group2"],
+        ["Sample6", "Protein3", "Gene1", 3, "Group2"],
+        ["Sample6", "Protein4", "Gene1", 11, "Group2"],
+    )
+
+    test_differentially_expressed_proteins_df = pd.DataFrame(
+        data=test_differentially_expressed_proteins_list,
+        columns=["Sample", "Protein ID", "Gene", "Normalised iBAQ", "Group"],
+    )
+    return test_differentially_expressed_proteins_df
+
 
 def test_variance_protein_group_calculation(
-        diff_expr_test_data
+        power_test_data
 ):
-    intensity_df, metadata_df = diff_expr_test_data
+    intensity_df = power_test_data
 
     protein_id = "Protein1"
     group1 = "Group1"
     group2 = "Group2"
 
     variance = variance_protein_group_calculation(
-        intensity_df, metadata_df, protein_id, group1, group2
+        intensity_df, protein_id, group1, group2
     )
-
-    assert variance == 4.0
     print(variance)
+    assert variance == 4.0
+
+def test_sample_size_calculation(
+        power_test_data
+
+):
+    test_alpha = 0.05
+    test_power = 0.8
+    test_fc_threshold = 1
+    test_selected_protein_group = "Protein1"
+
+
+    required_sample_size = sample_size_calculation(
+        differentially_expressed_proteins_df=power_test_data,
+        significant_proteins_df=power_test_data,
+        fc_threshold=test_fc_threshold,
+        power=test_power,
+        alpha=test_alpha,
+        group1= "Group1",
+        group2= "Group2",
+        selected_protein_group=test_selected_protein_group,
+        significant_proteins_only=False,
+        intensity_name=None
+    )
+    print(required_sample_size)
+    required_sample_size_int = next(iter(required_sample_size.values()),None)
+    assert required_sample_size_int == 63
+
+
 
 
 
diff --git a/ui/runs/templates/runs/details.html b/ui/runs/templates/runs/details.html
index 84ec3cfd..e7884f99 100644
--- a/ui/runs/templates/runs/details.html
+++ b/ui/runs/templates/runs/details.html
@@ -212,7 +212,7 @@ <h3>{{ display_name }}</h3>
                                 {% if display_output %}
                                     <div class="mb-5">
                                         <label for="display_output"></label>
-                                            <textarea class="form-control display-output-textarea" id="display_output" rows="1"
+                                            <textarea class="form-control display-output-textarea" id="display_output"
                                                       readonly>{{ display_output_result }}
                                             </textarea>
                                     </div>

From d3cf9d89ec64e7f250ac25d22d14124919373f98 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Wed, 26 Jun 2024 10:54:01 +0200
Subject: [PATCH 11/36] edited description of function

---
 protzilla/data_analysis/power_analysis.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py
index c05b7b67..b3f63665 100644
--- a/protzilla/data_analysis/power_analysis.py
+++ b/protzilla/data_analysis/power_analysis.py
@@ -52,13 +52,17 @@ def sample_size_calculation(
     intensity_name: str = None
 ) -> float:
     """
-    Function to calculate the required sample size for each significant protein to achieve the required power .
+    Function to calculate the required sample size for a selected protein to achieve the required power .
 
+    :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output.
     :param significant_proteins_df: The dataframe containing the significant proteins from t-test output.
-    :param alpha: The significance level.
+    :param significant_proteins_only: A boolean to display only significant proteins for selection to the user.
+    :param fc_threshold: The fold change threshold.
+    :param alpha: The significance level. The value for alpha is taken from the t-test by default.
     :param power: The power of the test.
     :param group1: The name of the first group.
     :param group2: The name of the second group.
+    :param selected_protein_group: The selected protein group for which the required sample size is to be calculated.
     :param intensity_name: The name of the column containing the protein group intensities.
     :return: The required sample size.
     """

From 3ce4ae1d7b7d34a0dae705b273a284ce71b83140 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Mon, 8 Jul 2024 06:38:21 +0200
Subject: [PATCH 12/36] check if implemented function of Paper (Cairns et al.,
 2009) and library-function of Sample Size Calculation have the same result

---
 .../data_analysis/test_power_analysis.py      | 80 ++++++++++++++++++-
 1 file changed, 78 insertions(+), 2 deletions(-)

diff --git a/tests/protzilla/data_analysis/test_power_analysis.py b/tests/protzilla/data_analysis/test_power_analysis.py
index ec0bc1b8..27550248 100644
--- a/tests/protzilla/data_analysis/test_power_analysis.py
+++ b/tests/protzilla/data_analysis/test_power_analysis.py
@@ -3,13 +3,13 @@
 import pytest
 
 
-from protzilla.data_analysis.power_analysis import variance_protein_group_calculation, sample_size_calculation
+from protzilla.data_analysis.power_analysis import sample_size_calculation, check_sample_size_calculation_with_libfunc, check_sample_size_calculation_implemented, check_sample_size_calculation_implemented_without_log
 
 
 @pytest.fixture
 def power_test_data():
     test_differentially_expressed_proteins_list = (
-        ["Sample1", "Protein1", "Gene1", 20, "Group1"],
+        ["Sample1", "Protein1", "Gene1", 18, "Group1"],
         ["Sample1", "Protein2", "Gene1", 16, "Group1"],
         ["Sample1", "Protein3", "Gene1", 1, "Group1"],
         ["Sample1", "Protein4", "Gene1", 14, "Group1"],
@@ -84,6 +84,82 @@ def test_sample_size_calculation(
     assert required_sample_size_int == 63
 
 
+def test_check_sample_size_calculation_with_libfun(
+        power_test_data
+
+):
+    test_alpha = 0.05
+    test_power = 0.8
+    test_fc_threshold = 5
+    test_selected_protein_group = "Protein1"
+
+    required_sample_size = check_sample_size_calculation_with_libfunc(
+        differentially_expressed_proteins_df=power_test_data,
+        significant_proteins_df=power_test_data,
+        fc_threshold=test_fc_threshold,
+        power=test_power,
+        alpha=test_alpha,
+        group1="Group1",
+        group2="Group2",
+        selected_protein_group=test_selected_protein_group,
+        significant_proteins_only=False,
+        intensity_name=None
+    )
+    print(required_sample_size)
+    required_sample_size_int = next(iter(required_sample_size.values()), None)
+    assert required_sample_size_int == 63
+
+def test_check_sample_size_calculation_impl(
+        power_test_data
+
+):
+    test_alpha = 0.05
+    test_power = 0.8
+    power_test_data_log2 = power_test_data.copy()
+    power_test_data_log2["Normalised iBAQ"] = np.log2(power_test_data_log2["Normalised iBAQ"])
+    fc_threshold = 1
+    test_selected_protein_group = "Protein1"
+
+    required_sample_size = check_sample_size_calculation_implemented(
+        differentially_expressed_proteins_df=power_test_data_log2,
+        significant_proteins_df=power_test_data,
+        fc_threshold=fc_threshold,
+        power=test_power,
+        alpha=test_alpha,
+        group1="Group1",
+        group2="Group2",
+        selected_protein_group=test_selected_protein_group,
+        significant_proteins_only=False,
+        intensity_name=None
+    )
+    print(required_sample_size)
+    required_sample_size_int = next(iter(required_sample_size.values()), None)
+    assert required_sample_size_int == 63
+
+def test_check_sample_size_calculation_implemented_without_log(
+        power_test_data
+
+):
+    test_alpha = 0.05
+    test_power = 0.8
+    test_fc_threshold = 5
+    test_selected_protein_group = "Protein1"
+
+    required_sample_size = check_sample_size_calculation_implemented_without_log(
+        differentially_expressed_proteins_df=power_test_data,
+        significant_proteins_df=power_test_data,
+        fc_threshold=test_fc_threshold,
+        power=test_power,
+        alpha=test_alpha,
+        group1="Group1",
+        group2="Group2",
+        selected_protein_group=test_selected_protein_group,
+        significant_proteins_only=False,
+        intensity_name=None
+    )
+    print(required_sample_size)
+    required_sample_size_int = next(iter(required_sample_size.values()), None)
+    assert required_sample_size_int == 63
 
 
 

From f78b0b928638a984376b0069547b1dffcc54e474 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Mon, 8 Jul 2024 06:39:21 +0200
Subject: [PATCH 13/36] power calculation and test of library-function and
 implemented paper-function

---
 protzilla/data_analysis/power_analysis.py | 201 +++++++++++++++++++++-
 protzilla/methods/data_analysis.py        |  41 ++++-
 ui/runs/forms/data_analysis.py            |  59 ++++++-
 3 files changed, 291 insertions(+), 10 deletions(-)

diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py
index b3f63665..a87ec0aa 100644
--- a/protzilla/data_analysis/power_analysis.py
+++ b/protzilla/data_analysis/power_analysis.py
@@ -7,7 +7,7 @@
 from statsmodels.stats.power import TTestIndPower
 
 
-def variance_protein_group_calculation(
+def variance_protein_group_calculation_max(
     intensity_df: pd.DataFrame,
     protein_id: str,
     group1: str,
@@ -73,7 +73,7 @@ def sample_size_calculation(
     z_alpha = stats.norm.ppf(1 - alpha / 2)
     z_beta = stats.norm.ppf(power)
 
-    variance_protein_group = variance_protein_group_calculation(
+    variance_protein_group = variance_protein_group_calculation_max(
         intensity_df=differentially_expressed_proteins_df,
         protein_id=protein_group,
         group1=group1,
@@ -87,5 +87,202 @@ def sample_size_calculation(
 
     return dict(required_sample_size=required_sample_size)
 
+def check_sample_size_calculation_with_libfunc(
+    differentially_expressed_proteins_df: pd.DataFrame,
+    significant_proteins_df: pd.DataFrame,
+    significant_proteins_only: bool,
+    fc_threshold: float,
+    alpha: float,
+    power: float,
+    group1: str,
+    group2: str,
+    selected_protein_group: str,
+    intensity_name: str = None
+) -> float:
+    """
+    Function to calculate the required sample size for a selected protein to achieve the required power .
+
+    :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output.
+    :param significant_proteins_df: The dataframe containing the significant proteins from t-test output.
+    :param significant_proteins_only: A boolean to display only significant proteins for selection to the user.
+    :param fc_threshold: The fold change threshold.
+    :param alpha: The significance level. The value for alpha is taken from the t-test by default.
+    :param power: The power of the test.
+    :param group1: The name of the first group.
+    :param group2: The name of the second group.
+    :param selected_protein_group: The selected protein group for which the required sample size is to be calculated.
+    :param intensity_name: The name of the column containing the protein group intensities.
+    :return: The required sample size.
+    """
+
+    if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values:
+        raise ValueError("Please select a valid protein group.")
+
+    protein_group = differentially_expressed_proteins_df[differentially_expressed_proteins_df["Protein ID"] == selected_protein_group]
+
+    group1_intensities = np.log2(protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values)
+    group2_intensities = np.log2(protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values)
+    variance_group1 = np.var(group1_intensities, ddof=1)
+    variance_group2 = np.var(group2_intensities, ddof=1)
+
+    sd_pooled = math.sqrt((variance_group1 + variance_group2) / 2)
+    mean_diff = abs(group1_intensities.mean() - group2_intensities.mean())
+    effect_size = (group1_intensities.mean() - group2_intensities.mean())/sd_pooled
+
+    obj = TTestIndPower()
+    required_sample_size = obj.solve_power(
+        effect_size=effect_size,
+        alpha=alpha,
+        power=power, nobs1=None, ratio=1.0, alternative='two-sided')
+    print(required_sample_size)
+
+    required_sample_size = math.ceil(required_sample_size)
+
+    return dict(required_sample_size=required_sample_size)
+    #required_sample_size = 2.27; pooled_sd = 0.23; effect_size = 4.39, mean_diff = 1.014
+
+    #impl: required_sample_size = 0.814; fc_threshold = 1.014; variance = 0.0534
+def check_sample_size_calculation_implemented(
+    differentially_expressed_proteins_df: pd.DataFrame,
+    significant_proteins_df: pd.DataFrame,
+    significant_proteins_only: bool,
+    fc_threshold: float,
+    alpha: float,
+    power: float,
+    group1: str,
+    group2: str,
+    selected_protein_group: str,
+    intensity_name: str = None
+) -> float:
+    """
+    Function to calculate the required sample size for a selected protein to achieve the required power .
+
+    :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output.
+    :param significant_proteins_df: The dataframe containing the significant proteins from t-test output.
+    :param significant_proteins_only: A boolean to display only significant proteins for selection to the user.
+    :param fc_threshold: The fold change threshold.
+    :param alpha: The significance level. The value for alpha is taken from the t-test by default.
+    :param power: The power of the test.
+    :param group1: The name of the first group.
+    :param group2: The name of the second group.
+    :param selected_protein_group: The selected protein group for which the required sample size is to be calculated.
+    :param intensity_name: The name of the column containing the protein group intensities.
+    :return: The required sample size.
+    """
+
+    if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values:
+        raise ValueError("Please select a valid protein group.")
+
+    z_alpha = stats.norm.ppf(1 - alpha / 2)
+    z_beta = stats.norm.ppf(power)
+    protein_group = differentially_expressed_proteins_df[
+        differentially_expressed_proteins_df["Protein ID"] == selected_protein_group]
+
+    group1_intensities = protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values
+    group2_intensities = protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values
+    fc_threshold = abs(group1_intensities.mean() - group2_intensities.mean())
+    variance_group1 = np.var(group1_intensities, ddof=1)
+    variance_group2 = np.var(group2_intensities, ddof=1)
+
+    pooled_variance = (variance_group1 + variance_group2) / 2
+    required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * pooled_variance)
+    required_sample_size = math.ceil(required_sample_size)
+    print(required_sample_size)
+
+    return dict(required_sample_size=required_sample_size)
+
+def check_sample_size_calculation_implemented_without_log(
+    differentially_expressed_proteins_df: pd.DataFrame,
+    significant_proteins_df: pd.DataFrame,
+    significant_proteins_only: bool,
+    fc_threshold: float,
+    alpha: float,
+    power: float,
+    group1: str,
+    group2: str,
+    selected_protein_group: str,
+    intensity_name: str = None
+) -> float:
+    """
+    Function to calculate the required sample size for a selected protein to achieve the required power .
+
+    :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output.
+    :param significant_proteins_df: The dataframe containing the significant proteins from t-test output.
+    :param significant_proteins_only: A boolean to display only significant proteins for selection to the user.
+    :param fc_threshold: The fold change threshold.
+    :param alpha: The significance level. The value for alpha is taken from the t-test by default.
+    :param power: The power of the test.
+    :param group1: The name of the first group.
+    :param group2: The name of the second group.
+    :param selected_protein_group: The selected protein group for which the required sample size is to be calculated.
+    :param intensity_name: The name of the column containing the protein group intensities.
+    :return: The required sample size.
+    """
+
+    if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values:
+        raise ValueError("Please select a valid protein group.")
+
+    z_alpha = stats.norm.ppf(1 - alpha / 2)
+    z_beta = stats.norm.ppf(power)
+    protein_group = differentially_expressed_proteins_df[
+        differentially_expressed_proteins_df["Protein ID"] == selected_protein_group]
+
+    group1_intensities = protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values
+    group2_intensities = protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values
+    fc_threshold = abs(group1_intensities.mean() - group2_intensities.mean())
+    variance_group1 = np.var(group1_intensities, ddof=1)
+    variance_group2 = np.var(group2_intensities, ddof=1)
+
+    pooled_variance = (variance_group1 + variance_group2) / 2
+    required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * pooled_variance)
+    required_sample_size = math.ceil(required_sample_size)
+    print(required_sample_size)
+
+    return dict(required_sample_size=required_sample_size)
+
+def power_calculation(
+    differentially_expressed_proteins_df: pd.DataFrame,
+    significant_proteins_df: pd.DataFrame,
+    significant_proteins_only: bool,
+    alpha: float,
+    fc_threshold: float,
+    group1: str,
+    group2: str,
+    selected_protein_group: str,
+    intensity_name: str = None
+) -> float:
+
+    """
+    Function to calculate the power of the t-test for a selected protein group.
+
+    :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output.
+    :param significant_proteins_df: The dataframe containing the significant proteins from t-test output.
+    :param significant_proteins_only: A boolean to display only significant proteins for selection to the user.
+    :param alpha: The significance level. The value for alpha is taken from the t-test by default.
+    :param fc_threshold: The fold change threshold.
+    :param group1: The name of the first group.
+    :param group2: The name of the second group.
+    :param selected_protein_group: The selected protein group for which the power is to be calculated.
+    :param intensity_name: The name of the column containing the protein group intensities.
+    :return: The power of the test.
+    """
+    if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values:
+        raise ValueError("Please select a valid protein group.")
+    protein_group = selected_protein_group
+    z_alpha = stats.norm.ppf(1 - alpha / 2)
+
+    variance_protein_group = variance_protein_group_calculation_max(
+        intensity_df=differentially_expressed_proteins_df,
+        protein_id=protein_group,
+        group1=group1,
+        group2=group2,
+        intensity_name=intensity_name,
+    )
+    sample_size = differentially_expressed_proteins_df.groupby('Group')['Sample'].count()
+    z_beta = fc_threshold * np.sqrt(sample_size/(2*variance_protein_group**2))-z_alpha
+    power = stats.norm.cdf(z_beta)
+
+    return dict(power=power)
+
 
 
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 24319c9a..c37b524d 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -21,7 +21,7 @@
     prot_quant_plot,
     scatter_plot,
 )
-from protzilla.data_analysis.power_analysis import sample_size_calculation
+from protzilla.data_analysis.power_analysis import sample_size_calculation, power_calculation
 from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph
 from protzilla.methods.data_preprocessing import TransformationLog
 from protzilla.steps import Plots, Step, StepManager, DisplayOutput
@@ -764,16 +764,49 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
 class PowerAnalysisPowerCalculation(DataAnalysisStep):
     display_name = "Power Calculation"
     operation = "Power Analysis"
-    method_description = "post-hoc Power Calculation"
+    method_description = "Calculates power of the test for given protein groups"
 
     input_keys = [
-        "significant_proteins_df"
+        "significant_proteins_df",
+        "differentially_expressed_proteins_df",
+        "selected_protein_group",
+        "significant_proteins_df",
+        "significant_proteins_only",
+        "fc_threshold",
+        "alpha",
+        "group1",
+        "group2",
     ]
+    output_keys = ["power",]
+
+    def method(self, inputs: dict) -> dict:
+        return power_calculation(**inputs)
+
+    def insert_dataframes(self, steps: StepManager, inputs) -> dict:
+        inputs["differentially_expressed_proteins_df"] = steps.get_step_output(
+            Step, "differentially_expressed_proteins_df", inputs["input_dict"]
+        )
+        step = next(
+            s for s in steps.all_steps if s.instance_identifier == inputs["input_dict"]
+        )
+        inputs["significant_proteins_df"] = steps.get_step_output(
+            Step, "significant_proteins_df", inputs["input_dict"]
+        )
+
+        inputs["alpha"] = step.inputs["alpha"]
+        inputs["group1"] = step.inputs["group1"]
+        inputs["group2"] = step.inputs["group2"]
+        return inputs
+
+    def handle_outputs(self, outputs: dict):
+        super().handle_outputs(outputs)
+        self.display_output["power"] = f"Power of the test: {outputs['power']}"
+
 
 class PowerAnalysisSampleSizeCalculation(DataAnalysisStep):
     display_name = "Sample Size Calculation"
     operation = "Power Analysis"
-    method_description = "Calculates sample size for protein groups"
+    method_description = "Calculates sample size for given protein groups"
 
     input_keys = [
         "differentially_expressed_proteins_df",
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index 1a6b6d34..74c5a149 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -1091,11 +1091,12 @@ def fill_form(self, run: Run) -> None:
         if single_protein_peptides:
             self.fields["peptide_df"].initial = single_protein_peptides[0]
 class PowerAnalysisPowerCalculationForm(MethodForm):
-    t_test_results = CustomChoiceField(
+    is_dynamic = True
+
+    input_dict = CustomChoiceField(
         choices=[],
-        label="T-test results",
+        label="Input data dict (generated e.g. by t-Test)",
     )
-    #fill alpha dynamic from t-test
     alpha = CustomFloatField(
         label="Error rate (alpha)",
         min_value = 0,
@@ -1103,8 +1104,58 @@ class PowerAnalysisPowerCalculationForm(MethodForm):
         step_size = 0.05,
         initial = 0.05,
     )
+    fc_threshold = CustomFloatField(
+        label="Log2 fold change threshold", min_value=0, initial=1
+    )
+    significant_proteins_only = CustomChoiceField(
+        choices=YesNo,
+        label="Select only significant proteins",
+        initial=YesNo.yes,
+    )
+    selected_protein_group = CustomChoiceField(
+        choices=[],
+        label="Protein group to calculate power for",
+    )
+
     def fill_form(self, run: Run) -> None:
-        self.fields["t_test_results"].choices = get_t_test_results(run)
+        self.fields["input_dict"].choices = fill_helper.to_choices(
+            run.steps.get_instance_identifiers(
+                DifferentialExpressionTTest,
+                "differentially_expressed_proteins_df",
+            )
+        )
+
+        input_dict_instance_id = self.data.get(
+            "input_dict", self.fields["input_dict"].choices[0][0]
+        )
+
+        self.fields["selected_protein_group"].choices = fill_helper.to_choices(
+            run.steps.get_step_output(
+                Step, "differentially_expressed_proteins_df", input_dict_instance_id
+            )["Protein ID"].unique()
+        )
+
+        significant_proteins_only = self.data.get(
+            "significant_proteins_only", self.fields["significant_proteins_only"].choices[0][0]
+        )
+
+        if significant_proteins_only == YesNo.yes:
+            self.fields["selected_protein_group"].choices = fill_helper.to_choices(
+                run.steps.get_step_output(
+                    Step, "significant_proteins_df", input_dict_instance_id
+                )["Protein ID"].unique()
+            )
+        else:
+            self.fields["selected_protein_group"].choices = fill_helper.to_choices(
+                run.steps.get_step_output(
+                    Step, "differentially_expressed_proteins_df", input_dict_instance_id
+                )["Protein ID"].unique()
+            )
+
+        self.fields["alpha"].initial = run.steps.get_step_output(
+            Step, "corrected_alpha", input_dict_instance_id
+        )
+
 
 class PowerAnalysisSampleSizeCalculationForm(MethodForm):
     is_dynamic = True

From e3dd1c35e5bb91d9d4648c9946bcb605cb9f1227 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Wed, 21 Aug 2024 00:56:51 +0200
Subject: [PATCH 14/36] added test for power_calculation method

---
 protzilla/data_analysis/power_analysis.py     | 52 +++++++++++++++++--
 .../data_analysis/test_power_analysis.py      | 25 ++++++++-
 2 files changed, 71 insertions(+), 6 deletions(-)

diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py
index a87ec0aa..92e76ba9 100644
--- a/protzilla/data_analysis/power_analysis.py
+++ b/protzilla/data_analysis/power_analysis.py
@@ -240,7 +240,7 @@ def check_sample_size_calculation_implemented_without_log(
 
     return dict(required_sample_size=required_sample_size)
 
-def power_calculation(
+def power_calculation_test(
     differentially_expressed_proteins_df: pd.DataFrame,
     significant_proteins_df: pd.DataFrame,
     significant_proteins_only: bool,
@@ -278,11 +278,55 @@ def power_calculation(
         group2=group2,
         intensity_name=intensity_name,
     )
-    sample_size = differentially_expressed_proteins_df.groupby('Group')['Sample'].count()
-    z_beta = fc_threshold * np.sqrt(sample_size/(2*variance_protein_group**2))-z_alpha
-    power = stats.norm.cdf(z_beta)
+    sample_size = min(differentially_expressed_proteins_df.groupby(['Group', 'Protein ID'])['Sample'].count())
+    z_beta = fc_threshold * np.sqrt(sample_size / (2 * variance_protein_group)) - z_alpha
+    power = round(stats.norm.cdf(z_beta), 2)
 
     return dict(power=power)
 
 
+def power_calculation(
+    differentially_expressed_proteins_df: pd.DataFrame,
+    significant_proteins_df: pd.DataFrame,
+    significant_proteins_only: bool,
+    alpha: float,
+    fc_threshold: float,
+    group1: str,
+    group2: str,
+    selected_protein_group: str,
+    intensity_name: str = None
+) -> float:
+
+    """
+    Function to calculate the power of the t-test for a selected protein group.
+
+    :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output.
+    :param significant_proteins_df: The dataframe containing the significant proteins from t-test output.
+    :param significant_proteins_only: A boolean to display only significant proteins for selection to the user.
+    :param alpha: The significance level. The value for alpha is taken from the t-test by default.
+    :param fc_threshold: The fold change threshold.
+    :param group1: The name of the first group.
+    :param group2: The name of the second group.
+    :param selected_protein_group: The selected protein group for which the power is to be calculated.
+    :param intensity_name: The name of the column containing the protein group intensities.
+    :return: The power of the test.
+    """
+    if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values:
+        raise ValueError("Please select a valid protein group.")
+    protein_group = selected_protein_group
+    z_alpha = stats.norm.ppf(1 - alpha / 2)
+
+    variance_protein_group = variance_protein_group_calculation_max(
+        intensity_df=differentially_expressed_proteins_df,
+        protein_id=protein_group,
+        group1=group1,
+        group2=group2,
+        intensity_name=intensity_name,
+    )
+    sample_size = min(differentially_expressed_proteins_df.groupby(['Group', 'Protein ID'])['Sample'].count())
+    z_beta = fc_threshold * np.sqrt(sample_size / (2 * variance_protein_group)) - z_alpha
+    power = round(stats.norm.cdf(z_beta), 2)
+
+    return dict(power=power)
+
 
diff --git a/tests/protzilla/data_analysis/test_power_analysis.py b/tests/protzilla/data_analysis/test_power_analysis.py
index 27550248..f83f7bf6 100644
--- a/tests/protzilla/data_analysis/test_power_analysis.py
+++ b/tests/protzilla/data_analysis/test_power_analysis.py
@@ -3,7 +3,7 @@
 import pytest
 
 
-from protzilla.data_analysis.power_analysis import sample_size_calculation, check_sample_size_calculation_with_libfunc, check_sample_size_calculation_implemented, check_sample_size_calculation_implemented_without_log
+from protzilla.data_analysis.power_analysis import sample_size_calculation, power_calculation_test, check_sample_size_calculation_with_libfunc, check_sample_size_calculation_implemented, check_sample_size_calculation_implemented_without_log
 
 
 @pytest.fixture
@@ -134,7 +134,7 @@ def test_check_sample_size_calculation_impl(
     )
     print(required_sample_size)
     required_sample_size_int = next(iter(required_sample_size.values()), None)
-    assert required_sample_size_int == 63
+    assert required_sample_size_int == 1
 
 def test_check_sample_size_calculation_implemented_without_log(
         power_test_data
@@ -163,4 +163,25 @@ def test_check_sample_size_calculation_implemented_without_log(
 
 
 
+def test_power_calculation(
+        power_test_data
+):
+    test_alpha = 0.05
+    test_fc_threshold = 1
+    test_selected_protein_group = "Protein1"
+
 
+    power = power_calculation_test(
+        differentially_expressed_proteins_df=power_test_data,
+        significant_proteins_df=power_test_data,
+        fc_threshold=test_fc_threshold,
+        alpha=test_alpha,
+        group1="Group1",
+        group2="Group2",
+        selected_protein_group=test_selected_protein_group,
+        significant_proteins_only=False,
+        intensity_name=None
+    )
+    print(power)
+    power_int = next(iter(power.values()), None)
+    assert power_int== 0.09

From 2e3de5ad1525e0f042b15df3868cbac62dbaac44 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Wed, 21 Aug 2024 20:29:25 +0200
Subject: [PATCH 15/36] fixed constructor error

---
 protzilla/data_analysis/power_analysis.py     | 48 +------------------
 protzilla/methods/data_analysis.py            |  2 +-
 .../data_analysis/test_power_analysis.py      |  4 +-
 user_data/workflows/standard.yaml             |  4 +-
 4 files changed, 7 insertions(+), 51 deletions(-)

diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py
index 92e76ba9..80d542a2 100644
--- a/protzilla/data_analysis/power_analysis.py
+++ b/protzilla/data_analysis/power_analysis.py
@@ -240,51 +240,6 @@ def check_sample_size_calculation_implemented_without_log(
 
     return dict(required_sample_size=required_sample_size)
 
-def power_calculation_test(
-    differentially_expressed_proteins_df: pd.DataFrame,
-    significant_proteins_df: pd.DataFrame,
-    significant_proteins_only: bool,
-    alpha: float,
-    fc_threshold: float,
-    group1: str,
-    group2: str,
-    selected_protein_group: str,
-    intensity_name: str = None
-) -> float:
-
-    """
-    Function to calculate the power of the t-test for a selected protein group.
-
-    :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output.
-    :param significant_proteins_df: The dataframe containing the significant proteins from t-test output.
-    :param significant_proteins_only: A boolean to display only significant proteins for selection to the user.
-    :param alpha: The significance level. The value for alpha is taken from the t-test by default.
-    :param fc_threshold: The fold change threshold.
-    :param group1: The name of the first group.
-    :param group2: The name of the second group.
-    :param selected_protein_group: The selected protein group for which the power is to be calculated.
-    :param intensity_name: The name of the column containing the protein group intensities.
-    :return: The power of the test.
-    """
-    if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values:
-        raise ValueError("Please select a valid protein group.")
-    protein_group = selected_protein_group
-    z_alpha = stats.norm.ppf(1 - alpha / 2)
-
-    variance_protein_group = variance_protein_group_calculation_max(
-        intensity_df=differentially_expressed_proteins_df,
-        protein_id=protein_group,
-        group1=group1,
-        group2=group2,
-        intensity_name=intensity_name,
-    )
-    sample_size = min(differentially_expressed_proteins_df.groupby(['Group', 'Protein ID'])['Sample'].count())
-    z_beta = fc_threshold * np.sqrt(sample_size / (2 * variance_protein_group)) - z_alpha
-    power = round(stats.norm.cdf(z_beta), 2)
-
-    return dict(power=power)
-
-
 def power_calculation(
     differentially_expressed_proteins_df: pd.DataFrame,
     significant_proteins_df: pd.DataFrame,
@@ -323,9 +278,10 @@ def power_calculation(
         group2=group2,
         intensity_name=intensity_name,
     )
+
     sample_size = min(differentially_expressed_proteins_df.groupby(['Group', 'Protein ID'])['Sample'].count())
     z_beta = fc_threshold * np.sqrt(sample_size / (2 * variance_protein_group)) - z_alpha
-    power = round(stats.norm.cdf(z_beta), 2)
+    power = float(round(stats.norm.cdf(z_beta), 2))
 
     return dict(power=power)
 
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index c37b524d..d5b2ccb4 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -777,7 +777,7 @@ class PowerAnalysisPowerCalculation(DataAnalysisStep):
         "group1",
         "group2",
     ]
-    output_keys = ["power",]
+    output_keys = ["power"]
 
     def method(self, inputs: dict) -> dict:
         return power_calculation(**inputs)
diff --git a/tests/protzilla/data_analysis/test_power_analysis.py b/tests/protzilla/data_analysis/test_power_analysis.py
index f83f7bf6..ceded6e3 100644
--- a/tests/protzilla/data_analysis/test_power_analysis.py
+++ b/tests/protzilla/data_analysis/test_power_analysis.py
@@ -3,7 +3,7 @@
 import pytest
 
 
-from protzilla.data_analysis.power_analysis import sample_size_calculation, power_calculation_test, check_sample_size_calculation_with_libfunc, check_sample_size_calculation_implemented, check_sample_size_calculation_implemented_without_log
+from protzilla.data_analysis.power_analysis import sample_size_calculation, power_calculation, check_sample_size_calculation_with_libfunc, check_sample_size_calculation_implemented, check_sample_size_calculation_implemented_without_log
 
 
 @pytest.fixture
@@ -171,7 +171,7 @@ def test_power_calculation(
     test_selected_protein_group = "Protein1"
 
 
-    power = power_calculation_test(
+    power = power_calculation(
         differentially_expressed_proteins_df=power_test_data,
         significant_proteins_df=power_test_data,
         fc_threshold=test_fc_threshold,
diff --git a/user_data/workflows/standard.yaml b/user_data/workflows/standard.yaml
index 3c27017d..970a2a2f 100644
--- a/user_data/workflows/standard.yaml
+++ b/user_data/workflows/standard.yaml
@@ -60,10 +60,10 @@ steps:
     type: DifferentialExpressionTTest
   - form_inputs: { }
     inputs: { }
-    type: PowerAnalysisPowerCalculation
+    type: PowerAnalysisSampleSizeCalculation
   - form_inputs: { }
     inputs: { }
-    type: PowerAnalysisSampleSizeCalculation
+    type: PowerAnalysisPowerCalculation
   - form_inputs:
       fc_threshold: 1
     inputs: { }

From a46a074eeb12539ee39fabb21191f55679445bdd Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Fri, 23 Aug 2024 21:23:28 +0200
Subject: [PATCH 16/36] sample size calculation for different group sizes
 (Cohen 1988) and moved validation methods to separate file

---
 protzilla/data_analysis/power_analysis.py     | 172 ++----------------
 .../power_analysis_validation.py              | 154 ++++++++++++++++
 2 files changed, 170 insertions(+), 156 deletions(-)
 create mode 100644 protzilla/data_analysis/power_analysis_validation.py

diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py
index 80d542a2..ec163dd5 100644
--- a/protzilla/data_analysis/power_analysis.py
+++ b/protzilla/data_analysis/power_analysis.py
@@ -87,159 +87,6 @@ def sample_size_calculation(
 
     return dict(required_sample_size=required_sample_size)
 
-def check_sample_size_calculation_with_libfunc(
-    differentially_expressed_proteins_df: pd.DataFrame,
-    significant_proteins_df: pd.DataFrame,
-    significant_proteins_only: bool,
-    fc_threshold: float,
-    alpha: float,
-    power: float,
-    group1: str,
-    group2: str,
-    selected_protein_group: str,
-    intensity_name: str = None
-) -> float:
-    """
-    Function to calculate the required sample size for a selected protein to achieve the required power .
-
-    :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output.
-    :param significant_proteins_df: The dataframe containing the significant proteins from t-test output.
-    :param significant_proteins_only: A boolean to display only significant proteins for selection to the user.
-    :param fc_threshold: The fold change threshold.
-    :param alpha: The significance level. The value for alpha is taken from the t-test by default.
-    :param power: The power of the test.
-    :param group1: The name of the first group.
-    :param group2: The name of the second group.
-    :param selected_protein_group: The selected protein group for which the required sample size is to be calculated.
-    :param intensity_name: The name of the column containing the protein group intensities.
-    :return: The required sample size.
-    """
-
-    if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values:
-        raise ValueError("Please select a valid protein group.")
-
-    protein_group = differentially_expressed_proteins_df[differentially_expressed_proteins_df["Protein ID"] == selected_protein_group]
-
-    group1_intensities = np.log2(protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values)
-    group2_intensities = np.log2(protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values)
-    variance_group1 = np.var(group1_intensities, ddof=1)
-    variance_group2 = np.var(group2_intensities, ddof=1)
-
-    sd_pooled = math.sqrt((variance_group1 + variance_group2) / 2)
-    mean_diff = abs(group1_intensities.mean() - group2_intensities.mean())
-    effect_size = (group1_intensities.mean() - group2_intensities.mean())/sd_pooled
-
-    obj = TTestIndPower()
-    required_sample_size = obj.solve_power(
-        effect_size=effect_size,
-        alpha=alpha,
-        power=power, nobs1=None, ratio=1.0, alternative='two-sided')
-    print(required_sample_size)
-
-    required_sample_size = math.ceil(required_sample_size)
-
-    return dict(required_sample_size=required_sample_size)
-    #required_sample_size = 2.27; pooled_sd = 0.23; effect_size = 4.39, mean_diff = 1.014
-
-    #impl: required_sample_size = 0.814; fc_threshold = 1.014; variance = 0.0534
-def check_sample_size_calculation_implemented(
-    differentially_expressed_proteins_df: pd.DataFrame,
-    significant_proteins_df: pd.DataFrame,
-    significant_proteins_only: bool,
-    fc_threshold: float,
-    alpha: float,
-    power: float,
-    group1: str,
-    group2: str,
-    selected_protein_group: str,
-    intensity_name: str = None
-) -> float:
-    """
-    Function to calculate the required sample size for a selected protein to achieve the required power .
-
-    :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output.
-    :param significant_proteins_df: The dataframe containing the significant proteins from t-test output.
-    :param significant_proteins_only: A boolean to display only significant proteins for selection to the user.
-    :param fc_threshold: The fold change threshold.
-    :param alpha: The significance level. The value for alpha is taken from the t-test by default.
-    :param power: The power of the test.
-    :param group1: The name of the first group.
-    :param group2: The name of the second group.
-    :param selected_protein_group: The selected protein group for which the required sample size is to be calculated.
-    :param intensity_name: The name of the column containing the protein group intensities.
-    :return: The required sample size.
-    """
-
-    if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values:
-        raise ValueError("Please select a valid protein group.")
-
-    z_alpha = stats.norm.ppf(1 - alpha / 2)
-    z_beta = stats.norm.ppf(power)
-    protein_group = differentially_expressed_proteins_df[
-        differentially_expressed_proteins_df["Protein ID"] == selected_protein_group]
-
-    group1_intensities = protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values
-    group2_intensities = protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values
-    fc_threshold = abs(group1_intensities.mean() - group2_intensities.mean())
-    variance_group1 = np.var(group1_intensities, ddof=1)
-    variance_group2 = np.var(group2_intensities, ddof=1)
-
-    pooled_variance = (variance_group1 + variance_group2) / 2
-    required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * pooled_variance)
-    required_sample_size = math.ceil(required_sample_size)
-    print(required_sample_size)
-
-    return dict(required_sample_size=required_sample_size)
-
-def check_sample_size_calculation_implemented_without_log(
-    differentially_expressed_proteins_df: pd.DataFrame,
-    significant_proteins_df: pd.DataFrame,
-    significant_proteins_only: bool,
-    fc_threshold: float,
-    alpha: float,
-    power: float,
-    group1: str,
-    group2: str,
-    selected_protein_group: str,
-    intensity_name: str = None
-) -> float:
-    """
-    Function to calculate the required sample size for a selected protein to achieve the required power .
-
-    :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output.
-    :param significant_proteins_df: The dataframe containing the significant proteins from t-test output.
-    :param significant_proteins_only: A boolean to display only significant proteins for selection to the user.
-    :param fc_threshold: The fold change threshold.
-    :param alpha: The significance level. The value for alpha is taken from the t-test by default.
-    :param power: The power of the test.
-    :param group1: The name of the first group.
-    :param group2: The name of the second group.
-    :param selected_protein_group: The selected protein group for which the required sample size is to be calculated.
-    :param intensity_name: The name of the column containing the protein group intensities.
-    :return: The required sample size.
-    """
-
-    if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values:
-        raise ValueError("Please select a valid protein group.")
-
-    z_alpha = stats.norm.ppf(1 - alpha / 2)
-    z_beta = stats.norm.ppf(power)
-    protein_group = differentially_expressed_proteins_df[
-        differentially_expressed_proteins_df["Protein ID"] == selected_protein_group]
-
-    group1_intensities = protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values
-    group2_intensities = protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values
-    fc_threshold = abs(group1_intensities.mean() - group2_intensities.mean())
-    variance_group1 = np.var(group1_intensities, ddof=1)
-    variance_group2 = np.var(group2_intensities, ddof=1)
-
-    pooled_variance = (variance_group1 + variance_group2) / 2
-    required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * pooled_variance)
-    required_sample_size = math.ceil(required_sample_size)
-    print(required_sample_size)
-
-    return dict(required_sample_size=required_sample_size)
-
 def power_calculation(
     differentially_expressed_proteins_df: pd.DataFrame,
     significant_proteins_df: pd.DataFrame,
@@ -257,7 +104,6 @@ def power_calculation(
 
     :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output.
     :param significant_proteins_df: The dataframe containing the significant proteins from t-test output.
-    :param significant_proteins_only: A boolean to display only significant proteins for selection to the user.
     :param alpha: The significance level. The value for alpha is taken from the t-test by default.
     :param fc_threshold: The fold change threshold.
     :param group1: The name of the first group.
@@ -279,8 +125,22 @@ def power_calculation(
         intensity_name=intensity_name,
     )
 
-    sample_size = min(differentially_expressed_proteins_df.groupby(['Group', 'Protein ID'])['Sample'].count())
-    z_beta = fc_threshold * np.sqrt(sample_size / (2 * variance_protein_group)) - z_alpha
+    """
+    filtered_df = differentially_expressed_proteins_df[differentially_expressed_proteins_df["Protein ID"] == protein_group]
+    filtered_df["Person"] = filtered_df["Sample"].apply(
+        lambda x: x[:7])
+
+    variance = filtered_df.groupby(['Person', 'Group'])['Normalised iBAQ'].var().reset_index()
+
+    filtered_df["Measurement"] = filtered_df["Sample"].apply(
+        lambda x: int(x[-2:]))
+    """
+    filtered_protein_df = differentially_expressed_proteins_df[differentially_expressed_proteins_df["Protein ID"] == protein_group]
+    grouped_df= filtered_protein_df.groupby(['Group', 'Protein ID'])['Sample'].count()
+    sample_size_group1 =  grouped_df[group1][0]
+    sample_size_group2 =  grouped_df[group2][0]
+    sample_size = (2 * sample_size_group1 * sample_size_group2) / (sample_size_group1 + sample_size_group2) # Equation 2.3.1 from Cohen 1988, Statistical Power Analysis for the Behavioral Sciences
+    z_beta = fc_threshold * np.sqrt(sample_size /  (2 * variance_protein_group)) - z_alpha
     power = float(round(stats.norm.cdf(z_beta), 2))
 
     return dict(power=power)
diff --git a/protzilla/data_analysis/power_analysis_validation.py b/protzilla/data_analysis/power_analysis_validation.py
new file mode 100644
index 00000000..40d517d3
--- /dev/null
+++ b/protzilla/data_analysis/power_analysis_validation.py
@@ -0,0 +1,154 @@
+import numpy as np
+import pandas as pd
+import math
+from scipy import stats
+from statsmodels.stats.power import TTestIndPower
+
+
+
+def check_sample_size_calculation_with_libfunc(
+    differentially_expressed_proteins_df: pd.DataFrame,
+    significant_proteins_df: pd.DataFrame,
+    fc_threshold: float,
+    alpha: float,
+    power: float,
+    group1: str,
+    group2: str,
+    selected_protein_group: str,
+    intensity_name: str = None
+) -> float:
+    """
+    Function to calculate the required sample size for a selected protein to achieve the required power .
+
+    :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output.
+    :param significant_proteins_df: The dataframe containing the significant proteins from t-test output.
+    :param fc_threshold: The fold change threshold.
+    :param alpha: The significance level. The value for alpha is taken from the t-test by default.
+    :param power: The power of the test.
+    :param group1: The name of the first group.
+    :param group2: The name of the second group.
+    :param selected_protein_group: The selected protein group for which the required sample size is to be calculated.
+    :param intensity_name: The name of the column containing the protein group intensities.
+    :return: The required sample size.
+    """
+
+    if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values:
+        raise ValueError("Please select a valid protein group.")
+
+    protein_group = differentially_expressed_proteins_df[differentially_expressed_proteins_df["Protein ID"] == selected_protein_group]
+
+    group1_intensities = np.log2(protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values)
+    group2_intensities = np.log2(protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values)
+    variance_group1 = np.var(group1_intensities, ddof=1)
+    variance_group2 = np.var(group2_intensities, ddof=1)
+
+    sd_pooled = math.sqrt((variance_group1 + variance_group2) / 2)
+    mean_diff = abs(group1_intensities.mean() - group2_intensities.mean())
+    effect_size = (group1_intensities.mean() - group2_intensities.mean())/sd_pooled
+
+    obj = TTestIndPower()
+    required_sample_size = obj.solve_power(
+        effect_size=effect_size,
+        alpha=alpha,
+        power=power, nobs1=None, ratio=1.0, alternative='two-sided')
+    print(required_sample_size)
+
+    required_sample_size = math.ceil(required_sample_size)
+
+    return dict(required_sample_size=required_sample_size)
+    #required_sample_size = 2.27; pooled_sd = 0.23; effect_size = 4.39, mean_diff = 1.014
+
+    #impl: required_sample_size = 0.814; fc_threshold = 1.014; variance = 0.0534
+def check_sample_size_calculation_implemented(
+    differentially_expressed_proteins_df: pd.DataFrame,
+    significant_proteins_df: pd.DataFrame,
+    fc_threshold: float,
+    alpha: float,
+    power: float,
+    group1: str,
+    group2: str,
+    selected_protein_group: str,
+    intensity_name: str = None
+) -> float:
+    """
+    Function to calculate the required sample size for a selected protein to achieve the required power .
+
+    :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output.
+    :param significant_proteins_df: The dataframe containing the significant proteins from t-test output.
+    :param fc_threshold: The fold change threshold.
+    :param alpha: The significance level. The value for alpha is taken from the t-test by default.
+    :param power: The power of the test.
+    :param group1: The name of the first group.
+    :param group2: The name of the second group.
+    :param selected_protein_group: The selected protein group for which the required sample size is to be calculated.
+    :param intensity_name: The name of the column containing the protein group intensities.
+    :return: The required sample size.
+    """
+
+    if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values:
+        raise ValueError("Please select a valid protein group.")
+
+    z_alpha = stats.norm.ppf(1 - alpha / 2)
+    z_beta = stats.norm.ppf(power)
+    protein_group = differentially_expressed_proteins_df[
+        differentially_expressed_proteins_df["Protein ID"] == selected_protein_group]
+
+    group1_intensities = protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values
+    group2_intensities = protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values
+    fc_threshold = abs(group1_intensities.mean() - group2_intensities.mean())
+    variance_group1 = np.var(group1_intensities, ddof=1)
+    variance_group2 = np.var(group2_intensities, ddof=1)
+
+    pooled_variance = (variance_group1 + variance_group2) / 2
+    required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * pooled_variance)
+    required_sample_size = math.ceil(required_sample_size)
+    print(required_sample_size)
+
+    return dict(required_sample_size=required_sample_size)
+
+def check_sample_size_calculation_implemented_without_log(
+    differentially_expressed_proteins_df: pd.DataFrame,
+    significant_proteins_df: pd.DataFrame,
+    fc_threshold: float,
+    alpha: float,
+    power: float,
+    group1: str,
+    group2: str,
+    selected_protein_group: str,
+    intensity_name: str = None
+) -> float:
+    """
+    Function to calculate the required sample size for a selected protein to achieve the required power .
+
+    :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output.
+    :param significant_proteins_df: The dataframe containing the significant proteins from t-test output.
+    :param fc_threshold: The fold change threshold.
+    :param alpha: The significance level. The value for alpha is taken from the t-test by default.
+    :param power: The power of the test.
+    :param group1: The name of the first group.
+    :param group2: The name of the second group.
+    :param selected_protein_group: The selected protein group for which the required sample size is to be calculated.
+    :param intensity_name: The name of the column containing the protein group intensities.
+    :return: The required sample size.
+    """
+
+    if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values:
+        raise ValueError("Please select a valid protein group.")
+
+    z_alpha = stats.norm.ppf(1 - alpha / 2)
+    z_beta = stats.norm.ppf(power)
+    protein_group = differentially_expressed_proteins_df[
+        differentially_expressed_proteins_df["Protein ID"] == selected_protein_group]
+
+    group1_intensities = protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values
+    group2_intensities = protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values
+    fc_threshold = abs(group1_intensities.mean() - group2_intensities.mean())
+    variance_group1 = np.var(group1_intensities, ddof=1)
+    variance_group2 = np.var(group2_intensities, ddof=1)
+
+    pooled_variance = (variance_group1 + variance_group2) / 2
+    required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * pooled_variance)
+    required_sample_size = math.ceil(required_sample_size)
+    print(required_sample_size)
+
+    return dict(required_sample_size=required_sample_size)
\ No newline at end of file

From 3446be375e4c4cee0c0481c037db449faf72e02d Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Mon, 26 Aug 2024 11:05:29 +0200
Subject: [PATCH 17/36] code formatting, resolved comments (output not a float,
 significant_proteins_only, intensity_name)

---
 .../differential_expression_mann_whitney.py   |  71 +++--
 protzilla/data_analysis/power_analysis.py     |  67 +++--
 .../power_analysis_validation.py              | 101 +++++--
 protzilla/data_analysis/ptm_analysis.py       |  49 +--
 protzilla/data_integration/di_plots.py        |   1 -
 .../data_preprocessing/filter_proteins.py     |   5 +-
 protzilla/data_preprocessing/normalisation.py |  38 +--
 .../data_preprocessing/outlier_detection.py   |  30 +-
 .../data_preprocessing/peptide_filter.py      |   4 +-
 .../data_preprocessing/transformation.py      |   4 +-
 protzilla/importing/ms_data_import.py         |  67 ++++-
 protzilla/importing/peptide_import.py         |   4 +-
 protzilla/methods/data_analysis.py            |  94 ++++--
 protzilla/methods/data_preprocessing.py       |   2 +-
 protzilla/methods/importing.py                |   4 +-
 protzilla/steps.py                            |  10 +-
 protzilla/utilities/transform_dfs.py          |   4 +-
 tests/conftest.py                             | 278 ++++++++++++++++--
 .../data_analysis/test_analysis_plots.py      |   4 +-
 .../test_differential_expression.py           |   2 +-
 .../test_filter_peptites_of_protein.py        |  15 +-
 .../data_analysis/test_peptide_analysis.py    |  74 ++++-
 .../data_analysis/test_plots_data_analysis.py |  26 +-
 .../data_analysis/test_power_analysis.py      |  59 ++--
 .../test_plots_data_integration.py            |  29 +-
 .../data_preprocessing/test_normalisation.py  |  10 +-
 .../test_outlier_detection.py                 |   3 +-
 .../test_peptide_preprocessing.py             |   2 -
 .../importing/test_ms_data_import.py          |   8 +-
 tests/protzilla/test_runner.py                | 107 ++++---
 ui/runs/forms/data_analysis.py                | 102 ++++---
 ui/runs/views.py                              |   6 +-
 .../workflows/overhaul.yaml:Zone.Identifier   |   3 +
 33 files changed, 903 insertions(+), 380 deletions(-)
 create mode 100644 user_data/workflows/overhaul.yaml:Zone.Identifier

diff --git a/protzilla/data_analysis/differential_expression_mann_whitney.py b/protzilla/data_analysis/differential_expression_mann_whitney.py
index 89041261..9c699e58 100644
--- a/protzilla/data_analysis/differential_expression_mann_whitney.py
+++ b/protzilla/data_analysis/differential_expression_mann_whitney.py
@@ -4,19 +4,22 @@
 import pandas as pd
 from scipy import stats
 
-from protzilla.data_analysis.differential_expression_helper import _map_log_base, apply_multiple_testing_correction
+from protzilla.data_analysis.differential_expression_helper import (
+    _map_log_base,
+    apply_multiple_testing_correction,
+)
 from protzilla.utilities.transform_dfs import long_to_wide
 
 
 def mann_whitney_test_on_intensity_data(
-        intensity_df: pd.DataFrame,
-        metadata_df: pd.DataFrame,
-        grouping: str,
-        group1: str,
-        group2: str,
-        log_base: str = None,
-        alpha=0.05,
-        multiple_testing_correction_method: str = "",
+    intensity_df: pd.DataFrame,
+    metadata_df: pd.DataFrame,
+    grouping: str,
+    group1: str,
+    group2: str,
+    log_base: str = None,
+    alpha=0.05,
+    multiple_testing_correction_method: str = "",
 ) -> dict:
     wide_df = long_to_wide(intensity_df)
 
@@ -31,13 +34,24 @@ def mann_whitney_test_on_intensity_data(
         multiple_testing_correction_method=multiple_testing_correction_method,
         columns_name="Protein ID",
     )
-    differentially_expressed_proteins_df = pd.merge(intensity_df, outputs["differential_expressed_columns_df"], on="Protein ID", how="left")
+    differentially_expressed_proteins_df = pd.merge(
+        intensity_df,
+        outputs["differential_expressed_columns_df"],
+        on="Protein ID",
+        how="left",
+    )
     differentially_expressed_proteins_df = differentially_expressed_proteins_df.loc[
-        differentially_expressed_proteins_df["Protein ID"].isin(outputs["differential_expressed_columns_df"]["Protein ID"])
+        differentially_expressed_proteins_df["Protein ID"].isin(
+            outputs["differential_expressed_columns_df"]["Protein ID"]
+        )
     ]
-    significant_proteins_df = pd.merge(intensity_df, outputs["significant_columns_df"], on="Protein ID", how="left")
+    significant_proteins_df = pd.merge(
+        intensity_df, outputs["significant_columns_df"], on="Protein ID", how="left"
+    )
     significant_proteins_df = significant_proteins_df.loc[
-        significant_proteins_df["Protein ID"].isin(outputs["significant_columns_df"]["Protein ID"])
+        significant_proteins_df["Protein ID"].isin(
+            outputs["significant_columns_df"]["Protein ID"]
+        )
     ]
 
     return dict(
@@ -50,16 +64,17 @@ def mann_whitney_test_on_intensity_data(
         messages=outputs["messages"],
     )
 
+
 def mann_whitney_test_on_columns(
-        df: pd.DataFrame,
-        metadata_df: pd.DataFrame,
-        grouping: str,
-        group1: str,
-        group2: str,
-        log_base: str = None,
-        alpha=0.05,
-        multiple_testing_correction_method: str = "",
-        columns_name: str = "Protein ID",
+    df: pd.DataFrame,
+    metadata_df: pd.DataFrame,
+    grouping: str,
+    group1: str,
+    group2: str,
+    log_base: str = None,
+    alpha=0.05,
+    multiple_testing_correction_method: str = "",
+    columns_name: str = "Protein ID",
 ) -> dict:
     """
     Perform Mann-Whitney U test on all columns of the data frame.
@@ -104,7 +119,9 @@ def mann_whitney_test_on_columns(
     for column in data_columns:
         group1_data = df_with_groups[df_with_groups[grouping] == group1][column]
         group2_data = df_with_groups[df_with_groups[grouping] == group2][column]
-        u_statistic, p_value = stats.mannwhitneyu(group1_data, group2_data, alternative="two-sided")
+        u_statistic, p_value = stats.mannwhitneyu(
+            group1_data, group2_data, alternative="two-sided"
+        )
 
         if not np.isnan(p_value):
             log2_fold_change = (
@@ -149,9 +166,13 @@ def mann_whitney_test_on_columns(
 
     significant_columns_df = combined_df[
         combined_df["corrected_p_value"] <= corrected_alpha
-        ]
+    ]
 
-    messages = [dict(level=logging.INFO, msg=f"Invalid columns: {invalid_columns}")] if invalid_columns else []
+    messages = (
+        [dict(level=logging.INFO, msg=f"Invalid columns: {invalid_columns}")]
+        if invalid_columns
+        else []
+    )
 
     return dict(
         differential_expressed_columns_df=combined_df,
diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py
index ec163dd5..4a303a18 100644
--- a/protzilla/data_analysis/power_analysis.py
+++ b/protzilla/data_analysis/power_analysis.py
@@ -1,10 +1,10 @@
-import logging
+import math
 
 import numpy as np
 import pandas as pd
-import math
 from scipy import stats
-from statsmodels.stats.power import TTestIndPower
+
+from protzilla.utilities import default_intensity_column
 
 
 def variance_protein_group_calculation_max(
@@ -24,13 +24,15 @@ def variance_protein_group_calculation_max(
     :param intensity_name: The name of the column containing the protein group intensities.
     :return: The variance of the protein group.
     """
-
-    if intensity_name is None:
-        intensity_name = "Normalised iBAQ"
+    intensity_name = default_intensity_column(intensity_df, intensity_name)
     protein_group = intensity_df[intensity_df["Protein ID"] == protein_id]
 
-    group1_intensities = protein_group[protein_group["Group"] == group1][intensity_name].values
-    group2_intensities = protein_group[protein_group["Group"] == group2][intensity_name].values
+    group1_intensities = protein_group[protein_group["Group"] == group1][
+        intensity_name
+    ].values
+    group2_intensities = protein_group[protein_group["Group"] == group2][
+        intensity_name
+    ].values
 
     variance_group1 = np.var(group1_intensities, ddof=1)
     variance_group2 = np.var(group2_intensities, ddof=1)
@@ -39,18 +41,18 @@ def variance_protein_group_calculation_max(
 
     return max_variance
 
+
 def sample_size_calculation(
     differentially_expressed_proteins_df: pd.DataFrame,
     significant_proteins_df: pd.DataFrame,
-    significant_proteins_only: bool,
     fc_threshold: float,
     alpha: float,
     power: float,
     group1: str,
     group2: str,
     selected_protein_group: str,
-    intensity_name: str = None
-) -> float:
+    intensity_name: str = None,
+) -> dict:
     """
     Function to calculate the required sample size for a selected protein to achieve the required power .
 
@@ -67,7 +69,11 @@ def sample_size_calculation(
     :return: The required sample size.
     """
 
-    if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values:
+    if (
+        selected_protein_group not in significant_proteins_df["Protein ID"].values
+        and selected_protein_group
+        not in differentially_expressed_proteins_df["Protein ID"].values
+    ):
         raise ValueError("Please select a valid protein group.")
     protein_group = selected_protein_group
     z_alpha = stats.norm.ppf(1 - alpha / 2)
@@ -81,24 +87,25 @@ def sample_size_calculation(
         intensity_name=intensity_name,
     )
 
-    required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * variance_protein_group)
+    required_sample_size = (
+        2 * ((z_alpha + z_beta) / fc_threshold) ** 2 * variance_protein_group
+    )
     required_sample_size = math.ceil(required_sample_size)
     print(required_sample_size)
 
     return dict(required_sample_size=required_sample_size)
 
+
 def power_calculation(
     differentially_expressed_proteins_df: pd.DataFrame,
     significant_proteins_df: pd.DataFrame,
-    significant_proteins_only: bool,
     alpha: float,
     fc_threshold: float,
     group1: str,
     group2: str,
     selected_protein_group: str,
-    intensity_name: str = None
-) -> float:
-
+    intensity_name: str = None,
+) -> dict:
     """
     Function to calculate the power of the t-test for a selected protein group.
 
@@ -112,7 +119,11 @@ def power_calculation(
     :param intensity_name: The name of the column containing the protein group intensities.
     :return: The power of the test.
     """
-    if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values:
+    if (
+        selected_protein_group not in significant_proteins_df["Protein ID"].values
+        and selected_protein_group
+        not in differentially_expressed_proteins_df["Protein ID"].values
+    ):
         raise ValueError("Please select a valid protein group.")
     protein_group = selected_protein_group
     z_alpha = stats.norm.ppf(1 - alpha / 2)
@@ -135,14 +146,18 @@ def power_calculation(
     filtered_df["Measurement"] = filtered_df["Sample"].apply(
         lambda x: int(x[-2:]))
     """
-    filtered_protein_df = differentially_expressed_proteins_df[differentially_expressed_proteins_df["Protein ID"] == protein_group]
-    grouped_df= filtered_protein_df.groupby(['Group', 'Protein ID'])['Sample'].count()
-    sample_size_group1 =  grouped_df[group1][0]
-    sample_size_group2 =  grouped_df[group2][0]
-    sample_size = (2 * sample_size_group1 * sample_size_group2) / (sample_size_group1 + sample_size_group2) # Equation 2.3.1 from Cohen 1988, Statistical Power Analysis for the Behavioral Sciences
-    z_beta = fc_threshold * np.sqrt(sample_size /  (2 * variance_protein_group)) - z_alpha
+    filtered_protein_df = differentially_expressed_proteins_df[
+        differentially_expressed_proteins_df["Protein ID"] == protein_group
+    ]
+    grouped_df = filtered_protein_df.groupby(["Group", "Protein ID"])["Sample"].count()
+    sample_size_group1 = grouped_df[group1][0]
+    sample_size_group2 = grouped_df[group2][0]
+    sample_size = (2 * sample_size_group1 * sample_size_group2) / (
+        sample_size_group1 + sample_size_group2
+    )  # Equation 2.3.1 from Cohen 1988, Statistical Power Analysis for the Behavioral Sciences
+    z_beta = (
+        fc_threshold * np.sqrt(sample_size / (2 * variance_protein_group)) - z_alpha
+    )
     power = float(round(stats.norm.cdf(z_beta), 2))
 
     return dict(power=power)
-
-
diff --git a/protzilla/data_analysis/power_analysis_validation.py b/protzilla/data_analysis/power_analysis_validation.py
index 40d517d3..8351202d 100644
--- a/protzilla/data_analysis/power_analysis_validation.py
+++ b/protzilla/data_analysis/power_analysis_validation.py
@@ -1,11 +1,11 @@
+import math
+
 import numpy as np
 import pandas as pd
-import math
 from scipy import stats
 from statsmodels.stats.power import TTestIndPower
 
 
-
 def check_sample_size_calculation_with_libfunc(
     differentially_expressed_proteins_df: pd.DataFrame,
     significant_proteins_df: pd.DataFrame,
@@ -15,8 +15,8 @@ def check_sample_size_calculation_with_libfunc(
     group1: str,
     group2: str,
     selected_protein_group: str,
-    intensity_name: str = None
-) -> float:
+    intensity_name: str = None,
+) -> dict:
     """
     Function to calculate the required sample size for a selected protein to achieve the required power .
 
@@ -32,33 +32,49 @@ def check_sample_size_calculation_with_libfunc(
     :return: The required sample size.
     """
 
-    if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values:
+    if (
+        selected_protein_group not in significant_proteins_df["Protein ID"].values
+        and selected_protein_group
+        not in differentially_expressed_proteins_df["Protein ID"].values
+    ):
         raise ValueError("Please select a valid protein group.")
 
-    protein_group = differentially_expressed_proteins_df[differentially_expressed_proteins_df["Protein ID"] == selected_protein_group]
-
-    group1_intensities = np.log2(protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values)
-    group2_intensities = np.log2(protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values)
+    protein_group = differentially_expressed_proteins_df[
+        differentially_expressed_proteins_df["Protein ID"] == selected_protein_group
+    ]
+
+    group1_intensities = np.log2(
+        protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values
+    )
+    group2_intensities = np.log2(
+        protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values
+    )
     variance_group1 = np.var(group1_intensities, ddof=1)
     variance_group2 = np.var(group2_intensities, ddof=1)
 
     sd_pooled = math.sqrt((variance_group1 + variance_group2) / 2)
-    mean_diff = abs(group1_intensities.mean() - group2_intensities.mean())
-    effect_size = (group1_intensities.mean() - group2_intensities.mean())/sd_pooled
+    abs(group1_intensities.mean() - group2_intensities.mean())
+    effect_size = (group1_intensities.mean() - group2_intensities.mean()) / sd_pooled
 
     obj = TTestIndPower()
     required_sample_size = obj.solve_power(
         effect_size=effect_size,
         alpha=alpha,
-        power=power, nobs1=None, ratio=1.0, alternative='two-sided')
+        power=power,
+        nobs1=None,
+        ratio=1.0,
+        alternative="two-sided",
+    )
     print(required_sample_size)
 
     required_sample_size = math.ceil(required_sample_size)
 
     return dict(required_sample_size=required_sample_size)
-    #required_sample_size = 2.27; pooled_sd = 0.23; effect_size = 4.39, mean_diff = 1.014
+    # required_sample_size = 2.27; pooled_sd = 0.23; effect_size = 4.39, mean_diff = 1.014
+
+    # impl: required_sample_size = 0.814; fc_threshold = 1.014; variance = 0.0534
+
 
-    #impl: required_sample_size = 0.814; fc_threshold = 1.014; variance = 0.0534
 def check_sample_size_calculation_implemented(
     differentially_expressed_proteins_df: pd.DataFrame,
     significant_proteins_df: pd.DataFrame,
@@ -68,8 +84,8 @@ def check_sample_size_calculation_implemented(
     group1: str,
     group2: str,
     selected_protein_group: str,
-    intensity_name: str = None
-) -> float:
+    intensity_name: str = None,
+) -> dict:
     """
     Function to calculate the required sample size for a selected protein to achieve the required power .
 
@@ -85,27 +101,39 @@ def check_sample_size_calculation_implemented(
     :return: The required sample size.
     """
 
-    if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values:
+    if (
+        selected_protein_group not in significant_proteins_df["Protein ID"].values
+        and selected_protein_group
+        not in differentially_expressed_proteins_df["Protein ID"].values
+    ):
         raise ValueError("Please select a valid protein group.")
 
     z_alpha = stats.norm.ppf(1 - alpha / 2)
     z_beta = stats.norm.ppf(power)
     protein_group = differentially_expressed_proteins_df[
-        differentially_expressed_proteins_df["Protein ID"] == selected_protein_group]
-
-    group1_intensities = protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values
-    group2_intensities = protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values
+        differentially_expressed_proteins_df["Protein ID"] == selected_protein_group
+    ]
+
+    group1_intensities = protein_group[protein_group["Group"] == group1][
+        "Normalised iBAQ"
+    ].values
+    group2_intensities = protein_group[protein_group["Group"] == group2][
+        "Normalised iBAQ"
+    ].values
     fc_threshold = abs(group1_intensities.mean() - group2_intensities.mean())
     variance_group1 = np.var(group1_intensities, ddof=1)
     variance_group2 = np.var(group2_intensities, ddof=1)
 
     pooled_variance = (variance_group1 + variance_group2) / 2
-    required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * pooled_variance)
+    required_sample_size = (
+        2 * ((z_alpha + z_beta) / fc_threshold) ** 2 * pooled_variance
+    )
     required_sample_size = math.ceil(required_sample_size)
     print(required_sample_size)
 
     return dict(required_sample_size=required_sample_size)
 
+
 def check_sample_size_calculation_implemented_without_log(
     differentially_expressed_proteins_df: pd.DataFrame,
     significant_proteins_df: pd.DataFrame,
@@ -115,8 +143,8 @@ def check_sample_size_calculation_implemented_without_log(
     group1: str,
     group2: str,
     selected_protein_group: str,
-    intensity_name: str = None
-) -> float:
+    intensity_name: str = None,
+) -> dict:
     """
     Function to calculate the required sample size for a selected protein to achieve the required power .
 
@@ -132,23 +160,34 @@ def check_sample_size_calculation_implemented_without_log(
     :return: The required sample size.
     """
 
-    if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values:
+    if (
+        selected_protein_group not in significant_proteins_df["Protein ID"].values
+        and selected_protein_group
+        not in differentially_expressed_proteins_df["Protein ID"].values
+    ):
         raise ValueError("Please select a valid protein group.")
 
     z_alpha = stats.norm.ppf(1 - alpha / 2)
     z_beta = stats.norm.ppf(power)
     protein_group = differentially_expressed_proteins_df[
-        differentially_expressed_proteins_df["Protein ID"] == selected_protein_group]
-
-    group1_intensities = protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values
-    group2_intensities = protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values
+        differentially_expressed_proteins_df["Protein ID"] == selected_protein_group
+    ]
+
+    group1_intensities = protein_group[protein_group["Group"] == group1][
+        "Normalised iBAQ"
+    ].values
+    group2_intensities = protein_group[protein_group["Group"] == group2][
+        "Normalised iBAQ"
+    ].values
     fc_threshold = abs(group1_intensities.mean() - group2_intensities.mean())
     variance_group1 = np.var(group1_intensities, ddof=1)
     variance_group2 = np.var(group2_intensities, ddof=1)
 
     pooled_variance = (variance_group1 + variance_group2) / 2
-    required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * pooled_variance)
+    required_sample_size = (
+        2 * ((z_alpha + z_beta) / fc_threshold) ** 2 * pooled_variance
+    )
     required_sample_size = math.ceil(required_sample_size)
     print(required_sample_size)
 
-    return dict(required_sample_size=required_sample_size)
\ No newline at end of file
+    return dict(required_sample_size=required_sample_size)
diff --git a/protzilla/data_analysis/ptm_analysis.py b/protzilla/data_analysis/ptm_analysis.py
index 7917699d..8368b73d 100644
--- a/protzilla/data_analysis/ptm_analysis.py
+++ b/protzilla/data_analysis/ptm_analysis.py
@@ -1,15 +1,15 @@
 import logging
-from math import log
+import re
 
 import numpy as np
 import pandas as pd
-import re
 
 from protzilla.utilities.transform_dfs import long_to_wide
 
 
 def filter_peptides_of_protein(
-        peptide_df: pd.DataFrame, protein_ids: list[str],
+    peptide_df: pd.DataFrame,
+    protein_ids: list[str],
 ) -> dict:
     """
     This function filters out all peptides with a PEP value (assigned to all samples
@@ -23,15 +23,21 @@ def filter_peptides_of_protein(
 
     filtered_peptide_dfs = [pd.DataFrame] * len(protein_ids)
     for i, protein_id in enumerate(protein_ids):
-        filtered_peptide_dfs[i] = peptide_df[peptide_df["Protein ID"].str.contains(protein_id)]
+        filtered_peptide_dfs[i] = peptide_df[
+            peptide_df["Protein ID"].str.contains(protein_id)
+        ]
     filtered_peptides = pd.concat(filtered_peptide_dfs)
 
     return dict(
         peptide_df=filtered_peptides,
-        messages=[{
-            "level": logging.INFO if len(filtered_peptides) > 0 else logging.WARNING,
-            "msg": f"Selected {len(filtered_peptides)} entry's from the peptide dataframe."
-        }],
+        messages=[
+            {
+                "level": logging.INFO
+                if len(filtered_peptides) > 0
+                else logging.WARNING,
+                "msg": f"Selected {len(filtered_peptides)} entry's from the peptide dataframe.",
+            }
+        ],
     )
 
 
@@ -48,8 +54,12 @@ def ptms_per_sample(peptide_df: pd.DataFrame) -> dict:
     modification_df = peptide_df[["Sample", "Modifications"]]
 
     modification_df = pd.concat(
-        [modification_df["Sample"],
-         (modification_df['Modifications'].str.get_dummies(sep=","))], axis=1)
+        [
+            modification_df["Sample"],
+            (modification_df["Modifications"].str.get_dummies(sep=",")),
+        ],
+        axis=1,
+    )
 
     for column, data in modification_df.iteritems():
         amount, name = from_string(column)
@@ -80,7 +90,7 @@ def ptms_per_protein_and_sample(peptide_df: pd.DataFrame) -> dict:
     modification_df = peptide_df[["Sample", "Protein ID", "Modifications"]]
 
     modification_df = modification_df[["Sample", "Protein ID"]].join(
-        modification_df['Modifications'].str.get_dummies(sep=",")
+        modification_df["Modifications"].str.get_dummies(sep=",")
     )
 
     for column, data in modification_df.iteritems():
@@ -95,16 +105,19 @@ def ptms_per_protein_and_sample(peptide_df: pd.DataFrame) -> dict:
 
     modification_df = modification_df.reset_index()
 
-    modi = (
-        modification_df.drop(["Sample", "Protein ID"], axis=1).apply(lambda x: ('(' + x.astype(str) + ') ' + x.name + ", ")))
+    modi = modification_df.drop(["Sample", "Protein ID"], axis=1).apply(
+        lambda x: ("(" + x.astype(str) + ") " + x.name + ", ")
+    )
 
     for column, data in modi.iteritems():
         modi[column] = np.where(modification_df[column] > 0, modi[column], "")
 
-    modification_df["Modifications"] = modi.apply(''.join, axis=1)
-    modification_df = modification_df[['Sample', 'Protein ID', 'Modifications']]
+    modification_df["Modifications"] = modi.apply("".join, axis=1)
+    modification_df = modification_df[["Sample", "Protein ID", "Modifications"]]
 
-    modification_df = long_to_wide(modification_df, "Modifications").fillna("").reset_index()
+    modification_df = (
+        long_to_wide(modification_df, "Modifications").fillna("").reset_index()
+    )
 
     return dict(ptm_df=modification_df)
 
@@ -118,9 +131,9 @@ def from_string(mod_string: str) -> tuple[int, str]:
     :return: tuple containing the amount and name of the modification
     """
 
-    re_search = re.search(r'\d+', mod_string)
+    re_search = re.search(r"\d+", mod_string)
     amount = int(re_search.group()) if re_search else 1
-    name = re.search(r'\D+', mod_string).group()
+    name = re.search(r"\D+", mod_string).group()
     name = name[1:] if name[0] == " " else name
 
     return amount, name
diff --git a/protzilla/data_integration/di_plots.py b/protzilla/data_integration/di_plots.py
index 74d50ff1..ef19515f 100644
--- a/protzilla/data_integration/di_plots.py
+++ b/protzilla/data_integration/di_plots.py
@@ -108,7 +108,6 @@ def GO_enrichment_bar_plot(
     elif value == "p-value":
         column = "P-value" if restring_input else "Adjusted P-value"
 
-
     if colors == "" or colors is None or len(colors) == 0:
         colors = PROTZILLA_DISCRETE_COLOR_SEQUENCE
     size_y = top_terms * 0.5 * len(gene_sets)
diff --git a/protzilla/data_preprocessing/filter_proteins.py b/protzilla/data_preprocessing/filter_proteins.py
index 5e0bb7b8..479503f8 100644
--- a/protzilla/data_preprocessing/filter_proteins.py
+++ b/protzilla/data_preprocessing/filter_proteins.py
@@ -1,6 +1,7 @@
 import pandas as pd
 
 from protzilla.data_preprocessing.plots import create_bar_plot, create_pie_plot
+
 from ..utilities.transform_dfs import long_to_wide
 
 
@@ -30,9 +31,7 @@ def by_samples_missing(
     filtered_proteins_list = (
         transformed_df.drop(remaining_proteins_list, axis=1).columns.unique().tolist()
     )
-    filtered_df = protein_df[
-        (protein_df["Protein ID"].isin(remaining_proteins_list))
-    ]
+    filtered_df = protein_df[(protein_df["Protein ID"].isin(remaining_proteins_list))]
     filtered_peptide_df = None
     if peptide_df is not None:
         filtered_peptide_df = peptide_df[
diff --git a/protzilla/data_preprocessing/normalisation.py b/protzilla/data_preprocessing/normalisation.py
index e2be755b..ec4398bf 100644
--- a/protzilla/data_preprocessing/normalisation.py
+++ b/protzilla/data_preprocessing/normalisation.py
@@ -229,64 +229,50 @@ def by_reference_protein(
 
 
 def by_z_score_plot(
-        method_inputs,
-        method_outputs,
-        graph_type,
-        group_by,
-        visual_transformation
+    method_inputs, method_outputs, graph_type, group_by, visual_transformation
 ):
     return _build_box_hist_plot(
         method_inputs["protein_df"],
         method_outputs["protein_df"],
         graph_type,
         group_by,
-        visual_transformation
+        visual_transformation,
     )
 
 
 def by_median_plot(
-        method_inputs,
-        method_outputs,
-        graph_type,
-        group_by,
-        visual_transformation
+    method_inputs, method_outputs, graph_type, group_by, visual_transformation
 ):
     return _build_box_hist_plot(
         method_inputs["protein_df"],
         method_outputs["protein_df"],
-        graph_type, group_by,
-        visual_transformation
+        graph_type,
+        group_by,
+        visual_transformation,
     )
 
 
 def by_totalsum_plot(
-        method_inputs,
-        method_outputs,
-        graph_type,
-        group_by,
-        visual_transformation
+    method_inputs, method_outputs, graph_type, group_by, visual_transformation
 ):
     return _build_box_hist_plot(
         method_inputs["protein_df"],
         method_outputs["protein_df"],
-        graph_type, group_by,
-        visual_transformation
+        graph_type,
+        group_by,
+        visual_transformation,
     )
 
 
 def by_reference_protein_plot(
-        method_inputs,
-        method_outputs,
-        graph_type,
-        group_by,
-        visual_transformation
+    method_inputs, method_outputs, graph_type, group_by, visual_transformation
 ):
     return _build_box_hist_plot(
         method_inputs["protein_df"],
         method_outputs["protein_df"],
         graph_type,
         group_by,
-        visual_transformation
+        visual_transformation,
     )
 
 
diff --git a/protzilla/data_preprocessing/outlier_detection.py b/protzilla/data_preprocessing/outlier_detection.py
index be7b8eb8..b5008830 100644
--- a/protzilla/data_preprocessing/outlier_detection.py
+++ b/protzilla/data_preprocessing/outlier_detection.py
@@ -10,14 +10,15 @@
     create_pca_2d_scatter_plot,
     create_pca_3d_scatter_plot,
 )
+
 from ..utilities.transform_dfs import long_to_wide
 
 
 def by_isolation_forest(
-        protein_df: pd.DataFrame,
-        peptide_df: pd.DataFrame | None,
-        n_estimators: int = 100,
-        n_jobs: int = -1,
+    protein_df: pd.DataFrame,
+    peptide_df: pd.DataFrame | None,
+    n_estimators: int = 100,
+    n_jobs: int = -1,
 ) -> dict:
     """
     This function filters out outliers using a clustering
@@ -62,8 +63,11 @@ def by_isolation_forest(
         ].index.tolist()
 
         protein_df = protein_df[~(protein_df["Sample"].isin(outlier_list))]
-        peptide_df = (None if peptide_df is None
-                      else peptide_df[~(peptide_df["Sample"].isin(outlier_list))])
+        peptide_df = (
+            None
+            if peptide_df is None
+            else peptide_df[~(peptide_df["Sample"].isin(outlier_list))]
+        )
 
         return dict(
             protein_df=protein_df,
@@ -125,8 +129,11 @@ def by_local_outlier_factor(
         outlier_list = df_lof_data[df_lof_data["Outlier"]].index.tolist()
 
         protein_df = protein_df[~(protein_df["Sample"].isin(outlier_list))]
-        peptide_df = (None if peptide_df is None
-                      else peptide_df[~(peptide_df["Sample"].isin(outlier_list))])
+        peptide_df = (
+            None
+            if peptide_df is None
+            else peptide_df[~(peptide_df["Sample"].isin(outlier_list))]
+        )
 
         return dict(
             protein_df=protein_df,
@@ -232,8 +239,11 @@ def by_pca(
             df_transformed_pca_data["Outlier"]
         ].index.tolist()
         protein_df = protein_df[~(protein_df["Sample"].isin(outlier_list))]
-        peptide_df = (None if peptide_df is None
-                      else peptide_df[~(peptide_df["Sample"].isin(outlier_list))])
+        peptide_df = (
+            None
+            if peptide_df is None
+            else peptide_df[~(peptide_df["Sample"].isin(outlier_list))]
+        )
 
         return dict(
             protein_df=protein_df,
diff --git a/protzilla/data_preprocessing/peptide_filter.py b/protzilla/data_preprocessing/peptide_filter.py
index 3b1caee9..80dafe9e 100644
--- a/protzilla/data_preprocessing/peptide_filter.py
+++ b/protzilla/data_preprocessing/peptide_filter.py
@@ -3,9 +3,7 @@
 from protzilla.data_preprocessing.plots import create_bar_plot, create_pie_plot
 
 
-def by_pep_value(
-    peptide_df: pd.DataFrame, threshold: float
-) -> dict:
+def by_pep_value(peptide_df: pd.DataFrame, threshold: float) -> dict:
     """
     This function filters out all peptides with a PEP value (assigned to all samples
     together for each peptide) below a certain threshold.
diff --git a/protzilla/data_preprocessing/transformation.py b/protzilla/data_preprocessing/transformation.py
index 221b01ab..401ea396 100644
--- a/protzilla/data_preprocessing/transformation.py
+++ b/protzilla/data_preprocessing/transformation.py
@@ -5,7 +5,9 @@
 from protzilla.utilities import default_intensity_column
 
 
-def by_log(protein_df: pd.DataFrame, peptide_df: pd.DataFrame | None, log_base="log10") -> dict:
+def by_log(
+    protein_df: pd.DataFrame, peptide_df: pd.DataFrame | None, log_base="log10"
+) -> dict:
     """
     This function log-transforms intensity
     DataFrames. Supports log-transformation to the base
diff --git a/protzilla/importing/ms_data_import.py b/protzilla/importing/ms_data_import.py
index c3d9136f..8dce747b 100644
--- a/protzilla/importing/ms_data_import.py
+++ b/protzilla/importing/ms_data_import.py
@@ -11,7 +11,10 @@
 
 
 def max_quant_import(
-    file_path: str, intensity_name: str, map_to_uniprot=False, aggregation_method: str ="Sum"
+    file_path: str,
+    intensity_name: str,
+    map_to_uniprot=False,
+    aggregation_method: str = "Sum",
 ) -> dict:
     assert intensity_name in ["Intensity", "iBAQ", "LFQ intensity"]
     try:
@@ -34,15 +37,28 @@ def max_quant_import(
             c[len(intensity_name) + 1 :] for c in intensity_df.columns
         ]
         intensity_df = intensity_df.assign(**{"Protein ID": protein_groups})
-        return transform_and_clean(intensity_df, intensity_name, map_to_uniprot, aggregation_method)
+        return transform_and_clean(
+            intensity_df, intensity_name, map_to_uniprot, aggregation_method
+        )
 
     except Exception as e:
         msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid Max Quant file."
-        return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))])
+        return dict(
+            messages=[
+                dict(
+                    level=logging.ERROR,
+                    msg=msg,
+                    trace=format_trace(traceback.format_exception(e)),
+                )
+            ]
+        )
 
 
 def ms_fragger_import(
-    file_path: str, intensity_name: str, map_to_uniprot=False, aggregation_method: str ="Sum"
+    file_path: str,
+    intensity_name: str,
+    map_to_uniprot=False,
+    aggregation_method: str = "Sum",
 ) -> dict:
     assert intensity_name in [
         "Intensity",
@@ -87,13 +103,25 @@ def ms_fragger_import(
         )
         intensity_df = intensity_df.assign(**{"Protein ID": protein_groups})
 
-        return transform_and_clean(intensity_df, intensity_name, map_to_uniprot, aggregation_method)
+        return transform_and_clean(
+            intensity_df, intensity_name, map_to_uniprot, aggregation_method
+        )
     except Exception as e:
         msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid MS Fragger file."
-        return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))])
+        return dict(
+            messages=[
+                dict(
+                    level=logging.ERROR,
+                    msg=msg,
+                    trace=format_trace(traceback.format_exception(e)),
+                )
+            ]
+        )
 
 
-def diann_import(file_path, map_to_uniprot=False, aggregation_method: str ="Sum") -> dict:
+def diann_import(
+    file_path, map_to_uniprot=False, aggregation_method: str = "Sum"
+) -> dict:
     try:
         df = pd.read_csv(
             file_path,
@@ -117,14 +145,27 @@ def diann_import(file_path, map_to_uniprot=False, aggregation_method: str ="Sum"
 
         intensity_name = "Intensity"
 
-        return transform_and_clean(intensity_df, intensity_name, map_to_uniprot, aggregation_method)
+        return transform_and_clean(
+            intensity_df, intensity_name, map_to_uniprot, aggregation_method
+        )
     except Exception as e:
         msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid DIA-NN MS file."
-        return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))])
+        return dict(
+            messages=[
+                dict(
+                    level=logging.ERROR,
+                    msg=msg,
+                    trace=format_trace(traceback.format_exception(e)),
+                )
+            ]
+        )
 
 
 def transform_and_clean(
-    df: pd.DataFrame, intensity_name: str, map_to_uniprot: bool, aggregation_method: str ="Sum"
+    df: pd.DataFrame,
+    intensity_name: str,
+    map_to_uniprot: bool,
+    aggregation_method: str = "Sum",
 ) -> dict:
     """
     Transforms a dataframe that is read from a file in wide format into long format,
@@ -158,7 +199,9 @@ def transform_and_clean(
     # applies the selected aggregation to duplicate protein groups, NaN if all are NaN, aggregation of numbers otherwise
     aggregation_method = aggregation_method.lower()
     agg_kwargs = {"sum": {"min_count": 1}, "median": {}, "mean": {}}
-    df = df.groupby("Protein ID", as_index=False).agg(aggregation_method, **agg_kwargs[aggregation_method])
+    df = df.groupby("Protein ID", as_index=False).agg(
+        aggregation_method, **agg_kwargs[aggregation_method]
+    )
 
     df = df.assign(Gene=lambda _: np.nan)  # add deprecated genes column
 
@@ -230,7 +273,7 @@ def clean_protein_groups(protein_groups, map_to_uniprot=True):
                 all_ids_of_group.extend(new_ids)
             else:
                 all_ids_of_group.append(old_id)
-        new_groups.append(all_ids_of_group[0] if all_ids_of_group else '')
+        new_groups.append(all_ids_of_group[0] if all_ids_of_group else "")
     return new_groups, removed_protein_ids
 
 
diff --git a/protzilla/importing/peptide_import.py b/protzilla/importing/peptide_import.py
index d38495dd..3056f1d3 100644
--- a/protzilla/importing/peptide_import.py
+++ b/protzilla/importing/peptide_import.py
@@ -47,9 +47,7 @@ def peptide_import(file_path, intensity_name, map_to_uniprot) -> dict:
     )
 
     molten = molten.rename(columns={"Leading razor protein": "Protein ID"})
-    ordered = molten[
-        ["Sample", "Protein ID", "Sequence", "Intensity", "PEP"]
-    ]
+    ordered = molten[["Sample", "Protein ID", "Sequence", "Intensity", "PEP"]]
     ordered.dropna(subset=["Protein ID"], inplace=True)
     ordered.sort_values(by=["Sample", "Protein ID"], ignore_index=True, inplace=True)
 
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index d5b2ccb4..c1150335 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -8,12 +8,12 @@
 )
 from protzilla.data_analysis.differential_expression_anova import anova
 from protzilla.data_analysis.differential_expression_linear_model import linear_model
-from protzilla.data_analysis.differential_expression_mann_whitney import mann_whitney_test_on_columns, \
-    mann_whitney_test_on_intensity_data
+from protzilla.data_analysis.differential_expression_mann_whitney import (
+    mann_whitney_test_on_columns,
+    mann_whitney_test_on_intensity_data,
+)
 from protzilla.data_analysis.differential_expression_t_test import t_test
 from protzilla.data_analysis.dimension_reduction import t_sne, umap
-from protzilla.data_analysis.ptm_analysis import filter_peptides_of_protein, ptms_per_sample, \
-    ptms_per_protein_and_sample
 from protzilla.data_analysis.model_evaluation import evaluate_classification_model
 from protzilla.data_analysis.plots import (
     clustergram_plot,
@@ -21,10 +21,18 @@
     prot_quant_plot,
     scatter_plot,
 )
-from protzilla.data_analysis.power_analysis import sample_size_calculation, power_calculation
+from protzilla.data_analysis.power_analysis import (
+    power_calculation,
+    sample_size_calculation,
+)
 from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph
+from protzilla.data_analysis.ptm_analysis import (
+    filter_peptides_of_protein,
+    ptms_per_protein_and_sample,
+    ptms_per_sample,
+)
 from protzilla.methods.data_preprocessing import TransformationLog
-from protzilla.steps import Plots, Step, StepManager, DisplayOutput
+from protzilla.steps import Plots, Step, StepManager
 
 
 class DataAnalysisStep(Step):
@@ -157,8 +165,10 @@ def plot(self, inputs):
 class DifferentialExpressionMannWhitneyOnIntensity(DataAnalysisStep):
     display_name = "Mann-Whitney Test"
     operation = "differential_expression"
-    method_description = ("A function to conduct a Mann-Whitney U test between groups defined in the clinical data."
-                          "The p-values are corrected for multiple testing.")
+    method_description = (
+        "A function to conduct a Mann-Whitney U test between groups defined in the clinical data."
+        "The p-values are corrected for multiple testing."
+    )
 
     input_keys = [
         "intensity_df",
@@ -181,8 +191,13 @@ def method(self, inputs: dict) -> dict:
         return mann_whitney_test_on_intensity_data(**inputs)
 
     def insert_dataframes(self, steps: StepManager, inputs) -> dict:
-        if steps.get_step_output(Step, "protein_df", inputs["intensity_df"]) is not None:
-            inputs["intensity_df"] = steps.get_step_output(Step, "protein_df", inputs["intensity_df"])
+        if (
+            steps.get_step_output(Step, "protein_df", inputs["intensity_df"])
+            is not None
+        ):
+            inputs["intensity_df"] = steps.get_step_output(
+                Step, "protein_df", inputs["intensity_df"]
+            )
         inputs["metadata_df"] = steps.metadata_df
         inputs["log_base"] = steps.get_step_input(TransformationLog, "log_base")
         return inputs
@@ -191,8 +206,10 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
 class DifferentialExpressionMannWhitneyOnPTM(DataAnalysisStep):
     display_name = "Mann-Whitney Test"
     operation = "Peptide analysis"
-    method_description = ("A function to conduct a Mann-Whitney U test between groups defined in the clinical data."
-                          "The p-values are corrected for multiple testing.")
+    method_description = (
+        "A function to conduct a Mann-Whitney U test between groups defined in the clinical data."
+        "The p-values are corrected for multiple testing."
+    )
 
     input_keys = [
         "df",
@@ -223,7 +240,9 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         return inputs
 
     def handle_outputs(self, outputs: dict) -> None:
-        outputs["differentially_expressed_ptm_df"] = outputs.pop("differential_expressed_columns_df", None)
+        outputs["differentially_expressed_ptm_df"] = outputs.pop(
+            "differential_expressed_columns_df", None
+        )
         outputs["significant_ptm_df"] = outputs.pop("significant_columns_df", None)
         super().handle_outputs(outputs)
 
@@ -702,17 +721,23 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         )
 
         if inputs["auto_select"]:
-            significant_proteins = (
-                steps.get_step_output(DataAnalysisStep, "significant_proteins_df", inputs["protein_list"]))
-            index_of_most_significant_protein = significant_proteins['corrected_p_value'].idxmin()
-            most_significant_protein = significant_proteins.loc[index_of_most_significant_protein]
+            significant_proteins = steps.get_step_output(
+                DataAnalysisStep, "significant_proteins_df", inputs["protein_list"]
+            )
+            index_of_most_significant_protein = significant_proteins[
+                "corrected_p_value"
+            ].idxmin()
+            most_significant_protein = significant_proteins.loc[
+                index_of_most_significant_protein
+            ]
             inputs["protein_id"] = [most_significant_protein["Protein ID"]]
-            self.messages.append({
-                "level": logging.INFO,
-                "msg":
-                    f"Selected the most significant Protein: {most_significant_protein['Protein ID']}, "
-                    f"from {inputs['protein_list']}"
-            })
+            self.messages.append(
+                {
+                    "level": logging.INFO,
+                    "msg": f"Selected the most significant Protein: {most_significant_protein['Protein ID']}, "
+                    f"from {inputs['protein_list']}",
+                }
+            )
 
         return inputs
 
@@ -720,8 +745,10 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
 class PTMsPerSample(DataAnalysisStep):
     display_name = "PTMs per Sample"
     operation = "Peptide analysis"
-    method_description = ("Analyze the post-translational modifications (PTMs) of a single protein of interest. "
-                          "This function requires a peptide dataframe with PTM information.")
+    method_description = (
+        "Analyze the post-translational modifications (PTMs) of a single protein of interest. "
+        "This function requires a peptide dataframe with PTM information."
+    )
 
     input_keys = [
         "peptide_df",
@@ -743,8 +770,10 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
 class PTMsProteinAndPerSample(DataAnalysisStep):
     display_name = "PTMs per Sample and Protein"
     operation = "Peptide analysis"
-    method_description = ("Analyze the post-translational modifications (PTMs) of all Proteins. "
-                          "This function requires a peptide dataframe with PTM information.")
+    method_description = (
+        "Analyze the post-translational modifications (PTMs) of all Proteins. "
+        "This function requires a peptide dataframe with PTM information."
+    )
 
     input_keys = [
         "peptide_df",
@@ -761,6 +790,8 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
             Step, "peptide_df", inputs["peptide_df"]
         )
         return inputs
+
+
 class PowerAnalysisPowerCalculation(DataAnalysisStep):
     display_name = "Power Calculation"
     operation = "Power Analysis"
@@ -780,6 +811,8 @@ class PowerAnalysisPowerCalculation(DataAnalysisStep):
     output_keys = ["power"]
 
     def method(self, inputs: dict) -> dict:
+        if "significant_proteins_only" in inputs:
+            del inputs["significant_proteins_only"]
         return power_calculation(**inputs)
 
     def insert_dataframes(self, steps: StepManager, inputs) -> dict:
@@ -822,7 +855,10 @@ class PowerAnalysisSampleSizeCalculation(DataAnalysisStep):
     output_keys = [
         "required_sample_size",
     ]
+
     def method(self, inputs: dict) -> dict:
+        if "significant_proteins_only" in inputs:
+            del inputs["significant_proteins_only"]
         return sample_size_calculation(**inputs)
 
     def insert_dataframes(self, steps: StepManager, inputs) -> dict:
@@ -843,4 +879,6 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
 
     def handle_outputs(self, outputs: dict):
         super().handle_outputs(outputs)
-        self.display_output["required_sample_size"] = f"Required Sample Size: {outputs['required_sample_size']}"
+        self.display_output[
+            "required_sample_size"
+        ] = f"Required Sample Size: {outputs['required_sample_size']}"
diff --git a/protzilla/methods/data_preprocessing.py b/protzilla/methods/data_preprocessing.py
index 0565eaf0..9099627e 100644
--- a/protzilla/methods/data_preprocessing.py
+++ b/protzilla/methods/data_preprocessing.py
@@ -168,7 +168,7 @@ class TransformationLog(DataPreprocessingStep):
     operation = "transformation"
     method_description = "Transform data by log"
 
-    input_keys = [ "protein_df", "peptide_df", "log_base"]
+    input_keys = ["protein_df", "peptide_df", "log_base"]
 
     def method(self, inputs):
         return transformation.by_log(**inputs)
diff --git a/protzilla/methods/importing.py b/protzilla/methods/importing.py
index 7cde1ba0..1cec5aaa 100644
--- a/protzilla/methods/importing.py
+++ b/protzilla/methods/importing.py
@@ -10,7 +10,7 @@
     max_quant_import,
     ms_fragger_import,
 )
-from protzilla.importing.peptide_import import peptide_import, evidence_import
+from protzilla.importing.peptide_import import evidence_import, peptide_import
 from protzilla.steps import Step, StepManager
 
 
@@ -139,4 +139,4 @@ class EvidenceImport(ImportingStep):
     output_keys = ["peptide_df"]
 
     def method(self, inputs):
-        return evidence_import(**inputs)
\ No newline at end of file
+        return evidence_import(**inputs)
diff --git a/protzilla/steps.py b/protzilla/steps.py
index 32ce93b3..185e4f3e 100644
--- a/protzilla/steps.py
+++ b/protzilla/steps.py
@@ -311,28 +311,32 @@ def export(self, format_):
                     exports.append(BytesIO(base64.b64decode(plot)))
         return exports
 
-class DisplayOutput:
 
+class DisplayOutput:
     def __init__(self, display_output: dict = None):
         if display_output is None:
             display_output = {}
         self.display_output = display_output
+
     def __iter__(self):
         return iter(self.display_output)
+
     def __repr__(self):
         return f"DisplayOutput: {self.display_output}"
+
     def __contains__(self, key):
         return key in self.display_output
+
     def __getitem__(self, key):
         return self.display_output[key]
+
     def __setitem__(self, key, value):
         self.display_output[key] = value
+
     def is_empty(self) -> bool:
         return len(self.display_output) == 0
 
 
-
-
 class StepManager:
     def __repr__(self):
         return f"IMP: {self.importing} PRE: {self.data_preprocessing} ANA: {self.data_analysis} INT: {self.data_integration}"
diff --git a/protzilla/utilities/transform_dfs.py b/protzilla/utilities/transform_dfs.py
index f3605b08..4ca02446 100644
--- a/protzilla/utilities/transform_dfs.py
+++ b/protzilla/utilities/transform_dfs.py
@@ -17,7 +17,9 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str = None):
         packages such as sklearn
     :rtype: pd.DataFrame
     """
-    values_name = default_intensity_column(intensity_df) if value_name is None else value_name
+    values_name = (
+        default_intensity_column(intensity_df) if value_name is None else value_name
+    )
     return pd.pivot(
         intensity_df, index="Sample", columns="Protein ID", values=values_name
     )
diff --git a/tests/conftest.py b/tests/conftest.py
index ea8728b1..1bc23e04 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -207,29 +207,259 @@ def peptides_df():
 def evidence_peptide_df():
     df = pd.DataFrame(
         (
-            ["Sample1", "Protein1", "SEQA", 1000000, "Unmodified", "_SEQA_", 1, 0.00001, "Raw_File_1"],
-            ["Sample1", "Protein2", "SEQB", 2000000, "Unmodified", "_SEQB_", None, 0.00002, "Raw_File_1"],
-            ["Sample1", "Protein2", "SEQC", 3000000, "Acetyl (Protein N-term)", "_(Acetyl (Protein N-term))SEQC_", None, 0.00003, "Raw_File_1"],
-            ["Sample1", "Protein2", "SEQD", 4000000, "Acetyl (Protein N-term),Oxidation (M)", "_(Acetyl (Protein N-term))SE(Oxidation (M))QD_", None, 0.00004, "Raw_File_1"],
-            ["Sample1", "Protein3", "SEQE", 5000000, "Unmodified", "_SEQE_", None, 0.00005, "Raw_File_1"],
-            ["Sample1", "Protein3", "SEQF", 6000000, "Unmodified", "_SEQF_", None, 0.00006, "Raw_File_1"],
-            ["Sample1", "Protein3", "SEQG", 7000000, "Unmodified", "_SEQG_", None, 0.00007, "Raw_File_1"],
-            ["Sample1", "Protein4", "SEQH", 8000000, "Unmodified", "_SEQH_", None, 0.00008, "Raw_File_1"],
-            ["Sample1", "Protein5", "SEQI", 9000000, "Unmodified", "_SEQI_", None, 0.00009, "Raw_File_1"],
-            ["Sample2", "Protein1", "SEQJ", 10000000, "Acetyl (Protein N-term)", "_(Acetyl (Protein N-term))SEQJ_", None, 0.0001, "Raw_File_2"],
-            ["Sample2", "Protein2", "SEQK", 11000000, "Unmodified", "_SEQK_", None, 0.00011, "Raw_File_2"],
-            ["Sample2", "Protein3", "SEQL", 12000000, "Unmodified", "_SEQL_", None, 0.00012, "Raw_File_2"],
-            ["Sample2", "Protein4", "SEQM", 13000000, "Unmodified", "_SEQM_", None, 0.00013, "Raw_File_2"],
-            ["Sample2", "Protein5", "SEQN", 14000000, "Unmodified", "_SEQN_", None, 0.00014, "Raw_File_2"],
-            ["Sample3", "Protein1", "SEQO", 15000000, "Unmodified", "_SEQO_", None, 0.00015, "Raw_File_3"],
-            ["Sample3", "Protein2", "SEQP", 16000000, "Unmodified", "_SEQP_", None, 0.00016, "Raw_File_3"],
-            ["Sample3", "Protein3", "SEQQ", 17000000, "Unmodified", "_SEQQ_", None, 0.00017, "Raw_File_3"],
-            ["Sample3", "Protein4", "SEQR", 18000000, "Unmodified", "_SEQR_", None, 0.00018, "Raw_File_3"],
-            ["Sample3", "Protein5", "SEQS", 19000000, "Unmodified", "_SEQS_", None, 0.00019, "Raw_File_3"],
-            ["Sample4", "Protein1", "SEQT", 20000000, "Unmodified", "_SEQT_", None, 0.0002, "Raw_File_4"],
-            ["Sample4", "Protein2", "SEQU", 21000000, "Unmodified", "_SEQU_", None, 0.00021, "Raw_File_4"],
-            ["Sample4", "Protein3", "SEQV", 22000000, "Unmodified", "_SEQV_", None, 0.00022, "Raw_File_4"],
-            ["Sample4", "Protein4", "SEQW", 23000000, "Unmodified", "_SEQW_", None, 0.00023, "Raw_File_4"],
+            [
+                "Sample1",
+                "Protein1",
+                "SEQA",
+                1000000,
+                "Unmodified",
+                "_SEQA_",
+                1,
+                0.00001,
+                "Raw_File_1",
+            ],
+            [
+                "Sample1",
+                "Protein2",
+                "SEQB",
+                2000000,
+                "Unmodified",
+                "_SEQB_",
+                None,
+                0.00002,
+                "Raw_File_1",
+            ],
+            [
+                "Sample1",
+                "Protein2",
+                "SEQC",
+                3000000,
+                "Acetyl (Protein N-term)",
+                "_(Acetyl (Protein N-term))SEQC_",
+                None,
+                0.00003,
+                "Raw_File_1",
+            ],
+            [
+                "Sample1",
+                "Protein2",
+                "SEQD",
+                4000000,
+                "Acetyl (Protein N-term),Oxidation (M)",
+                "_(Acetyl (Protein N-term))SE(Oxidation (M))QD_",
+                None,
+                0.00004,
+                "Raw_File_1",
+            ],
+            [
+                "Sample1",
+                "Protein3",
+                "SEQE",
+                5000000,
+                "Unmodified",
+                "_SEQE_",
+                None,
+                0.00005,
+                "Raw_File_1",
+            ],
+            [
+                "Sample1",
+                "Protein3",
+                "SEQF",
+                6000000,
+                "Unmodified",
+                "_SEQF_",
+                None,
+                0.00006,
+                "Raw_File_1",
+            ],
+            [
+                "Sample1",
+                "Protein3",
+                "SEQG",
+                7000000,
+                "Unmodified",
+                "_SEQG_",
+                None,
+                0.00007,
+                "Raw_File_1",
+            ],
+            [
+                "Sample1",
+                "Protein4",
+                "SEQH",
+                8000000,
+                "Unmodified",
+                "_SEQH_",
+                None,
+                0.00008,
+                "Raw_File_1",
+            ],
+            [
+                "Sample1",
+                "Protein5",
+                "SEQI",
+                9000000,
+                "Unmodified",
+                "_SEQI_",
+                None,
+                0.00009,
+                "Raw_File_1",
+            ],
+            [
+                "Sample2",
+                "Protein1",
+                "SEQJ",
+                10000000,
+                "Acetyl (Protein N-term)",
+                "_(Acetyl (Protein N-term))SEQJ_",
+                None,
+                0.0001,
+                "Raw_File_2",
+            ],
+            [
+                "Sample2",
+                "Protein2",
+                "SEQK",
+                11000000,
+                "Unmodified",
+                "_SEQK_",
+                None,
+                0.00011,
+                "Raw_File_2",
+            ],
+            [
+                "Sample2",
+                "Protein3",
+                "SEQL",
+                12000000,
+                "Unmodified",
+                "_SEQL_",
+                None,
+                0.00012,
+                "Raw_File_2",
+            ],
+            [
+                "Sample2",
+                "Protein4",
+                "SEQM",
+                13000000,
+                "Unmodified",
+                "_SEQM_",
+                None,
+                0.00013,
+                "Raw_File_2",
+            ],
+            [
+                "Sample2",
+                "Protein5",
+                "SEQN",
+                14000000,
+                "Unmodified",
+                "_SEQN_",
+                None,
+                0.00014,
+                "Raw_File_2",
+            ],
+            [
+                "Sample3",
+                "Protein1",
+                "SEQO",
+                15000000,
+                "Unmodified",
+                "_SEQO_",
+                None,
+                0.00015,
+                "Raw_File_3",
+            ],
+            [
+                "Sample3",
+                "Protein2",
+                "SEQP",
+                16000000,
+                "Unmodified",
+                "_SEQP_",
+                None,
+                0.00016,
+                "Raw_File_3",
+            ],
+            [
+                "Sample3",
+                "Protein3",
+                "SEQQ",
+                17000000,
+                "Unmodified",
+                "_SEQQ_",
+                None,
+                0.00017,
+                "Raw_File_3",
+            ],
+            [
+                "Sample3",
+                "Protein4",
+                "SEQR",
+                18000000,
+                "Unmodified",
+                "_SEQR_",
+                None,
+                0.00018,
+                "Raw_File_3",
+            ],
+            [
+                "Sample3",
+                "Protein5",
+                "SEQS",
+                19000000,
+                "Unmodified",
+                "_SEQS_",
+                None,
+                0.00019,
+                "Raw_File_3",
+            ],
+            [
+                "Sample4",
+                "Protein1",
+                "SEQT",
+                20000000,
+                "Unmodified",
+                "_SEQT_",
+                None,
+                0.0002,
+                "Raw_File_4",
+            ],
+            [
+                "Sample4",
+                "Protein2",
+                "SEQU",
+                21000000,
+                "Unmodified",
+                "_SEQU_",
+                None,
+                0.00021,
+                "Raw_File_4",
+            ],
+            [
+                "Sample4",
+                "Protein3",
+                "SEQV",
+                22000000,
+                "Unmodified",
+                "_SEQV_",
+                None,
+                0.00022,
+                "Raw_File_4",
+            ],
+            [
+                "Sample4",
+                "Protein4",
+                "SEQW",
+                23000000,
+                "Unmodified",
+                "_SEQW_",
+                None,
+                0.00023,
+                "Raw_File_4",
+            ],
         ),
         columns=[
             "Sample",
@@ -241,7 +471,7 @@ def evidence_peptide_df():
             "Missed cleavages",
             "PEP",
             "Raw file",
-        ]
+        ],
     )
     return df
 
diff --git a/tests/protzilla/data_analysis/test_analysis_plots.py b/tests/protzilla/data_analysis/test_analysis_plots.py
index 3b665b0b..4d6f8156 100644
--- a/tests/protzilla/data_analysis/test_analysis_plots.py
+++ b/tests/protzilla/data_analysis/test_analysis_plots.py
@@ -83,7 +83,9 @@ def test_plots_volcano_plot_no_annotation(ttest_input, ttest_output, show_figure
         fig.show()
 
 
-def test_plots_volcano_plot_multiple_annotations(ttest_input, ttest_output, show_figures):
+def test_plots_volcano_plot_multiple_annotations(
+    ttest_input, ttest_output, show_figures
+):
     fig = create_volcano_plot(
         p_values=ttest_output["corrected_p_values_df"],
         log2_fc=ttest_output["log2_fold_change_df"],
diff --git a/tests/protzilla/data_analysis/test_differential_expression.py b/tests/protzilla/data_analysis/test_differential_expression.py
index 43ad6021..d9f75d28 100644
--- a/tests/protzilla/data_analysis/test_differential_expression.py
+++ b/tests/protzilla/data_analysis/test_differential_expression.py
@@ -386,4 +386,4 @@ def test_differential_expression_anova(show_figures):
         1.0000,
     ]
 
-    assert assertion_p_values == p_values_rounded
\ No newline at end of file
+    assert assertion_p_values == p_values_rounded
diff --git a/tests/protzilla/data_analysis/test_filter_peptites_of_protein.py b/tests/protzilla/data_analysis/test_filter_peptites_of_protein.py
index b191f335..85981476 100644
--- a/tests/protzilla/data_analysis/test_filter_peptites_of_protein.py
+++ b/tests/protzilla/data_analysis/test_filter_peptites_of_protein.py
@@ -1,13 +1,18 @@
-import pytest
-
 from protzilla.data_analysis.ptm_analysis import filter_peptides_of_protein
 
 
 def test_filter_peptides_of_protein(peptides_df):
-    filtered_peptides_df = filter_peptides_of_protein(peptides_df, ["Protein2"])["peptide_df"]
+    filtered_peptides_df = filter_peptides_of_protein(peptides_df, ["Protein2"])[
+        "peptide_df"
+    ]
 
     assert len(filtered_peptides_df) == 6
     assert filtered_peptides_df["Sequence"].tolist() == [
-        "SEQB", "SEQC", "SEQD", "SEQK", "SEQP", "SEQU"
+        "SEQB",
+        "SEQC",
+        "SEQD",
+        "SEQK",
+        "SEQP",
+        "SEQU",
     ]
-    assert (filtered_peptides_df["Protein ID"] == "Protein2").all()
\ No newline at end of file
+    assert (filtered_peptides_df["Protein ID"] == "Protein2").all()
diff --git a/tests/protzilla/data_analysis/test_peptide_analysis.py b/tests/protzilla/data_analysis/test_peptide_analysis.py
index b2848a65..40f225cb 100644
--- a/tests/protzilla/data_analysis/test_peptide_analysis.py
+++ b/tests/protzilla/data_analysis/test_peptide_analysis.py
@@ -1,18 +1,28 @@
 import pytest
 
-from protzilla.data_analysis.ptm_analysis import filter_peptides_of_protein, ptms_per_sample, \
-    ptms_per_protein_and_sample
+from protzilla.data_analysis.ptm_analysis import (
+    filter_peptides_of_protein,
+    ptms_per_protein_and_sample,
+    ptms_per_sample,
+)
 
 
 @pytest.mark.parametrize("df_num", [0, 1])
 def test_filter_peptides_of_protein(peptides_df, evidence_peptide_df, df_num):
     peptide_df = [peptides_df, evidence_peptide_df][df_num]
 
-    filtered_peptides_df = filter_peptides_of_protein(peptide_df, ["Protein2"])["peptide_df"]
+    filtered_peptides_df = filter_peptides_of_protein(peptide_df, ["Protein2"])[
+        "peptide_df"
+    ]
 
     assert len(filtered_peptides_df) == 6
     assert filtered_peptides_df["Sequence"].tolist() == [
-        'SEQB', 'SEQC', 'SEQD', 'SEQK', 'SEQP', 'SEQU'
+        "SEQB",
+        "SEQC",
+        "SEQD",
+        "SEQK",
+        "SEQP",
+        "SEQU",
     ]
     assert (filtered_peptides_df["Protein ID"] == "Protein2").all()
 
@@ -20,7 +30,12 @@ def test_filter_peptides_of_protein(peptides_df, evidence_peptide_df, df_num):
 def test_ptms_per_sampel(evidence_peptide_df):
     ptm_df = ptms_per_sample(evidence_peptide_df)["ptm_df"]
 
-    assert ptm_df.columns.tolist() == ["Sample", "Acetyl (Protein N-term)", "Oxidation (M)", "Unmodified"]
+    assert ptm_df.columns.tolist() == [
+        "Sample",
+        "Acetyl (Protein N-term)",
+        "Oxidation (M)",
+        "Unmodified",
+    ]
     assert ptm_df["Sample"].tolist() == ["Sample1", "Sample2", "Sample3", "Sample4"]
     assert ptm_df["Unmodified"].tolist() == [7, 4, 5, 4]
     assert ptm_df["Acetyl (Protein N-term)"].tolist() == [2, 1, 0, 0]
@@ -30,15 +45,42 @@ def test_ptms_per_sampel(evidence_peptide_df):
 def test_ptms_per_protein_and_sample(evidence_peptide_df):
     ptm_df = ptms_per_protein_and_sample(evidence_peptide_df)["ptm_df"]
 
-    assert ptm_df.columns.tolist() == ["Sample", "Protein1", "Protein2", "Protein3", "Protein4", "Protein5"]
+    assert ptm_df.columns.tolist() == [
+        "Sample",
+        "Protein1",
+        "Protein2",
+        "Protein3",
+        "Protein4",
+        "Protein5",
+    ]
     assert ptm_df["Sample"].tolist() == ["Sample1", "Sample2", "Sample3", "Sample4"]
-    assert (ptm_df["Protein1"].tolist() ==
-            ["(1) Unmodified, ", "(1) Acetyl (Protein N-term), ", "(1) Unmodified, ", "(1) Unmodified, "])
-    assert (ptm_df["Protein2"].tolist() ==
-            ["(2) Acetyl (Protein N-term), (1) Oxidation (M), (1) Unmodified, ", "(1) Unmodified, ", "(1) Unmodified, ", "(1) Unmodified, "])
-    assert (ptm_df["Protein3"].tolist() ==
-            ["(3) Unmodified, ", "(1) Unmodified, ", "(1) Unmodified, ", "(1) Unmodified, "])
-    assert (ptm_df["Protein4"].tolist() ==
-            ["(1) Unmodified, ", "(1) Unmodified, ", "(1) Unmodified, ", "(1) Unmodified, "])
-    assert (ptm_df["Protein5"].tolist() ==
-            ["(1) Unmodified, ", "(1) Unmodified, ", "(1) Unmodified, ", ""])
\ No newline at end of file
+    assert ptm_df["Protein1"].tolist() == [
+        "(1) Unmodified, ",
+        "(1) Acetyl (Protein N-term), ",
+        "(1) Unmodified, ",
+        "(1) Unmodified, ",
+    ]
+    assert ptm_df["Protein2"].tolist() == [
+        "(2) Acetyl (Protein N-term), (1) Oxidation (M), (1) Unmodified, ",
+        "(1) Unmodified, ",
+        "(1) Unmodified, ",
+        "(1) Unmodified, ",
+    ]
+    assert ptm_df["Protein3"].tolist() == [
+        "(3) Unmodified, ",
+        "(1) Unmodified, ",
+        "(1) Unmodified, ",
+        "(1) Unmodified, ",
+    ]
+    assert ptm_df["Protein4"].tolist() == [
+        "(1) Unmodified, ",
+        "(1) Unmodified, ",
+        "(1) Unmodified, ",
+        "(1) Unmodified, ",
+    ]
+    assert ptm_df["Protein5"].tolist() == [
+        "(1) Unmodified, ",
+        "(1) Unmodified, ",
+        "(1) Unmodified, ",
+        "",
+    ]
diff --git a/tests/protzilla/data_analysis/test_plots_data_analysis.py b/tests/protzilla/data_analysis/test_plots_data_analysis.py
index 60403907..ae2d9957 100644
--- a/tests/protzilla/data_analysis/test_plots_data_analysis.py
+++ b/tests/protzilla/data_analysis/test_plots_data_analysis.py
@@ -101,14 +101,20 @@ def test_scatter_plot_4d_df(wide_4d_df, color_df):
 
     assert "messages" in outputs
     assert "plots" not in outputs
-    assert any("Consider reducing the dimensionality" in message["msg"] for message in outputs["messages"])
+    assert any(
+        "Consider reducing the dimensionality" in message["msg"]
+        for message in outputs["messages"]
+    )
 
 
 def test_scatter_plot_color_df_2d(show_figures, wide_2d_df):
     outputs = scatter_plot(wide_2d_df, wide_2d_df)
     assert "messages" in outputs
     assert "plots" not in outputs
-    assert any("The color dataframe should have 1 dimension only" in message["msg"] for message in outputs["messages"])
+    assert any(
+        "The color dataframe should have 1 dimension only" in message["msg"]
+        for message in outputs["messages"]
+    )
 
 
 def test_clustergram(show_figures, wide_4d_df, color_df):
@@ -151,8 +157,10 @@ def test_clustergram_input_not_right_type(wide_4d_df):
     assert "messages" in outputs2
     assert "plots" not in outputs2
     assert any(
-        'The selected input for "grouping dataframe" is not a dataframe, ' in message["msg"]
-        for message in outputs2["messages"])
+        'The selected input for "grouping dataframe" is not a dataframe, '
+        in message["msg"]
+        for message in outputs2["messages"]
+    )
 
 
 def test_clustergram_dimension_mismatch(wide_4d_df):
@@ -176,7 +184,10 @@ def test_clustergram_dimension_mismatch(wide_4d_df):
     )
     assert "messages" in outputs
     assert "plots" not in outputs
-    assert any("There is a dimension mismatch" in message["msg"] for message in outputs["messages"])
+    assert any(
+        "There is a dimension mismatch" in message["msg"]
+        for message in outputs["messages"]
+    )
 
 
 def test_clustergram_different_samples(wide_4d_df):
@@ -200,6 +211,7 @@ def test_clustergram_different_samples(wide_4d_df):
     assert "messages" in outputs
     assert "plots" not in outputs
     assert any(
-        "The input dataframe and the grouping contain different samples" in message["msg"]
+        "The input dataframe and the grouping contain different samples"
+        in message["msg"]
         for message in outputs["messages"]
-    )
\ No newline at end of file
+    )
diff --git a/tests/protzilla/data_analysis/test_power_analysis.py b/tests/protzilla/data_analysis/test_power_analysis.py
index ceded6e3..5b2f92ed 100644
--- a/tests/protzilla/data_analysis/test_power_analysis.py
+++ b/tests/protzilla/data_analysis/test_power_analysis.py
@@ -2,8 +2,13 @@
 import pandas as pd
 import pytest
 
-
-from protzilla.data_analysis.power_analysis import sample_size_calculation, power_calculation, check_sample_size_calculation_with_libfunc, check_sample_size_calculation_implemented, check_sample_size_calculation_implemented_without_log
+from protzilla.data_analysis.power_analysis import (
+    check_sample_size_calculation_implemented,
+    check_sample_size_calculation_implemented_without_log,
+    check_sample_size_calculation_with_libfunc,
+    power_calculation,
+    sample_size_calculation,
+)
 
 
 @pytest.fixture
@@ -42,9 +47,7 @@ def power_test_data():
     return test_differentially_expressed_proteins_df
 
 
-def test_variance_protein_group_calculation(
-        power_test_data
-):
+def test_variance_protein_group_calculation(power_test_data):
     intensity_df = power_test_data
 
     protein_id = "Protein1"
@@ -57,37 +60,31 @@ def test_variance_protein_group_calculation(
     print(variance)
     assert variance == 4.0
 
-def test_sample_size_calculation(
-        power_test_data
 
-):
+def test_sample_size_calculation(power_test_data):
     test_alpha = 0.05
     test_power = 0.8
     test_fc_threshold = 1
     test_selected_protein_group = "Protein1"
 
-
     required_sample_size = sample_size_calculation(
         differentially_expressed_proteins_df=power_test_data,
         significant_proteins_df=power_test_data,
         fc_threshold=test_fc_threshold,
         power=test_power,
         alpha=test_alpha,
-        group1= "Group1",
-        group2= "Group2",
+        group1="Group1",
+        group2="Group2",
         selected_protein_group=test_selected_protein_group,
         significant_proteins_only=False,
-        intensity_name=None
+        intensity_name=None,
     )
     print(required_sample_size)
-    required_sample_size_int = next(iter(required_sample_size.values()),None)
+    required_sample_size_int = next(iter(required_sample_size.values()), None)
     assert required_sample_size_int == 63
 
 
-def test_check_sample_size_calculation_with_libfun(
-        power_test_data
-
-):
+def test_check_sample_size_calculation_with_libfun(power_test_data):
     test_alpha = 0.05
     test_power = 0.8
     test_fc_threshold = 5
@@ -103,20 +100,20 @@ def test_check_sample_size_calculation_with_libfun(
         group2="Group2",
         selected_protein_group=test_selected_protein_group,
         significant_proteins_only=False,
-        intensity_name=None
+        intensity_name=None,
     )
     print(required_sample_size)
     required_sample_size_int = next(iter(required_sample_size.values()), None)
     assert required_sample_size_int == 63
 
-def test_check_sample_size_calculation_impl(
-        power_test_data
 
-):
+def test_check_sample_size_calculation_impl(power_test_data):
     test_alpha = 0.05
     test_power = 0.8
     power_test_data_log2 = power_test_data.copy()
-    power_test_data_log2["Normalised iBAQ"] = np.log2(power_test_data_log2["Normalised iBAQ"])
+    power_test_data_log2["Normalised iBAQ"] = np.log2(
+        power_test_data_log2["Normalised iBAQ"]
+    )
     fc_threshold = 1
     test_selected_protein_group = "Protein1"
 
@@ -130,16 +127,14 @@ def test_check_sample_size_calculation_impl(
         group2="Group2",
         selected_protein_group=test_selected_protein_group,
         significant_proteins_only=False,
-        intensity_name=None
+        intensity_name=None,
     )
     print(required_sample_size)
     required_sample_size_int = next(iter(required_sample_size.values()), None)
     assert required_sample_size_int == 1
 
-def test_check_sample_size_calculation_implemented_without_log(
-        power_test_data
 
-):
+def test_check_sample_size_calculation_implemented_without_log(power_test_data):
     test_alpha = 0.05
     test_power = 0.8
     test_fc_threshold = 5
@@ -155,22 +150,18 @@ def test_check_sample_size_calculation_implemented_without_log(
         group2="Group2",
         selected_protein_group=test_selected_protein_group,
         significant_proteins_only=False,
-        intensity_name=None
+        intensity_name=None,
     )
     print(required_sample_size)
     required_sample_size_int = next(iter(required_sample_size.values()), None)
     assert required_sample_size_int == 63
 
 
-
-def test_power_calculation(
-        power_test_data
-):
+def test_power_calculation(power_test_data):
     test_alpha = 0.05
     test_fc_threshold = 1
     test_selected_protein_group = "Protein1"
 
-
     power = power_calculation(
         differentially_expressed_proteins_df=power_test_data,
         significant_proteins_df=power_test_data,
@@ -180,8 +171,8 @@ def test_power_calculation(
         group2="Group2",
         selected_protein_group=test_selected_protein_group,
         significant_proteins_only=False,
-        intensity_name=None
+        intensity_name=None,
     )
     print(power)
     power_int = next(iter(power.values()), None)
-    assert power_int== 0.09
+    assert power_int == 0.09
diff --git a/tests/protzilla/data_integration/test_plots_data_integration.py b/tests/protzilla/data_integration/test_plots_data_integration.py
index 044c7c5a..39c801f5 100644
--- a/tests/protzilla/data_integration/test_plots_data_integration.py
+++ b/tests/protzilla/data_integration/test_plots_data_integration.py
@@ -68,7 +68,10 @@ def test_enrichment_bar_plot_wrong_value(data_folder_tests):
         gene_sets=["Reactome_2013"],
     )
     assert "messages" in current_out
-    assert any(("FDR is not available" in message["msg"]) for message in current_out["messages"])
+    assert any(
+        ("FDR is not available" in message["msg"])
+        for message in current_out["messages"]
+    )
 
 
 def test_enrichment_bar_plot_empty_df():
@@ -81,7 +84,9 @@ def test_enrichment_bar_plot_empty_df():
         gene_sets=["Reactome_2013"],
     )
     assert "messages" in current_out
-    assert any(("No data to plot" in message["msg"]) for message in current_out["messages"])
+    assert any(
+        ("No data to plot" in message["msg"]) for message in current_out["messages"]
+    )
 
 
 def test_enrichment_bar_plot_no_category(data_folder_tests):
@@ -92,7 +97,10 @@ def test_enrichment_bar_plot_no_category(data_folder_tests):
         input_df=enrichment_df, top_terms=10, cutoff=0.05, value="p_value", gene_sets=[]
     )
     assert "messages" in current_out
-    assert any(("Please select at least one category" in message["msg"]) for message in current_out["messages"])
+    assert any(
+        ("Please select at least one category" in message["msg"])
+        for message in current_out["messages"]
+    )
 
 
 def test_enrichment_bar_plot_wrong_df():
@@ -105,7 +113,10 @@ def test_enrichment_bar_plot_wrong_df():
         gene_sets=["KEGG"],
     )
     assert "messages" in current_out
-    assert any(("Please choose an enrichment result dataframe" in message["msg"]) for message in current_out["messages"])
+    assert any(
+        ("Please choose an enrichment result dataframe" in message["msg"])
+        for message in current_out["messages"]
+    )
 
 
 def test_enrichment_bar_plot_cutoff(data_folder_tests):
@@ -119,7 +130,10 @@ def test_enrichment_bar_plot_cutoff(data_folder_tests):
     )
 
     assert "messages" in current_out
-    assert any(("No data to plot when applying cutoff" in message["msg"]) for message in current_out["messages"])
+    assert any(
+        ("No data to plot when applying cutoff" in message["msg"])
+        for message in current_out["messages"]
+    )
 
     enrichment_df = pd.read_csv(
         data_folder_tests / "Reactome_enrichment_enrichr.csv", sep="\t"
@@ -132,7 +146,10 @@ def test_enrichment_bar_plot_cutoff(data_folder_tests):
         gene_sets=["Reactome_2013"],
     )
     assert "messages" in current_out
-    assert any(("No data to plot when applying cutoff" in message["msg"]) for message in current_out["messages"])
+    assert any(
+        ("No data to plot when applying cutoff" in message["msg"])
+        for message in current_out["messages"]
+    )
 
 
 @pytest.mark.parametrize("x_axis_type", ["Gene Sets", "Combined Score"])
diff --git a/tests/protzilla/data_preprocessing/test_normalisation.py b/tests/protzilla/data_preprocessing/test_normalisation.py
index 8e5c2a45..cd2acdcc 100644
--- a/tests/protzilla/data_preprocessing/test_normalisation.py
+++ b/tests/protzilla/data_preprocessing/test_normalisation.py
@@ -349,7 +349,9 @@ def test_totalsum_normalisation(
     method_inputs = {"protein_df": normalisation_df}
     method_outputs = by_totalsum(**method_inputs)
 
-    fig = by_totalsum_plot(method_inputs, method_outputs, "Boxplot", "Sample", "log10")[0]
+    fig = by_totalsum_plot(method_inputs, method_outputs, "Boxplot", "Sample", "log10")[
+        0
+    ]
     if show_figures:
         fig.show()
 
@@ -376,9 +378,9 @@ def test_ref_protein_normalisation(
     }
     method_outputs = by_reference_protein(**method_input)
 
-    fig = by_reference_protein_plot(method_input, method_outputs, "Boxplot", "Sample", "log10")[
-        0
-    ]
+    fig = by_reference_protein_plot(
+        method_input, method_outputs, "Boxplot", "Sample", "log10"
+    )[0]
     if show_figures:
         fig.show()
 
diff --git a/tests/protzilla/data_preprocessing/test_outlier_detection.py b/tests/protzilla/data_preprocessing/test_outlier_detection.py
index e94f84b2..f21e18e0 100644
--- a/tests/protzilla/data_preprocessing/test_outlier_detection.py
+++ b/tests/protzilla/data_preprocessing/test_outlier_detection.py
@@ -65,8 +65,7 @@ def outlier_detection_df_with_nan():
 
 
 def test_outlier_detection_with_isolation_forest(
-    show_figures, outlier_detection_df,
-        peptides_df
+    show_figures, outlier_detection_df, peptides_df
 ):
     method_inputs = {
         "protein_df": outlier_detection_df,
diff --git a/tests/protzilla/data_preprocessing/test_peptide_preprocessing.py b/tests/protzilla/data_preprocessing/test_peptide_preprocessing.py
index f3900fdd..1de769f5 100644
--- a/tests/protzilla/data_preprocessing/test_peptide_preprocessing.py
+++ b/tests/protzilla/data_preprocessing/test_peptide_preprocessing.py
@@ -1,5 +1,4 @@
 import pandas as pd
-import pytest
 
 from protzilla.constants.paths import TEST_DATA_PATH
 from protzilla.data_preprocessing.peptide_filter import by_pep_value, by_pep_value_plot
@@ -57,4 +56,3 @@ def test_pep_filter(show_figures, leftover_peptide_df, filtered_peptides_list):
 
     pd.testing.assert_frame_equal(method_outputs["peptide_df"], leftover_peptide_df)
     assert method_outputs["filtered_peptides"] == filtered_peptides_list
-
diff --git a/tests/protzilla/importing/test_ms_data_import.py b/tests/protzilla/importing/test_ms_data_import.py
index 457fe145..e7909db8 100644
--- a/tests/protzilla/importing/test_ms_data_import.py
+++ b/tests/protzilla/importing/test_ms_data_import.py
@@ -218,7 +218,9 @@ def test_max_quant_import_no_protein_ids_column():
     assert "protein_df" not in outputs
     assert "messages" in outputs
     assert any(message["level"] == logging.ERROR for message in outputs["messages"])
-    assert any("Majority protein IDs" in message["msg"] for message in outputs["messages"])
+    assert any(
+        "Majority protein IDs" in message["msg"] for message in outputs["messages"]
+    )
 
 
 def test_max_quant_import_invalid_data():
@@ -310,9 +312,7 @@ def test_transform_and_clean():
         ["C", "Q11111", np.nan],
     ]
     df = pd.DataFrame(data, columns=columns)
-    outputs = ms_data_import.transform_and_clean(
-        df, "intensity", map_to_uniprot=False
-    )
+    outputs = ms_data_import.transform_and_clean(df, "intensity", map_to_uniprot=False)
     expected_df = pd.DataFrame(expected_output, columns=out_col)
 
     # we do not care about the genes column, it is deprecated (and replaced by nan)
diff --git a/tests/protzilla/test_runner.py b/tests/protzilla/test_runner.py
index b5de3148..0d251eda 100644
--- a/tests/protzilla/test_runner.py
+++ b/tests/protzilla/test_runner.py
@@ -12,8 +12,8 @@
 sys.path.append(f"{PROJECT_PATH}")
 
 from protzilla.runner import Runner, _serialize_graphs
-from runner_cli import args_parser
 from protzilla.steps import Output, Plots
+from runner_cli import args_parser
 
 
 @pytest.fixture
@@ -43,7 +43,8 @@ def mock_current_parameters(*args, **kwargs):
 
         # side effect to mark the step as finished
         runner.run.current_step.output = Output(
-            {key: "mock_output_value" for key in runner.run.current_step.output_keys})
+            {key: "mock_output_value" for key in runner.run.current_step.output_keys}
+        )
         if len(runner.run.current_step.output_keys) == 0:
             runner.run.current_step.plots = Plots(["mock_plot"])
 
@@ -88,34 +89,61 @@ def test_runner_imports(
     runner.compute_workflow()
 
     expected_methods = [
-        'MaxQuantImport',
-        'MetadataImport',
-        'FilterProteinsBySamplesMissing',
-        'FilterSamplesByProteinIntensitiesSum',
-        'ImputationByKNN',
-        'OutlierDetectionByLocalOutlierFactor',
-        'NormalisationByMedian',
-        'TransformationLog',
-        'PlotProtQuant',
-        'DifferentialExpressionTTest',
-        'PlotVolcano',
-        'EnrichmentAnalysisGOAnalysisWithString',
-        'PlotGOEnrichmentBarPlot'
+        "MaxQuantImport",
+        "MetadataImport",
+        "FilterProteinsBySamplesMissing",
+        "FilterSamplesByProteinIntensitiesSum",
+        "ImputationByKNN",
+        "OutlierDetectionByLocalOutlierFactor",
+        "NormalisationByMedian",
+        "TransformationLog",
+        "PlotProtQuant",
+        "DifferentialExpressionTTest",
+        "PlotVolcano",
+        "EnrichmentAnalysisGOAnalysisWithString",
+        "PlotGOEnrichmentBarPlot",
     ]
     expected_method_parameters = [
-        call({'intensity_name': 'iBAQ', 'map_to_uniprot': False, 'aggregation_mode': 'Sum', 'file_path': 'tests/proteinGroups_small_cut.txt'}),
-        call({'feature_orientation': 'Columns (samples in rows, features in columns)', 'file_path': 'tests/metadata_cut_columns.csv'}),
-        call({'percentage': 0.5}),
-        call({'deviation_threshold': 2.0}),
-        call({'number_of_neighbours': 5}),
-        call({'number_of_neighbors': 20}),
-        call({'percentile': 0.5}),
-        call({'log_base': 'log2'}),
-        call({'similarity_measure': 'euclidean distance'}),
-        call({'alpha': 0.05}),
-        call({'fc_threshold': 1}),
-        call({'differential_expression_threshold': 1, 'direction': 'both', 'gene_sets_restring': [], 'organism': 9606}),
-        call({'colors': [], 'cutoff': 0.05, 'gene_sets': ['Process', 'Component', 'Function', 'KEGG'], 'top_terms': 10, 'value': 'p-value'})
+        call(
+            {
+                "intensity_name": "iBAQ",
+                "map_to_uniprot": False,
+                "aggregation_mode": "Sum",
+                "file_path": "tests/proteinGroups_small_cut.txt",
+            }
+        ),
+        call(
+            {
+                "feature_orientation": "Columns (samples in rows, features in columns)",
+                "file_path": "tests/metadata_cut_columns.csv",
+            }
+        ),
+        call({"percentage": 0.5}),
+        call({"deviation_threshold": 2.0}),
+        call({"number_of_neighbours": 5}),
+        call({"number_of_neighbors": 20}),
+        call({"percentile": 0.5}),
+        call({"log_base": "log2"}),
+        call({"similarity_measure": "euclidean distance"}),
+        call({"alpha": 0.05}),
+        call({"fc_threshold": 1}),
+        call(
+            {
+                "differential_expression_threshold": 1,
+                "direction": "both",
+                "gene_sets_restring": [],
+                "organism": 9606,
+            }
+        ),
+        call(
+            {
+                "colors": [],
+                "cutoff": 0.05,
+                "gene_sets": ["Process", "Component", "Function", "KEGG"],
+                "top_terms": 10,
+                "value": "p-value",
+            }
+        ),
     ]
 
     assert mock_method.call_count == 13
@@ -168,10 +196,21 @@ def test_runner_calculates(monkeypatch, tests_folder_name, ms_data_path, metadat
         "FilterProteinsBySamplesMissing",
     ]
     assert mock_method.call_args_list == [
-        call({'intensity_name': 'iBAQ', 'map_to_uniprot': False, 'aggregation_method': 'Sum', 'file_path': 'tests/proteinGroups_small_cut.txt'}),
-        call({'feature_orientation': 'Columns (samples in rows, features in columns)',
-              'file_path': 'tests/metadata_cut_columns.csv'}),
-        call({'percentage': 0.5})
+        call(
+            {
+                "intensity_name": "iBAQ",
+                "map_to_uniprot": False,
+                "aggregation_method": "Sum",
+                "file_path": "tests/proteinGroups_small_cut.txt",
+            }
+        ),
+        call(
+            {
+                "feature_orientation": "Columns (samples in rows, features in columns)",
+                "file_path": "tests/metadata_cut_columns.csv",
+            }
+        ),
+        call({"percentage": 0.5}),
     ]
     mock_plot.assert_not_called()
 
@@ -251,7 +290,9 @@ def test_serialize_workflow_graphs():
             assert _serialize_graphs(step["graphs"]) == serial_filter_graphs
 
 
-def test_integration_runner(metadata_path, ms_data_path, tests_folder_name, monkeypatch):
+def test_integration_runner(
+    metadata_path, ms_data_path, tests_folder_name, monkeypatch
+):
     name = tests_folder_name + "/test_runner_integration_" + random_string()
     runner = Runner(
         **{
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index 74c5a149..7811c6b9 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -1,24 +1,26 @@
-import logging
 from enum import Enum, StrEnum
 
-from protzilla.methods.data_preprocessing import DataPreprocessingStep
 from protzilla.methods.data_analysis import (
+    DataAnalysisStep,
     DifferentialExpressionLinearModel,
     DifferentialExpressionTTest,
-    DimensionReductionUMAP, DataAnalysisStep, SelectPeptidesForProtein, PTMsPerSample,
+    DimensionReductionUMAP,
+    PTMsPerSample,
+    SelectPeptidesForProtein,
 )
+from protzilla.methods.data_preprocessing import DataPreprocessingStep
 from protzilla.run import Run
 from protzilla.steps import Step
 
 from . import fill_helper
 from .base import MethodForm
 from .custom_fields import (
+    CustomBooleanField,
     CustomCharField,
     CustomChoiceField,
     CustomFloatField,
     CustomMultipleChoiceField,
     CustomNumberField,
-    CustomBooleanField,
 )
 
 
@@ -143,6 +145,7 @@ class DimensionReductionMetric(Enum):
     cosine = "cosine"
     havensine = "havensine"
 
+
 class DifferentialExpressionANOVAForm(MethodForm):
     is_dynamic = True
 
@@ -295,9 +298,13 @@ class DifferentialExpressionMannWhitneyOnIntensityForm(MethodForm):
     group2 = CustomChoiceField(choices=[], label="Group 2")
 
     def fill_form(self, run: Run) -> None:
-        self.fields["intensity_df"].choices = fill_helper.get_choices_for_protein_df_steps(run)
+        self.fields[
+            "intensity_df"
+        ].choices = fill_helper.get_choices_for_protein_df_steps(run)
 
-        self.fields["grouping"].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
+        self.fields[
+            "grouping"
+        ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
 
         grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0])
 
@@ -325,9 +332,7 @@ def fill_form(self, run: Run) -> None:
 class DifferentialExpressionMannWhitneyOnPTMForm(MethodForm):
     is_dynamic = True
 
-    ptm_df = CustomChoiceField(
-        choices=[], label="Step to use ptm data from"
-    )
+    ptm_df = CustomChoiceField(choices=[], label="Step to use ptm data from")
     multiple_testing_correction_method = CustomChoiceField(
         choices=MultipleTestingCorrectionMethod,
         label="Multiple testing correction",
@@ -345,7 +350,9 @@ def fill_form(self, run: Run) -> None:
             run.steps.get_instance_identifiers(PTMsPerSample, "ptm_df")
         )
 
-        self.fields["grouping"].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
+        self.fields[
+            "grouping"
+        ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
 
         grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0])
 
@@ -719,7 +726,7 @@ class ClassificationRandomForestForm(MethodForm):
     # TODO: Workflow_meta line 1763
     train_val_split = CustomNumberField(
         label="Choose the size of the validation data set (you can either enter the absolute number of validation "
-              "samples or a number between 0.0 and 1.0 to represent the percentage of validation samples)",
+        "samples or a number between 0.0 and 1.0 to represent the percentage of validation samples)",
         initial=0.20,
     )
     # TODO: Workflow_meta line 1770
@@ -807,7 +814,7 @@ class ClassificationSVMForm(MethodForm):
     )
     train_val_split = CustomNumberField(
         label="Choose the size of the validation data set (you can either enter the absolute number of validation "
-              "samples or a number between 0.0 and 1.0 to represent the percentage of validation samples)",
+        "samples or a number between 0.0 and 1.0 to represent the percentage of validation samples)",
         initial=0.20,
     )
     # TODO: Workflow_meta line 1973
@@ -924,7 +931,7 @@ class DimensionReductionUMAPForm(MethodForm):
     )
     n_neighbors = CustomNumberField(
         label="The size of local neighborhood (in terms of number of neighboring sample points) used for manifold "
-              "approximation",
+        "approximation",
         min_value=2,
         max_value=100,
         step_size=1,
@@ -965,7 +972,7 @@ class ProteinGraphPeptidesToIsoformForm(MethodForm):
     k = CustomNumberField(label="k-mer length", min_value=1, step_size=1, initial=5)
     allowed_mismatches = CustomNumberField(
         label="Number of allowed mismatched amino acids per peptide. For many allowed mismatches, this can take a "
-              "long time.",
+        "long time.",
         min_value=0,
         step_size=1,
         initial=2,
@@ -1016,29 +1023,33 @@ def fill_form(self, run: Run) -> None:
 
         selected_auto_select = self.data.get("auto_select")
 
-        choices = fill_helper.to_choices([] if selected_auto_select else ["all proteins"])
-        choices.extend(fill_helper.get_choices(
-            run, "significant_proteins_df", DataAnalysisStep
-        ))
+        choices = fill_helper.to_choices(
+            [] if selected_auto_select else ["all proteins"]
+        )
+        choices.extend(
+            fill_helper.get_choices(run, "significant_proteins_df", DataAnalysisStep)
+        )
         self.fields["protein_list"].choices = choices
 
-        chosen_list = self.data.get("protein_list", self.fields["protein_list"].choices[0][0])
+        chosen_list = self.data.get(
+            "protein_list", self.fields["protein_list"].choices[0][0]
+        )
         if not selected_auto_select:
             self.toggle_visibility("sort_proteins", True)
             self.toggle_visibility("protein_ids", True)
 
             if chosen_list == "all proteins":
                 self.fields["protein_ids"].choices = fill_helper.to_choices(
-                    run.steps.get_step_output(
-                        Step, "protein_df"
-                    )["Protein ID"].unique()
+                    run.steps.get_step_output(Step, "protein_df")["Protein ID"].unique()
                 )
             else:
                 if self.data.get("sort_proteins"):
                     self.fields["protein_ids"].choices = fill_helper.to_choices(
                         run.steps.get_step_output(
                             DataAnalysisStep, "significant_proteins_df", chosen_list
-                        ).sort_values(by="corrected_p_value")["Protein ID"].unique()
+                        )
+                        .sort_values(by="corrected_p_value")["Protein ID"]
+                        .unique()
                     )
                 else:
                     self.fields["protein_ids"].choices = fill_helper.to_choices(
@@ -1061,12 +1072,12 @@ def fill_form(self, run: Run) -> None:
         single_protein_peptides = run.steps.get_instance_identifiers(
             SelectPeptidesForProtein, "peptide_df"
         )
-        self.fields["peptide_df"].choices = fill_helper.to_choices(single_protein_peptides)
-
-        self.fields["peptide_df"].choices = fill_helper.get_choices(
-            run, "peptide_df"
+        self.fields["peptide_df"].choices = fill_helper.to_choices(
+            single_protein_peptides
         )
 
+        self.fields["peptide_df"].choices = fill_helper.get_choices(run, "peptide_df")
+
         single_protein_peptides = run.steps.get_instance_identifiers(
             SelectPeptidesForProtein, "peptide_df"
         )
@@ -1081,15 +1092,15 @@ class PTMsPerProteinAndSampleForm(MethodForm):
     )
 
     def fill_form(self, run: Run) -> None:
-        self.fields["peptide_df"].choices = fill_helper.get_choices(
-            run, "peptide_df"
-        )
+        self.fields["peptide_df"].choices = fill_helper.get_choices(run, "peptide_df")
 
         single_protein_peptides = run.steps.get_instance_identifiers(
             SelectPeptidesForProtein, "peptide_df"
         )
         if single_protein_peptides:
             self.fields["peptide_df"].initial = single_protein_peptides[0]
+
+
 class PowerAnalysisPowerCalculationForm(MethodForm):
     is_dynamic = True
 
@@ -1099,10 +1110,10 @@ class PowerAnalysisPowerCalculationForm(MethodForm):
     )
     alpha = CustomFloatField(
         label="Error rate (alpha)",
-        min_value = 0,
-        max_value = 1,
-        step_size = 0.05,
-        initial = 0.05,
+        min_value=0,
+        max_value=1,
+        step_size=0.05,
+        initial=0.05,
     )
     fc_threshold = CustomFloatField(
         label="Log2 fold change threshold", min_value=0, initial=1
@@ -1136,7 +1147,8 @@ def fill_form(self, run: Run) -> None:
         )
 
         significant_proteins_only = self.data.get(
-            "significant_proteins_only", self.fields["significant_proteins_only"].choices[0][0]
+            "significant_proteins_only",
+            self.fields["significant_proteins_only"].choices[0][0],
         )
 
         if significant_proteins_only == YesNo.yes:
@@ -1166,17 +1178,17 @@ class PowerAnalysisSampleSizeCalculationForm(MethodForm):
     )
     alpha = CustomFloatField(
         label="Error rate (alpha)",
-        min_value = 0,
-        max_value = 1,
-        step_size = 0.05,
-        initial = 0.05,
+        min_value=0,
+        max_value=1,
+        step_size=0.05,
+        initial=0.05,
     )
     power = CustomFloatField(
         label="Power",
-        min_value = 0,
-        max_value = 1,
-        step_size = 0.05,
-        initial = 0.8,
+        min_value=0,
+        max_value=1,
+        step_size=0.05,
+        initial=0.8,
     )
     fc_threshold = CustomFloatField(
         label="Log2 fold change threshold", min_value=0, initial=1
@@ -1191,7 +1203,6 @@ class PowerAnalysisSampleSizeCalculationForm(MethodForm):
         label="Protein group to calculate sample size for",
     )
 
-
     def fill_form(self, run: Run) -> None:
         self.fields["input_dict"].choices = fill_helper.to_choices(
             run.steps.get_instance_identifiers(
@@ -1211,7 +1222,8 @@ def fill_form(self, run: Run) -> None:
         )
 
         significant_proteins_only = self.data.get(
-            "significant_proteins_only", self.fields["significant_proteins_only"].choices[0][0]
+            "significant_proteins_only",
+            self.fields["significant_proteins_only"].choices[0][0],
         )
 
         if significant_proteins_only == YesNo.yes:
diff --git a/ui/runs/views.py b/ui/runs/views.py
index 4de29692..7d2254ed 100644
--- a/ui/runs/views.py
+++ b/ui/runs/views.py
@@ -18,8 +18,8 @@
 from django.shortcuts import render
 from django.urls import reverse
 
-from protzilla.run_helper import log_messages
 from protzilla.run import Run, get_available_run_names
+from protzilla.run_helper import log_messages
 from protzilla.stepfactory import StepFactory
 from protzilla.steps import Step
 from protzilla.utilities.utilities import (
@@ -125,7 +125,9 @@ def detail(request: HttpRequest, run_name: str):
         run.steps.current_step.display_output is not None
         and not run.current_step.display_output.is_empty()
     )
-    display_output_text = next(iter(run.current_step.display_output.display_output.values()), None)
+    display_output_text = next(
+        iter(run.current_step.display_output.display_output.values()), None
+    )
 
     return render(
         request,
diff --git a/user_data/workflows/overhaul.yaml:Zone.Identifier b/user_data/workflows/overhaul.yaml:Zone.Identifier
new file mode 100644
index 00000000..71c6e851
--- /dev/null
+++ b/user_data/workflows/overhaul.yaml:Zone.Identifier
@@ -0,0 +1,3 @@
+[ZoneTransfer]
+ZoneId=3
+HostUrl=https://files.slack.com/files-pri/T055BG3H51R-F06U5LX84NS/download/overhaul.yaml?origin_team=E055BG3H51R

From cb25777ce725e0ad3bbc7e8990482b0ff431d5a3 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Wed, 28 Aug 2024 19:56:38 +0200
Subject: [PATCH 18/36] feature: user can choose whether metadata contains a
 column for individuals. If so, the mean values per individual are used to
 calculate the power and sample size.

---
 protzilla/data_analysis/power_analysis.py | 74 +++++++++++++++++++----
 protzilla/methods/data_analysis.py        | 14 ++---
 ui/runs/forms/data_analysis.py            | 20 +++++-
 ui/runs/forms/fill_helper.py              |  4 ++
 4 files changed, 91 insertions(+), 21 deletions(-)

diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py
index 4a303a18..7989ce7b 100644
--- a/protzilla/data_analysis/power_analysis.py
+++ b/protzilla/data_analysis/power_analysis.py
@@ -45,12 +45,14 @@ def variance_protein_group_calculation_max(
 def sample_size_calculation(
     differentially_expressed_proteins_df: pd.DataFrame,
     significant_proteins_df: pd.DataFrame,
+    metadata_df: pd.DataFrame,
     fc_threshold: float,
     alpha: float,
     power: float,
     group1: str,
     group2: str,
     selected_protein_group: str,
+    individual_column: str,
     intensity_name: str = None,
 ) -> dict:
     """
@@ -58,7 +60,6 @@ def sample_size_calculation(
 
     :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output.
     :param significant_proteins_df: The dataframe containing the significant proteins from t-test output.
-    :param significant_proteins_only: A boolean to display only significant proteins for selection to the user.
     :param fc_threshold: The fold change threshold.
     :param alpha: The significance level. The value for alpha is taken from the t-test by default.
     :param power: The power of the test.
@@ -79,8 +80,32 @@ def sample_size_calculation(
     z_alpha = stats.norm.ppf(1 - alpha / 2)
     z_beta = stats.norm.ppf(power)
 
+    intensity_name = default_intensity_column(
+        differentially_expressed_proteins_df, intensity_name
+    )
+    filtered_protein_group_df = differentially_expressed_proteins_df[
+        differentially_expressed_proteins_df["Protein ID"] == protein_group
+    ]
+
+    if individual_column != "None" and individual_column in metadata_df.columns:
+        # filtered_protein_group_df["Individual"] = filtered_protein_group_df["Sample"].apply(lambda x: x[:4])
+        filtered_protein_group_merged_df = pd.merge(
+            filtered_protein_group_df,
+            metadata_df[["Sample", individual_column]],
+            on="Sample",
+        )
+        # filtered_protein_group_df.join(metadata_df[["Sample", individual_column]].set_index("Sample"), on="Sample")
+
+        filtered_protein_group_df = (
+            filtered_protein_group_merged_df.groupby(
+                ["Protein ID", "Group", individual_column]
+            )[intensity_name]
+            .mean()
+            .reset_index()
+        )
+
     variance_protein_group = variance_protein_group_calculation_max(
-        intensity_df=differentially_expressed_proteins_df,
+        intensity_df=filtered_protein_group_df,
         protein_id=protein_group,
         group1=group1,
         group2=group2,
@@ -89,7 +114,7 @@ def sample_size_calculation(
 
     required_sample_size = (
         2 * ((z_alpha + z_beta) / fc_threshold) ** 2 * variance_protein_group
-    )
+    )  # Equation (1) in Cairns, David A., et al., 2008, Sample size determination in clinical proteomic profiling experiments using mass spectrometry for class comparison
     required_sample_size = math.ceil(required_sample_size)
     print(required_sample_size)
 
@@ -99,11 +124,13 @@ def sample_size_calculation(
 def power_calculation(
     differentially_expressed_proteins_df: pd.DataFrame,
     significant_proteins_df: pd.DataFrame,
+    metadata_df: pd.DataFrame,
     alpha: float,
     fc_threshold: float,
     group1: str,
     group2: str,
     selected_protein_group: str,
+    individual_column: str,
     intensity_name: str = None,
 ) -> dict:
     """
@@ -128,8 +155,33 @@ def power_calculation(
     protein_group = selected_protein_group
     z_alpha = stats.norm.ppf(1 - alpha / 2)
 
+    intensity_name = default_intensity_column(
+        differentially_expressed_proteins_df, intensity_name
+    )
+    filtered_protein_group_df = differentially_expressed_proteins_df[
+        differentially_expressed_proteins_df["Protein ID"] == protein_group
+    ]
+    if individual_column != "None" and individual_column in metadata_df.columns:
+        filtered_protein_group_merged_df = pd.merge(
+            filtered_protein_group_df,
+            metadata_df[["Sample", individual_column]],
+            on="Sample",
+        )
+        # filtered_protein_group_df.join(metadata_df[["Sample", individual_column]].set_index("Sample"), on="Sample")
+
+        filtered_protein_group_df = (
+            filtered_protein_group_merged_df.groupby(
+                ["Protein ID", "Group", individual_column]
+            )[intensity_name]
+            .mean()
+            .reset_index()
+        )
+        filtered_protein_group_df = filtered_protein_group_df.rename(
+            columns={individual_column: "Sample"}
+        )
+
     variance_protein_group = variance_protein_group_calculation_max(
-        intensity_df=differentially_expressed_proteins_df,
+        intensity_df=filtered_protein_group_df,
         protein_id=protein_group,
         group1=group1,
         group2=group2,
@@ -146,12 +198,12 @@ def power_calculation(
     filtered_df["Measurement"] = filtered_df["Sample"].apply(
         lambda x: int(x[-2:]))
     """
-    filtered_protein_df = differentially_expressed_proteins_df[
-        differentially_expressed_proteins_df["Protein ID"] == protein_group
-    ]
-    grouped_df = filtered_protein_df.groupby(["Group", "Protein ID"])["Sample"].count()
-    sample_size_group1 = grouped_df[group1][0]
-    sample_size_group2 = grouped_df[group2][0]
+
+    group_count_df = filtered_protein_group_df.groupby(["Group", "Protein ID"])[
+        "Sample"
+    ].count()
+    sample_size_group1 = group_count_df[group1][0]
+    sample_size_group2 = group_count_df[group2][0]
     sample_size = (2 * sample_size_group1 * sample_size_group2) / (
         sample_size_group1 + sample_size_group2
     )  # Equation 2.3.1 from Cohen 1988, Statistical Power Analysis for the Behavioral Sciences
@@ -160,4 +212,4 @@ def power_calculation(
     )
     power = float(round(stats.norm.cdf(z_beta), 2))
 
-    return dict(power=power)
+    return dict(power=power)
\ No newline at end of file
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index c1150335..62db5c10 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -802,17 +802,16 @@ class PowerAnalysisPowerCalculation(DataAnalysisStep):
         "differentially_expressed_proteins_df",
         "selected_protein_group",
         "significant_proteins_df",
-        "significant_proteins_only",
         "fc_threshold",
         "alpha",
         "group1",
         "group2",
+        "individual_column",
+        "metadata_df",
     ]
     output_keys = ["power"]
 
     def method(self, inputs: dict) -> dict:
-        if "significant_proteins_only" in inputs:
-            del inputs["significant_proteins_only"]
         return power_calculation(**inputs)
 
     def insert_dataframes(self, steps: StepManager, inputs) -> dict:
@@ -825,7 +824,7 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         inputs["significant_proteins_df"] = steps.get_step_output(
             Step, "significant_proteins_df", inputs["input_dict"]
         )
-
+        inputs["metadata_df"] = steps.metadata_df
         inputs["alpha"] = step.inputs["alpha"]
         inputs["group1"] = step.inputs["group1"]
         inputs["group2"] = step.inputs["group2"]
@@ -845,20 +844,19 @@ class PowerAnalysisSampleSizeCalculation(DataAnalysisStep):
         "differentially_expressed_proteins_df",
         "selected_protein_group",
         "significant_proteins_df",
-        "significant_proteins_only",
         "fc_threshold",
         "alpha",
         "group1",
         "group2",
         "power",
+        "individual_column",
+        "metadata_df",
     ]
     output_keys = [
         "required_sample_size",
     ]
 
     def method(self, inputs: dict) -> dict:
-        if "significant_proteins_only" in inputs:
-            del inputs["significant_proteins_only"]
         return sample_size_calculation(**inputs)
 
     def insert_dataframes(self, steps: StepManager, inputs) -> dict:
@@ -871,7 +869,7 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         inputs["significant_proteins_df"] = steps.get_step_output(
             Step, "significant_proteins_df", inputs["input_dict"]
         )
-
+        inputs["metadata_df"] = steps.metadata_df
         inputs["alpha"] = step.inputs["alpha"]
         inputs["group1"] = step.inputs["group1"]
         inputs["group2"] = step.inputs["group2"]
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index 7811c6b9..d7508230 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -1127,6 +1127,10 @@ class PowerAnalysisPowerCalculationForm(MethodForm):
         choices=[],
         label="Protein group to calculate power for",
     )
+    individual_column = CustomChoiceField(
+        choices=[],
+        label="Column name for individuals in metadata, if it exists (mean value will be calculated per individual)",
+    )
 
     def fill_form(self, run: Run) -> None:
         self.fields["input_dict"].choices = fill_helper.to_choices(
@@ -1139,7 +1143,11 @@ def fill_form(self, run: Run) -> None:
         input_dict_instance_id = self.data.get(
             "input_dict", self.fields["input_dict"].choices[0][0]
         )
-
+        self.fields["individual_column"].choices = [
+            ("None", "None")
+        ] + fill_helper.get_choices_for_metadata_all_columns(run)
+        individual_column = self.data.get("individual_column", "None")
+        self.fields["individual_column"].initial = individual_column
         self.fields["selected_protein_group"].choices = fill_helper.to_choices(
             run.steps.get_step_output(
                 Step, "differentially_expressed_proteins_df", input_dict_instance_id
@@ -1202,6 +1210,10 @@ class PowerAnalysisSampleSizeCalculationForm(MethodForm):
         choices=[],
         label="Protein group to calculate sample size for",
     )
+    individual_column = CustomChoiceField(
+        choices=[],
+        label="Column name for individuals in metadata, if it exists (mean value will be calculated per individual)",
+    )
 
     def fill_form(self, run: Run) -> None:
         self.fields["input_dict"].choices = fill_helper.to_choices(
@@ -1214,7 +1226,11 @@ def fill_form(self, run: Run) -> None:
         input_dict_instance_id = self.data.get(
             "input_dict", self.fields["input_dict"].choices[0][0]
         )
-
+        self.fields["individual_column"].choices = [
+            ("None", "None")
+        ] + fill_helper.get_choices_for_metadata_all_columns(run)
+        individual_column = self.data.get("individual_column", "None")
+        self.fields["individual_column"].initial = individual_column
         self.fields["selected_protein_group"].choices = fill_helper.to_choices(
             run.steps.get_step_output(
                 Step, "differentially_expressed_proteins_df", input_dict_instance_id
diff --git a/ui/runs/forms/fill_helper.py b/ui/runs/forms/fill_helper.py
index 0b416f8f..641c7763 100644
--- a/ui/runs/forms/fill_helper.py
+++ b/ui/runs/forms/fill_helper.py
@@ -32,3 +32,7 @@ def get_choices_for_metadata_non_sample_columns(run: Run) -> list[tuple[str, str
             run.steps.metadata_df.columns != "Sample"
         ].unique()
     )
+
+
+def get_choices_for_metadata_all_columns(run: Run) -> list[tuple[str, str]]:
+    return to_choices(run.steps.metadata_df.columns)

From 52ef105633da888ad4999368028770cace9a6080 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Wed, 28 Aug 2024 21:13:08 +0200
Subject: [PATCH 19/36] adapted test for power_calculation and
 sample_size_calculation and checked values from paper of Cairns

---
 .../data_analysis/test_power_analysis.py      | 92 ++++++++++++++-----
 1 file changed, 67 insertions(+), 25 deletions(-)

diff --git a/tests/protzilla/data_analysis/test_power_analysis.py b/tests/protzilla/data_analysis/test_power_analysis.py
index 5b2f92ed..eb563d68 100644
--- a/tests/protzilla/data_analysis/test_power_analysis.py
+++ b/tests/protzilla/data_analysis/test_power_analysis.py
@@ -1,14 +1,20 @@
 import numpy as np
 import pandas as pd
 import pytest
+import math
+from scipy import stats
 
 from protzilla.data_analysis.power_analysis import (
-    check_sample_size_calculation_implemented,
-    check_sample_size_calculation_implemented_without_log,
-    check_sample_size_calculation_with_libfunc,
     power_calculation,
     sample_size_calculation,
+    variance_protein_group_calculation_max,
+)
+from protzilla.data_analysis.power_analysis_validation import (
+    check_sample_size_calculation_with_libfunc,
+    check_sample_size_calculation_implemented,
+    check_sample_size_calculation_implemented_without_log,
 )
+from test_differential_expression import diff_expr_test_data
 
 
 @pytest.fixture
@@ -54,35 +60,61 @@ def test_variance_protein_group_calculation(power_test_data):
     group1 = "Group1"
     group2 = "Group2"
 
-    variance = variance_protein_group_calculation(
+    variance = variance_protein_group_calculation_max(
         intensity_df, protein_id, group1, group2
     )
     print(variance)
     assert variance == 4.0
 
 
-def test_sample_size_calculation(power_test_data):
+def test_sample_size_calculation(power_test_data, diff_expr_test_data):
     test_alpha = 0.05
     test_power = 0.8
     test_fc_threshold = 1
     test_selected_protein_group = "Protein1"
+    test_individual_column = "None"
+    test_differentially_expressed_proteins_df, test_metadata_df = diff_expr_test_data
 
     required_sample_size = sample_size_calculation(
         differentially_expressed_proteins_df=power_test_data,
         significant_proteins_df=power_test_data,
+        metadata_df=test_metadata_df,
         fc_threshold=test_fc_threshold,
         power=test_power,
         alpha=test_alpha,
         group1="Group1",
         group2="Group2",
         selected_protein_group=test_selected_protein_group,
-        significant_proteins_only=False,
+        individual_column=test_individual_column,
         intensity_name=None,
     )
     print(required_sample_size)
     required_sample_size_int = next(iter(required_sample_size.values()), None)
     assert required_sample_size_int == 63
 
+def test_power_calculation(power_test_data, diff_expr_test_data):
+    test_alpha = 0.05
+    test_fc_threshold = 1
+    test_selected_protein_group = "Protein1"
+    test_individual_column = "None"
+    test_differentially_expressed_proteins_df, test_metadata_df = diff_expr_test_data
+
+    power = power_calculation(
+        differentially_expressed_proteins_df=power_test_data,
+        significant_proteins_df=power_test_data,
+        metadata_df=test_metadata_df,
+        fc_threshold=test_fc_threshold,
+        alpha=test_alpha,
+        group1="Group1",
+        group2="Group2",
+        selected_protein_group=test_selected_protein_group,
+        individual_column=test_individual_column,
+        intensity_name=None,
+    )
+    print(power)
+    power_int = next(iter(power.values()), None)
+    assert power_int == 0.09
+
 
 def test_check_sample_size_calculation_with_libfun(power_test_data):
     test_alpha = 0.05
@@ -156,23 +188,33 @@ def test_check_sample_size_calculation_implemented_without_log(power_test_data):
     required_sample_size_int = next(iter(required_sample_size.values()), None)
     assert required_sample_size_int == 63
 
+def test_replicate_paper_sample_size_calculation(power_test_data):
+    alpha = 0.001
+    power = 0.95
+    fc_threshold = math.log2(2)
+    biological_variance = 0.233
+    technical_variance = 2.298
+    number_of_replicates = 2
+
+    z_alpha = round(stats.norm.ppf(1 - alpha / 2), 3)
+    z_beta = round(stats.norm.ppf(power), 3)
+
+    required_sample_size = (
+        2
+        * ((z_alpha + z_beta) / fc_threshold) ** 2
+        * ((technical_variance / number_of_replicates) + biological_variance)
+    )  # Equation (1) in Cairns, David A., et al., 2008, Sample size determination in clinical proteomic profiling experiments using mass spectrometry for class comparison
+    required_sample_size = math.ceil(required_sample_size)
+    print(required_sample_size)
 
-def test_power_calculation(power_test_data):
-    test_alpha = 0.05
-    test_fc_threshold = 1
-    test_selected_protein_group = "Protein1"
-
-    power = power_calculation(
-        differentially_expressed_proteins_df=power_test_data,
-        significant_proteins_df=power_test_data,
-        fc_threshold=test_fc_threshold,
-        alpha=test_alpha,
-        group1="Group1",
-        group2="Group2",
-        selected_protein_group=test_selected_protein_group,
-        significant_proteins_only=False,
-        intensity_name=None,
-    )
-    print(power)
-    power_int = next(iter(power.values()), None)
-    assert power_int == 0.09
+    data = {
+        "Cairns": [44, 31, 62, 44, 14, 10, 19, 14, 5, 4, 7, 5],
+        "Calculated": [65, 52, 92, 74, 20, 16, 28, 23, 7, 6, 10, 8],
+    }
+    df = pd.DataFrame(data)
+    correlation = df["Cairns"].corr(df["Calculated"])
+    print(correlation)
+    correlationmatrix = df.corr()
+    print(correlationmatrix)
+
+    return dict(required_sample_size=required_sample_size)

From ac9e783f39887f22a58affd6448b661309b6ea18 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Tue, 3 Sep 2024 17:28:20 +0200
Subject: [PATCH 20/36] added function that calculates sample size for all
 proteins and shows the distribution in a violin plot

---
 protzilla/data_analysis/power_analysis.py | 112 +++++++++++++++++++++-
 protzilla/methods/data_analysis.py        |  52 +++++++++-
 ui/runs/form_mapping.py                   |   1 +
 ui/runs/forms/data_analysis.py            |  99 +++++++++++++++++++
 user_data/workflows/standard.yaml         |   3 +
 5 files changed, 264 insertions(+), 3 deletions(-)

diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py
index 7989ce7b..76fb032f 100644
--- a/protzilla/data_analysis/power_analysis.py
+++ b/protzilla/data_analysis/power_analysis.py
@@ -3,6 +3,8 @@
 import numpy as np
 import pandas as pd
 from scipy import stats
+import plotly.express as px
+import plotly.graph_objs as go
 
 from protzilla.utilities import default_intensity_column
 
@@ -56,16 +58,21 @@ def sample_size_calculation(
     intensity_name: str = None,
 ) -> dict:
     """
-    Function to calculate the required sample size for a selected protein to achieve the required power .
+    Function to calculate the required sample size for a selected protein to achieve the desired statistical power.
+    If metadata_df contains a column that identifies individuals, the function first calculates the mean intensity for
+    each individual (based on replicates) within the dataset. These individual means are used to determine the variance
+    for the sample size calculation formula.
 
     :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output.
     :param significant_proteins_df: The dataframe containing the significant proteins from t-test output.
+    :param metadata_df: The dataframe containing the clinical data.
     :param fc_threshold: The fold change threshold.
     :param alpha: The significance level. The value for alpha is taken from the t-test by default.
     :param power: The power of the test.
     :param group1: The name of the first group.
     :param group2: The name of the second group.
     :param selected_protein_group: The selected protein group for which the required sample size is to be calculated.
+    :param individual_column: The name of the column in metadata_df containing the individual ID.
     :param intensity_name: The name of the column containing the protein group intensities.
     :return: The required sample size.
     """
@@ -135,14 +142,21 @@ def power_calculation(
 ) -> dict:
     """
     Function to calculate the power of the t-test for a selected protein group.
+    If metadata_df contains a column that identifies individuals, the function first calculates the mean intensity for
+    each individual (based on replicates) within the dataset. These individual means are used to determine the variance
+    for the power calculation formula.
+    If both groups have different numbers of samples, the sample size for the power formula is calculated according
+    to the equation 2.3.1 from Cohen 1988, Statistical Power Analysis for the Behavioral Sciences.
 
     :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output.
     :param significant_proteins_df: The dataframe containing the significant proteins from t-test output.
+    :param metadata_df: The dataframe containing the clinical data.
     :param alpha: The significance level. The value for alpha is taken from the t-test by default.
     :param fc_threshold: The fold change threshold.
     :param group1: The name of the first group.
     :param group2: The name of the second group.
     :param selected_protein_group: The selected protein group for which the power is to be calculated.
+    :param individual_column: The name of the column in metadata_df containing the individual ID.
     :param intensity_name: The name of the column containing the protein group intensities.
     :return: The power of the test.
     """
@@ -212,4 +226,98 @@ def power_calculation(
     )
     power = float(round(stats.norm.cdf(z_beta), 2))
 
-    return dict(power=power)
\ No newline at end of file
+    return dict(power=power)
+
+def sample_size_calculation_for_all_proteins(
+        differentially_expressed_proteins_df: pd.DataFrame,
+        significant_proteins_df: pd.DataFrame,
+        significant_proteins_only: str,
+        metadata_df: pd.DataFrame,
+        fc_threshold: float,
+        alpha: float,
+        power: float,
+        group1: str,
+        group2: str,
+        individual_column: str,
+        select_all_proteins: bool,
+        selected_protein_groups: list,
+        intensity_name: str = None,
+
+) -> dict:
+    """
+    Function to calculate the required sample size for all proteins in the dataset to achieve the required power.
+    Variance estimation ...
+
+    :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output.
+    :param significant_proteins_df: The dataframe containing the significant proteins from t-test output.
+    :param significant_proteins_only: A boolean indicating whether only significant proteins should be considered.
+    :param metadata_df: The dataframe containing the clinical data.
+    :param fc_threshold: The fold change threshold.
+    :param alpha: The significance level. The value for alpha is taken from the t-test by default.
+    :param power: The power of the test.
+    :param group1: The name of the first group.
+    :param group2: The name of the second group.
+    :param individual_column: The name of the column in metadata_df containing the individual ID.
+    :param select_all_proteins: A boolean indicating whether all proteins should be considered.
+    :param selected_protein_groups: A list of selected protein groups, if not all proteins should be considered.
+    :param intensity_name: The name of the column containing the protein group intensities.
+    """
+    if select_all_proteins and significant_proteins_only == 'No':
+        protein_groups_for_calculation = differentially_expressed_proteins_df["Protein ID"].unique()
+    elif select_all_proteins and significant_proteins_only == 'Yes':
+        protein_groups_for_calculation = significant_proteins_df["Protein ID"].unique()
+    else:
+        protein_groups_for_calculation = selected_protein_groups
+
+    required_sample_sizes = []
+
+    for protein_group in protein_groups_for_calculation:
+        required_sample_size = sample_size_calculation(
+            differentially_expressed_proteins_df=differentially_expressed_proteins_df,
+            significant_proteins_df=significant_proteins_df,
+            metadata_df=metadata_df,
+            fc_threshold=fc_threshold,
+            alpha=alpha,
+            power=power,
+            group1=group1,
+            group2=group2,
+            selected_protein_group=protein_group,
+            individual_column=individual_column,
+            intensity_name=intensity_name,
+        )["required_sample_size"]
+
+        required_sample_sizes.append(required_sample_size)
+
+    required_sample_size_for_all_proteins = max(required_sample_sizes)
+
+    violin_plot_args = dict(
+        meanline_visible=True,
+        box_visible=True,
+        scalemode='width',
+        spanmode='hard',
+        span=[0, required_sample_size_for_all_proteins]
+    )
+
+    fig = go.Figure()
+
+    fig.add_trace(go.Violin(
+        x=['Protein group'] * len(required_sample_sizes),
+        y=required_sample_sizes,
+        line_color='red',
+        **violin_plot_args
+    ))
+    sample_size_dataframe = pd.DataFrame(protein_groups_for_calculation)
+    sample_size_dataframe["Sample Size"] = required_sample_sizes
+
+    differentially_expressed_proteins_df = pd.merge(
+        differentially_expressed_proteins_df,
+        sample_size_dataframe,
+        on="Protein ID",
+    )
+        #merge["Sample Size"] = required_sample_sizes
+
+    return dict(required_sample_size_for_all_proteins=required_sample_size_for_all_proteins,
+                plots=[fig],
+                differentially_expressed_proteins_df=differentially_expressed_proteins_df,
+                sample_size_dataframe=sample_size_dataframe,
+                )
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 62db5c10..245b885f 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -24,6 +24,7 @@
 from protzilla.data_analysis.power_analysis import (
     power_calculation,
     sample_size_calculation,
+    sample_size_calculation_for_all_proteins,
 )
 from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph
 from protzilla.data_analysis.ptm_analysis import (
@@ -801,7 +802,6 @@ class PowerAnalysisPowerCalculation(DataAnalysisStep):
         "significant_proteins_df",
         "differentially_expressed_proteins_df",
         "selected_protein_group",
-        "significant_proteins_df",
         "fc_threshold",
         "alpha",
         "group1",
@@ -880,3 +880,53 @@ def handle_outputs(self, outputs: dict):
         self.display_output[
             "required_sample_size"
         ] = f"Required Sample Size: {outputs['required_sample_size']}"
+
+class PowerAnalysisSampleSizeCalculationForAllProteins(PlotStep):
+    display_name = "Sample Size Calculation for all Proteins"
+    operation = "Power Analysis"
+    method_description = "Calculates sample size for all proteins"
+
+    input_keys = [
+        "differentially_expressed_proteins_df",
+        "significant_proteins_df",
+        "significant_proteins_only",
+        "fc_threshold",
+        "alpha",
+        "group1",
+        "group2",
+        "power",
+        "individual_column",
+        "metadata_df",
+        "select_all_proteins",
+        "selected_protein_groups",
+    ]
+    output_keys = [
+        "required_sample_size_for_all_proteins",
+        "differentially_expressed_proteins_df",
+        "sample_size_dataframe"
+    ]
+
+    def method(self, inputs: dict) -> dict:
+        return sample_size_calculation_for_all_proteins(**inputs)
+
+    def insert_dataframes(self, steps: StepManager, inputs) -> dict:
+        inputs["differentially_expressed_proteins_df"] = steps.get_step_output(
+            Step, "differentially_expressed_proteins_df", inputs["input_dict"]
+        )
+        step = next(
+            s for s in steps.all_steps if s.instance_identifier == inputs["input_dict"]
+        )
+        inputs["significant_proteins_df"] = steps.get_step_output(
+            Step, "significant_proteins_df", inputs["input_dict"]
+        )
+        inputs["metadata_df"] = steps.metadata_df
+        inputs["alpha"] = step.inputs["alpha"]
+        inputs["group1"] = step.inputs["group1"]
+        inputs["group2"] = step.inputs["group2"]
+        return inputs
+
+    def handle_outputs(self, outputs: dict):
+        super().handle_outputs(outputs)
+        self.display_output[
+            "required_sample_size_for_all_proteins"
+        ] = f"Required Sample Size for all Proteins: {outputs['required_sample_size_for_all_proteins']}"
\ No newline at end of file
diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py
index 0fec7200..90cf6d43 100644
--- a/ui/runs/form_mapping.py
+++ b/ui/runs/form_mapping.py
@@ -63,6 +63,7 @@
     data_analysis.ProteinGraphVariationGraph: data_analysis_forms.ProteinGraphVariationGraphForm,
     data_analysis.PowerAnalysisPowerCalculation: data_analysis_forms.PowerAnalysisPowerCalculationForm,
     data_analysis.PowerAnalysisSampleSizeCalculation: data_analysis_forms.PowerAnalysisSampleSizeCalculationForm,
+    data_analysis.PowerAnalysisSampleSizeCalculationForAllProteins: data_analysis_forms.PowerAnalysisSampleSizeCalculationForAllProteinsForm,
     data_analysis.SelectPeptidesForProtein: data_analysis_forms.SelectPeptidesForProteinForm,
     data_analysis.PTMsPerSample: data_analysis_forms.PTMsPerSampleForm,
     data_analysis.PTMsProteinAndPerSample: data_analysis_forms.PTMsPerProteinAndSampleForm,
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index d7508230..f37e8e46 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -1258,3 +1258,102 @@ def fill_form(self, run: Run) -> None:
         self.fields["alpha"].initial = run.steps.get_step_output(
             Step, "corrected_alpha", input_dict_instance_id
         )
+
+class PowerAnalysisSampleSizeCalculationForAllProteinsForm(MethodForm):
+    is_dynamic = True
+
+    input_dict = CustomChoiceField(
+        choices=[],
+        label="Input data dict (generated e.g. by t-Test)",
+    )
+    alpha = CustomFloatField(
+        label="Error rate (alpha)",
+        min_value=0,
+        max_value=1,
+        step_size=0.05,
+        initial=0.05,
+    )
+    power = CustomFloatField(
+        label="Power",
+        min_value=0,
+        max_value=1,
+        step_size=0.05,
+        initial=0.8,
+    )
+    fc_threshold = CustomFloatField(
+        label="Log2 fold change threshold", min_value=0, initial=1
+    )
+    individual_column = CustomChoiceField(
+        choices=[],
+        label="Column name for individuals in metadata, if it exists (mean value will be calculated per individual)",
+    )
+    significant_proteins_only = CustomChoiceField(
+        choices=YesNo,
+        label="Select only significant proteins",
+        initial=YesNo.yes,
+    )
+    select_all_proteins = CustomBooleanField(
+        label="Select all proteins",
+        initial=True,
+    )
+    selected_protein_groups = CustomMultipleChoiceField(
+        choices=[],
+        label="Protein groups to calculate sample size for",
+    )
+
+   #def __init__(self, *args, **kwargs):
+    #    super().__init__(*args, **kwargs)
+     #   select_all_proteins = self.data.get("select_all_proteins", True)
+      #  if select_all_proteins == False:
+       #     self.toggle_visibility("selected_protein_groups", True)
+       # else:
+        #    self.toggle_visibility("selected_protein_groups", False)"""
+
+    def fill_form(self, run: Run) -> None:
+        self.fields["input_dict"].choices = fill_helper.to_choices(
+            run.steps.get_instance_identifiers(
+                DifferentialExpressionTTest,
+                "differentially_expressed_proteins_df",
+            )
+        )
+        input_dict_instance_id = self.data.get(
+            "input_dict", self.fields["input_dict"].choices[0][0]
+        )
+        self.fields["alpha"].initial = run.steps.get_step_output(
+            Step, "corrected_alpha", input_dict_instance_id
+        )
+        self.fields["individual_column"].choices = [
+            ("None", "None")
+        ] + fill_helper.get_choices_for_metadata_all_columns(run)
+        individual_column = self.data.get("individual_column", "None")
+        self.fields["individual_column"].initial = individual_column
+
+        significant_proteins_only = self.data.get(
+            "significant_proteins_only",
+            self.fields["significant_proteins_only"].choices[0][0],
+        )
+
+        if significant_proteins_only == YesNo.yes:
+            self.fields["selected_protein_groups"].choices = fill_helper.to_choices(
+                run.steps.get_step_output(
+                    Step, "significant_proteins_df", input_dict_instance_id
+                )["Protein ID"].unique()
+            )
+        else:
+            self.fields["selected_protein_groups"].choices = fill_helper.to_choices(
+                run.steps.get_step_output(
+                    Step, "differentially_expressed_proteins_df", input_dict_instance_id
+                )["Protein ID"].unique()
+            )
+        if not self.data:
+            select_all_proteins = True
+        else:
+            if "select_all_proteins" in self.data:
+                select_all_proteins = True
+            else:
+                select_all_proteins = False
+
+        if select_all_proteins == False:
+            self.toggle_visibility("selected_protein_groups", True)
+        else:
+            self.toggle_visibility("selected_protein_groups", False)
\ No newline at end of file
diff --git a/user_data/workflows/standard.yaml b/user_data/workflows/standard.yaml
index 970a2a2f..abd07c9b 100644
--- a/user_data/workflows/standard.yaml
+++ b/user_data/workflows/standard.yaml
@@ -64,6 +64,9 @@ steps:
   - form_inputs: { }
     inputs: { }
     type: PowerAnalysisPowerCalculation
+  - form_inputs: {}
+    inputs: { }
+    type: PowerAnalysisSampleSizeCalculationForAllProteins
   - form_inputs:
       fc_threshold: 1
     inputs: { }

From e54c767c784763c32ba26a2225bb8fbea56e2d37 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Tue, 3 Sep 2024 17:29:21 +0200
Subject: [PATCH 21/36] formatting

---
 protzilla/data_analysis/power_analysis.py | 69 ++++++++++++-----------
 protzilla/methods/data_analysis.py        |  5 +-
 ui/runs/forms/data_analysis.py            | 15 ++---
 3 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py
index 76fb032f..f207dc63 100644
--- a/protzilla/data_analysis/power_analysis.py
+++ b/protzilla/data_analysis/power_analysis.py
@@ -228,21 +228,21 @@ def power_calculation(
 
     return dict(power=power)
 
-def sample_size_calculation_for_all_proteins(
-        differentially_expressed_proteins_df: pd.DataFrame,
-        significant_proteins_df: pd.DataFrame,
-        significant_proteins_only: str,
-        metadata_df: pd.DataFrame,
-        fc_threshold: float,
-        alpha: float,
-        power: float,
-        group1: str,
-        group2: str,
-        individual_column: str,
-        select_all_proteins: bool,
-        selected_protein_groups: list,
-        intensity_name: str = None,
 
+def sample_size_calculation_for_all_proteins(
+    differentially_expressed_proteins_df: pd.DataFrame,
+    significant_proteins_df: pd.DataFrame,
+    significant_proteins_only: str,
+    metadata_df: pd.DataFrame,
+    fc_threshold: float,
+    alpha: float,
+    power: float,
+    group1: str,
+    group2: str,
+    individual_column: str,
+    select_all_proteins: bool,
+    selected_protein_groups: list,
+    intensity_name: str = None,
 ) -> dict:
     """
     Function to calculate the required sample size for all proteins in the dataset to achieve the required power.
@@ -262,9 +262,11 @@ def sample_size_calculation_for_all_proteins(
     :param selected_protein_groups: A list of selected protein groups, if not all proteins should be considered.
     :param intensity_name: The name of the column containing the protein group intensities.
     """
-    if select_all_proteins and significant_proteins_only == 'No':
-        protein_groups_for_calculation = differentially_expressed_proteins_df["Protein ID"].unique()
-    elif select_all_proteins and significant_proteins_only == 'Yes':
+    if select_all_proteins and significant_proteins_only == "No":
+        protein_groups_for_calculation = differentially_expressed_proteins_df[
+            "Protein ID"
+        ].unique()
+    elif select_all_proteins and significant_proteins_only == "Yes":
         protein_groups_for_calculation = significant_proteins_df["Protein ID"].unique()
     else:
         protein_groups_for_calculation = selected_protein_groups
@@ -293,19 +295,21 @@ def sample_size_calculation_for_all_proteins(
     violin_plot_args = dict(
         meanline_visible=True,
         box_visible=True,
-        scalemode='width',
-        spanmode='hard',
-        span=[0, required_sample_size_for_all_proteins]
+        scalemode="width",
+        spanmode="hard",
+        span=[0, required_sample_size_for_all_proteins],
     )
 
     fig = go.Figure()
 
-    fig.add_trace(go.Violin(
-        x=['Protein group'] * len(required_sample_sizes),
-        y=required_sample_sizes,
-        line_color='red',
-        **violin_plot_args
-    ))
+    fig.add_trace(
+        go.Violin(
+            x=["Protein group"] * len(required_sample_sizes),
+            y=required_sample_sizes,
+            line_color="red",
+            **violin_plot_args
+        )
+    )
     sample_size_dataframe = pd.DataFrame(protein_groups_for_calculation)
     sample_size_dataframe["Sample Size"] = required_sample_sizes
 
@@ -314,10 +318,11 @@ def sample_size_calculation_for_all_proteins(
         sample_size_dataframe,
         on="Protein ID",
     )
-        #merge["Sample Size"] = required_sample_sizes
+    # merge["Sample Size"] = required_sample_sizes
 
-    return dict(required_sample_size_for_all_proteins=required_sample_size_for_all_proteins,
-                plots=[fig],
-                differentially_expressed_proteins_df=differentially_expressed_proteins_df,
-                sample_size_dataframe=sample_size_dataframe,
-                )
+    return dict(
+        required_sample_size_for_all_proteins=required_sample_size_for_all_proteins,
+        plots=[fig],
+        differentially_expressed_proteins_df=differentially_expressed_proteins_df,
+        sample_size_dataframe=sample_size_dataframe,
+    )
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 245b885f..229ad415 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -881,6 +881,7 @@ def handle_outputs(self, outputs: dict):
             "required_sample_size"
         ] = f"Required Sample Size: {outputs['required_sample_size']}"
 
+
 class PowerAnalysisSampleSizeCalculationForAllProteins(PlotStep):
     display_name = "Sample Size Calculation for all Proteins"
     operation = "Power Analysis"
@@ -903,7 +904,7 @@ class PowerAnalysisSampleSizeCalculationForAllProteins(PlotStep):
     output_keys = [
         "required_sample_size_for_all_proteins",
         "differentially_expressed_proteins_df",
-        "sample_size_dataframe"
+        "sample_size_dataframe",
     ]
 
     def method(self, inputs: dict) -> dict:
@@ -929,4 +930,4 @@ def handle_outputs(self, outputs: dict):
         super().handle_outputs(outputs)
         self.display_output[
             "required_sample_size_for_all_proteins"
-        ] = f"Required Sample Size for all Proteins: {outputs['required_sample_size_for_all_proteins']}"
\ No newline at end of file
+        ] = f"Required Sample Size for all Proteins: {outputs['required_sample_size_for_all_proteins']}"
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index f37e8e46..3fe7b399 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -1259,6 +1259,7 @@ def fill_form(self, run: Run) -> None:
             Step, "corrected_alpha", input_dict_instance_id
         )
 
+
 class PowerAnalysisSampleSizeCalculationForAllProteinsForm(MethodForm):
     is_dynamic = True
 
@@ -1301,13 +1302,13 @@ class PowerAnalysisSampleSizeCalculationForAllProteinsForm(MethodForm):
         label="Protein groups to calculate sample size for",
     )
 
-   #def __init__(self, *args, **kwargs):
+    # def __init__(self, *args, **kwargs):
     #    super().__init__(*args, **kwargs)
-     #   select_all_proteins = self.data.get("select_all_proteins", True)
-      #  if select_all_proteins == False:
-       #     self.toggle_visibility("selected_protein_groups", True)
-       # else:
-        #    self.toggle_visibility("selected_protein_groups", False)"""
+    #   select_all_proteins = self.data.get("select_all_proteins", True)
+    #  if select_all_proteins == False:
+    #     self.toggle_visibility("selected_protein_groups", True)
+    # else:
+    #    self.toggle_visibility("selected_protein_groups", False)"""
 
     def fill_form(self, run: Run) -> None:
         self.fields["input_dict"].choices = fill_helper.to_choices(
@@ -1356,4 +1357,4 @@ def fill_form(self, run: Run) -> None:
         if select_all_proteins == False:
             self.toggle_visibility("selected_protein_groups", True)
         else:
-            self.toggle_visibility("selected_protein_groups", False)
\ No newline at end of file
+            self.toggle_visibility("selected_protein_groups", False)

From 2faa9726a89888d1dd50a954cc462ef0a6cb5683 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Tue, 3 Sep 2024 18:37:23 +0200
Subject: [PATCH 22/36] commented the dataframe-output-stuff out, otherwise
 violin plot couldn't be displayed anymore (WIP...)

---
 protzilla/data_analysis/power_analysis.py | 18 ++++++++++--------
 protzilla/methods/data_analysis.py        |  4 ++--
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py
index f207dc63..bd94996d 100644
--- a/protzilla/data_analysis/power_analysis.py
+++ b/protzilla/data_analysis/power_analysis.py
@@ -261,6 +261,8 @@ def sample_size_calculation_for_all_proteins(
     :param select_all_proteins: A boolean indicating whether all proteins should be considered.
     :param selected_protein_groups: A list of selected protein groups, if not all proteins should be considered.
     :param intensity_name: The name of the column containing the protein group intensities.
+
+    :return:
     """
     if select_all_proteins and significant_proteins_only == "No":
         protein_groups_for_calculation = differentially_expressed_proteins_df[
@@ -290,13 +292,13 @@ def sample_size_calculation_for_all_proteins(
 
         required_sample_sizes.append(required_sample_size)
 
-    required_sample_size_for_all_proteins = max(required_sample_sizes)
+        required_sample_size_for_all_proteins = max(required_sample_sizes)
 
     violin_plot_args = dict(
         meanline_visible=True,
         box_visible=True,
-        scalemode="width",
-        spanmode="hard",
+        scalemode='width',
+        spanmode='hard',
         span=[0, required_sample_size_for_all_proteins],
     )
 
@@ -310,19 +312,19 @@ def sample_size_calculation_for_all_proteins(
             **violin_plot_args
         )
     )
-    sample_size_dataframe = pd.DataFrame(protein_groups_for_calculation)
+    """sample_size_dataframe = pd.DataFrame(protein_groups_for_calculation)
     sample_size_dataframe["Sample Size"] = required_sample_sizes
 
     differentially_expressed_proteins_df = pd.merge(
         differentially_expressed_proteins_df,
         sample_size_dataframe,
         on="Protein ID",
-    )
-    # merge["Sample Size"] = required_sample_sizes
+    )"""
+
 
     return dict(
         required_sample_size_for_all_proteins=required_sample_size_for_all_proteins,
         plots=[fig],
-        differentially_expressed_proteins_df=differentially_expressed_proteins_df,
-        sample_size_dataframe=sample_size_dataframe,
+        #differentially_expressed_proteins_df=differentially_expressed_proteins_df,
+        #sample_size_dataframe=sample_size_dataframe,
     )
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 229ad415..ed53f223 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -903,8 +903,8 @@ class PowerAnalysisSampleSizeCalculationForAllProteins(PlotStep):
     ]
     output_keys = [
         "required_sample_size_for_all_proteins",
-        "differentially_expressed_proteins_df",
-        "sample_size_dataframe",
+        #"differentially_expressed_proteins_df",
+        #"sample_size_dataframe",
     ]
 
     def method(self, inputs: dict) -> dict:

From 25cf2b2b000271c9bde193ce57c979cd63ea3df8 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Tue, 3 Sep 2024 19:59:23 +0200
Subject: [PATCH 23/36] changed color of violinplot and added axis-description

---
 protzilla/data_analysis/power_analysis.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py
index bd94996d..ec013962 100644
--- a/protzilla/data_analysis/power_analysis.py
+++ b/protzilla/data_analysis/power_analysis.py
@@ -5,7 +5,9 @@
 from scipy import stats
 import plotly.express as px
 import plotly.graph_objs as go
+import protzilla.constants.colors as colorscheme
 
+from ..constants.colors import PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE
 from protzilla.utilities import default_intensity_column
 
 
@@ -294,24 +296,33 @@ def sample_size_calculation_for_all_proteins(
 
         required_sample_size_for_all_proteins = max(required_sample_sizes)
 
+    colors = colorscheme.PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE
+
     violin_plot_args = dict(
         meanline_visible=True,
         box_visible=True,
         scalemode='width',
         spanmode='hard',
         span=[0, required_sample_size_for_all_proteins],
+        fillcolor='rgba(0,0,0,0)'
     )
 
     fig = go.Figure()
 
     fig.add_trace(
         go.Violin(
-            x=["Protein group"] * len(required_sample_sizes),
+            x=["Protein Groups"] * len(required_sample_sizes),
             y=required_sample_sizes,
-            line_color="red",
+            line_color=colors[1],
             **violin_plot_args
         )
     )
+    fig.update_layout(
+        title="Distribution of Required Sample Sizes for All Proteins",
+        xaxis_title="Protein Groups",
+        yaxis_title="Required Sample Size",
+        showlegend=False,
+    )
     """sample_size_dataframe = pd.DataFrame(protein_groups_for_calculation)
     sample_size_dataframe["Sample Size"] = required_sample_sizes
 

From ae4e8cbe539a1847bc0f671de71adc08202f431d Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Thu, 5 Sep 2024 13:10:24 +0200
Subject: [PATCH 24/36] changed color of violinplot and removed
 axis-description

---
 protzilla/data_analysis/power_analysis.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py
index ec013962..5deca159 100644
--- a/protzilla/data_analysis/power_analysis.py
+++ b/protzilla/data_analysis/power_analysis.py
@@ -304,14 +304,13 @@ def sample_size_calculation_for_all_proteins(
         scalemode='width',
         spanmode='hard',
         span=[0, required_sample_size_for_all_proteins],
-        fillcolor='rgba(0,0,0,0)'
     )
 
     fig = go.Figure()
 
     fig.add_trace(
         go.Violin(
-            x=["Protein Groups"] * len(required_sample_sizes),
+            x=[""] * len(required_sample_sizes),
             y=required_sample_sizes,
             line_color=colors[1],
             **violin_plot_args
@@ -319,7 +318,6 @@ def sample_size_calculation_for_all_proteins(
     )
     fig.update_layout(
         title="Distribution of Required Sample Sizes for All Proteins",
-        xaxis_title="Protein Groups",
         yaxis_title="Required Sample Size",
         showlegend=False,
     )

From 5c630081f4c8e0f643f0d64691498dbb174d2861 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Thu, 5 Sep 2024 14:39:35 +0200
Subject: [PATCH 25/36] resolved comments

---
 protzilla/data_analysis/power_analysis.py | 26 +++++++++--------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py
index 5deca159..2307b3f7 100644
--- a/protzilla/data_analysis/power_analysis.py
+++ b/protzilla/data_analysis/power_analysis.py
@@ -298,24 +298,19 @@ def sample_size_calculation_for_all_proteins(
 
     colors = colorscheme.PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE
 
-    violin_plot_args = dict(
-        meanline_visible=True,
-        box_visible=True,
-        scalemode='width',
-        spanmode='hard',
-        span=[0, required_sample_size_for_all_proteins],
-    )
-
-    fig = go.Figure()
-
-    fig.add_trace(
+    fig = go.Figure(
         go.Violin(
-            x=[""] * len(required_sample_sizes),
+            name="" * len(required_sample_sizes),
             y=required_sample_sizes,
             line_color=colors[1],
-            **violin_plot_args
+            meanline_visible=True,
+            box_visible=True,
+            scalemode="width",
+            spanmode="hard",
+            span=[0, required_sample_size_for_all_proteins],
         )
     )
+
     fig.update_layout(
         title="Distribution of Required Sample Sizes for All Proteins",
         yaxis_title="Required Sample Size",
@@ -330,10 +325,9 @@ def sample_size_calculation_for_all_proteins(
         on="Protein ID",
     )"""
 
-
     return dict(
         required_sample_size_for_all_proteins=required_sample_size_for_all_proteins,
         plots=[fig],
-        #differentially_expressed_proteins_df=differentially_expressed_proteins_df,
-        #sample_size_dataframe=sample_size_dataframe,
+        # differentially_expressed_proteins_df=differentially_expressed_proteins_df,
+        # sample_size_dataframe=sample_size_dataframe,
     )

From 0adc15c7705dee105ec6e6ef2b19fc43023bdb19 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Thu, 5 Sep 2024 16:20:25 +0200
Subject: [PATCH 26/36] Added function to get dataframes with sample size
 column as output

---
 protzilla/data_analysis/power_analysis.py     | 26 +++++++++++++++----
 protzilla/methods/data_analysis.py            |  5 ++--
 .../power_analysis_validation.py              |  0
 3 files changed, 24 insertions(+), 7 deletions(-)
 rename {protzilla => tests/protzilla}/data_analysis/power_analysis_validation.py (100%)

diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py
index 2307b3f7..0379f91e 100644
--- a/protzilla/data_analysis/power_analysis.py
+++ b/protzilla/data_analysis/power_analysis.py
@@ -316,18 +316,34 @@ def sample_size_calculation_for_all_proteins(
         yaxis_title="Required Sample Size",
         showlegend=False,
     )
-    """sample_size_dataframe = pd.DataFrame(protein_groups_for_calculation)
+    sample_size_dataframe = pd.DataFrame(protein_groups_for_calculation)
+    sample_size_dataframe.columns = ["Protein ID"]
     sample_size_dataframe["Sample Size"] = required_sample_sizes
 
-    differentially_expressed_proteins_df = pd.merge(
+    if select_all_proteins and significant_proteins_only == "No":
+        differentially_expressed_proteins_df = pd.merge(
         differentially_expressed_proteins_df,
         sample_size_dataframe,
         on="Protein ID",
-    )"""
+    )
+    elif select_all_proteins and significant_proteins_only == "Yes":
+        significant_proteins_df = pd.merge(
+        significant_proteins_df,
+        sample_size_dataframe,
+        on="Protein ID",
+    )
+    else:
+        sample_size_dataframe = pd.merge(
+        sample_size_dataframe,
+        sample_size_dataframe,
+        on="Protein ID",
+    )
+
 
     return dict(
         required_sample_size_for_all_proteins=required_sample_size_for_all_proteins,
         plots=[fig],
-        # differentially_expressed_proteins_df=differentially_expressed_proteins_df,
-        # sample_size_dataframe=sample_size_dataframe,
+        differentially_expressed_proteins_df=differentially_expressed_proteins_df,
+        significant_proteins_df=significant_proteins_df,
+        sample_size_dataframe=sample_size_dataframe,
     )
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index ed53f223..3f36ba07 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -903,8 +903,9 @@ class PowerAnalysisSampleSizeCalculationForAllProteins(PlotStep):
     ]
     output_keys = [
         "required_sample_size_for_all_proteins",
-        #"differentially_expressed_proteins_df",
-        #"sample_size_dataframe",
+        "differentially_expressed_proteins_df",
+        "sample_size_dataframe",
+        "significant_proteins_df",
     ]
 
     def method(self, inputs: dict) -> dict:
diff --git a/protzilla/data_analysis/power_analysis_validation.py b/tests/protzilla/data_analysis/power_analysis_validation.py
similarity index 100%
rename from protzilla/data_analysis/power_analysis_validation.py
rename to tests/protzilla/data_analysis/power_analysis_validation.py

From dcba877792c4dd4cefdba71063644a8157e64434 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Fri, 6 Sep 2024 14:11:18 +0200
Subject: [PATCH 27/36] Added power_calculation_for_all_proteins to calculate
 minimum power for all proteins

---
 protzilla/data_analysis/power_analysis.py | 122 ++++++++++++++++++++--
 protzilla/methods/data_analysis.py        |  51 +++++++++
 ui/runs/form_mapping.py                   |   1 +
 ui/runs/forms/data_analysis.py            |  90 ++++++++++++++--
 user_data/workflows/standard.yaml         |   3 +
 5 files changed, 253 insertions(+), 14 deletions(-)

diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py
index 0379f91e..e2286ff5 100644
--- a/protzilla/data_analysis/power_analysis.py
+++ b/protzilla/data_analysis/power_analysis.py
@@ -248,7 +248,6 @@ def sample_size_calculation_for_all_proteins(
 ) -> dict:
     """
     Function to calculate the required sample size for all proteins in the dataset to achieve the required power.
-    Variance estimation ...
 
     :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output.
     :param significant_proteins_df: The dataframe containing the significant proteins from t-test output.
@@ -265,7 +264,13 @@ def sample_size_calculation_for_all_proteins(
     :param intensity_name: The name of the column containing the protein group intensities.
 
     :return:
+        - required_sample_size_for_all_proteins: The maximum required sample size for all proteins.
+        - a violin plot showing the distribution of required sample sizes for all proteins.
+        - a df differentially_expressed_proteins_df from t-test output with added sample size column.
+        - a df significant_proteins_df from t-test output with added sample size column.
+        - a df sample_size_dataframe containing the sample sizes for all proteins.
     """
+
     if select_all_proteins and significant_proteins_only == "No":
         protein_groups_for_calculation = differentially_expressed_proteins_df[
             "Protein ID"
@@ -332,18 +337,121 @@ def sample_size_calculation_for_all_proteins(
         sample_size_dataframe,
         on="Protein ID",
     )
+
+    return dict(
+        required_sample_size_for_all_proteins=required_sample_size_for_all_proteins,
+        plots=[fig],
+        differentially_expressed_proteins_df=differentially_expressed_proteins_df,
+        significant_proteins_df=significant_proteins_df,
+        sample_size_dataframe=sample_size_dataframe,
+    )
+
+def power_calculation_for_all_proteins(
+    differentially_expressed_proteins_df: pd.DataFrame,
+    significant_proteins_df: pd.DataFrame,
+    significant_proteins_only: str,
+    metadata_df: pd.DataFrame,
+    fc_threshold: float,
+    alpha: float,
+    group1: str,
+    group2: str,
+    individual_column: str,
+    select_all_proteins: bool,
+    selected_protein_groups: list,
+    intensity_name: str = None,
+) -> dict:
+    """
+    Function to calculate the power of the t-test for all proteins in the dataset.
+
+    :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output.
+    :param significant_proteins_df: The dataframe containing the significant proteins from t-test output.
+    :param significant_proteins_only: A boolean indicating whether only significant proteins should be considered.
+    :param metadata_df: The dataframe containing the clinical data.
+    :param fc_threshold: The fold change threshold.
+    :param alpha: The significance level. The value for alpha is taken from the t-test by default.
+    :param group1: The name of the first group.
+    :param group2: The name of the second group.
+    :param individual_column: The name of the column in metadata_df containing the individual ID.
+    :param select_all_proteins: A boolean indicating whether all proteins should be considered.
+    :param selected_protein_groups: A list of selected protein groups, if not all proteins should be considered.
+    :param intensity_name: The name of the column containing the protein group intensities.
+
+    :return:
+        - power_for_all_proteins: The minimum power of all proteins.
+        - a df differentially_expressed_proteins_df from t-test output with added power column.
+        - a df significant_proteins_df from t-test output with added power column.
+        - a df power_dataframe containing the power for all proteins.
+    """
+    if select_all_proteins and significant_proteins_only == "No":
+        protein_groups_for_calculation = differentially_expressed_proteins_df[
+            "Protein ID"
+        ].unique()
+    elif select_all_proteins and significant_proteins_only == "Yes":
+        protein_groups_for_calculation = significant_proteins_df["Protein ID"].unique()
     else:
-        sample_size_dataframe = pd.merge(
-        sample_size_dataframe,
-        sample_size_dataframe,
-        on="Protein ID",
+        protein_groups_for_calculation = selected_protein_groups
+
+    power_list = []
+
+    for protein_group in protein_groups_for_calculation:
+        power = power_calculation(
+            differentially_expressed_proteins_df=differentially_expressed_proteins_df,
+            significant_proteins_df=significant_proteins_df,
+            metadata_df=metadata_df,
+            fc_threshold=fc_threshold,
+            alpha=alpha,
+            group1=group1,
+            group2=group2,
+            selected_protein_group=protein_group,
+            individual_column=individual_column,
+            intensity_name=intensity_name,
+        )["power"]
+
+        power_list.append(power)
+
+        power_for_all_proteins = min(power_list)
+
+    colors = colorscheme.PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE
+
+    fig = go.Figure(
+        go.Violin(
+            name="" * len(power_list),
+            y=power_list,
+            line_color=colors[1],
+            meanline_visible=True,
+            box_visible=True,
+            scalemode="width",
+            spanmode="hard",
+            span=[power_for_all_proteins, 1],
+        )
     )
 
+    fig.update_layout(
+        title="Distribution of Power for All Proteins",
+        yaxis_title="Power",
+        showlegend=False,
+    )
+    power_dataframe = pd.DataFrame(protein_groups_for_calculation)
+    power_dataframe.columns = ["Protein ID"]
+    power_dataframe["Power"] = power_list
+
+    if select_all_proteins and significant_proteins_only == "No":
+        differentially_expressed_proteins_df = pd.merge(
+        differentially_expressed_proteins_df,
+        power_dataframe,
+        on="Protein ID",
+    )
+    elif select_all_proteins and significant_proteins_only == "Yes":
+        significant_proteins_df = pd.merge(
+        significant_proteins_df,
+        power_dataframe,
+        on="Protein ID",
+    )
 
     return dict(
-        required_sample_size_for_all_proteins=required_sample_size_for_all_proteins,
+        power_for_all_proteins=power_for_all_proteins,
         plots=[fig],
         differentially_expressed_proteins_df=differentially_expressed_proteins_df,
         significant_proteins_df=significant_proteins_df,
-        sample_size_dataframe=sample_size_dataframe,
+        power_dataframe=power_dataframe,
     )
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 3f36ba07..21c430b5 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -25,6 +25,7 @@
     power_calculation,
     sample_size_calculation,
     sample_size_calculation_for_all_proteins,
+    power_calculation_for_all_proteins,
 )
 from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph
 from protzilla.data_analysis.ptm_analysis import (
@@ -932,3 +933,53 @@ def handle_outputs(self, outputs: dict):
         self.display_output[
             "required_sample_size_for_all_proteins"
         ] = f"Required Sample Size for all Proteins: {outputs['required_sample_size_for_all_proteins']}"
+
+class PowerAnalysisPowerCalculationForAllProteins(PlotStep):
+    display_name = "Power Calculation for all Proteins"
+    operation = "Power Analysis"
+    method_description = "Calculates power for all proteins"
+
+    input_keys = [
+        "differentially_expressed_proteins_df",
+        "significant_proteins_df",
+        "significant_proteins_only",
+        "fc_threshold",
+        "alpha",
+        "group1",
+        "group2",
+        "individual_column",
+        "metadata_df",
+        "select_all_proteins",
+        "selected_protein_groups",
+    ]
+    output_keys = [
+        "power_for_all_proteins",
+        "differentially_expressed_proteins_df",
+        "power_dataframe",
+        "significant_proteins_df",
+    ]
+
+    def method(self, inputs: dict) -> dict:
+        return power_calculation_for_all_proteins(**inputs)
+
+    def insert_dataframes(self, steps: StepManager, inputs) -> dict:
+        inputs["differentially_expressed_proteins_df"] = steps.get_step_output(
+            Step, "differentially_expressed_proteins_df", inputs["input_dict"]
+        )
+        step = next(
+            s for s in steps.all_steps if s.instance_identifier == inputs["input_dict"]
+        )
+        inputs["significant_proteins_df"] = steps.get_step_output(
+            Step, "significant_proteins_df", inputs["input_dict"]
+        )
+        inputs["metadata_df"] = steps.metadata_df
+        inputs["alpha"] = step.inputs["alpha"]
+        inputs["group1"] = step.inputs["group1"]
+        inputs["group2"] = step.inputs["group2"]
+        return inputs
+
+    def handle_outputs(self, outputs: dict):
+        super().handle_outputs(outputs)
+        self.display_output[
+            "power_for_all_proteins"
+        ] = f"Power for all Proteins: {outputs['power_for_all_proteins']}"
diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py
index 90cf6d43..82e9511b 100644
--- a/ui/runs/form_mapping.py
+++ b/ui/runs/form_mapping.py
@@ -64,6 +64,7 @@
     data_analysis.PowerAnalysisPowerCalculation: data_analysis_forms.PowerAnalysisPowerCalculationForm,
     data_analysis.PowerAnalysisSampleSizeCalculation: data_analysis_forms.PowerAnalysisSampleSizeCalculationForm,
     data_analysis.PowerAnalysisSampleSizeCalculationForAllProteins: data_analysis_forms.PowerAnalysisSampleSizeCalculationForAllProteinsForm,
+    data_analysis.PowerAnalysisPowerCalculationForAllProteins: data_analysis_forms.PowerAnalysisPowerCalculationForAllProteinsForm,
     data_analysis.SelectPeptidesForProtein: data_analysis_forms.SelectPeptidesForProteinForm,
     data_analysis.PTMsPerSample: data_analysis_forms.PTMsPerSampleForm,
     data_analysis.PTMsProteinAndPerSample: data_analysis_forms.PTMsPerProteinAndSampleForm,
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index 3fe7b399..093a5fb7 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -1302,13 +1302,89 @@ class PowerAnalysisSampleSizeCalculationForAllProteinsForm(MethodForm):
         label="Protein groups to calculate sample size for",
     )
 
-    # def __init__(self, *args, **kwargs):
-    #    super().__init__(*args, **kwargs)
-    #   select_all_proteins = self.data.get("select_all_proteins", True)
-    #  if select_all_proteins == False:
-    #     self.toggle_visibility("selected_protein_groups", True)
-    # else:
-    #    self.toggle_visibility("selected_protein_groups", False)"""
+    def fill_form(self, run: Run) -> None:
+        self.fields["input_dict"].choices = fill_helper.to_choices(
+            run.steps.get_instance_identifiers(
+                DifferentialExpressionTTest,
+                "differentially_expressed_proteins_df",
+            )
+        )
+        input_dict_instance_id = self.data.get(
+            "input_dict", self.fields["input_dict"].choices[0][0]
+        )
+        self.fields["alpha"].initial = run.steps.get_step_output(
+            Step, "corrected_alpha", input_dict_instance_id
+        )
+        self.fields["individual_column"].choices = [
+            ("None", "None")
+        ] + fill_helper.get_choices_for_metadata_all_columns(run)
+        individual_column = self.data.get("individual_column", "None")
+        self.fields["individual_column"].initial = individual_column
+
+        significant_proteins_only = self.data.get(
+            "significant_proteins_only",
+            self.fields["significant_proteins_only"].choices[0][0],
+        )
+
+        if significant_proteins_only == YesNo.yes:
+            self.fields["selected_protein_groups"].choices = fill_helper.to_choices(
+                run.steps.get_step_output(
+                    Step, "significant_proteins_df", input_dict_instance_id
+                )["Protein ID"].unique()
+            )
+        else:
+            self.fields["selected_protein_groups"].choices = fill_helper.to_choices(
+                run.steps.get_step_output(
+                    Step, "differentially_expressed_proteins_df", input_dict_instance_id
+                )["Protein ID"].unique()
+            )
+        if not self.data:
+            select_all_proteins = True
+        else:
+            if "select_all_proteins" in self.data:
+                select_all_proteins = True
+            else:
+                select_all_proteins = False
+
+        if select_all_proteins == False:
+            self.toggle_visibility("selected_protein_groups", True)
+        else:
+            self.toggle_visibility("selected_protein_groups", False)
+
+class PowerAnalysisPowerCalculationForAllProteinsForm(MethodForm):
+    is_dynamic = True
+
+    input_dict = CustomChoiceField(
+        choices=[],
+        label="Input data dict (generated e.g. by t-Test)",
+    )
+    alpha = CustomFloatField(
+        label="Error rate (alpha)",
+        min_value=0,
+        max_value=1,
+        step_size=0.05,
+        initial=0.05,
+    )
+    fc_threshold = CustomFloatField(
+        label="Log2 fold change threshold", min_value=0, initial=1
+    )
+    individual_column = CustomChoiceField(
+        choices=[],
+        label="Column name for individuals in metadata, if it exists (mean value will be calculated per individual)",
+    )
+    significant_proteins_only = CustomChoiceField(
+        choices=YesNo,
+        label="Select only significant proteins",
+        initial=YesNo.yes,
+    )
+    select_all_proteins = CustomBooleanField(
+        label="Select all proteins",
+        initial=True,
+    )
+    selected_protein_groups = CustomMultipleChoiceField(
+        choices=[],
+        label="Protein groups to calculate sample size for",
+    )
 
     def fill_form(self, run: Run) -> None:
         self.fields["input_dict"].choices = fill_helper.to_choices(
diff --git a/user_data/workflows/standard.yaml b/user_data/workflows/standard.yaml
index abd07c9b..b2bb0d5d 100644
--- a/user_data/workflows/standard.yaml
+++ b/user_data/workflows/standard.yaml
@@ -67,6 +67,9 @@ steps:
   - form_inputs: {}
     inputs: { }
     type: PowerAnalysisSampleSizeCalculationForAllProteins
+  - form_inputs: { }
+    inputs: { }
+    type: PowerAnalysisPowerCalculationForAllProteins
   - form_inputs:
       fc_threshold: 1
     inputs: { }

From eb3298453cce4250d39c8b44edf61fc77b2f1ce5 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Sun, 8 Sep 2024 14:59:11 +0200
Subject: [PATCH 28/36] Fixed hover display of violin plots

---
 protzilla/data_analysis/power_analysis.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py
index e2286ff5..e4fd2c9a 100644
--- a/protzilla/data_analysis/power_analysis.py
+++ b/protzilla/data_analysis/power_analysis.py
@@ -313,6 +313,7 @@ def sample_size_calculation_for_all_proteins(
             scalemode="width",
             spanmode="hard",
             span=[0, required_sample_size_for_all_proteins],
+            hoverinfo="y",
         )
     )
 
@@ -423,6 +424,7 @@ def power_calculation_for_all_proteins(
             scalemode="width",
             spanmode="hard",
             span=[power_for_all_proteins, 1],
+            hoverinfo="y"
         )
     )
 

From 1adda1ba676a2a452edf3578bdb21fd3c23fcf1a Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Tue, 8 Oct 2024 19:05:28 +0200
Subject: [PATCH 29/36] fixed typo and removed unnecessary comment

---
 protzilla/data_analysis/power_analysis.py | 13 -------------
 ui/runs/forms/data_analysis.py            |  2 +-
 2 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py
index e4fd2c9a..304f2823 100644
--- a/protzilla/data_analysis/power_analysis.py
+++ b/protzilla/data_analysis/power_analysis.py
@@ -103,7 +103,6 @@ def sample_size_calculation(
             metadata_df[["Sample", individual_column]],
             on="Sample",
         )
-        # filtered_protein_group_df.join(metadata_df[["Sample", individual_column]].set_index("Sample"), on="Sample")
 
         filtered_protein_group_df = (
             filtered_protein_group_merged_df.groupby(
@@ -183,7 +182,6 @@ def power_calculation(
             metadata_df[["Sample", individual_column]],
             on="Sample",
         )
-        # filtered_protein_group_df.join(metadata_df[["Sample", individual_column]].set_index("Sample"), on="Sample")
 
         filtered_protein_group_df = (
             filtered_protein_group_merged_df.groupby(
@@ -204,17 +202,6 @@ def power_calculation(
         intensity_name=intensity_name,
     )
 
-    """
-    filtered_df = differentially_expressed_proteins_df[differentially_expressed_proteins_df["Protein ID"] == protein_group]
-    filtered_df["Person"] = filtered_df["Sample"].apply(
-        lambda x: x[:7])
-
-    variance = filtered_df.groupby(['Person', 'Group'])['Normalised iBAQ'].var().reset_index()
-
-    filtered_df["Measurement"] = filtered_df["Sample"].apply(
-        lambda x: int(x[-2:]))
-    """
-
     group_count_df = filtered_protein_group_df.groupby(["Group", "Protein ID"])[
         "Sample"
     ].count()
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index 093a5fb7..0798fdde 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -1383,7 +1383,7 @@ class PowerAnalysisPowerCalculationForAllProteinsForm(MethodForm):
     )
     selected_protein_groups = CustomMultipleChoiceField(
         choices=[],
-        label="Protein groups to calculate sample size for",
+        label="Protein groups to calculate power for",
     )
 
     def fill_form(self, run: Run) -> None:

From d0ec1749ea51a78faad7e86b9e25f7c08a3cd071 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Tue, 29 Oct 2024 22:38:05 +0100
Subject: [PATCH 30/36] calculations for thesis (should be removed before
 merging into dev)

---
 protzilla/data_analysis/power_analysis.py     | 77 ++++++++++++++++++-
 protzilla/methods/data_analysis.py            |  1 +
 .../data_analysis/test_power_analysis.py      |  3 +-
 3 files changed, 78 insertions(+), 3 deletions(-)

diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py
index 304f2823..0ad98566 100644
--- a/protzilla/data_analysis/power_analysis.py
+++ b/protzilla/data_analysis/power_analysis.py
@@ -126,7 +126,8 @@ def sample_size_calculation(
     required_sample_size = math.ceil(required_sample_size)
     print(required_sample_size)
 
-    return dict(required_sample_size=required_sample_size)
+    return dict(required_sample_size=required_sample_size,
+                variance_protein_group=variance_protein_group) #TODO: remove this line before merging into main
 
 
 def power_calculation(
@@ -217,7 +218,6 @@ def power_calculation(
 
     return dict(power=power)
 
-
 def sample_size_calculation_for_all_proteins(
     differentially_expressed_proteins_df: pd.DataFrame,
     significant_proteins_df: pd.DataFrame,
@@ -288,6 +288,56 @@ def sample_size_calculation_for_all_proteins(
 
         required_sample_size_for_all_proteins = max(required_sample_sizes)
 
+    #TODO: remove before merging into main
+    required_sample_size_above_threshold = []
+
+    for protein_group in protein_groups_for_calculation:
+        required_sample_size = sample_size_calculation(
+            differentially_expressed_proteins_df=differentially_expressed_proteins_df,
+            significant_proteins_df=significant_proteins_df,
+            metadata_df=metadata_df,
+            fc_threshold=fc_threshold,
+            alpha=alpha,
+            power=power,
+            group1=group1,
+            group2=group2,
+            selected_protein_group=protein_group,
+            individual_column=individual_column,
+            intensity_name=intensity_name,
+        )["required_sample_size"]
+
+        if required_sample_size > 44:
+            required_sample_size_above_threshold.append({"Protein ID": protein_group, "Required Sample Size": required_sample_size})
+
+    num_proteins_above_threshold = len(required_sample_size_above_threshold)
+    print(num_proteins_above_threshold)
+    num_required_sample_sizes = len(required_sample_sizes)
+    print(num_required_sample_sizes)
+
+
+    variance_protein_group_all = []
+    for protein_group in protein_groups_for_calculation:
+        variance_protein_group = sample_size_calculation(
+            differentially_expressed_proteins_df=differentially_expressed_proteins_df,
+            significant_proteins_df=significant_proteins_df,
+            metadata_df=metadata_df,
+            fc_threshold=fc_threshold,
+            alpha=alpha,
+            power=power,
+            group1=group1,
+            group2=group2,
+            selected_protein_group=protein_group,
+            individual_column=individual_column,
+            intensity_name=intensity_name,
+        )["variance_protein_group"]
+
+        variance_protein_group_all.append(variance_protein_group)
+
+    variance_mean = np.mean(variance_protein_group_all)
+    print(variance_mean)
+
+    #end of lines that should be removed before merging
+
     colors = colorscheme.PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE
 
     fig = go.Figure(
@@ -399,6 +449,29 @@ def power_calculation_for_all_proteins(
 
         power_for_all_proteins = min(power_list)
 
+    power_below_threshold = []
+    for protein_group in protein_groups_for_calculation:
+        power = power_calculation(
+            differentially_expressed_proteins_df=differentially_expressed_proteins_df,
+            significant_proteins_df=significant_proteins_df,
+            metadata_df=metadata_df,
+            fc_threshold=fc_threshold,
+            alpha=alpha,
+            group1=group1,
+            group2=group2,
+            selected_protein_group=protein_group,
+            individual_column=individual_column,
+            intensity_name=intensity_name,
+        )["power"]
+        power_list.append({"Protein ID": protein_group, "Power": power})
+        if power < 0.8:
+            power_below_threshold.append({"Protein ID": protein_group, "Power": power})
+    num_proteins_below_threshold = len(power_below_threshold)
+    print(num_proteins_below_threshold)
+    num_power_list = len(power_list)
+    print(num_power_list)
+
+
     colors = colorscheme.PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE
 
     fig = go.Figure(
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 21c430b5..38b1fb9f 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -855,6 +855,7 @@ class PowerAnalysisSampleSizeCalculation(DataAnalysisStep):
     ]
     output_keys = [
         "required_sample_size",
+        "variance_protein_group", #TODO: remove this line before merging into main
     ]
 
     def method(self, inputs: dict) -> dict:
diff --git a/tests/protzilla/data_analysis/test_power_analysis.py b/tests/protzilla/data_analysis/test_power_analysis.py
index eb563d68..0026621b 100644
--- a/tests/protzilla/data_analysis/test_power_analysis.py
+++ b/tests/protzilla/data_analysis/test_power_analysis.py
@@ -9,7 +9,7 @@
     sample_size_calculation,
     variance_protein_group_calculation_max,
 )
-from protzilla.data_analysis.power_analysis_validation import (
+from tests.protzilla.data_analysis.power_analysis_validation import (
     check_sample_size_calculation_with_libfunc,
     check_sample_size_calculation_implemented,
     check_sample_size_calculation_implemented_without_log,
@@ -92,6 +92,7 @@ def test_sample_size_calculation(power_test_data, diff_expr_test_data):
     required_sample_size_int = next(iter(required_sample_size.values()), None)
     assert required_sample_size_int == 63
 
+
 def test_power_calculation(power_test_data, diff_expr_test_data):
     test_alpha = 0.05
     test_fc_threshold = 1

From 776dc55d49a0c0986c0affd0844c1df7d0e01214 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Mon, 4 Nov 2024 03:19:59 +0100
Subject: [PATCH 31/36] calculations for thesis (should be removed before
 merging into dev)

---
 .../power_analysis_validation.py              |   4 +-
 .../data_analysis/test_power_analysis.py      | 168 +++++++++++++++---
 2 files changed, 147 insertions(+), 25 deletions(-)

diff --git a/tests/protzilla/data_analysis/power_analysis_validation.py b/tests/protzilla/data_analysis/power_analysis_validation.py
index 8351202d..09586de8 100644
--- a/tests/protzilla/data_analysis/power_analysis_validation.py
+++ b/tests/protzilla/data_analysis/power_analysis_validation.py
@@ -75,7 +75,7 @@ def check_sample_size_calculation_with_libfunc(
     # impl: required_sample_size = 0.814; fc_threshold = 1.014; variance = 0.0534
 
 
-def check_sample_size_calculation_implemented(
+def check_sample_size_calculation_protzilla(
     differentially_expressed_proteins_df: pd.DataFrame,
     significant_proteins_df: pd.DataFrame,
     fc_threshold: float,
@@ -134,7 +134,7 @@ def check_sample_size_calculation_implemented(
     return dict(required_sample_size=required_sample_size)
 
 
-def check_sample_size_calculation_implemented_without_log(
+def check_sample_size_calculation_protzilla_without_log(
     differentially_expressed_proteins_df: pd.DataFrame,
     significant_proteins_df: pd.DataFrame,
     fc_threshold: float,
diff --git a/tests/protzilla/data_analysis/test_power_analysis.py b/tests/protzilla/data_analysis/test_power_analysis.py
index 0026621b..76cd1b82 100644
--- a/tests/protzilla/data_analysis/test_power_analysis.py
+++ b/tests/protzilla/data_analysis/test_power_analysis.py
@@ -11,8 +11,8 @@
 )
 from tests.protzilla.data_analysis.power_analysis_validation import (
     check_sample_size_calculation_with_libfunc,
-    check_sample_size_calculation_implemented,
-    check_sample_size_calculation_implemented_without_log,
+    check_sample_size_calculation_protzilla,
+    check_sample_size_calculation_protzilla_without_log,
 )
 from test_differential_expression import diff_expr_test_data
 
@@ -24,7 +24,7 @@ def power_test_data():
         ["Sample1", "Protein2", "Gene1", 16, "Group1"],
         ["Sample1", "Protein3", "Gene1", 1, "Group1"],
         ["Sample1", "Protein4", "Gene1", 14, "Group1"],
-        ["Sample2", "Protein1", "Gene1", 20, "Group1"],
+        ["Sample2", "Protein1", "Gene1", 19, "Group1"],
         ["Sample2", "Protein2", "Gene1", 15, "Group1"],
         ["Sample2", "Protein3", "Gene1", 2, "Group1"],
         ["Sample2", "Protein4", "Gene1", 15, "Group1"],
@@ -32,18 +32,101 @@ def power_test_data():
         ["Sample3", "Protein2", "Gene1", 14, "Group1"],
         ["Sample3", "Protein3", "Gene1", 3, "Group1"],
         ["Sample3", "Protein4", "Gene1", 16, "Group1"],
-        ["Sample4", "Protein1", "Gene1", 8, "Group2"],
-        ["Sample4", "Protein2", "Gene1", 15, "Group2"],
-        ["Sample4", "Protein3", "Gene1", 1, "Group2"],
-        ["Sample4", "Protein4", "Gene1", 9, "Group2"],
-        ["Sample5", "Protein1", "Gene1", 10, "Group2"],
-        ["Sample5", "Protein2", "Gene1", 14, "Group2"],
-        ["Sample5", "Protein3", "Gene1", 2, "Group2"],
-        ["Sample5", "Protein4", "Gene1", 10, "Group2"],
-        ["Sample6", "Protein1", "Gene1", 12, "Group2"],
-        ["Sample6", "Protein2", "Gene1", 13, "Group2"],
-        ["Sample6", "Protein3", "Gene1", 3, "Group2"],
-        ["Sample6", "Protein4", "Gene1", 11, "Group2"],
+        ["Sample4", "Protein1", "Gene1", 16, "Group1"],
+        ["Sample4", "Protein2", "Gene1", 14, "Group1"],
+        ["Sample4", "Protein3", "Gene1", 3, "Group1"],
+        ["Sample4", "Protein4", "Gene1", 16, "Group1"],
+        ["Sample5", "Protein1", "Gene1", 24, "Group1"],
+        ["Sample5", "Protein2", "Gene1", 14, "Group1"],
+        ["Sample5", "Protein3", "Gene1", 3, "Group1"],
+        ["Sample5", "Protein4", "Gene1", 16, "Group1"],
+        ["Sample6", "Protein1", "Gene1", 21, "Group1"],
+        ["Sample6", "Protein2", "Gene1", 14, "Group1"],
+        ["Sample6", "Protein3", "Gene1", 3, "Group1"],
+        ["Sample6", "Protein4", "Gene1", 16, "Group1"],
+        ["Sample7", "Protein1", "Gene1", 8, "Group2"],
+        ["Sample7", "Protein2", "Gene1", 15, "Group2"],
+        ["Sample7", "Protein3", "Gene1", 1, "Group2"],
+        ["Sample7", "Protein4", "Gene1", 9, "Group2"],
+        ["Sample8", "Protein1", "Gene1", 9, "Group2"],
+        ["Sample8", "Protein2", "Gene1", 14, "Group2"],
+        ["Sample8", "Protein3", "Gene1", 2, "Group2"],
+        ["Sample8", "Protein4", "Gene1", 10, "Group2"],
+        ["Sample9", "Protein1", "Gene1", 12, "Group2"],
+        ["Sample9", "Protein2", "Gene1", 13, "Group2"],
+        ["Sample9", "Protein3", "Gene1", 3, "Group2"],
+        ["Sample9", "Protein4", "Gene1", 11, "Group2"],
+        ["Sample10", "Protein1", "Gene1", 6, "Group2"],
+        ["Sample10", "Protein2", "Gene1", 13, "Group2"],
+        ["Sample10", "Protein3", "Gene1", 3, "Group2"],
+        ["Sample10", "Protein4", "Gene1", 11, "Group2"],
+        ["Sample11", "Protein1", "Gene1", 14, "Group2"],
+        ["Sample11", "Protein2", "Gene1", 13, "Group2"],
+        ["Sample11", "Protein3", "Gene1", 3, "Group2"],
+        ["Sample11", "Protein4", "Gene1", 11, "Group2"],
+        ["Sample12", "Protein1", "Gene1", 11, "Group2"],
+        ["Sample12", "Protein2", "Gene1", 13, "Group2"],
+        ["Sample12", "Protein3", "Gene1", 3, "Group2"],
+        ["Sample12", "Protein4", "Gene1", 11, "Group2"],
+    )
+
+    test_differentially_expressed_proteins_df = pd.DataFrame(
+        data=test_differentially_expressed_proteins_list,
+        columns=["Sample", "Protein ID", "Gene", "Normalised iBAQ", "Group"],
+    )
+    return test_differentially_expressed_proteins_df
+
+@pytest.fixture
+def power_test_data_intensity_values():
+    test_differentially_expressed_proteins_list = (
+        ["Sample1", "Protein1", "Gene1", -1.56714, "Group1"],
+        ["Sample1", "Protein2", "Gene1", 16, "Group1"],
+        ["Sample1", "Protein3", "Gene1", 1, "Group1"],
+        ["Sample1", "Protein4", "Gene1", 14, "Group1"],
+        ["Sample2", "Protein1", "Gene1", -0.37691, "Group1"],
+        ["Sample2", "Protein2", "Gene1", 15, "Group1"],
+        ["Sample2", "Protein3", "Gene1", 2, "Group1"],
+        ["Sample2", "Protein4", "Gene1", 15, "Group1"],
+        ["Sample3", "Protein1", "Gene1", 0.38817, "Group1"],
+        ["Sample3", "Protein2", "Gene1", 14, "Group1"],
+        ["Sample3", "Protein3", "Gene1", 3, "Group1"],
+        ["Sample3", "Protein4", "Gene1", 16, "Group1"],
+        ["Sample4", "Protein1", "Gene1", 1.6, "Group1"],
+        ["Sample4", "Protein2", "Gene1", 14, "Group1"],
+        ["Sample4", "Protein3", "Gene1", 3, "Group1"],
+        ["Sample4", "Protein4", "Gene1", 16, "Group1"],
+        ["Sample5", "Protein1", "Gene1", 1.9, "Group1"],
+        ["Sample5", "Protein2", "Gene1", 14, "Group1"],
+        ["Sample5", "Protein3", "Gene1", 3, "Group1"],
+        ["Sample5", "Protein4", "Gene1", 16, "Group1"],
+        ["Sample6", "Protein1", "Gene1", -0.07, "Group1"],
+        ["Sample6", "Protein2", "Gene1", 14, "Group1"],
+        ["Sample6", "Protein3", "Gene1", 3, "Group1"],
+        ["Sample6", "Protein4", "Gene1", 16, "Group1"],
+        ["Sample7", "Protein1", "Gene1", 0.9819, "Group2"],
+        ["Sample7", "Protein2", "Gene1", 15, "Group2"],
+        ["Sample7", "Protein3", "Gene1", 1, "Group2"],
+        ["Sample7", "Protein4", "Gene1", 9, "Group2"],
+        ["Sample8", "Protein1", "Gene1", -0.26, "Group2"],
+        ["Sample8", "Protein2", "Gene1", 13, "Group2"],
+        ["Sample8", "Protein3", "Gene1", 3, "Group2"],
+        ["Sample8", "Protein4", "Gene1", 11, "Group2"],
+        ["Sample9", "Protein1", "Gene1", 1.116, "Group2"],
+        ["Sample9", "Protein2", "Gene1", 14, "Group2"],
+        ["Sample9", "Protein3", "Gene1", 3, "Group2"],
+        ["Sample9", "Protein4", "Gene1", 16, "Group2"],
+        ["Sample10", "Protein1", "Gene1", 0.81, "Group2"],
+        ["Sample10", "Protein2", "Gene1", 14, "Group2"],
+        ["Sample10", "Protein3", "Gene1", 3, "Group2"],
+        ["Sample10", "Protein4", "Gene1", 16, "Group2"],
+        ["Sample11", "Protein1", "Gene1", 1.336, "Group2"],
+        ["Sample11", "Protein2", "Gene1", 14, "Group2"],
+        ["Sample11", "Protein3", "Gene1", 3, "Group2"],
+        ["Sample11", "Protein4", "Gene1", 16, "Group2"],
+        ["Sample12", "Protein1", "Gene1", 1.81, "Group2"],
+        ["Sample12", "Protein2", "Gene1", 14, "Group2"],
+        ["Sample12", "Protein3", "Gene1", 2, "Group2"],
+        ["Sample12", "Protein4", "Gene1", 10, "Group2"],
     )
 
     test_differentially_expressed_proteins_df = pd.DataFrame(
@@ -116,6 +199,49 @@ def test_power_calculation(power_test_data, diff_expr_test_data):
     power_int = next(iter(power.values()), None)
     assert power_int == 0.09
 
+#TODO: The following tests has been used for thesis calculations. Should not be merged to dev branch.
+
+def test_check_sample_size_calculation_with_libfun_intensity_values(power_test_data_intensity_values):
+    test_alpha = 0.05
+    test_power = 0.8
+    test_fc_threshold = 5
+    test_selected_protein_group = "Protein1"
+
+    required_sample_size = check_sample_size_calculation_with_libfunc(
+        differentially_expressed_proteins_df=power_test_data_intensity_values,
+        significant_proteins_df=power_test_data_intensity_values,
+        fc_threshold=test_fc_threshold,
+        power=test_power,
+        alpha=test_alpha,
+        group1="Group1",
+        group2="Group2",
+        selected_protein_group=test_selected_protein_group,
+        intensity_name=None,
+    )
+    print(required_sample_size)
+    required_sample_size_int = next(iter(required_sample_size.values()), None)
+    assert required_sample_size_int == 63
+
+def test_check_sample_size_calculation_protzilla_intensity_values(power_test_data_intensity_values):
+    test_alpha = 0.05
+    test_power = 0.8
+    test_fc_threshold = 1
+    test_selected_protein_group = "Protein1"
+
+    required_sample_size = check_sample_size_calculation_protzilla(
+        differentially_expressed_proteins_df=power_test_data_intensity_values,
+        significant_proteins_df=power_test_data_intensity_values,
+        fc_threshold=test_fc_threshold,
+        power=test_power,
+        alpha=test_alpha,
+        group1="Group1",
+        group2="Group2",
+        selected_protein_group=test_selected_protein_group,
+        intensity_name=None,
+    )
+    print(required_sample_size)
+    required_sample_size_int = next(iter(required_sample_size.values()), None)
+    assert required_sample_size_int == 1
 
 def test_check_sample_size_calculation_with_libfun(power_test_data):
     test_alpha = 0.05
@@ -132,15 +258,13 @@ def test_check_sample_size_calculation_with_libfun(power_test_data):
         group1="Group1",
         group2="Group2",
         selected_protein_group=test_selected_protein_group,
-        significant_proteins_only=False,
         intensity_name=None,
     )
     print(required_sample_size)
     required_sample_size_int = next(iter(required_sample_size.values()), None)
     assert required_sample_size_int == 63
 
-
-def test_check_sample_size_calculation_impl(power_test_data):
+def test_check_sample_size_calculation_protzilla(power_test_data):
     test_alpha = 0.05
     test_power = 0.8
     power_test_data_log2 = power_test_data.copy()
@@ -150,7 +274,7 @@ def test_check_sample_size_calculation_impl(power_test_data):
     fc_threshold = 1
     test_selected_protein_group = "Protein1"
 
-    required_sample_size = check_sample_size_calculation_implemented(
+    required_sample_size = check_sample_size_calculation_protzilla(
         differentially_expressed_proteins_df=power_test_data_log2,
         significant_proteins_df=power_test_data,
         fc_threshold=fc_threshold,
@@ -159,7 +283,6 @@ def test_check_sample_size_calculation_impl(power_test_data):
         group1="Group1",
         group2="Group2",
         selected_protein_group=test_selected_protein_group,
-        significant_proteins_only=False,
         intensity_name=None,
     )
     print(required_sample_size)
@@ -167,13 +290,13 @@ def test_check_sample_size_calculation_impl(power_test_data):
     assert required_sample_size_int == 1
 
 
-def test_check_sample_size_calculation_implemented_without_log(power_test_data):
+def test_check_sample_size_calculation_protzilla_without_log(power_test_data):
     test_alpha = 0.05
     test_power = 0.8
     test_fc_threshold = 5
     test_selected_protein_group = "Protein1"
 
-    required_sample_size = check_sample_size_calculation_implemented_without_log(
+    required_sample_size = check_sample_size_calculation_protzilla_without_log(
         differentially_expressed_proteins_df=power_test_data,
         significant_proteins_df=power_test_data,
         fc_threshold=test_fc_threshold,
@@ -182,7 +305,6 @@ def test_check_sample_size_calculation_implemented_without_log(power_test_data):
         group1="Group1",
         group2="Group2",
         selected_protein_group=test_selected_protein_group,
-        significant_proteins_only=False,
         intensity_name=None,
     )
     print(required_sample_size)

From 7131d3b043bf38fff5301bd01cdb59f4c566bb79 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Mon, 18 Nov 2024 03:22:16 +0100
Subject: [PATCH 32/36] put calculation for thesis into comment and changed
 description of methods "...for All Proteins"

---
 protzilla/data_analysis/power_analysis.py | 3 ++-
 protzilla/methods/data_analysis.py        | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py
index 0ad98566..932d49c2 100644
--- a/protzilla/data_analysis/power_analysis.py
+++ b/protzilla/data_analysis/power_analysis.py
@@ -449,6 +449,7 @@ def power_calculation_for_all_proteins(
 
         power_for_all_proteins = min(power_list)
 
+    """
     power_below_threshold = []
     for protein_group in protein_groups_for_calculation:
         power = power_calculation(
@@ -470,7 +471,7 @@ def power_calculation_for_all_proteins(
     print(num_proteins_below_threshold)
     num_power_list = len(power_list)
     print(num_power_list)
-
+    """
 
     colors = colorscheme.PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE
 
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 38b1fb9f..09183b27 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -885,9 +885,9 @@ def handle_outputs(self, outputs: dict):
 
 
 class PowerAnalysisSampleSizeCalculationForAllProteins(PlotStep):
-    display_name = "Sample Size Calculation for all Proteins"
+    display_name = "Sample Size Calculation for All Proteins"
     operation = "Power Analysis"
-    method_description = "Calculates sample size for all proteins"
+    method_description = "Calculates sample size for a selected group of proteins and returns the maximum required sample size."
 
     input_keys = [
         "differentially_expressed_proteins_df",
@@ -936,9 +936,9 @@ def handle_outputs(self, outputs: dict):
         ] = f"Required Sample Size for all Proteins: {outputs['required_sample_size_for_all_proteins']}"
 
 class PowerAnalysisPowerCalculationForAllProteins(PlotStep):
-    display_name = "Power Calculation for all Proteins"
+    display_name = "Power Calculation for All Proteins"
     operation = "Power Analysis"
-    method_description = "Calculates power for all proteins"
+    method_description = "Calculates power for a selected group of proteins and returns the minimum power."
 
     input_keys = [
         "differentially_expressed_proteins_df",

From 6e2daa3d10eae6a3fd0bddfd7c9b8a176fa21413 Mon Sep 17 00:00:00 2001
From: selenabr <116892527+selenabr@users.noreply.github.com>
Date: Mon, 18 Nov 2024 03:53:24 +0100
Subject: [PATCH 33/36] Add files via upload

meta file that includes an additional column that identifies the individual sample IDs.
---
 meta_individual_column.csv | 144 +++++++++++++++++++++++++++++++++++++
 1 file changed, 144 insertions(+)
 create mode 100644 meta_individual_column.csv

diff --git a/meta_individual_column.csv b/meta_individual_column.csv
new file mode 100644
index 00000000..82053971
--- /dev/null
+++ b/meta_individual_column.csv
@@ -0,0 +1,144 @@
+﻿Sample,Group,Batch,Individual
+AD01_C1_INSOLUBLE_01,AD,C1,AD01_
+AD01_C1_INSOLUBLE_02,AD,C1,AD01_
+AD01_C1_INSOLUBLE_03,AD,C1,AD01_
+AD01_C2_INSOLUBLE_01,AD,C2,AD01_
+AD02_C1_INSOLUBLE_01,AD,C1,AD02_
+AD02_C1_INSOLUBLE_02,AD,C1,AD02_
+AD02_C2_INSOLUBLE_01,AD,C2,AD02_
+AD03_C1_INSOLUBLE_01,AD,C1,AD03_
+AD03_C1_INSOLUBLE_02,AD,C1,AD03_
+AD03_C1_INSOLUBLE_03,AD,C1,AD03_
+AD03_C2_INSOLUBLE_01,AD,C2,AD03_
+AD04_C1_INSOLUBLE_01,AD,C1,AD04_
+AD04_C2_INSOLUBLE_01,AD,C2,AD04_
+AD05_C2_INSOLUBLE_01,AD,C2,AD05_
+AD06_C1_INSOLUBLE_01,AD,C1,AD06_
+AD07_C1_INSOLUBLE_01,AD,C1,AD07_
+AD07_C1_INSOLUBLE_02,AD,C1,AD07_
+AD07_C1_INSOLUBLE_03,AD,C1,AD07_
+AD07_C2_INSOLUBLE_01,AD,C2,AD07_
+AD08_C2_INSOLUBLE_01,AD,C2,AD08_
+AD09_C1_INSOLUBLE_01,AD,C1,AD09_
+AD10_C1_INSOLUBLE_01,AD,C1,AD10_
+AD10_C2_INSOLUBLE_01,AD,C2,AD10_
+AD11_C2_INSOLUBLE_01,AD,C2,AD11_
+AD12_C2_INSOLUBLE_01,AD,C2,AD12_
+AD13_C2_INSOLUBLE_01,AD,C2,AD13_
+AD14_C2_INSOLUBLE_01,AD,C2,AD14_
+AD15_C2_INSOLUBLE_01,AD,C2,AD15_
+AD16_C2_INSOLUBLE_01,AD,C2,AD16_
+AD17_C2_INSOLUBLE_01,AD,C2,AD17_
+AD18_C2_INSOLUBLE_01,AD,C2,AD18_
+AD19_C2_INSOLUBLE_01,AD,C2,AD19_
+AD20_C1_INSOLUBLE_01,AD,C1,AD20_
+AD21_C1_INSOLUBLE_01,AD,C1,AD21_
+AD21_C2_INSOLUBLE_01,AD,C2,AD21_
+AD22_C1_INSOLUBLE_01,AD,C1,AD22_
+AD23_C1_INSOLUBLE_01,AD,C1,AD23_
+AD23_C1_INSOLUBLE_02,AD,C1,AD23_
+AD23_C2_INSOLUBLE_01,AD,C2,AD23_
+AD24_C1_INSOLUBLE_01,AD,C1,AD24_
+AD24_C1_INSOLUBLE_02,AD,C1,AD24_
+AD25_C1_INSOLUBLE_01,AD,C1,AD25_
+AD26_C1_INSOLUBLE_01,AD,C1,AD26_
+AD27_C1_INSOLUBLE_01,AD,C1,AD27_
+AD27_C1_INSOLUBLE_02,AD,C1,AD27_
+AD28_C2_INSOLUBLE_01,AD,C2,AD28_
+AD29_C1_INSOLUBLE_01,AD,C1,AD29_
+AD30_C1_INSOLUBLE_01,AD,C1,AD30_
+AD30_C1_INSOLUBLE_02,AD,C1,AD30_
+AD30_C2_INSOLUBLE_01,AD,C2,AD30_
+AD31_C2_INSOLUBLE_01,AD,C2,AD31_
+AD32_C2_INSOLUBLE_01,AD,C2,AD32_
+AD33_C2_INSOLUBLE_01,AD,C2,AD33_
+AD34_C1_INSOLUBLE_01,AD,C1,AD34_
+AD34_C1_INSOLUBLE_02,AD,C1,AD34_
+AD35_C1_INSOLUBLE_01,AD,C1,AD35_
+AD35_C1_INSOLUBLE_02,AD,C1,AD35_
+AD36_C1_INSOLUBLE_01,AD,C1,AD36_
+AD37_C1_INSOLUBLE_01,AD,C1,AD37_
+AD37_C2_INSOLUBLE_01,AD,C2,AD37_
+AD38_C1_INSOLUBLE_01,AD,C1,AD38_
+AD38_C1_INSOLUBLE_02,AD,C1,AD38_
+AD38_C1_INSOLUBLE_03,AD,C1,AD38_
+AD39_C2_INSOLUBLE_01,AD,C2,AD39_
+AD40_C2_INSOLUBLE_01,AD,C2,AD40_
+AD41_C2_INSOLUBLE_01,AD,C2,AD41_
+AD42_C2_INSOLUBLE_01,AD,C2,AD42_
+AD43_C1_INSOLUBLE_01,AD,C1,AD43_
+AD44_C1_INSOLUBLE_01,AD,C1,AD44_
+AD44_C1_INSOLUBLE_02,AD,C1,AD44_
+AD44_C1_INSOLUBLE_03,AD,C1,AD44_
+AD44_C1_INSOLUBLE_04,AD,C1,AD44_
+AD45_C1_INSOLUBLE_01,AD,C1,AD45_
+AD45_C1_INSOLUBLE_02,AD,C1,AD45_
+AD46_C1_INSOLUBLE_01,AD,C1,AD46_
+AD46_C1_INSOLUBLE_02,AD,C1,AD46_
+AD46_C1_INSOLUBLE_03,AD,C1,AD46_
+AD46_C2_INSOLUBLE_01,AD,C2,AD46_
+AD47_C1_INSOLUBLE_01,AD,C1,AD47_
+AD48_C2_INSOLUBLE_01,AD,C2,AD48_
+AD49_C2_INSOLUBLE_01,AD,C2,AD49_
+CTR01_C1_INSOLUBLE_01,CTR,C1,CTR01
+CTR02_C1_INSOLUBLE_01,CTR,C1,CTR02
+CTR03_C1_INSOLUBLE_01,CTR,C1,CTR03
+CTR04_C1_INSOLUBLE_01,CTR,C1,CTR04
+CTR05_C2_INSOLUBLE_01,CTR,C2,CTR05
+CTR06_C2_INSOLUBLE_01,CTR,C2,CTR06
+CTR07_C1_INSOLUBLE_01,CTR,C1,CTR07
+CTR08_C1_INSOLUBLE_01,CTR,C1,CTR08
+CTR08_C2_INSOLUBLE_01,CTR,C2,CTR08
+CTR09_C2_INSOLUBLE_01,CTR,C2,CTR09
+CTR10_C1_INSOLUBLE_01,CTR,C1,CTR10
+CTR10_C2_INSOLUBLE_01,CTR,C2,CTR10
+CTR11_C2_INSOLUBLE_01,CTR,C2,CTR11
+CTR12_C2_INSOLUBLE_01,CTR,C2,CTR12
+CTR13_C2_INSOLUBLE_01,CTR,C2,CTR13
+CTR14_C2_INSOLUBLE_01,CTR,C2,CTR14
+CTR15_C2_INSOLUBLE_01,CTR,C2,CTR15
+CTR16_C2_INSOLUBLE_01,CTR,C2,CTR16
+CTR17_C2_INSOLUBLE_01,CTR,C2,CTR17
+CTR18_C2_INSOLUBLE_01,CTR,C2,CTR18
+CTR19_C1_INSOLUBLE_01,CTR,C1,CTR19
+CTR20_C1_INSOLUBLE_01,CTR,C1,CTR20
+CTR21_C2_INSOLUBLE_01,CTR,C2,CTR21
+CTR22_C2_INSOLUBLE_01,CTR,C2,CTR22
+CTR23_C2_INSOLUBLE_01,CTR,C2,CTR23
+CTR24_C1_INSOLUBLE_01,CTR,C1,CTR24
+CTR25_C1_INSOLUBLE_01,CTR,C1,CTR25
+CTR26_C2_INSOLUBLE_01,CTR,C2,CTR26
+CTR27_C1_INSOLUBLE_01,CTR,C1,CTR27
+CTR28_C1_INSOLUBLE_01,CTR,C1,CTR28
+CTR28_C1_INSOLUBLE_02,CTR,C1,CTR28
+CTR28_C2_INSOLUBLE_01,CTR,C2,CTR28
+CTR29_C1_INSOLUBLE_01,CTR,C1,CTR29
+CTR29_C1_INSOLUBLE_02,CTR,C1,CTR29
+CTR29_C1_INSOLUBLE_03,CTR,C1,CTR29
+CTR30_C1_INSOLUBLE_01,CTR,C1,CTR30
+CTR30_C1_INSOLUBLE_02,CTR,C1,CTR30
+CTR30_C2_INSOLUBLE_01,CTR,C2,CTR30
+CTR31_C1_INSOLUBLE_01,CTR,C1,CTR31
+CTR31_C2_INSOLUBLE_01,CTR,C2,CTR31
+CTR32_C1_INSOLUBLE_01,CTR,C1,CTR32
+CTR32_C2_INSOLUBLE_01,CTR,C2,CTR32
+CTR33_C1_INSOLUBLE_01,CTR,C1,CTR33
+CTR34_C1_INSOLUBLE_01,CTR,C1,CTR34
+CTR34_C2_INSOLUBLE_01,CTR,C2,CTR34
+CTR35_C1_INSOLUBLE_01,CTR,C1,CTR35
+CTR36_C1_INSOLUBLE_01,CTR,C1,CTR36
+CTR36_C1_INSOLUBLE_02,CTR,C1,CTR36
+CTR37_C1_INSOLUBLE_01,CTR,C1,CTR37
+CTR38_C1_INSOLUBLE_01,CTR,C1,CTR38
+CTR39_C1_INSOLUBLE_01,CTR,C1,CTR39
+CTR40_C1_INSOLUBLE_01,CTR,C1,CTR40
+CTR40_C1_INSOLUBLE_02,CTR,C1,CTR40
+CTR40_C1_INSOLUBLE_03,CTR,C1,CTR40
+CTR41_C1_INSOLUBLE_01,CTR,C1,CTR41
+CTR41_C1_INSOLUBLE_02,CTR,C1,CTR41
+CTR41_C1_INSOLUBLE_03,CTR,C1,CTR41
+CTR42_C1_INSOLUBLE_01,CTR,C1,CTR42
+CTR42_C1_INSOLUBLE_02,CTR,C1,CTR42
+CTR42_C1_INSOLUBLE_03,CTR,C1,CTR42
+CTR43_C2_INSOLUBLE_01,CTR,C2,CTR43
+CTR44_C1_INSOLUBLE_01,CTR,C1,CTR44

From 01e9d5f51423bf368fe7c7857a818330f955df95 Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Tue, 4 Mar 2025 19:17:46 +0100
Subject: [PATCH 34/36] merge bachelor-thesis-selena into dev

---
 .pre-commit-config.yaml | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a62c2094..885f0833 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1 +1,27 @@
-repos: []
\ No newline at end of file
+repos:
+  - repo: local
+    hooks:
+      - id: autoflake
+        name: Remove unused variables and imports
+        entry: bash -c 'autoflake "$@"; git add -u' --
+        language: python
+        args:
+          [
+            "--in-place",
+            "--remove-all-unused-imports",
+            "--remove-unused-variables",
+            "--expand-star-imports",
+            "--ignore-init-module-imports",
+          ]
+        files: \.py$
+      - id: isort
+        name: Sorting import statements
+        entry: bash -c 'isort "$@"; git add -u' --
+        language: python
+        args: ["--filter-files"]
+        files: \.py$
+      - id: black
+        name: Black Python code formatting
+        entry: bash -c 'black "$@"; git add -u' --
+        language: python
+        types: [python]
\ No newline at end of file

From 412dfd1e4284f309784683aa9a605e7bc643037e Mon Sep 17 00:00:00 2001
From: selenabr <selena.braune@student.hpi.de>
Date: Tue, 4 Mar 2025 19:52:21 +0100
Subject: [PATCH 35/36] fixed error in power_analysis.py (constants.color) and
 commented file test_power_analysis.py

---
 protzilla/data_analysis/power_analysis.py            | 2 +-
 tests/protzilla/data_analysis/test_power_analysis.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/protzilla/data_analysis/power_analysis.py b/protzilla/data_analysis/power_analysis.py
index 932d49c2..a2192c2a 100644
--- a/protzilla/data_analysis/power_analysis.py
+++ b/protzilla/data_analysis/power_analysis.py
@@ -7,7 +7,7 @@
 import plotly.graph_objs as go
 import protzilla.constants.colors as colorscheme
 
-from ..constants.colors import PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE
+from ..constants.colors import PLOT_COLOR_SEQUENCE
 from protzilla.utilities import default_intensity_column
 
 
diff --git a/tests/protzilla/data_analysis/test_power_analysis.py b/tests/protzilla/data_analysis/test_power_analysis.py
index 76cd1b82..8d551610 100644
--- a/tests/protzilla/data_analysis/test_power_analysis.py
+++ b/tests/protzilla/data_analysis/test_power_analysis.py
@@ -1,4 +1,4 @@
-import numpy as np
+"""import numpy as np
 import pandas as pd
 import pytest
 import math
@@ -341,3 +341,4 @@ def test_replicate_paper_sample_size_calculation(power_test_data):
     print(correlationmatrix)
 
     return dict(required_sample_size=required_sample_size)
+"""
\ No newline at end of file

From 7b6c1596d29efd055323ba7240dff4edcc346678 Mon Sep 17 00:00:00 2001
From: Jonas Krohn <jonas.krohn@gmx.de>
Date: Thu, 6 Mar 2025 23:59:18 +0100
Subject: [PATCH 36/36] changed steps to new format

---
 protzilla/methods/data_analysis.py            | 91 +++----------------
 .../workflows/overhaul.yaml:Zone.Identifier   |  3 -
 2 files changed, 14 insertions(+), 80 deletions(-)
 delete mode 100644 user_data/workflows/overhaul.yaml:Zone.Identifier

diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 8d0e37e7..13d01fd2 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -49,15 +49,6 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         return inputs
 
 
-class PlotStep(DataAnalysisStep):
-    step = "plot"
-
-    def handle_outputs(self, outputs: dict):
-        super().handle_outputs(outputs)
-        plots = self.output.output.pop("plots", [])
-        self.plots = Plots(plots)
-
-
 class DifferentialExpressionANOVA(DataAnalysisStep):
     display_name = "ANOVA"
     operation = "differential_expression"
@@ -666,21 +657,9 @@ class PowerAnalysisPowerCalculation(DataAnalysisStep):
     operation = "Power Analysis"
     method_description = "Calculates power of the test for given protein groups"
 
-    input_keys = [
-        "significant_proteins_df",
-        "differentially_expressed_proteins_df",
-        "selected_protein_group",
-        "fc_threshold",
-        "alpha",
-        "group1",
-        "group2",
-        "individual_column",
-        "metadata_df",
-    ]
     output_keys = ["power"]
 
-    def method(self, inputs: dict) -> dict:
-        return power_calculation(**inputs)
+    calc_method = staticmethod(power_calculation)
 
     def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         inputs["differentially_expressed_proteins_df"] = steps.get_step_output(
@@ -698,8 +677,8 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         inputs["group2"] = step.inputs["group2"]
         return inputs
 
-    def handle_outputs(self, outputs: dict):
-        super().handle_outputs(outputs)
+    def handle_calc_outputs(self, outputs : dict):
+        super().handle_calc_outputs(outputs)
         self.display_output["power"] = f"Power of the test: {outputs['power']}"
 
 
@@ -708,25 +687,12 @@ class PowerAnalysisSampleSizeCalculation(DataAnalysisStep):
     operation = "Power Analysis"
     method_description = "Calculates sample size for given protein groups"
 
-    input_keys = [
-        "differentially_expressed_proteins_df",
-        "selected_protein_group",
-        "significant_proteins_df",
-        "fc_threshold",
-        "alpha",
-        "group1",
-        "group2",
-        "power",
-        "individual_column",
-        "metadata_df",
-    ]
     output_keys = [
         "required_sample_size",
         "variance_protein_group",  # TODO: remove this line before merging into main
     ]
 
-    def method(self, inputs: dict) -> dict:
-        return sample_size_calculation(**inputs)
+    calc_method = staticmethod(sample_size_calculation)
 
     def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         inputs["differentially_expressed_proteins_df"] = steps.get_step_output(
@@ -744,32 +710,18 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         inputs["group2"] = step.inputs["group2"]
         return inputs
 
-    def handle_outputs(self, outputs: dict):
-        super().handle_outputs(outputs)
+    def handle_calc_outputs(self, outputs: dict):
+        super().handle_calc_outputs(outputs)
         self.display_output[
             "required_sample_size"
         ] = f"Required Sample Size: {outputs['required_sample_size']}"
 
 
-class PowerAnalysisSampleSizeCalculationForAllProteins(PlotStep):
+class PowerAnalysisSampleSizeCalculationForAllProteins(Step):
     display_name = "Sample Size Calculation for All Proteins"
     operation = "Power Analysis"
     method_description = "Calculates sample size for a selected group of proteins and returns the maximum required sample size."
 
-    input_keys = [
-        "differentially_expressed_proteins_df",
-        "significant_proteins_df",
-        "significant_proteins_only",
-        "fc_threshold",
-        "alpha",
-        "group1",
-        "group2",
-        "power",
-        "individual_column",
-        "metadata_df",
-        "select_all_proteins",
-        "selected_protein_groups",
-    ]
     output_keys = [
         "required_sample_size_for_all_proteins",
         "differentially_expressed_proteins_df",
@@ -777,8 +729,7 @@ class PowerAnalysisSampleSizeCalculationForAllProteins(PlotStep):
         "significant_proteins_df",
     ]
 
-    def method(self, inputs: dict) -> dict:
-        return sample_size_calculation_for_all_proteins(**inputs)
+    plot_method = staticmethod(sample_size_calculation_for_all_proteins)
 
     def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         inputs["differentially_expressed_proteins_df"] = steps.get_step_output(
@@ -796,31 +747,18 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         inputs["group2"] = step.inputs["group2"]
         return inputs
 
-    def handle_outputs(self, outputs: dict):
-        super().handle_outputs(outputs)
+    def handle_plot_outputs(self, outputs: dict):
+        super().handle_plot_outputs(outputs)
         self.display_output[
             "required_sample_size_for_all_proteins"
         ] = f"Required Sample Size for all Proteins: {outputs['required_sample_size_for_all_proteins']}"
 
 
-class PowerAnalysisPowerCalculationForAllProteins(PlotStep):
+class PowerAnalysisPowerCalculationForAllProteins():
     display_name = "Power Calculation for All Proteins"
     operation = "Power Analysis"
     method_description = "Calculates power for a selected group of proteins and returns the minimum power."
 
-    input_keys = [
-        "differentially_expressed_proteins_df",
-        "significant_proteins_df",
-        "significant_proteins_only",
-        "fc_threshold",
-        "alpha",
-        "group1",
-        "group2",
-        "individual_column",
-        "metadata_df",
-        "select_all_proteins",
-        "selected_protein_groups",
-    ]
     output_keys = [
         "power_for_all_proteins",
         "differentially_expressed_proteins_df",
@@ -828,8 +766,7 @@ class PowerAnalysisPowerCalculationForAllProteins(PlotStep):
         "significant_proteins_df",
     ]
 
-    def method(self, inputs: dict) -> dict:
-        return power_calculation_for_all_proteins(**inputs)
+    plot_method = staticmethod(power_calculation_for_all_proteins)
 
     def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         inputs["differentially_expressed_proteins_df"] = steps.get_step_output(
@@ -847,8 +784,8 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         inputs["group2"] = step.inputs["group2"]
         return inputs
 
-    def handle_outputs(self, outputs: dict):
-        super().handle_outputs(outputs)
+    def handle_plot_outputs(self, outputs: dict):
+        super().handle_plot_outputs(outputs)
         self.display_output[
             "power_for_all_proteins"
         ] = f"Power for all Proteins: {outputs['power_for_all_proteins']}"
diff --git a/user_data/workflows/overhaul.yaml:Zone.Identifier b/user_data/workflows/overhaul.yaml:Zone.Identifier
deleted file mode 100644
index 71c6e851..00000000
--- a/user_data/workflows/overhaul.yaml:Zone.Identifier
+++ /dev/null
@@ -1,3 +0,0 @@
-[ZoneTransfer]
-ZoneId=3
-HostUrl=https://files.slack.com/files-pri/T055BG3H51R-F06U5LX84NS/download/overhaul.yaml?origin_team=E055BG3H51R