Skip to content
Draft
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
73e13a2
added sample size calculation in methods\data_analysis.py and forms\d…
selenabr Jun 5, 2024
f133c87
enabled possibility to choose one protein for calculation dependent o…
selenabr Jun 17, 2024
49c7f0e
fixed errors with missing inputs
selenabr Jun 18, 2024
6d8c9a8
added variance calculation and testing function and edited sample siz…
selenabr Jun 18, 2024
0b95cf0
fixed some errors
selenabr Jun 19, 2024
22c293d
output field for result
selenabr Jun 20, 2024
fd756df
Merge branch 'dev' into bachelor-thesis-selena
selenabr Jun 20, 2024
b22b6e7
further implementation of output field for result
selenabr Jun 21, 2024
c6a2f3b
display display_output in output field
selenabr Jun 23, 2024
032286c
display_output field displayed in the same size and position as the o…
selenabr Jun 25, 2024
e90fab3
test function for sample_size_calculation
selenabr Jun 25, 2024
01eba42
Merge branch 'dev' into bachelor-thesis-selena
selenabr Jun 25, 2024
d3cf9d8
edited description of function
selenabr Jun 26, 2024
3ce4ae1
check if implemented function of Paper (Cairns et al., 2009) and libr…
selenabr Jul 8, 2024
f78b0b9
power calculation and test of library-function and implemented paper-…
selenabr Jul 8, 2024
e3dd1c3
added test for power_calculation method
selenabr Aug 20, 2024
2e3de5a
fixed constructor error
selenabr Aug 21, 2024
a46a074
sample size calculation for different group sizes (Cohen 1988) and mo…
selenabr Aug 23, 2024
3446be3
code formatting, resolved comments (output not a float, significant_p…
selenabr Aug 26, 2024
cb25777
feature: user can choose whether metadata contains a column for indiv…
selenabr Aug 28, 2024
52ef105
adapted test for power_calculation and sample_size_calculation and ch…
selenabr Aug 28, 2024
ac9e783
added function that calculates sample size for all proteins and shows…
selenabr Sep 3, 2024
e54c767
formatting
selenabr Sep 3, 2024
2faa972
commented the dataframe-output-stuff out, otherwise violin plot could…
selenabr Sep 3, 2024
25cf2b2
changed color of violinplot and added axis-description
selenabr Sep 3, 2024
ae4e8cb
changed color of violinplot and removed axis-description
selenabr Sep 5, 2024
5c63008
resolved comments
selenabr Sep 5, 2024
0adc15c
Added function to get dataframes with sample size column as output
selenabr Sep 5, 2024
dcba877
Added power_calculation_for_all_proteins to calculate minimum power f…
selenabr Sep 6, 2024
eb32984
Fixed hover display of violin plots
selenabr Sep 8, 2024
1adda1b
fixed typo and removed unnecessary comment
selenabr Oct 8, 2024
d0ec174
calculations for thesis (should be removed before merging into dev)
selenabr Oct 29, 2024
776dc55
calculations for thesis (should be removed before merging into dev)
selenabr Nov 4, 2024
7131d3b
put calculation for thesis into comment and changed description of me…
selenabr Nov 18, 2024
6e2daa3
Add files via upload
selenabr Nov 18, 2024
6778796
Merge branch 'dev' into bachelor-thesis-selena
selenabr Mar 4, 2025
01e9d5f
merge bachelor-thesis-selena into dev
selenabr Mar 4, 2025
412dfd1
fixed error in power_analysis.py (constants.color) and commented file…
selenabr Mar 4, 2025
7b6c159
changed steps to new format
Jonas0000 Mar 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 91 additions & 0 deletions protzilla/data_analysis/power_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import logging

import numpy as np
import pandas as pd
import math
from scipy import stats
from statsmodels.stats.power import TTestIndPower


def variance_protein_group_calculation(
intensity_df: pd.DataFrame,
protein_id: str,
group1: str,
group2: str,
intensity_name: str = None,
) -> float:
"""
Function to calculate the variance of a protein group for the two classes and return the maximum variance.

:param intensity_df: The dataframe containing the protein group intensities.
:param protein_id: The protein ID.
:param group1: The name of the first group.
:param group2: The name of the second group.
:param intensity_name: The name of the column containing the protein group intensities.
:return: The variance of the protein group.
"""

if intensity_name is None:
intensity_name = "Normalised iBAQ"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assumes that data has to be normalized before feeding into the step. Otherwise the column doesn't exist. I would say that is an unnecessary limitation that is not transparent to the user

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel that this could just the default argument if it is set anyways. Or is there a reason why the default has to be None?

protein_group = intensity_df[intensity_df["Protein ID"] == protein_id]

group1_intensities = protein_group[protein_group["Group"] == group1][intensity_name].values
group2_intensities = protein_group[protein_group["Group"] == group2][intensity_name].values

variance_group1 = np.var(group1_intensities, ddof=1)
variance_group2 = np.var(group2_intensities, ddof=1)

max_variance = max(variance_group1, variance_group2)

return max_variance

def sample_size_calculation(
differentially_expressed_proteins_df: pd.DataFrame,
significant_proteins_df: pd.DataFrame,
significant_proteins_only: bool,
fc_threshold: float,
alpha: float,
power: float,
group1: str,
group2: str,
selected_protein_group: str,
intensity_name: str = None
) -> float:
"""
Function to calculate the required sample size for a selected protein to achieve the required power .

:param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output.
:param significant_proteins_df: The dataframe containing the significant proteins from t-test output.
:param significant_proteins_only: A boolean to display only significant proteins for selection to the user.
:param fc_threshold: The fold change threshold.
:param alpha: The significance level. The value for alpha is taken from the t-test by default.
:param power: The power of the test.
:param group1: The name of the first group.
:param group2: The name of the second group.
:param selected_protein_group: The selected protein group for which the required sample size is to be calculated.
:param intensity_name: The name of the column containing the protein group intensities.
:return: The required sample size.
"""

if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values:
raise ValueError("Please select a valid protein group.")
protein_group = selected_protein_group
z_alpha = stats.norm.ppf(1 - alpha / 2)
z_beta = stats.norm.ppf(power)

variance_protein_group = variance_protein_group_calculation(
intensity_df=differentially_expressed_proteins_df,
protein_id=protein_group,
group1=group1,
group2=group2,
intensity_name=intensity_name,
)

required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * variance_protein_group)
required_sample_size = math.ceil(required_sample_size)
print(required_sample_size)

return dict(required_sample_size=required_sample_size)



55 changes: 53 additions & 2 deletions protzilla/methods/data_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@
prot_quant_plot,
scatter_plot,
)
from protzilla.data_analysis.power_analysis import sample_size_calculation
from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph
from protzilla.methods.data_preprocessing import TransformationLog
from protzilla.steps import Plots, Step, StepManager
from protzilla.steps import Plots, Step, StepManager, DisplayOutput


class DataAnalysisStep(Step):
Expand Down Expand Up @@ -759,4 +760,54 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
inputs["peptide_df"] = steps.get_step_output(
Step, "peptide_df", inputs["peptide_df"]
)
return inputs
return inputs
class PowerAnalysisPowerCalculation(DataAnalysisStep):
display_name = "Power Calculation"
operation = "Power Analysis"
method_description = "post-hoc Power Calculation"

input_keys = [
"significant_proteins_df"
]

class PowerAnalysisSampleSizeCalculation(DataAnalysisStep):
display_name = "Sample Size Calculation"
operation = "Power Analysis"
method_description = "Calculates sample size for protein groups"

input_keys = [
"differentially_expressed_proteins_df",
"selected_protein_group",
"significant_proteins_df",
"significant_proteins_only",
"fc_threshold",
"alpha",
"group1",
"group2",
"power",
]
output_keys = [
"required_sample_size",
]
def method(self, inputs: dict) -> dict:
return sample_size_calculation(**inputs)

def insert_dataframes(self, steps: StepManager, inputs) -> dict:
inputs["differentially_expressed_proteins_df"] = steps.get_step_output(
Step, "differentially_expressed_proteins_df", inputs["input_dict"]
)
step = next(
s for s in steps.all_steps if s.instance_identifier == inputs["input_dict"]
)
inputs["significant_proteins_df"] = steps.get_step_output(
Step, "significant_proteins_df", inputs["input_dict"]
)

inputs["alpha"] = step.inputs["alpha"]
inputs["group1"] = step.inputs["group1"]
inputs["group2"] = step.inputs["group2"]
return inputs

def handle_outputs(self, outputs: dict):
super().handle_outputs(outputs)
self.display_output["required_sample_size"] = f"Required Sample Size: {outputs['required_sample_size']}"
22 changes: 22 additions & 0 deletions protzilla/steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def __init__(self, instance_identifier: str | None = None):
self.messages: Messages = Messages([])
self.output: Output = Output()
self.plots: Plots = Plots()
self.display_output: DisplayOutput = DisplayOutput()
self.instance_identifier = instance_identifier

if self.instance_identifier is None:
Expand Down Expand Up @@ -310,6 +311,27 @@ def export(self, format_):
exports.append(BytesIO(base64.b64decode(plot)))
return exports

class DisplayOutput:

def __init__(self, display_output: dict = None):
if display_output is None:
display_output = {}
self.display_output = display_output
def __iter__(self):
return iter(self.display_output)
def __repr__(self):
return f"DisplayOutput: {self.display_output}"
def __contains__(self, key):
return key in self.display_output
def __getitem__(self, key):
return self.display_output[key]
def __setitem__(self, key, value):
self.display_output[key] = value
def is_empty(self) -> bool:
return len(self.display_output) == 0




class StepManager:
def __repr__(self):
Expand Down
90 changes: 90 additions & 0 deletions tests/protzilla/data_analysis/test_power_analysis.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are these test commented out?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, the first tests up to line 202 shouldn't be commented out. They tested the new methods on the old branch, and they worked. I think I commented them out because the methods didn't work on the dev branch due to the new changes...

Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import numpy as np
import pandas as pd
import pytest


from protzilla.data_analysis.power_analysis import variance_protein_group_calculation, sample_size_calculation


@pytest.fixture
def power_test_data():
test_differentially_expressed_proteins_list = (
["Sample1", "Protein1", "Gene1", 20, "Group1"],
["Sample1", "Protein2", "Gene1", 16, "Group1"],
["Sample1", "Protein3", "Gene1", 1, "Group1"],
["Sample1", "Protein4", "Gene1", 14, "Group1"],
["Sample2", "Protein1", "Gene1", 20, "Group1"],
["Sample2", "Protein2", "Gene1", 15, "Group1"],
["Sample2", "Protein3", "Gene1", 2, "Group1"],
["Sample2", "Protein4", "Gene1", 15, "Group1"],
["Sample3", "Protein1", "Gene1", 22, "Group1"],
["Sample3", "Protein2", "Gene1", 14, "Group1"],
["Sample3", "Protein3", "Gene1", 3, "Group1"],
["Sample3", "Protein4", "Gene1", 16, "Group1"],
["Sample4", "Protein1", "Gene1", 8, "Group2"],
["Sample4", "Protein2", "Gene1", 15, "Group2"],
["Sample4", "Protein3", "Gene1", 1, "Group2"],
["Sample4", "Protein4", "Gene1", 9, "Group2"],
["Sample5", "Protein1", "Gene1", 10, "Group2"],
["Sample5", "Protein2", "Gene1", 14, "Group2"],
["Sample5", "Protein3", "Gene1", 2, "Group2"],
["Sample5", "Protein4", "Gene1", 10, "Group2"],
["Sample6", "Protein1", "Gene1", 12, "Group2"],
["Sample6", "Protein2", "Gene1", 13, "Group2"],
["Sample6", "Protein3", "Gene1", 3, "Group2"],
["Sample6", "Protein4", "Gene1", 11, "Group2"],
)

test_differentially_expressed_proteins_df = pd.DataFrame(
data=test_differentially_expressed_proteins_list,
columns=["Sample", "Protein ID", "Gene", "Normalised iBAQ", "Group"],
)
return test_differentially_expressed_proteins_df


def test_variance_protein_group_calculation(
power_test_data
):
intensity_df = power_test_data

protein_id = "Protein1"
group1 = "Group1"
group2 = "Group2"

variance = variance_protein_group_calculation(
intensity_df, protein_id, group1, group2
)
print(variance)
assert variance == 4.0

def test_sample_size_calculation(
power_test_data

):
test_alpha = 0.05
test_power = 0.8
test_fc_threshold = 1
test_selected_protein_group = "Protein1"


required_sample_size = sample_size_calculation(
differentially_expressed_proteins_df=power_test_data,
significant_proteins_df=power_test_data,
fc_threshold=test_fc_threshold,
power=test_power,
alpha=test_alpha,
group1= "Group1",
group2= "Group2",
selected_protein_group=test_selected_protein_group,
significant_proteins_only=False,
intensity_name=None
)
print(required_sample_size)
required_sample_size_int = next(iter(required_sample_size.values()),None)
assert required_sample_size_int == 63






2 changes: 2 additions & 0 deletions ui/runs/form_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@
data_analysis.DimensionReductionUMAP: data_analysis_forms.DimensionReductionUMAPForm,
data_analysis.ProteinGraphPeptidesToIsoform: data_analysis_forms.ProteinGraphPeptidesToIsoformForm,
data_analysis.ProteinGraphVariationGraph: data_analysis_forms.ProteinGraphVariationGraphForm,
data_analysis.PowerAnalysisPowerCalculation: data_analysis_forms.PowerAnalysisPowerCalculationForm,
data_analysis.PowerAnalysisSampleSizeCalculation: data_analysis_forms.PowerAnalysisSampleSizeCalculationForm,
data_analysis.SelectPeptidesForProtein: data_analysis_forms.SelectPeptidesForProteinForm,
data_analysis.PTMsPerSample: data_analysis_forms.PTMsPerSampleForm,
data_analysis.PTMsProteinAndPerSample: data_analysis_forms.PTMsPerProteinAndSampleForm,
Expand Down
Loading