-
Notifications
You must be signed in to change notification settings - Fork 1
Sample size calculation #519
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev
Are you sure you want to change the base?
Changes from 13 commits
73e13a2
f133c87
49c7f0e
6d8c9a8
0b95cf0
22c293d
fd756df
b22b6e7
c6a2f3b
032286c
e90fab3
01eba42
d3cf9d8
3ce4ae1
f78b0b9
e3dd1c3
2e3de5a
a46a074
3446be3
cb25777
52ef105
ac9e783
e54c767
2faa972
25cf2b2
ae4e8cb
5c63008
0adc15c
dcba877
eb32984
1adda1b
d0ec174
776dc55
7131d3b
6e2daa3
6778796
01e9d5f
412dfd1
7b6c159
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,91 @@ | ||
| import logging | ||
|
|
||
| import numpy as np | ||
| import pandas as pd | ||
| import math | ||
| from scipy import stats | ||
| from statsmodels.stats.power import TTestIndPower | ||
|
|
||
|
|
||
| def variance_protein_group_calculation( | ||
| intensity_df: pd.DataFrame, | ||
| protein_id: str, | ||
| group1: str, | ||
| group2: str, | ||
| intensity_name: str = None, | ||
| ) -> float: | ||
| """ | ||
| Function to calculate the variance of a protein group for the two classes and return the maximum variance. | ||
|
|
||
| :param intensity_df: The dataframe containing the protein group intensities. | ||
| :param protein_id: The protein ID. | ||
| :param group1: The name of the first group. | ||
| :param group2: The name of the second group. | ||
| :param intensity_name: The name of the column containing the protein group intensities. | ||
| :return: The variance of the protein group. | ||
| """ | ||
|
|
||
| if intensity_name is None: | ||
| intensity_name = "Normalised iBAQ" | ||
|
||
| protein_group = intensity_df[intensity_df["Protein ID"] == protein_id] | ||
|
|
||
| group1_intensities = protein_group[protein_group["Group"] == group1][intensity_name].values | ||
| group2_intensities = protein_group[protein_group["Group"] == group2][intensity_name].values | ||
|
|
||
| variance_group1 = np.var(group1_intensities, ddof=1) | ||
| variance_group2 = np.var(group2_intensities, ddof=1) | ||
|
|
||
| max_variance = max(variance_group1, variance_group2) | ||
|
|
||
| return max_variance | ||
|
|
||
| def sample_size_calculation( | ||
| differentially_expressed_proteins_df: pd.DataFrame, | ||
| significant_proteins_df: pd.DataFrame, | ||
| significant_proteins_only: bool, | ||
selenabr marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| fc_threshold: float, | ||
| alpha: float, | ||
| power: float, | ||
| group1: str, | ||
| group2: str, | ||
| selected_protein_group: str, | ||
| intensity_name: str = None | ||
| ) -> float: | ||
selenabr marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| """ | ||
| Function to calculate the required sample size for a selected protein to achieve the required power . | ||
|
|
||
| :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. | ||
| :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. | ||
| :param significant_proteins_only: A boolean to display only significant proteins for selection to the user. | ||
| :param fc_threshold: The fold change threshold. | ||
| :param alpha: The significance level. The value for alpha is taken from the t-test by default. | ||
| :param power: The power of the test. | ||
| :param group1: The name of the first group. | ||
| :param group2: The name of the second group. | ||
| :param selected_protein_group: The selected protein group for which the required sample size is to be calculated. | ||
| :param intensity_name: The name of the column containing the protein group intensities. | ||
| :return: The required sample size. | ||
| """ | ||
|
|
||
| if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values: | ||
| raise ValueError("Please select a valid protein group.") | ||
| protein_group = selected_protein_group | ||
| z_alpha = stats.norm.ppf(1 - alpha / 2) | ||
| z_beta = stats.norm.ppf(power) | ||
|
|
||
| variance_protein_group = variance_protein_group_calculation( | ||
| intensity_df=differentially_expressed_proteins_df, | ||
| protein_id=protein_group, | ||
| group1=group1, | ||
| group2=group2, | ||
| intensity_name=intensity_name, | ||
| ) | ||
|
|
||
| required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * variance_protein_group) | ||
| required_sample_size = math.ceil(required_sample_size) | ||
| print(required_sample_size) | ||
|
|
||
| return dict(required_sample_size=required_sample_size) | ||
|
|
||
|
|
||
|
|
||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are these test commented out?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, the first tests up to line 202 shouldn't be commented out. They tested the new methods on the old branch, and they worked. I think I commented them out because the methods didn't work on the dev branch due to the new changes... |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,90 @@ | ||
| import numpy as np | ||
| import pandas as pd | ||
| import pytest | ||
|
|
||
|
|
||
| from protzilla.data_analysis.power_analysis import variance_protein_group_calculation, sample_size_calculation | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def power_test_data(): | ||
| test_differentially_expressed_proteins_list = ( | ||
| ["Sample1", "Protein1", "Gene1", 20, "Group1"], | ||
| ["Sample1", "Protein2", "Gene1", 16, "Group1"], | ||
| ["Sample1", "Protein3", "Gene1", 1, "Group1"], | ||
| ["Sample1", "Protein4", "Gene1", 14, "Group1"], | ||
| ["Sample2", "Protein1", "Gene1", 20, "Group1"], | ||
| ["Sample2", "Protein2", "Gene1", 15, "Group1"], | ||
| ["Sample2", "Protein3", "Gene1", 2, "Group1"], | ||
| ["Sample2", "Protein4", "Gene1", 15, "Group1"], | ||
| ["Sample3", "Protein1", "Gene1", 22, "Group1"], | ||
| ["Sample3", "Protein2", "Gene1", 14, "Group1"], | ||
| ["Sample3", "Protein3", "Gene1", 3, "Group1"], | ||
| ["Sample3", "Protein4", "Gene1", 16, "Group1"], | ||
| ["Sample4", "Protein1", "Gene1", 8, "Group2"], | ||
| ["Sample4", "Protein2", "Gene1", 15, "Group2"], | ||
| ["Sample4", "Protein3", "Gene1", 1, "Group2"], | ||
| ["Sample4", "Protein4", "Gene1", 9, "Group2"], | ||
| ["Sample5", "Protein1", "Gene1", 10, "Group2"], | ||
| ["Sample5", "Protein2", "Gene1", 14, "Group2"], | ||
| ["Sample5", "Protein3", "Gene1", 2, "Group2"], | ||
| ["Sample5", "Protein4", "Gene1", 10, "Group2"], | ||
| ["Sample6", "Protein1", "Gene1", 12, "Group2"], | ||
| ["Sample6", "Protein2", "Gene1", 13, "Group2"], | ||
| ["Sample6", "Protein3", "Gene1", 3, "Group2"], | ||
| ["Sample6", "Protein4", "Gene1", 11, "Group2"], | ||
| ) | ||
|
|
||
| test_differentially_expressed_proteins_df = pd.DataFrame( | ||
| data=test_differentially_expressed_proteins_list, | ||
| columns=["Sample", "Protein ID", "Gene", "Normalised iBAQ", "Group"], | ||
| ) | ||
| return test_differentially_expressed_proteins_df | ||
|
|
||
|
|
||
| def test_variance_protein_group_calculation( | ||
| power_test_data | ||
| ): | ||
| intensity_df = power_test_data | ||
|
|
||
| protein_id = "Protein1" | ||
| group1 = "Group1" | ||
| group2 = "Group2" | ||
|
|
||
| variance = variance_protein_group_calculation( | ||
| intensity_df, protein_id, group1, group2 | ||
| ) | ||
| print(variance) | ||
| assert variance == 4.0 | ||
|
|
||
| def test_sample_size_calculation( | ||
| power_test_data | ||
|
|
||
| ): | ||
| test_alpha = 0.05 | ||
| test_power = 0.8 | ||
| test_fc_threshold = 1 | ||
| test_selected_protein_group = "Protein1" | ||
|
|
||
|
|
||
| required_sample_size = sample_size_calculation( | ||
| differentially_expressed_proteins_df=power_test_data, | ||
| significant_proteins_df=power_test_data, | ||
| fc_threshold=test_fc_threshold, | ||
| power=test_power, | ||
| alpha=test_alpha, | ||
| group1= "Group1", | ||
| group2= "Group2", | ||
| selected_protein_group=test_selected_protein_group, | ||
| significant_proteins_only=False, | ||
| intensity_name=None | ||
| ) | ||
| print(required_sample_size) | ||
| required_sample_size_int = next(iter(required_sample_size.values()),None) | ||
| assert required_sample_size_int == 63 | ||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Assumes that data has to be normalized before feeding into the step. Otherwise the column doesn't exist. I would say that is an unnecessary limitation that is not transparent to the user