-
Notifications
You must be signed in to change notification settings - Fork 1
Sample size calculation #519
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev
Are you sure you want to change the base?
Changes from 15 commits
73e13a2
f133c87
49c7f0e
6d8c9a8
0b95cf0
22c293d
fd756df
b22b6e7
c6a2f3b
032286c
e90fab3
01eba42
d3cf9d8
3ce4ae1
f78b0b9
e3dd1c3
2e3de5a
a46a074
3446be3
cb25777
52ef105
ac9e783
e54c767
2faa972
25cf2b2
ae4e8cb
5c63008
0adc15c
dcba877
eb32984
1adda1b
d0ec174
776dc55
7131d3b
6e2daa3
6778796
01e9d5f
412dfd1
7b6c159
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,288 @@ | ||||||
| import logging | ||||||
|
|
||||||
| import numpy as np | ||||||
| import pandas as pd | ||||||
| import math | ||||||
| from scipy import stats | ||||||
| from statsmodels.stats.power import TTestIndPower | ||||||
|
|
||||||
|
|
||||||
| def variance_protein_group_calculation_max( | ||||||
| intensity_df: pd.DataFrame, | ||||||
| protein_id: str, | ||||||
| group1: str, | ||||||
| group2: str, | ||||||
| intensity_name: str = None, | ||||||
| ) -> float: | ||||||
| """ | ||||||
| Function to calculate the variance of a protein group for the two classes and return the maximum variance. | ||||||
|
|
||||||
| :param intensity_df: The dataframe containing the protein group intensities. | ||||||
| :param protein_id: The protein ID. | ||||||
| :param group1: The name of the first group. | ||||||
| :param group2: The name of the second group. | ||||||
| :param intensity_name: The name of the column containing the protein group intensities. | ||||||
| :return: The variance of the protein group. | ||||||
| """ | ||||||
|
|
||||||
| if intensity_name is None: | ||||||
| intensity_name = "Normalised iBAQ" | ||||||
|
||||||
| protein_group = intensity_df[intensity_df["Protein ID"] == protein_id] | ||||||
|
|
||||||
| group1_intensities = protein_group[protein_group["Group"] == group1][intensity_name].values | ||||||
| group2_intensities = protein_group[protein_group["Group"] == group2][intensity_name].values | ||||||
|
|
||||||
| variance_group1 = np.var(group1_intensities, ddof=1) | ||||||
| variance_group2 = np.var(group2_intensities, ddof=1) | ||||||
|
|
||||||
| max_variance = max(variance_group1, variance_group2) | ||||||
|
|
||||||
| return max_variance | ||||||
|
|
||||||
| def sample_size_calculation( | ||||||
| differentially_expressed_proteins_df: pd.DataFrame, | ||||||
| significant_proteins_df: pd.DataFrame, | ||||||
| significant_proteins_only: bool, | ||||||
selenabr marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| fc_threshold: float, | ||||||
| alpha: float, | ||||||
| power: float, | ||||||
| group1: str, | ||||||
| group2: str, | ||||||
| selected_protein_group: str, | ||||||
| intensity_name: str = None | ||||||
| ) -> float: | ||||||
selenabr marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| """ | ||||||
| Function to calculate the required sample size for a selected protein to achieve the required power . | ||||||
|
|
||||||
| :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. | ||||||
| :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. | ||||||
| :param significant_proteins_only: A boolean to display only significant proteins for selection to the user. | ||||||
| :param fc_threshold: The fold change threshold. | ||||||
| :param alpha: The significance level. The value for alpha is taken from the t-test by default. | ||||||
| :param power: The power of the test. | ||||||
| :param group1: The name of the first group. | ||||||
| :param group2: The name of the second group. | ||||||
| :param selected_protein_group: The selected protein group for which the required sample size is to be calculated. | ||||||
| :param intensity_name: The name of the column containing the protein group intensities. | ||||||
| :return: The required sample size. | ||||||
| """ | ||||||
|
|
||||||
| if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values: | ||||||
| raise ValueError("Please select a valid protein group.") | ||||||
| protein_group = selected_protein_group | ||||||
| z_alpha = stats.norm.ppf(1 - alpha / 2) | ||||||
| z_beta = stats.norm.ppf(power) | ||||||
|
|
||||||
| variance_protein_group = variance_protein_group_calculation_max( | ||||||
| intensity_df=differentially_expressed_proteins_df, | ||||||
| protein_id=protein_group, | ||||||
| group1=group1, | ||||||
| group2=group2, | ||||||
| intensity_name=intensity_name, | ||||||
| ) | ||||||
|
|
||||||
| required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * variance_protein_group) | ||||||
| required_sample_size = math.ceil(required_sample_size) | ||||||
| print(required_sample_size) | ||||||
|
|
||||||
| return dict(required_sample_size=required_sample_size) | ||||||
|
|
||||||
| def check_sample_size_calculation_with_libfunc( | ||||||
| differentially_expressed_proteins_df: pd.DataFrame, | ||||||
| significant_proteins_df: pd.DataFrame, | ||||||
| significant_proteins_only: bool, | ||||||
| fc_threshold: float, | ||||||
| alpha: float, | ||||||
| power: float, | ||||||
| group1: str, | ||||||
| group2: str, | ||||||
| selected_protein_group: str, | ||||||
| intensity_name: str = None | ||||||
| ) -> float: | ||||||
| """ | ||||||
| Function to calculate the required sample size for a selected protein to achieve the required power . | ||||||
|
|
||||||
| :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. | ||||||
| :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. | ||||||
| :param significant_proteins_only: A boolean to display only significant proteins for selection to the user. | ||||||
| :param fc_threshold: The fold change threshold. | ||||||
| :param alpha: The significance level. The value for alpha is taken from the t-test by default. | ||||||
| :param power: The power of the test. | ||||||
| :param group1: The name of the first group. | ||||||
| :param group2: The name of the second group. | ||||||
| :param selected_protein_group: The selected protein group for which the required sample size is to be calculated. | ||||||
| :param intensity_name: The name of the column containing the protein group intensities. | ||||||
| :return: The required sample size. | ||||||
| """ | ||||||
|
|
||||||
| if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values: | ||||||
| raise ValueError("Please select a valid protein group.") | ||||||
|
|
||||||
| protein_group = differentially_expressed_proteins_df[differentially_expressed_proteins_df["Protein ID"] == selected_protein_group] | ||||||
|
|
||||||
| group1_intensities = np.log2(protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values) | ||||||
| group2_intensities = np.log2(protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values) | ||||||
| variance_group1 = np.var(group1_intensities, ddof=1) | ||||||
| variance_group2 = np.var(group2_intensities, ddof=1) | ||||||
|
|
||||||
| sd_pooled = math.sqrt((variance_group1 + variance_group2) / 2) | ||||||
| mean_diff = abs(group1_intensities.mean() - group2_intensities.mean()) | ||||||
| effect_size = (group1_intensities.mean() - group2_intensities.mean())/sd_pooled | ||||||
|
|
||||||
| obj = TTestIndPower() | ||||||
| required_sample_size = obj.solve_power( | ||||||
| effect_size=effect_size, | ||||||
| alpha=alpha, | ||||||
| power=power, nobs1=None, ratio=1.0, alternative='two-sided') | ||||||
| print(required_sample_size) | ||||||
|
|
||||||
| required_sample_size = math.ceil(required_sample_size) | ||||||
|
|
||||||
| return dict(required_sample_size=required_sample_size) | ||||||
| #required_sample_size = 2.27; pooled_sd = 0.23; effect_size = 4.39, mean_diff = 1.014 | ||||||
|
|
||||||
| #impl: required_sample_size = 0.814; fc_threshold = 1.014; variance = 0.0534 | ||||||
| def check_sample_size_calculation_implemented( | ||||||
| differentially_expressed_proteins_df: pd.DataFrame, | ||||||
| significant_proteins_df: pd.DataFrame, | ||||||
| significant_proteins_only: bool, | ||||||
| fc_threshold: float, | ||||||
| alpha: float, | ||||||
| power: float, | ||||||
| group1: str, | ||||||
| group2: str, | ||||||
| selected_protein_group: str, | ||||||
| intensity_name: str = None | ||||||
| ) -> float: | ||||||
| """ | ||||||
| Function to calculate the required sample size for a selected protein to achieve the required power . | ||||||
|
|
||||||
| :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. | ||||||
| :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. | ||||||
| :param significant_proteins_only: A boolean to display only significant proteins for selection to the user. | ||||||
| :param fc_threshold: The fold change threshold. | ||||||
| :param alpha: The significance level. The value for alpha is taken from the t-test by default. | ||||||
| :param power: The power of the test. | ||||||
| :param group1: The name of the first group. | ||||||
| :param group2: The name of the second group. | ||||||
| :param selected_protein_group: The selected protein group for which the required sample size is to be calculated. | ||||||
| :param intensity_name: The name of the column containing the protein group intensities. | ||||||
| :return: The required sample size. | ||||||
| """ | ||||||
|
|
||||||
| if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values: | ||||||
| raise ValueError("Please select a valid protein group.") | ||||||
|
|
||||||
| z_alpha = stats.norm.ppf(1 - alpha / 2) | ||||||
| z_beta = stats.norm.ppf(power) | ||||||
| protein_group = differentially_expressed_proteins_df[ | ||||||
| differentially_expressed_proteins_df["Protein ID"] == selected_protein_group] | ||||||
|
|
||||||
| group1_intensities = protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values | ||||||
| group2_intensities = protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values | ||||||
| fc_threshold = abs(group1_intensities.mean() - group2_intensities.mean()) | ||||||
| variance_group1 = np.var(group1_intensities, ddof=1) | ||||||
| variance_group2 = np.var(group2_intensities, ddof=1) | ||||||
|
|
||||||
| pooled_variance = (variance_group1 + variance_group2) / 2 | ||||||
| required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * pooled_variance) | ||||||
| required_sample_size = math.ceil(required_sample_size) | ||||||
| print(required_sample_size) | ||||||
|
|
||||||
| return dict(required_sample_size=required_sample_size) | ||||||
|
|
||||||
| def check_sample_size_calculation_implemented_without_log( | ||||||
| differentially_expressed_proteins_df: pd.DataFrame, | ||||||
| significant_proteins_df: pd.DataFrame, | ||||||
| significant_proteins_only: bool, | ||||||
| fc_threshold: float, | ||||||
| alpha: float, | ||||||
| power: float, | ||||||
| group1: str, | ||||||
| group2: str, | ||||||
| selected_protein_group: str, | ||||||
| intensity_name: str = None | ||||||
| ) -> float: | ||||||
| """ | ||||||
| Function to calculate the required sample size for a selected protein to achieve the required power . | ||||||
|
|
||||||
| :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. | ||||||
| :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. | ||||||
| :param significant_proteins_only: A boolean to display only significant proteins for selection to the user. | ||||||
| :param fc_threshold: The fold change threshold. | ||||||
| :param alpha: The significance level. The value for alpha is taken from the t-test by default. | ||||||
| :param power: The power of the test. | ||||||
| :param group1: The name of the first group. | ||||||
| :param group2: The name of the second group. | ||||||
| :param selected_protein_group: The selected protein group for which the required sample size is to be calculated. | ||||||
| :param intensity_name: The name of the column containing the protein group intensities. | ||||||
| :return: The required sample size. | ||||||
| """ | ||||||
|
|
||||||
| if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values: | ||||||
| raise ValueError("Please select a valid protein group.") | ||||||
|
|
||||||
| z_alpha = stats.norm.ppf(1 - alpha / 2) | ||||||
| z_beta = stats.norm.ppf(power) | ||||||
| protein_group = differentially_expressed_proteins_df[ | ||||||
| differentially_expressed_proteins_df["Protein ID"] == selected_protein_group] | ||||||
|
|
||||||
| group1_intensities = protein_group[protein_group["Group"] == group1]["Normalised iBAQ"].values | ||||||
| group2_intensities = protein_group[protein_group["Group"] == group2]["Normalised iBAQ"].values | ||||||
| fc_threshold = abs(group1_intensities.mean() - group2_intensities.mean()) | ||||||
| variance_group1 = np.var(group1_intensities, ddof=1) | ||||||
| variance_group2 = np.var(group2_intensities, ddof=1) | ||||||
|
|
||||||
| pooled_variance = (variance_group1 + variance_group2) / 2 | ||||||
| required_sample_size = (2 * ((z_alpha + z_beta)/ fc_threshold) ** 2 * pooled_variance) | ||||||
| required_sample_size = math.ceil(required_sample_size) | ||||||
| print(required_sample_size) | ||||||
|
|
||||||
| return dict(required_sample_size=required_sample_size) | ||||||
|
|
||||||
| def power_calculation( | ||||||
| differentially_expressed_proteins_df: pd.DataFrame, | ||||||
| significant_proteins_df: pd.DataFrame, | ||||||
| significant_proteins_only: bool, | ||||||
| alpha: float, | ||||||
| fc_threshold: float, | ||||||
| group1: str, | ||||||
| group2: str, | ||||||
| selected_protein_group: str, | ||||||
| intensity_name: str = None | ||||||
| ) -> float: | ||||||
|
|
||||||
| """ | ||||||
| Function to calculate the power of the t-test for a selected protein group. | ||||||
|
|
||||||
| :param differentially_expressed_proteins_df: The dataframe containing the differentially expressed proteins from t-test output. | ||||||
| :param significant_proteins_df: The dataframe containing the significant proteins from t-test output. | ||||||
| :param significant_proteins_only: A boolean to display only significant proteins for selection to the user. | ||||||
| :param alpha: The significance level. The value for alpha is taken from the t-test by default. | ||||||
| :param fc_threshold: The fold change threshold. | ||||||
| :param group1: The name of the first group. | ||||||
| :param group2: The name of the second group. | ||||||
| :param selected_protein_group: The selected protein group for which the power is to be calculated. | ||||||
| :param intensity_name: The name of the column containing the protein group intensities. | ||||||
| :return: The power of the test. | ||||||
| """ | ||||||
| if selected_protein_group not in significant_proteins_df['Protein ID'].values and selected_protein_group not in differentially_expressed_proteins_df['Protein ID'].values: | ||||||
| raise ValueError("Please select a valid protein group.") | ||||||
| protein_group = selected_protein_group | ||||||
| z_alpha = stats.norm.ppf(1 - alpha / 2) | ||||||
|
|
||||||
| variance_protein_group = variance_protein_group_calculation_max( | ||||||
| intensity_df=differentially_expressed_proteins_df, | ||||||
| protein_id=protein_group, | ||||||
| group1=group1, | ||||||
| group2=group2, | ||||||
| intensity_name=intensity_name, | ||||||
| ) | ||||||
| sample_size = differentially_expressed_proteins_df.groupby('Group')['Sample'].count() | ||||||
| z_beta = fc_threshold * np.sqrt(sample_size/(2*variance_protein_group**2))-z_alpha | ||||||
|
||||||
| z_beta = fc_threshold * np.sqrt(sample_size/(2*variance_protein_group**2))-z_alpha | |
| z_beta = fc_threshold * np.sqrt(sample_size / (2 * variance_protein_group)) - z_alpha |
I think the square is too much since we are already dealing with variances and not standard deviations. (Also some minor formatting issues)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Assumes that data has to be normalized before feeding into the step. Otherwise the column doesn't exist. I would say that is an unnecessary limitation that is not transparent to the user