Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 115 additions & 31 deletions protzilla/data_analysis/classification.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Warum werden die Parameter alle so spezifiziert? Damit grenzt man ja die Flexibilität der Funktionen schon ein, weil man jetzt nicht mehr alle Parameter, die der Random Forest Classifier von sklearn bietet, nutzen kann.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ich habe eigentlich versucht, dass keine Funktionalität oder Flexibiltät verloren geht, sondern nur die **kwargs ausspezifiziert, wenn es konkrete Parameter sind, die verwendet werden oder entfernt, wenn sie überhaupt nicht genutzt werden. Wenn ich etwas übersehen habe, dann ändere ich das gerne. Kannst du mir dafür eine konkrete stelle nennen, an der Flexibilität verloren geht?
Zum Hintergrund, warum ich das gemacht habe: Ich fand es sehr unverständlich, was für parameter bei den Kwargs verwendet erwartet werden und missverständlich, wenn die Funktion kwargs angenommen hat, obwohl mit diesen nichts getan wird. So wollte ich in den Funktionen klarer machen, welches eigentlich die Parameter sind, die genutzt werden.

Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,15 @@ def perform_classification(
clf_parameters,
scoring,
model_selection_scoring="accuracy",
test_validate_split=None,
**parameters,
train_validate_split=None,
n_splits: int = 5,
n_repeats: int = 10,
random_state_cv: int = 42,
p_samples = None,
):
if validation_strategy == "Manual" and grid_search_method == "Manual":
X_train, X_val, y_train, y_val = perform_train_test_split(
input_df, labels_df, test_size=test_validate_split
input_df, labels_df, test_size=train_validate_split
)
model = clf.set_params(**clf_parameters)
model.fit(X_train, y_train)
Expand All @@ -54,7 +57,7 @@ def perform_classification(
return "Please select a cross validation strategy"
elif validation_strategy != "Manual" and grid_search_method == "Manual":
model = clf.set_params(**clf_parameters)
cv = perform_cross_validation(validation_strategy, **parameters)
cv = perform_cross_validation(validation_strategy, n_splits,n_repeats,random_state_cv=random_state_cv, p_samples=p_samples)
scores = cross_validate(
model, input_df, labels_df, scoring=scoring, cv=cv, return_train_score=True
)
Expand All @@ -66,7 +69,7 @@ def perform_classification(
return model, model_evaluation_df
elif validation_strategy != "Manual" and grid_search_method != "Manual":
clf_parameters = create_dict_with_lists_as_values(clf_parameters)
cv = perform_cross_validation(validation_strategy, **parameters)
cv = perform_cross_validation(validation_strategy, n_splits, n_repeats, random_state_cv=random_state_cv, p_samples=p_samples)
model = perform_grid_search_cv(
grid_search_method,
clf,
Expand All @@ -83,21 +86,34 @@ def perform_classification(
)
return model.best_estimator_, model_evaluation_df


def random_forest(
input_df: pd.DataFrame,
metadata_df: pd.DataFrame,
labels_column: str,
positive_label: str = None,
n_estimators=100,
criterion="gini",
max_depth=None,
bootstrap=True,
random_state=42,
n_estimators: int = 100,
criterion: str = "gini",
max_depth: int = None,
bootstrap: bool = True,

#test_split_parameters
test_size: float = 0.2,
split_stratify: str = "yes",
shuffle: bool = True,
random_state: int = 42,

#classification_parameters
model_selection: str = "Grid search",
validation_strategy: str = "Cross Validation",
scoring: list[str] = ["accuracy"],
**kwargs,
model_selection_scoring: str = "accuracy",
train_val_split: float = 0.25,
validation_strategy: str = "Cross Validation",

#cross_validation_parameters
n_splits: int = 5,
n_repeats: int = 10,
random_state_cv: int = 42,
p_samples = None,
):
"""
Perform classification using a random forest classifier from sklearn.
Expand All @@ -109,9 +125,8 @@ def random_forest(
:param labels_column: The column name in the `metadata_df` dataframe that contains
the target variable (labels) for classification.
:type labels_column: str
:param train_test_split: The proportion of data to be used for testing. Default is
0.2 (80-20 train-test split).
:type train_test_split: int, optional
:param positive_label: The label that should be considered as the positive class.
:type positive_label: str, optional
:param n_estimators: The number of decision trees to be used in the random forest.
:type n_estimators: int, optional
:param criterion: The impurity measure used for tree construction.
Expand All @@ -121,16 +136,35 @@ def random_forest(
:type max_depth: int or None, optional
:param bootstrap: Whether bootstrap samples should be used when building trees.
:type bootstrap: bool, optional
:param test_size: The proportion of data to be used for testing. Default is
0.2 (80-20 train-test split).
:type test_size: float, optional
:param split_stratify: If not None, data is split in a stratified fashion, using this as
the class labels.
:type split_stratify: str, optional
:param shuffle: Whether to shuffle the data before splitting.
:type shuffle: bool, optional
:param random_state: The random seed for reproducibility.
:type random_state: int
:type random_state: int, optional
:param model_selection: The model selection method for hyperparameter tuning.
:type model_selection: str
:param validation_strategy: The strategy for model validation.
:type validation_strategy: str
:param scoring: The scoring metric(s) used to evaluate the model's performance
during validation.
:type scoring: list[str]
:param **kwargs: Additional keyword arguments to be passed to the function.
:param model_selection_scoring: The scoring metric used to select the best model.
:type model_selection_scoring: str, optional
:param train_val_split: The proportion of data to be used for validation from the train part of the train-test-split. Default is 0.25.
:type train_val_split: float, optional
:param validation_strategy: The strategy for model validation.
:type validation_strategy: str
:param n_splits: The number of folds in a KFold.
:type n_splits: int, optional
:param n_repeats: The number of times cross-validator needs to be repeated.
:type n_repeats: int, optional
:param random_state_cv: The random seed for reproducibility.
:type random_state_cv: int, optional
:param p_samples: The number of samples to be used in the cross-validation.
:type p_samples: float, optional
:return: A RandomForestClassifier instance, a dataframe consisting of the model's
training parameters and the validation score, along with four dataframes
containing the respective test and training samples and labels.
Expand All @@ -155,7 +189,9 @@ def random_forest(
X_train, X_test, y_train, y_test = perform_train_test_split(
input_df_wide,
labels_df["Encoded Label"],
**kwargs,
test_size,
shuffle=shuffle,
split_stratify=split_stratify,
)

clf = RandomForestClassifier()
Expand All @@ -179,7 +215,12 @@ def random_forest(
clf,
clf_parameters,
scoring,
**kwargs,
model_selection_scoring,
train_val_split,
n_splits,
n_repeats,
random_state_cv,
p_samples,
)

X_test.reset_index(inplace=True)
Expand All @@ -206,14 +247,28 @@ def svm(
gamma="scale", # only relevant ‘rbf’, ‘poly’ and ‘sigmoid’.
coef0=0.0, # relevant for "poly" and "sigmoid"
probability=True,
tol=0.001,
tolerance=0.001,
class_weight=None,
max_iter=-1,
random_state=42,

#test_split_parameters
test_size: float = 0.2,
split_stratify: str = "yes",
shuffle: bool = True,

#classification_parameters
model_selection: str = "Grid search",
validation_strategy: str = "Cross Validation",
scoring: list[str] = ["accuracy"],
**kwargs,
model_selection_scoring = "accuracy",
train_val_split: float | None = None,
validation_strategy: str = "Cross Validation",

#cross_validation_parameters
n_splits: int = 5,
n_repeats: int = 10,
random_state_cv: int = 42,
p_samples = None,
):
"""
Perform classification using the support vector machine classifier from sklearn.
Expand All @@ -225,6 +280,8 @@ def svm(
:param labels_column: The column name in the `metadata_df` dataframe that contains
the target variable (labels) for classification.
:type labels_column: str
:param positive_label: The label that should be considered as the positive class.
:type positive_label: str, optional
:param C: Regularization parameter
:type C: float
:param kernel: Specifies the kernel type.
Expand All @@ -245,14 +302,34 @@ def svm(
:type max_iter: int
:param random_state: The random seed for reproducibility.
:type random_state: int
:param test_size: The proportion of data to be used for testing. Default is
0.2 (80-20 train-test split).
:type test_size: float, optional
:param split_stratify: If not None, data is split in a stratified fashion, using this as
the class labels.
:type split_stratify: str, optional
:param shuffle: Whether to shuffle the data before splitting.
:type shuffle: bool, optional

:param model_selection: The model selection method for hyperparameter tuning.
:type model_selection: str
:param validation_strategy: The strategy for model validation.
:type validation_strategy: str
:param scoring: The scoring metric(s) used to evaluate the model's performance
during validation.
:type scoring: list[str]
:param **kwargs: Additional keyword arguments to be passed to the function.
:param model_selection_scoring: The scoring metric used to select the best model.
:type model_selection_scoring: str, optional
:param train_val_split: The proportion of data to be used for validation from the train part of the train-test-split. Default is 0.25.
:type train_val_split: float, optional
:param validation_strategy: The strategy for model validation.
:type validation_strategy: str
:param n_splits: The number of folds in a KFold.
:type n_splits: int, optional
:param n_repeats: The number of times cross-validator needs to be repeated.
:type n_repeats: int, optional
:param random_state_cv: The random seed for reproducibility.
:type random_state_cv: int, optional
:param p_samples: The number of samples to be used in the cross-validation.
:type p_samples: float, optional
:return: A dict containing: a SVC instance, a dataframe consisting of the model's
training parameters and the validation score, along with four dataframes
containing the respective test and training samples and labels.
Expand All @@ -276,7 +353,9 @@ def svm(
X_train, X_test, y_train, y_test = perform_train_test_split(
input_df_wide,
labels_df["Encoded Label"],
**kwargs,
test_size,
shuffle=shuffle,
split_stratify=split_stratify
)

clf = SVC()
Expand All @@ -287,7 +366,7 @@ def svm(
gamma=gamma,
coef0=coef0,
probability=probability,
tol=tol,
tol=tolerance,
class_weight=class_weight,
max_iter=max_iter,
random_state=random_state,
Expand All @@ -303,7 +382,12 @@ def svm(
clf,
clf_parameters,
scoring,
**kwargs,
model_selection_scoring,
train_val_split,
n_splits,
n_repeats,
random_state_cv,
p_samples,
)

X_test.reset_index(inplace=True)
Expand Down
2 changes: 0 additions & 2 deletions protzilla/data_analysis/classification_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,6 @@ def perform_cross_validation(
shuffle="yes",
random_state_cv=42,
p_samples=None,
**parameters,
):
shuffle = shuffle == "yes"
random_state_cv = None if not shuffle else random_state_cv
Expand Down Expand Up @@ -213,7 +212,6 @@ def perform_train_test_split(
random_state=42,
shuffle=True,
split_stratify="yes",
**kwargs,
):
# by default this contains already filtered samples from metadata, we need to remove those
labels_df = labels_df[labels_df.index.isin(input_df.index)]
Expand Down
13 changes: 5 additions & 8 deletions protzilla/data_analysis/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ def k_means(
n_init: int = 10,
max_iter: int = 300,
tolerance: float = 1e-4,
**kwargs,
):
"""
A method that uses k-means to partition a number of samples in k clusters. The
Expand Down Expand Up @@ -106,8 +105,7 @@ def k_means(
clf,
clf_parameters,
scoring,
labels_df=labels_df["Encoded Label"],
**kwargs,
labels_df=labels_df["Encoded Label"]
)

# create dataframes for ouput dict
Expand Down Expand Up @@ -159,7 +157,7 @@ def expectation_maximisation(
init_params: str = "kmeans",
max_iter: int = 100,
random_state=42,
**kwargs,
model_selection_scoring=None,
):
"""
Performs expectation maximization clustering with a Gaussian Mixture Model, using
Expand Down Expand Up @@ -236,7 +234,7 @@ def expectation_maximisation(
clf_parameters,
scoring,
labels_df=labels_df["Encoded Label"],
**kwargs,
model_selection_scoring = model_selection_scoring,
)

cluster_labels_df = pd.DataFrame(
Expand Down Expand Up @@ -264,7 +262,7 @@ def hierarchical_agglomerative_clustering(
n_clusters: int = 2,
metric: str = "euclidean",
linkage: str = "ward",
**kwargs,
model_selection_scoring=None,
):
"""
Performs Agglomerative Clustering by recursively merging a pair of clusters of
Expand Down Expand Up @@ -327,7 +325,7 @@ def hierarchical_agglomerative_clustering(
clf_parameters,
scoring,
labels_df=labels_df["Encoded Label"],
**kwargs,
model_selection_scoring = model_selection_scoring,
)

cluster_labels_df = pd.DataFrame(
Expand All @@ -348,7 +346,6 @@ def perform_clustering(
scoring,
labels_df=None,
model_selection_scoring=None,
**parameters,
):
if model_selection == "Manual":
model = clf.set_params(**clf_parameters)
Expand Down
4 changes: 1 addition & 3 deletions protzilla/data_analysis/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,9 +305,7 @@ def prot_quant_plot(
:param similarity_measure: method to compare the chosen proteingroup with all others. The two
methods are "cosine similarity" and "euclidean distance".
:param similarity: similarity score of the chosen similarity measurement method.


:return: returns a dictionary containing a list with a plotly figure and/or a list of messages
:return: returns a dictionary containing a list with a plotly figure
"""

wide_df = long_to_wide(input_df) if is_long_format(input_df) else input_df
Expand Down
2 changes: 0 additions & 2 deletions protzilla/data_integration/enrichment_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,6 @@ def GO_analysis_with_Enrichr(
background_path=None,
background_number=None,
background_biomart=None,
**kwargs,
):
"""
A method that performs online over-representation analysis for a given set of proteins
Expand Down Expand Up @@ -680,7 +679,6 @@ def GO_analysis_offline(
direction="both",
background_path=None,
background_number=None,
**kwargs,
):
"""
A method that performs offline over-representation analysis for a given set of proteins
Expand Down
2 changes: 0 additions & 2 deletions protzilla/data_integration/enrichment_analysis_gsea.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ def gsea_preranked(
weighted_score=1.0,
seed=123,
threads=4,
**kwargs,
):
"""
Ranks proteins by a provided value column according to ranking_direction and
Expand Down Expand Up @@ -294,7 +293,6 @@ def gsea(
weighted_score=1.0,
seed=123,
threads=4,
**kwargs,
):
"""
Performs Gene Set Enrichment Analysis (GSEA) on a dataframe with protein IDs, samples and intensities.
Expand Down
Loading
Loading