cschlaffner · Jonas0000 · Mar 4, 2025 · Dec 11, 2024 · Dec 12, 2024 · Dec 12, 2024
diff --git a/protzilla/data_analysis/classification.py b/protzilla/data_analysis/classification.py
@@ -26,12 +26,15 @@ def perform_classification(
     clf_parameters,
     scoring,
     model_selection_scoring="accuracy",
-    test_validate_split=None,
-    **parameters,
+    train_validate_split=None,
+    n_splits: int = 5,
+    n_repeats: int = 10,
+    random_state_cv: int = 42,
+    p_samples = None,
 ):
     if validation_strategy == "Manual" and grid_search_method == "Manual":
         X_train, X_val, y_train, y_val = perform_train_test_split(
-            input_df, labels_df, test_size=test_validate_split
+            input_df, labels_df, test_size=train_validate_split
         )
         model = clf.set_params(**clf_parameters)
         model.fit(X_train, y_train)
@@ -54,7 +57,7 @@ def perform_classification(
         return "Please select a cross validation strategy"
     elif validation_strategy != "Manual" and grid_search_method == "Manual":
         model = clf.set_params(**clf_parameters)
-        cv = perform_cross_validation(validation_strategy, **parameters)
+        cv = perform_cross_validation(validation_strategy, n_splits,n_repeats,random_state_cv=random_state_cv, p_samples=p_samples)
         scores = cross_validate(
             model, input_df, labels_df, scoring=scoring, cv=cv, return_train_score=True
         )
@@ -66,7 +69,7 @@ def perform_classification(
         return model, model_evaluation_df
     elif validation_strategy != "Manual" and grid_search_method != "Manual":
         clf_parameters = create_dict_with_lists_as_values(clf_parameters)
-        cv = perform_cross_validation(validation_strategy, **parameters)
+        cv = perform_cross_validation(validation_strategy, n_splits, n_repeats, random_state_cv=random_state_cv, p_samples=p_samples)
         model = perform_grid_search_cv(
             grid_search_method,
             clf,
@@ -83,21 +86,34 @@ def perform_classification(
         )
         return model.best_estimator_, model_evaluation_df
 
-
 def random_forest(
     input_df: pd.DataFrame,
     metadata_df: pd.DataFrame,
     labels_column: str,
     positive_label: str = None,
-    n_estimators=100,
-    criterion="gini",
-    max_depth=None,
-    bootstrap=True,
-    random_state=42,
+    n_estimators: int = 100,
+    criterion: str = "gini",
+    max_depth: int = None,
+    bootstrap: bool = True,
+
+    #test_split_parameters
+    test_size: float = 0.2,
+    split_stratify: str = "yes",
+    shuffle: bool = True,
+    random_state: int = 42,
+
+    #classification_parameters
     model_selection: str = "Grid search",
-    validation_strategy: str = "Cross Validation",
     scoring: list[str] = ["accuracy"],
-    **kwargs,
+    model_selection_scoring: str = "accuracy",
+    train_val_split: float = 0.25,
+    validation_strategy: str = "Cross Validation",
+
+    #cross_validation_parameters
+    n_splits: int = 5,
+    n_repeats: int = 10,
+    random_state_cv: int = 42,
+    p_samples = None,
 ):
     """
     Perform classification using a random forest classifier from sklearn.
@@ -109,9 +125,8 @@ def random_forest(
     :param labels_column: The column name in the `metadata_df` dataframe that contains
         the target variable (labels) for classification.
     :type labels_column: str
-    :param train_test_split: The proportion of data to be used for testing. Default is
-        0.2 (80-20 train-test split).
-    :type train_test_split: int, optional
+    :param positive_label: The label that should be considered as the positive class.
+    :type positive_label: str, optional
     :param n_estimators: The number of decision trees to be used in the random forest.
     :type n_estimators: int, optional
     :param criterion: The impurity measure used for tree construction.
@@ -121,16 +136,35 @@ def random_forest(
     :type max_depth: int or None, optional
     :param bootstrap: Whether bootstrap samples should be used when building trees.
     :type bootstrap: bool, optional
+    :param test_size: The proportion of data to be used for testing. Default is
+        0.2 (80-20 train-test split).
+    :type test_size: float, optional
+    :param split_stratify: If not None, data is split in a stratified fashion, using this as
+        the class labels.
+    :type split_stratify: str, optional
+    :param shuffle: Whether to shuffle the data before splitting.
+    :type shuffle: bool, optional
     :param random_state: The random seed for reproducibility.
-    :type random_state: int
+    :type random_state: int, optional
     :param model_selection: The model selection method for hyperparameter tuning.
     :type model_selection: str
-    :param validation_strategy: The strategy for model validation.
-    :type validation_strategy: str
     :param scoring: The scoring metric(s) used to evaluate the model's performance
         during validation.
     :type scoring: list[str]
-    :param **kwargs: Additional keyword arguments to be passed to the function.
+    :param model_selection_scoring: The scoring metric used to select the best model.
+    :type model_selection_scoring: str, optional
+    :param train_val_split: The proportion of data to be used for validation from the train part of the train-test-split. Default is 0.25.
+    :type train_val_split: float, optional
+    :param validation_strategy: The strategy for model validation.
+    :type validation_strategy: str
+    :param n_splits: The number of folds in a KFold.
+    :type n_splits: int, optional
+    :param n_repeats: The number of times cross-validator needs to be repeated.
+    :type n_repeats: int, optional
+    :param random_state_cv: The random seed for reproducibility.
+    :type random_state_cv: int, optional
+    :param p_samples: The number of samples to be used in the cross-validation.
+    :type p_samples: float, optional
     :return: A RandomForestClassifier instance, a dataframe consisting of the model's
         training parameters and the validation score, along with four dataframes
         containing the respective test and training samples and labels.
@@ -155,7 +189,9 @@ def random_forest(
     X_train, X_test, y_train, y_test = perform_train_test_split(
         input_df_wide,
         labels_df["Encoded Label"],
-        **kwargs,
+        test_size,
+        shuffle=shuffle,
+        split_stratify=split_stratify,
     )
 
     clf = RandomForestClassifier()
@@ -179,7 +215,12 @@ def random_forest(
         clf,
         clf_parameters,
         scoring,
-        **kwargs,
+        model_selection_scoring,
+        train_val_split,
+        n_splits,
+        n_repeats,
+        random_state_cv,
+        p_samples,
     )
 
     X_test.reset_index(inplace=True)
@@ -206,14 +247,28 @@ def svm(
     gamma="scale",  # only relevant ‘rbf’, ‘poly’ and ‘sigmoid’.
     coef0=0.0,  # relevant for "poly" and "sigmoid"
     probability=True,
-    tol=0.001,
+    tolerance=0.001,
     class_weight=None,
     max_iter=-1,
     random_state=42,
+
+    #test_split_parameters
+    test_size: float = 0.2,
+    split_stratify: str = "yes",
+    shuffle: bool = True,
+
+    #classification_parameters
     model_selection: str = "Grid search",
-    validation_strategy: str = "Cross Validation",
     scoring: list[str] = ["accuracy"],
-    **kwargs,
+    model_selection_scoring = "accuracy",
+    train_val_split: float | None = None,
+    validation_strategy: str = "Cross Validation",
+
+    #cross_validation_parameters
+    n_splits: int = 5,
+    n_repeats: int = 10,
+    random_state_cv: int = 42,
+    p_samples = None,
 ):
     """
     Perform classification using the support vector machine classifier from sklearn.
@@ -225,6 +280,8 @@ def svm(
     :param labels_column: The column name in the `metadata_df` dataframe that contains
         the target variable (labels) for classification.
     :type labels_column: str
+    :param positive_label: The label that should be considered as the positive class.
+    :type positive_label: str, optional
     :param C: Regularization parameter
     :type C: float
     :param kernel: Specifies the kernel type.
@@ -245,14 +302,34 @@ def svm(
     :type max_iter: int
     :param random_state: The random seed for reproducibility.
     :type random_state: int
+    :param test_size: The proportion of data to be used for testing. Default is
+        0.2 (80-20 train-test split).
+    :type test_size: float, optional
+    :param split_stratify: If not None, data is split in a stratified fashion, using this as
+        the class labels.
+    :type split_stratify: str, optional
+    :param shuffle: Whether to shuffle the data before splitting.
+    :type shuffle: bool, optional   
+
     :param model_selection: The model selection method for hyperparameter tuning.
     :type model_selection: str
-    :param validation_strategy: The strategy for model validation.
-    :type validation_strategy: str
     :param scoring: The scoring metric(s) used to evaluate the model's performance
         during validation.
     :type scoring: list[str]
-    :param **kwargs: Additional keyword arguments to be passed to the function.
+    :param model_selection_scoring: The scoring metric used to select the best model.
+    :type model_selection_scoring: str, optional
+    :param train_val_split: The proportion of data to be used for validation from the train part of the train-test-split. Default is 0.25.
+    :type train_val_split: float, optional
+    :param validation_strategy: The strategy for model validation.
+    :type validation_strategy: str
+    :param n_splits: The number of folds in a KFold.
+    :type n_splits: int, optional
+    :param n_repeats: The number of times cross-validator needs to be repeated.
+    :type n_repeats: int, optional
+    :param random_state_cv: The random seed for reproducibility.
+    :type random_state_cv: int, optional
+    :param p_samples: The number of samples to be used in the cross-validation.
+    :type p_samples: float, optional
     :return: A dict containing: a SVC instance, a dataframe consisting of the model's
         training parameters and the validation score, along with four dataframes
         containing the respective test and training samples and labels.
@@ -276,7 +353,9 @@ def svm(
     X_train, X_test, y_train, y_test = perform_train_test_split(
         input_df_wide,
         labels_df["Encoded Label"],
-        **kwargs,
+        test_size,
+        shuffle=shuffle,
+        split_stratify=split_stratify
     )
 
     clf = SVC()
@@ -287,7 +366,7 @@ def svm(
         gamma=gamma,
         coef0=coef0,
         probability=probability,
-        tol=tol,
+        tol=tolerance,
         class_weight=class_weight,
         max_iter=max_iter,
         random_state=random_state,
@@ -303,7 +382,12 @@ def svm(
         clf,
         clf_parameters,
         scoring,
-        **kwargs,
+        model_selection_scoring,
+        train_val_split,
+        n_splits,
+        n_repeats,
+        random_state_cv,
+        p_samples,
     )
 
     X_test.reset_index(inplace=True)

diff --git a/protzilla/data_analysis/classification_helper.py b/protzilla/data_analysis/classification_helper.py
@@ -95,7 +95,6 @@ def perform_cross_validation(
     shuffle="yes",
     random_state_cv=42,
     p_samples=None,
-    **parameters,
 ):
     shuffle = shuffle == "yes"
     random_state_cv = None if not shuffle else random_state_cv
@@ -213,7 +212,6 @@ def perform_train_test_split(
     random_state=42,
     shuffle=True,
     split_stratify="yes",
-    **kwargs,
 ):
     # by default this contains already filtered samples from metadata, we need to remove those
     labels_df = labels_df[labels_df.index.isin(input_df.index)]

diff --git a/protzilla/data_analysis/clustering.py b/protzilla/data_analysis/clustering.py
@@ -28,7 +28,6 @@ def k_means(
     n_init: int = 10,
     max_iter: int = 300,
     tolerance: float = 1e-4,
-    **kwargs,
 ):
     """
     A method that uses k-means to partition a number of samples in k clusters. The
@@ -106,8 +105,7 @@ def k_means(
             clf,
             clf_parameters,
             scoring,
-            labels_df=labels_df["Encoded Label"],
-            **kwargs,
+            labels_df=labels_df["Encoded Label"]
         )
 
         # create dataframes for ouput dict
@@ -159,7 +157,7 @@ def expectation_maximisation(
     init_params: str = "kmeans",
     max_iter: int = 100,
     random_state=42,
-    **kwargs,
+    model_selection_scoring=None,
 ):
     """
     Performs expectation maximization clustering with a Gaussian Mixture Model, using
@@ -236,7 +234,7 @@ def expectation_maximisation(
         clf_parameters,
         scoring,
         labels_df=labels_df["Encoded Label"],
-        **kwargs,
+        model_selection_scoring = model_selection_scoring,
     )
 
     cluster_labels_df = pd.DataFrame(
@@ -264,7 +262,7 @@ def hierarchical_agglomerative_clustering(
     n_clusters: int = 2,
     metric: str = "euclidean",
     linkage: str = "ward",
-    **kwargs,
+    model_selection_scoring=None,
 ):
     """
     Performs Agglomerative Clustering by recursively merging a pair of clusters of
@@ -327,7 +325,7 @@ def hierarchical_agglomerative_clustering(
         clf_parameters,
         scoring,
         labels_df=labels_df["Encoded Label"],
-        **kwargs,
+        model_selection_scoring = model_selection_scoring,
     )
 
     cluster_labels_df = pd.DataFrame(
@@ -348,7 +346,6 @@ def perform_clustering(
     scoring,
     labels_df=None,
     model_selection_scoring=None,
-    **parameters,
 ):
     if model_selection == "Manual":
         model = clf.set_params(**clf_parameters)

diff --git a/protzilla/data_analysis/plots.py b/protzilla/data_analysis/plots.py
@@ -305,9 +305,7 @@ def prot_quant_plot(
     :param similarity_measure: method to compare the chosen proteingroup with all others. The two
         methods are "cosine similarity" and "euclidean distance".
     :param similarity: similarity score of the chosen similarity measurement method.
-
-
-    :return: returns a dictionary containing a list with a plotly figure and/or a list of messages
+    :return: returns a dictionary containing a list with a plotly figure
     """
 
     wide_df = long_to_wide(input_df) if is_long_format(input_df) else input_df

diff --git a/protzilla/data_integration/enrichment_analysis.py b/protzilla/data_integration/enrichment_analysis.py
@@ -456,7 +456,6 @@ def GO_analysis_with_Enrichr(
     background_path=None,
     background_number=None,
     background_biomart=None,
-    **kwargs,
 ):
     """
     A method that performs online over-representation analysis for a given set of proteins
@@ -680,7 +679,6 @@ def GO_analysis_offline(
     direction="both",
     background_path=None,
     background_number=None,
-    **kwargs,
 ):
     """
     A method that performs offline over-representation analysis for a given set of proteins

diff --git a/protzilla/data_integration/enrichment_analysis_gsea.py b/protzilla/data_integration/enrichment_analysis_gsea.py
@@ -86,7 +86,6 @@ def gsea_preranked(
     weighted_score=1.0,
     seed=123,
     threads=4,
-    **kwargs,
 ):
     """
     Ranks proteins by a provided value column according to ranking_direction and
@@ -294,7 +293,6 @@ def gsea(
     weighted_score=1.0,
     seed=123,
     threads=4,
-    **kwargs,
 ):
     """
     Performs Gene Set Enrichment Analysis (GSEA) on a dataframe with protein IDs, samples and intensities.