cschlaffner · Jonas0000 · Mar 4, 2025 · Dec 11, 2024 · Dec 12, 2024 · Dec 12, 2024
diff --git a/protzilla/data_analysis/classification.py b/protzilla/data_analysis/classification.py
@@ -27,7 +27,10 @@ def perform_classification(
     scoring,
     model_selection_scoring="accuracy",
     test_validate_split=None,
-    **parameters,
+    n_splits: int = 5,
+    n_repeats: int = 10,
+    random_state_cv: int = 42,
+    p_samples = None,
 ):
     if validation_strategy == "Manual" and grid_search_method == "Manual":
         X_train, X_val, y_train, y_val = perform_train_test_split(
@@ -54,7 +57,7 @@ def perform_classification(
         return "Please select a cross validation strategy"
     elif validation_strategy != "Manual" and grid_search_method == "Manual":
         model = clf.set_params(**clf_parameters)
-        cv = perform_cross_validation(validation_strategy, **parameters)
+        cv = perform_cross_validation(validation_strategy, n_splits,n_repeats,random_state_cv=random_state_cv, p_samples=p_samples)
         scores = cross_validate(
             model, input_df, labels_df, scoring=scoring, cv=cv, return_train_score=True
         )
@@ -66,7 +69,7 @@ def perform_classification(
         return model, model_evaluation_df
     elif validation_strategy != "Manual" and grid_search_method != "Manual":
         clf_parameters = create_dict_with_lists_as_values(clf_parameters)
-        cv = perform_cross_validation(validation_strategy, **parameters)
+        cv = perform_cross_validation(validation_strategy, n_splits, n_repeats, random_state_cv=random_state_cv, p_samples=p_samples)
         model = perform_grid_search_cv(
             grid_search_method,
             clf,
@@ -83,7 +86,6 @@ def perform_classification(
         )
         return model.best_estimator_, model_evaluation_df
 
-
 def random_forest(
     input_df: pd.DataFrame,
     metadata_df: pd.DataFrame,
@@ -93,11 +95,25 @@ def random_forest(
     criterion="gini",
     max_depth=None,
     bootstrap=True,
+
+    #test_split_parameters
+    test_size: float = 0.2,
+    split_stratify: str = "yes",
+    shuffle: bool = True,
     random_state=42,
+
+    #classification_parameters
     model_selection: str = "Grid search",
-    validation_strategy: str = "Cross Validation",
     scoring: list[str] = ["accuracy"],
-    **kwargs,
+    model_selection_scoring = "accuracy",
+    train_val_split: float | None = None,
+    validation_strategy: str = "Cross Validation",
+
+    #cross_validation_parameters
+    n_splits: int = 5,
+    n_repeats: int = 10,
+    random_state_cv: int = 42,
+    p_samples = None,
 ):
     """
     Perform classification using a random forest classifier from sklearn.
@@ -155,7 +171,9 @@ def random_forest(
     X_train, X_test, y_train, y_test = perform_train_test_split(
         input_df_wide,
         labels_df["Encoded Label"],
-        **kwargs,
+        test_size,
+        shuffle=shuffle,
+        split_stratify=split_stratify,
     )
 
     clf = RandomForestClassifier()
@@ -179,7 +197,12 @@ def random_forest(
         clf,
         clf_parameters,
         scoring,
-        **kwargs,
+        model_selection_scoring,
+        train_val_split,
+        n_splits,
+        n_repeats,
+        random_state_cv,
+        p_samples,
     )
 
     X_test.reset_index(inplace=True)
@@ -206,14 +229,28 @@ def svm(
     gamma="scale",  # only relevant ‘rbf’, ‘poly’ and ‘sigmoid’.
     coef0=0.0,  # relevant for "poly" and "sigmoid"
     probability=True,
-    tol=0.001,
+    tolerance=0.001,
     class_weight=None,
     max_iter=-1,
     random_state=42,
+
+    #test_split_parameters
+    test_size: float = 0.2,
+    split_stratify: str = "yes",
+    shuffle: bool = True,
+
+    #classification_parameters
     model_selection: str = "Grid search",
-    validation_strategy: str = "Cross Validation",
     scoring: list[str] = ["accuracy"],
-    **kwargs,
+    model_selection_scoring = "accuracy",
+    train_val_split: float | None = None,
+    validation_strategy: str = "Cross Validation",
+
+    #cross_validation_parameters
+    n_splits: int = 5,
+    n_repeats: int = 10,
+    random_state_cv: int = 42,
+    p_samples = None,
 ):
     """
     Perform classification using the support vector machine classifier from sklearn.
@@ -276,7 +313,9 @@ def svm(
     X_train, X_test, y_train, y_test = perform_train_test_split(
         input_df_wide,
         labels_df["Encoded Label"],
-        **kwargs,
+        test_size,
+        shuffle=shuffle,
+        split_stratify=split_stratify
     )
 
     clf = SVC()
@@ -287,7 +326,7 @@ def svm(
         gamma=gamma,
         coef0=coef0,
         probability=probability,
-        tol=tol,
+        tol=tolerance,
         class_weight=class_weight,
         max_iter=max_iter,
         random_state=random_state,
@@ -303,7 +342,12 @@ def svm(
         clf,
         clf_parameters,
         scoring,
-        **kwargs,
+        model_selection_scoring,
+        train_val_split,
+        n_splits,
+        n_repeats,
+        random_state_cv,
+        p_samples,
     )
 
     X_test.reset_index(inplace=True)

diff --git a/protzilla/data_analysis/classification_helper.py b/protzilla/data_analysis/classification_helper.py
@@ -95,7 +95,6 @@ def perform_cross_validation(
     shuffle="yes",
     random_state_cv=42,
     p_samples=None,
-    **parameters,
 ):
     shuffle = shuffle == "yes"
     random_state_cv = None if not shuffle else random_state_cv
@@ -213,7 +212,6 @@ def perform_train_test_split(
     random_state=42,
     shuffle=True,
     split_stratify="yes",
-    **kwargs,
 ):
     # by default this contains already filtered samples from metadata, we need to remove those
     labels_df = labels_df[labels_df.index.isin(input_df.index)]

diff --git a/protzilla/data_analysis/clustering.py b/protzilla/data_analysis/clustering.py
@@ -28,7 +28,6 @@ def k_means(
     n_init: int = 10,
     max_iter: int = 300,
     tolerance: float = 1e-4,
-    **kwargs,
 ):
     """
     A method that uses k-means to partition a number of samples in k clusters. The
@@ -106,8 +105,7 @@ def k_means(
             clf,
             clf_parameters,
             scoring,
-            labels_df=labels_df["Encoded Label"],
-            **kwargs,
+            labels_df=labels_df["Encoded Label"]
         )
 
         # create dataframes for ouput dict
@@ -159,7 +157,7 @@ def expectation_maximisation(
     init_params: str = "kmeans",
     max_iter: int = 100,
     random_state=42,
-    **kwargs,
+    model_selection_scoring=None,
 ):
     """
     Performs expectation maximization clustering with a Gaussian Mixture Model, using
@@ -236,7 +234,7 @@ def expectation_maximisation(
         clf_parameters,
         scoring,
         labels_df=labels_df["Encoded Label"],
-        **kwargs,
+        model_selection_scoring = model_selection_scoring,
     )
 
     cluster_labels_df = pd.DataFrame(
@@ -264,7 +262,7 @@ def hierarchical_agglomerative_clustering(
     n_clusters: int = 2,
     metric: str = "euclidean",
     linkage: str = "ward",
-    **kwargs,
+    model_selection_scoring=None,
 ):
     """
     Performs Agglomerative Clustering by recursively merging a pair of clusters of
@@ -327,7 +325,7 @@ def hierarchical_agglomerative_clustering(
         clf_parameters,
         scoring,
         labels_df=labels_df["Encoded Label"],
-        **kwargs,
+        model_selection_scoring = model_selection_scoring,
     )
 
     cluster_labels_df = pd.DataFrame(
@@ -348,7 +346,6 @@ def perform_clustering(
     scoring,
     labels_df=None,
     model_selection_scoring=None,
-    **parameters,
 ):
     if model_selection == "Manual":
         model = clf.set_params(**clf_parameters)

diff --git a/protzilla/data_analysis/plots.py b/protzilla/data_analysis/plots.py
@@ -305,9 +305,7 @@ def prot_quant_plot(
     :param similarity_measure: method to compare the chosen proteingroup with all others. The two
         methods are "cosine similarity" and "euclidean distance".
     :param similarity: similarity score of the chosen similarity measurement method.
-
-
-    :return: returns a dictionary containing a list with a plotly figure and/or a list of messages
+    :return: returns a dictionary containing a list with a plotly figure
     """
 
     wide_df = long_to_wide(input_df) if is_long_format(input_df) else input_df

diff --git a/protzilla/data_integration/enrichment_analysis.py b/protzilla/data_integration/enrichment_analysis.py
@@ -456,7 +456,6 @@ def GO_analysis_with_Enrichr(
     background_path=None,
     background_number=None,
     background_biomart=None,
-    **kwargs,
 ):
     """
     A method that performs online over-representation analysis for a given set of proteins
@@ -680,7 +679,6 @@ def GO_analysis_offline(
     direction="both",
     background_path=None,
     background_number=None,
-    **kwargs,
 ):
     """
     A method that performs offline over-representation analysis for a given set of proteins

diff --git a/protzilla/data_integration/enrichment_analysis_gsea.py b/protzilla/data_integration/enrichment_analysis_gsea.py
@@ -86,7 +86,6 @@ def gsea_preranked(
     weighted_score=1.0,
     seed=123,
     threads=4,
-    **kwargs,
 ):
     """
     Ranks proteins by a provided value column according to ranking_direction and
@@ -294,7 +293,6 @@ def gsea(
     weighted_score=1.0,
     seed=123,
     threads=4,
-    **kwargs,
 ):
     """
     Performs Gene Set Enrichment Analysis (GSEA) on a dataframe with protein IDs, samples and intensities.

diff --git a/protzilla/data_preprocessing/filter_proteins.py b/protzilla/data_preprocessing/filter_proteins.py
@@ -1,6 +1,7 @@
 import pandas as pd
 
 from protzilla.data_preprocessing.plots import create_bar_plot, create_pie_plot
+
 from ..utilities.transform_dfs import long_to_wide
 
 
@@ -30,9 +31,7 @@ def by_samples_missing(
     filtered_proteins_list = (
         transformed_df.drop(remaining_proteins_list, axis=1).columns.unique().tolist()
     )
-    filtered_df = protein_df[
-        (protein_df["Protein ID"].isin(remaining_proteins_list))
-    ]
+    filtered_df = protein_df[(protein_df["Protein ID"].isin(remaining_proteins_list))]
     filtered_peptide_df = None
     if peptide_df is not None:
         filtered_peptide_df = peptide_df[
@@ -46,32 +45,26 @@ def by_samples_missing(
     )
 
 
-def _build_pie_bar_plot(remaining_proteins, filtered_proteins, graph_type):
+def by_samples_missing_plot(
+    output_remaining_proteins, output_filtered_proteins, graph_type
+):
     if graph_type == "Pie chart":
         fig = create_pie_plot(
             values_of_sectors=[
-                len(remaining_proteins),
-                len(filtered_proteins),
+                len(output_remaining_proteins),
+                len(output_filtered_proteins),
             ],
             names_of_sectors=["Proteins kept", "Proteins filtered"],
             heading="Number of Filtered Proteins",
         )
     elif graph_type == "Bar chart":
         fig = create_bar_plot(
             values_of_sectors=[
-                len(remaining_proteins),
-                len(filtered_proteins),
+                len(output_remaining_proteins),
+                len(output_filtered_proteins),
             ],
             names_of_sectors=["Proteins kept", "Proteins filtered"],
             heading="Number of Filtered Proteins",
             y_title="Number of Proteins",
         )
     return [fig]
-
-
-def by_samples_missing_plot(method_inputs, method_outputs, graph_type):
-    return _build_pie_bar_plot(
-        method_outputs["remaining_proteins"],
-        method_outputs["filtered_proteins"],
-        graph_type,
-    )
diff --git a/protzilla/data_preprocessing/filter_samples.py b/protzilla/data_preprocessing/filter_samples.py
@@ -133,22 +133,18 @@ def by_proteins_missing(
     )
 
 
-def by_protein_intensity_sum_plot(method_inputs, method_outputs, graph_type):
-    return _build_pie_bar_plot(
-        method_outputs["protein_df"], method_outputs["filtered_samples"], graph_type
-    )
+def by_protein_intensity_sum_plot(
+    output_protein_df, output_filtered_samples, graph_type
+):
+    return _build_pie_bar_plot(output_protein_df, output_filtered_samples, graph_type)
 
 
-def by_proteins_missing_plot(method_inputs, method_outputs, graph_type):
-    return _build_pie_bar_plot(
-        method_outputs["protein_df"], method_outputs["filtered_samples"], graph_type
-    )
+def by_proteins_missing_plot(output_protein_df, output_filtered_samples, graph_type):
+    return _build_pie_bar_plot(output_protein_df, output_filtered_samples, graph_type)
 
 
-def by_protein_count_plot(method_inputs, method_outputs, graph_type):
-    return _build_pie_bar_plot(
-        method_outputs["protein_df"], method_outputs["filtered_samples"], graph_type
-    )
+def by_protein_count_plot(output_protein_df, output_filtered_samples, graph_type):
+    return _build_pie_bar_plot(output_protein_df, output_filtered_samples, graph_type)
 
 
 def _build_pie_bar_plot(result_df, filtered_sampels, graph_type):