Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 58 additions & 14 deletions protzilla/data_analysis/classification.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Warum werden die Parameter alle so spezifiziert? Damit grenzt man ja die Flexibilität der Funktionen schon ein, weil man jetzt nicht mehr alle Parameter, die der Random Forest Classifier von sklearn bietet, nutzen kann.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ich habe eigentlich versucht, dass keine Funktionalität oder Flexibiltät verloren geht, sondern nur die **kwargs ausspezifiziert, wenn es konkrete Parameter sind, die verwendet werden oder entfernt, wenn sie überhaupt nicht genutzt werden. Wenn ich etwas übersehen habe, dann ändere ich das gerne. Kannst du mir dafür eine konkrete stelle nennen, an der Flexibilität verloren geht?
Zum Hintergrund, warum ich das gemacht habe: Ich fand es sehr unverständlich, was für parameter bei den Kwargs verwendet erwartet werden und missverständlich, wenn die Funktion kwargs angenommen hat, obwohl mit diesen nichts getan wird. So wollte ich in den Funktionen klarer machen, welches eigentlich die Parameter sind, die genutzt werden.

Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,10 @@ def perform_classification(
scoring,
model_selection_scoring="accuracy",
test_validate_split=None,
**parameters,
n_splits: int = 5,
n_repeats: int = 10,
random_state_cv: int = 42,
p_samples = None,
):
if validation_strategy == "Manual" and grid_search_method == "Manual":
X_train, X_val, y_train, y_val = perform_train_test_split(
Expand All @@ -54,7 +57,7 @@ def perform_classification(
return "Please select a cross validation strategy"
elif validation_strategy != "Manual" and grid_search_method == "Manual":
model = clf.set_params(**clf_parameters)
cv = perform_cross_validation(validation_strategy, **parameters)
cv = perform_cross_validation(validation_strategy, n_splits,n_repeats,random_state_cv=random_state_cv, p_samples=p_samples)
scores = cross_validate(
model, input_df, labels_df, scoring=scoring, cv=cv, return_train_score=True
)
Expand All @@ -66,7 +69,7 @@ def perform_classification(
return model, model_evaluation_df
elif validation_strategy != "Manual" and grid_search_method != "Manual":
clf_parameters = create_dict_with_lists_as_values(clf_parameters)
cv = perform_cross_validation(validation_strategy, **parameters)
cv = perform_cross_validation(validation_strategy, n_splits, n_repeats, random_state_cv=random_state_cv, p_samples=p_samples)
model = perform_grid_search_cv(
grid_search_method,
clf,
Expand All @@ -83,7 +86,6 @@ def perform_classification(
)
return model.best_estimator_, model_evaluation_df


def random_forest(
input_df: pd.DataFrame,
metadata_df: pd.DataFrame,
Expand All @@ -93,11 +95,25 @@ def random_forest(
criterion="gini",
max_depth=None,
bootstrap=True,

#test_split_parameters
test_size: float = 0.2,
split_stratify: str = "yes",
shuffle: bool = True,
random_state=42,

#classification_parameters
model_selection: str = "Grid search",
validation_strategy: str = "Cross Validation",
scoring: list[str] = ["accuracy"],
**kwargs,
model_selection_scoring = "accuracy",
train_val_split: float | None = None,
validation_strategy: str = "Cross Validation",

#cross_validation_parameters
n_splits: int = 5,
n_repeats: int = 10,
random_state_cv: int = 42,
p_samples = None,
):
"""
Perform classification using a random forest classifier from sklearn.
Expand Down Expand Up @@ -155,7 +171,9 @@ def random_forest(
X_train, X_test, y_train, y_test = perform_train_test_split(
input_df_wide,
labels_df["Encoded Label"],
**kwargs,
test_size,
shuffle=shuffle,
split_stratify=split_stratify,
)

clf = RandomForestClassifier()
Expand All @@ -179,7 +197,12 @@ def random_forest(
clf,
clf_parameters,
scoring,
**kwargs,
model_selection_scoring,
train_val_split,
n_splits,
n_repeats,
random_state_cv,
p_samples,
)

X_test.reset_index(inplace=True)
Expand All @@ -206,14 +229,28 @@ def svm(
gamma="scale", # only relevant ‘rbf’, ‘poly’ and ‘sigmoid’.
coef0=0.0, # relevant for "poly" and "sigmoid"
probability=True,
tol=0.001,
tolerance=0.001,
class_weight=None,
max_iter=-1,
random_state=42,

#test_split_parameters
test_size: float = 0.2,
split_stratify: str = "yes",
shuffle: bool = True,

#classification_parameters
model_selection: str = "Grid search",
validation_strategy: str = "Cross Validation",
scoring: list[str] = ["accuracy"],
**kwargs,
model_selection_scoring = "accuracy",
train_val_split: float | None = None,
validation_strategy: str = "Cross Validation",

#cross_validation_parameters
n_splits: int = 5,
n_repeats: int = 10,
random_state_cv: int = 42,
p_samples = None,
):
"""
Perform classification using the support vector machine classifier from sklearn.
Expand Down Expand Up @@ -276,7 +313,9 @@ def svm(
X_train, X_test, y_train, y_test = perform_train_test_split(
input_df_wide,
labels_df["Encoded Label"],
**kwargs,
test_size,
shuffle=shuffle,
split_stratify=split_stratify
)

clf = SVC()
Expand All @@ -287,7 +326,7 @@ def svm(
gamma=gamma,
coef0=coef0,
probability=probability,
tol=tol,
tol=tolerance,
class_weight=class_weight,
max_iter=max_iter,
random_state=random_state,
Expand All @@ -303,7 +342,12 @@ def svm(
clf,
clf_parameters,
scoring,
**kwargs,
model_selection_scoring,
train_val_split,
n_splits,
n_repeats,
random_state_cv,
p_samples,
)

X_test.reset_index(inplace=True)
Expand Down
2 changes: 0 additions & 2 deletions protzilla/data_analysis/classification_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,6 @@ def perform_cross_validation(
shuffle="yes",
random_state_cv=42,
p_samples=None,
**parameters,
):
shuffle = shuffle == "yes"
random_state_cv = None if not shuffle else random_state_cv
Expand Down Expand Up @@ -213,7 +212,6 @@ def perform_train_test_split(
random_state=42,
shuffle=True,
split_stratify="yes",
**kwargs,
):
# by default this contains already filtered samples from metadata, we need to remove those
labels_df = labels_df[labels_df.index.isin(input_df.index)]
Expand Down
13 changes: 5 additions & 8 deletions protzilla/data_analysis/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ def k_means(
n_init: int = 10,
max_iter: int = 300,
tolerance: float = 1e-4,
**kwargs,
):
"""
A method that uses k-means to partition a number of samples in k clusters. The
Expand Down Expand Up @@ -106,8 +105,7 @@ def k_means(
clf,
clf_parameters,
scoring,
labels_df=labels_df["Encoded Label"],
**kwargs,
labels_df=labels_df["Encoded Label"]
)

# create dataframes for ouput dict
Expand Down Expand Up @@ -159,7 +157,7 @@ def expectation_maximisation(
init_params: str = "kmeans",
max_iter: int = 100,
random_state=42,
**kwargs,
model_selection_scoring=None,
):
"""
Performs expectation maximization clustering with a Gaussian Mixture Model, using
Expand Down Expand Up @@ -236,7 +234,7 @@ def expectation_maximisation(
clf_parameters,
scoring,
labels_df=labels_df["Encoded Label"],
**kwargs,
model_selection_scoring = model_selection_scoring,
)

cluster_labels_df = pd.DataFrame(
Expand Down Expand Up @@ -264,7 +262,7 @@ def hierarchical_agglomerative_clustering(
n_clusters: int = 2,
metric: str = "euclidean",
linkage: str = "ward",
**kwargs,
model_selection_scoring=None,
):
"""
Performs Agglomerative Clustering by recursively merging a pair of clusters of
Expand Down Expand Up @@ -327,7 +325,7 @@ def hierarchical_agglomerative_clustering(
clf_parameters,
scoring,
labels_df=labels_df["Encoded Label"],
**kwargs,
model_selection_scoring = model_selection_scoring,
)

cluster_labels_df = pd.DataFrame(
Expand All @@ -348,7 +346,6 @@ def perform_clustering(
scoring,
labels_df=None,
model_selection_scoring=None,
**parameters,
):
if model_selection == "Manual":
model = clf.set_params(**clf_parameters)
Expand Down
4 changes: 1 addition & 3 deletions protzilla/data_analysis/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,9 +305,7 @@ def prot_quant_plot(
:param similarity_measure: method to compare the chosen proteingroup with all others. The two
methods are "cosine similarity" and "euclidean distance".
:param similarity: similarity score of the chosen similarity measurement method.


:return: returns a dictionary containing a list with a plotly figure and/or a list of messages
:return: returns a dictionary containing a list with a plotly figure
"""

wide_df = long_to_wide(input_df) if is_long_format(input_df) else input_df
Expand Down
2 changes: 0 additions & 2 deletions protzilla/data_integration/enrichment_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,6 @@ def GO_analysis_with_Enrichr(
background_path=None,
background_number=None,
background_biomart=None,
**kwargs,
):
"""
A method that performs online over-representation analysis for a given set of proteins
Expand Down Expand Up @@ -680,7 +679,6 @@ def GO_analysis_offline(
direction="both",
background_path=None,
background_number=None,
**kwargs,
):
"""
A method that performs offline over-representation analysis for a given set of proteins
Expand Down
2 changes: 0 additions & 2 deletions protzilla/data_integration/enrichment_analysis_gsea.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ def gsea_preranked(
weighted_score=1.0,
seed=123,
threads=4,
**kwargs,
):
"""
Ranks proteins by a provided value column according to ranking_direction and
Expand Down Expand Up @@ -294,7 +293,6 @@ def gsea(
weighted_score=1.0,
seed=123,
threads=4,
**kwargs,
):
"""
Performs Gene Set Enrichment Analysis (GSEA) on a dataframe with protein IDs, samples and intensities.
Expand Down
25 changes: 9 additions & 16 deletions protzilla/data_preprocessing/filter_proteins.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ich vermute die build_pie_bar_plot Funktion wurde sonst nirgends aufgerufen? Aber ist es vielleicht nicht sinnvoll, wenn man eine generische allgemeine Funktion für diese Art von Plot hat? Für den Fall, dass man sie nochmal verwenden möchte. Also warum hast du das zusammengeführt?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Grundsätzlich bin ich voll bei dir und fände eine allgemeine Funktion auch sehr sinnvoll. Allerdings ist die Funktion an dieser stelle so spezifisch (Bspw. durch sie Labels ["Proteins kept", "Proteins filtered"]), dass man sie sowieso kein zweites mal verwenden kann. Hintergrund war einfach nur, dass ich die zweite Funktion, die nichts tut, entfernen wollte, weil sie nichts tut.

Eigentlich ist die Funktion auch genau so gut an anderer Stelle wieder verwendbar, wie vorher, dadurch, dass ich nur Parameter umbenannt habe.

Fändest du es trotzdem besser, wenn ich wieder eine zusätzliche Funktion einfüge?

Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pandas as pd

from protzilla.data_preprocessing.plots import create_bar_plot, create_pie_plot

from ..utilities.transform_dfs import long_to_wide


Expand Down Expand Up @@ -30,9 +31,7 @@ def by_samples_missing(
filtered_proteins_list = (
transformed_df.drop(remaining_proteins_list, axis=1).columns.unique().tolist()
)
filtered_df = protein_df[
(protein_df["Protein ID"].isin(remaining_proteins_list))
]
filtered_df = protein_df[(protein_df["Protein ID"].isin(remaining_proteins_list))]
filtered_peptide_df = None
if peptide_df is not None:
filtered_peptide_df = peptide_df[
Expand All @@ -46,32 +45,26 @@ def by_samples_missing(
)


def _build_pie_bar_plot(remaining_proteins, filtered_proteins, graph_type):
def by_samples_missing_plot(
output_remaining_proteins, output_filtered_proteins, graph_type
):
if graph_type == "Pie chart":
fig = create_pie_plot(
values_of_sectors=[
len(remaining_proteins),
len(filtered_proteins),
len(output_remaining_proteins),
len(output_filtered_proteins),
],
names_of_sectors=["Proteins kept", "Proteins filtered"],
heading="Number of Filtered Proteins",
)
elif graph_type == "Bar chart":
fig = create_bar_plot(
values_of_sectors=[
len(remaining_proteins),
len(filtered_proteins),
len(output_remaining_proteins),
len(output_filtered_proteins),
],
names_of_sectors=["Proteins kept", "Proteins filtered"],
heading="Number of Filtered Proteins",
y_title="Number of Proteins",
)
return [fig]


def by_samples_missing_plot(method_inputs, method_outputs, graph_type):
return _build_pie_bar_plot(
method_outputs["remaining_proteins"],
method_outputs["filtered_proteins"],
graph_type,
)
20 changes: 8 additions & 12 deletions protzilla/data_preprocessing/filter_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,22 +133,18 @@ def by_proteins_missing(
)


def by_protein_intensity_sum_plot(method_inputs, method_outputs, graph_type):
return _build_pie_bar_plot(
method_outputs["protein_df"], method_outputs["filtered_samples"], graph_type
)
def by_protein_intensity_sum_plot(
output_protein_df, output_filtered_samples, graph_type
):
return _build_pie_bar_plot(output_protein_df, output_filtered_samples, graph_type)


def by_proteins_missing_plot(method_inputs, method_outputs, graph_type):
return _build_pie_bar_plot(
method_outputs["protein_df"], method_outputs["filtered_samples"], graph_type
)
def by_proteins_missing_plot(output_protein_df, output_filtered_samples, graph_type):
return _build_pie_bar_plot(output_protein_df, output_filtered_samples, graph_type)


def by_protein_count_plot(method_inputs, method_outputs, graph_type):
return _build_pie_bar_plot(
method_outputs["protein_df"], method_outputs["filtered_samples"], graph_type
)
def by_protein_count_plot(output_protein_df, output_filtered_samples, graph_type):
return _build_pie_bar_plot(output_protein_df, output_filtered_samples, graph_type)


def _build_pie_bar_plot(result_df, filtered_sampels, graph_type):
Expand Down
Loading