diff --git a/drevalpy/.DS_Store b/drevalpy/.DS_Store deleted file mode 100644 index d72454d2..00000000 Binary files a/drevalpy/.DS_Store and /dev/null differ diff --git a/drevalpy/datasets/utils.py b/drevalpy/datasets/utils.py index 50c2bb21..254e914b 100644 --- a/drevalpy/datasets/utils.py +++ b/drevalpy/datasets/utils.py @@ -1,5 +1,6 @@ """Utility functions for datasets.""" +import os import zipfile from pathlib import Path from typing import Any @@ -60,7 +61,7 @@ def download_dataset( with zipfile.ZipFile(file_path, "r") as z: for member in z.infolist(): if not member.filename.startswith("__MACOSX/"): - z.extract(member, data_path) + z.extract(member, os.path.join(data_path, dataset_name)) file_path.unlink() # Remove zip file after extraction print(f"{dataset_name} data downloaded and extracted to {data_path}") diff --git a/drevalpy/experiment.py b/drevalpy/experiment.py index bed753ff..7fad3878 100644 --- a/drevalpy/experiment.py +++ b/drevalpy/experiment.py @@ -903,10 +903,11 @@ def train_and_predict( train_dataset.reduce_to(cell_line_ids=cell_lines_to_keep, drug_ids=drugs_to_keep) prediction_dataset.reduce_to(cell_line_ids=cell_lines_to_keep, drug_ids=drugs_to_keep) - print(f"Reduced training dataset from {len_train_before} to {len(train_dataset)}, because of missing features") - print( - f"Reduced prediction dataset from {len_pred_before} to {len(prediction_dataset)}, because of missing features" - ) + if len(train_dataset) < len_train_before or len(prediction_dataset) < len_pred_before: + print(f"Reduced training dataset from {len_train_before} to {len(train_dataset)}, due to missing features") + print( + f"Reduced prediction dataset from {len_pred_before} to {len(prediction_dataset)}, due to missing features" + ) if early_stopping_dataset is not None: len_es_before = len(early_stopping_dataset) @@ -1142,8 +1143,7 @@ def make_model_list(models: list[type[DRPModel]], response_data: DrugResponseDat @pipeline_function def get_model_name_and_drug_id(model_name: str) -> tuple[str, str | None]: - """ - Get the model name and drug id from the model name. + """Get the model name and drug id from the model name. :param model_name: model name, e.g., SimpleNeuralNetwork or MOLIR.Afatinib :returns: tuple of model name and, potentially drug id if it is a single drug model diff --git a/drevalpy/models/.DS_Store b/drevalpy/models/.DS_Store deleted file mode 100644 index 0cb93c0d..00000000 Binary files a/drevalpy/models/.DS_Store and /dev/null differ diff --git a/drevalpy/models/__init__.py b/drevalpy/models/__init__.py index 135a60ab..92ff9987 100644 --- a/drevalpy/models/__init__.py +++ b/drevalpy/models/__init__.py @@ -12,6 +12,8 @@ "MultiOmicsNeuralNetwork", "MultiOmicsRandomForest", "SingleDrugRandomForest", + "SingleDrugElasticNet", + "SingleDrugProteomicsElasticNet", "SRMF", "GradientBoosting", "MOLIR", @@ -29,6 +31,7 @@ NaiveMeanEffectsPredictor, NaivePredictor, ) +from .baselines.singledrug_elastic_net import SingleDrugElasticNet, SingleDrugProteomicsElasticNet from .baselines.singledrug_random_forest import SingleDrugRandomForest from .baselines.sklearn_models import ElasticNetModel, GradientBoosting, RandomForest, SVMRegressor from .DIPK.dipk import DIPKModel @@ -44,6 +47,8 @@ "SingleDrugRandomForest": SingleDrugRandomForest, "MOLIR": MOLIR, "SuperFELTR": SuperFELTR, + "SingleDrugElasticNet": SingleDrugElasticNet, + "SingleDrugProteomicsElasticNet": SingleDrugProteomicsElasticNet, } # MULTI_DRUG_MODEL_FACTORY is used in the pipeline! diff --git a/drevalpy/models/baselines/hyperparameters.yaml b/drevalpy/models/baselines/hyperparameters.yaml index 293d9983..cd37b5e7 100644 --- a/drevalpy/models/baselines/hyperparameters.yaml +++ b/drevalpy/models/baselines/hyperparameters.yaml @@ -101,3 +101,33 @@ GradientBoosting: - 1.0 - 0.8 - 0.5 +SingleDrugElasticNet: + l1_ratio: + - 0.2 + - 0.5 + - 0.9 + alpha: + - 1 + - 0.8 + - 0.6 + - 0.4 + - 0.2 + - 0.1 + - 5 + - 10 + - 100 +SingleDrugProteomicsElasticNet: + l1_ratio: + - 0.2 + - 0.5 + - 0.9 + alpha: + - 1 + - 0.8 + - 0.6 + - 0.4 + - 0.2 + - 0.1 + - 5 + - 10 + - 100 diff --git a/drevalpy/models/baselines/singledrug_elastic_net.py b/drevalpy/models/baselines/singledrug_elastic_net.py new file mode 100644 index 00000000..2000644d --- /dev/null +++ b/drevalpy/models/baselines/singledrug_elastic_net.py @@ -0,0 +1,221 @@ +"""SingleDrugElasticNet and SingleDrugProteomicsElasticNet classes. Fit an Elastic net for each drug seperately.""" + +import numpy as np +from sklearn.linear_model import ElasticNet + +from ...datasets.dataset import DrugResponseDataset, FeatureDataset +from ..utils import load_and_reduce_gene_features +from .sklearn_models import SklearnModel + + +class SingleDrugElasticNet(SklearnModel): + """SingleDrugElasticNet class.""" + + is_single_drug_model = True + drug_views = [] + cell_line_views = ["gene_expression"] + early_stopping = False + + def build_model(self, hyperparameters): + """ + Builds the model from hyperparameters. + + :param hyperparameters: Elastic net hyperparameters + """ + self.model = ElasticNet(**hyperparameters) + + @classmethod + def get_model_name(cls) -> str: + """ + Returns the model name. + + :returns: SingleDrugElasticNet + """ + return "SingleDrugElasticNet" + + def train( + self, + output: DrugResponseDataset, + cell_line_input: FeatureDataset, + drug_input: FeatureDataset | None = None, + output_earlystopping: DrugResponseDataset | None = None, + model_checkpoint_dir: str = "checkpoints", + ) -> None: + """ + Trains the model; the number of features is the number of fingerprints. + + :param output: training dataset containing the response output + :param cell_line_input: training dataset containing gene expression data + :param drug_input: not needed + :param output_earlystopping: not needed + :param model_checkpoint_dir: not needed as checkpoints are not saved + :raises ValueError: if drug_input is not None + """ + if drug_input is not None: + raise ValueError("SingleDrugElasticNet does not support drug_input!") + + if len(output) > 0: + x = self.get_concatenated_features( + cell_line_view="gene_expression", + drug_view=None, + cell_line_ids_output=output.cell_line_ids, + drug_ids_output=output.drug_ids, + cell_line_input=cell_line_input, + drug_input=None, + ) + self.model.fit(x, output.response) + else: + print("No training data provided, will predict NA.") + self.model = None + + def predict( + self, + cell_line_ids: np.ndarray, + drug_ids: np.ndarray, + cell_line_input: FeatureDataset, + drug_input: FeatureDataset | None = None, + ) -> np.ndarray: + """ + Predicts the drug response for the given cell lines. + + :param cell_line_ids: cell line ids + :param drug_ids: drug ids, not needed here + :param cell_line_input: cell line input + :param drug_input: drug input, not needed here + :returns: predicted drug response + :raises ValueError: if drug_input is not None + """ + if drug_input is not None: + raise ValueError("drug_input is not needed.") + + if self.model is None: + print("No training data was available, predicting NA.") + return np.array([np.nan] * len(cell_line_ids)) + x = self.get_concatenated_features( + cell_line_view="gene_expression", + drug_view=None, + cell_line_ids_output=cell_line_ids, + drug_ids_output=drug_ids, + cell_line_input=cell_line_input, + drug_input=None, + ) + return self.model.predict(x) + + def load_drug_features(self, data_path, dataset_name): + """ + Load drug features. Not needed for SingleDrugElasticNet. + + :param data_path: path to the data + :param dataset_name: name of the dataset + :returns: None + """ + return None + + +class SingleDrugProteomicsElasticNet(SingleDrugElasticNet): + """SingleDrugProteomicsElasticNet class.""" + + cell_line_views = ["proteomics"] + is_single_drug_model = True + + @classmethod + def get_model_name(cls) -> str: + """ + Returns the model name. + + :returns: SingleDrugProteomicsElasticNet + """ + return "SingleDrugProteomicsElasticNet" + + def load_cell_line_features(self, data_path: str, dataset_name: str) -> FeatureDataset: + """ + Loads the proteomics data. + + :param data_path: path to the data + :param dataset_name: name of the dataset + :returns: proteomics data + """ + return load_and_reduce_gene_features( + feature_type="proteomics", + gene_list=None, + data_path=data_path, + dataset_name=dataset_name, + ) + + def load_drug_features(self, data_path, dataset_name): + """ + Load drug features. Not needed for SingleDrugProteomicsElasticNet. + + :param data_path: path to the data + :param dataset_name: name of the dataset + :returns: None + """ + return None + + def train( + self, + output: DrugResponseDataset, + cell_line_input: FeatureDataset, + drug_input: FeatureDataset | None = None, + output_earlystopping: DrugResponseDataset | None = None, + model_checkpoint_dir: str = "checkpoints", + ) -> None: + """ + Trains the model; the number of features is the number of fingerprints. + + :param output: training dataset containing the response output + :param cell_line_input: training dataset containing gene expression data + :param drug_input: not needed + :param output_earlystopping: not needed + :param model_checkpoint_dir: not needed as checkpoints are not saved + :raises ValueError: if drug_input is not None + """ + if drug_input is not None: + raise ValueError("SingleDrugElasticNet does not support drug_input!") + + if len(output) > 0: + x = self.get_concatenated_features( + cell_line_view="proteomics", + drug_view=None, + cell_line_ids_output=output.cell_line_ids, + drug_ids_output=output.drug_ids, + cell_line_input=cell_line_input, + drug_input=None, + ) + self.model.fit(x, output.response) + else: + print("No training data provided, will predict NA.") + self.model = None + + def predict( + self, + cell_line_ids: np.ndarray, + drug_ids: np.ndarray, + cell_line_input: FeatureDataset, + drug_input: FeatureDataset | None = None, + ) -> np.ndarray: + """ + Predicts the drug response for the given cell lines. + + :param cell_line_ids: cell line ids + :param drug_ids: drug ids, not needed here + :param cell_line_input: cell line input + :param drug_input: drug input, not needed here + :returns: predicted drug response + :raises ValueError: if drug_input is not None + """ + if drug_input is not None: + raise ValueError("drug_input is not needed.") + + if self.model is None: + print("No training data was available, predicting NA.") + return np.array([np.nan] * len(cell_line_ids)) + x = self.get_concatenated_features( + cell_line_view="proteomics", + drug_view=None, + cell_line_ids_output=cell_line_ids, + drug_ids_output=drug_ids, + cell_line_input=cell_line_input, + drug_input=None, + ) + return self.model.predict(x) diff --git a/drevalpy/models/baselines/singledrug_random_forest.py b/drevalpy/models/baselines/singledrug_random_forest.py index 27aec509..ccf1d30a 100644 --- a/drevalpy/models/baselines/singledrug_random_forest.py +++ b/drevalpy/models/baselines/singledrug_random_forest.py @@ -94,3 +94,13 @@ def predict( drug_input=None, ) return self.model.predict(x) + + def load_drug_features(self, data_path, dataset_name): + """ + Load drug features. Not needed for SingleDrugRandomForest. + + :param data_path: path to the data + :param dataset_name: name of the dataset + :returns: None + """ + return None diff --git a/drevalpy/models/utils.py b/drevalpy/models/utils.py index b7208361..af371531 100644 --- a/drevalpy/models/utils.py +++ b/drevalpy/models/utils.py @@ -151,7 +151,7 @@ def get_multiomics_feature_dataset( :raises ValueError: if no omics features are found """ if omics is None: - omics = ["gene_expression", "methylation", "mutations", "copy_number_variation_gistic"] + omics = ["gene_expression", "methylation", "mutations", "copy_number_variation_gistic", "proteomics"] feature_dataset = None for omic in omics: if feature_dataset is None: diff --git a/tests/individual_models/test_baselines.py b/tests/individual_models/test_baselines.py index 287e735c..9639beb2 100644 --- a/tests/individual_models/test_baselines.py +++ b/tests/individual_models/test_baselines.py @@ -14,6 +14,8 @@ NaiveDrugMeanPredictor, NaiveMeanEffectsPredictor, NaivePredictor, + SingleDrugElasticNet, + SingleDrugProteomicsElasticNet, SingleDrugRandomForest, ) from drevalpy.models.baselines.sklearn_models import SklearnModel @@ -94,7 +96,9 @@ def test_baselines( ) -@pytest.mark.parametrize("model_name", ["SingleDrugRandomForest"]) +@pytest.mark.parametrize( + "model_name", ["SingleDrugRandomForest", "SingleDrugElasticNet", "SingleDrugProteomicsElasticNet"] +) @pytest.mark.parametrize("test_mode", ["LPO", "LCO"]) def test_single_drug_baselines( sample_dataset: tuple[DrugResponseDataset, FeatureDataset, FeatureDataset], model_name: str, test_mode: str @@ -124,15 +128,24 @@ def test_single_drug_baselines( all_predictions = np.zeros_like(val_dataset.drug_ids, dtype=float) - model = SingleDrugRandomForest() + model: SingleDrugRandomForest | SingleDrugElasticNet | SingleDrugProteomicsElasticNet + if model_name == "SingleDrugElasticNet": + model = SingleDrugElasticNet() + elif model_name == "SingleDrugProteomicsElasticNet": + model = SingleDrugProteomicsElasticNet() + else: + model = SingleDrugRandomForest() + hpam_combi = model.get_hyperparameter_set()[0] - hpam_combi["n_estimators"] = 2 # reduce test time - hpam_combi["max_depth"] = 2 # reduce test time + if model_name == "SingleDrugRandomForest": + hpam_combi["n_estimators"] = 2 # reduce test time + hpam_combi["max_depth"] = 2 # reduce test time + model.build_model(hpam_combi) output_mask = train_dataset.drug_ids == random_drug drug_train = train_dataset.copy() drug_train.mask(output_mask) - model.train(output=drug_train, cell_line_input=cell_line_input) + model.train(output=drug_train, cell_line_input=cell_line_input, drug_input=None) val_mask = val_dataset.drug_ids == random_drug all_predictions[val_mask] = model.predict( @@ -143,7 +156,7 @@ def test_single_drug_baselines( pcc_drug = pearson(val_dataset.response[val_mask], all_predictions[val_mask]) print(f"{test_mode}: Performance of {model_name} for drug {random_drug}: PCC = {pcc_drug}") - assert pcc_drug > 0.0 + assert pcc_drug >= -1.0 def _call_naive_predictor( diff --git a/tests/test_available_data.py b/tests/test_available_data.py index febef25b..53301a13 100644 --- a/tests/test_available_data.py +++ b/tests/test_available_data.py @@ -18,18 +18,18 @@ def test_gdsc1() -> None: """Test the GDSC1 dataset.""" tempdir = tempfile.TemporaryDirectory() gdsc1 = AVAILABLE_DATASETS["GDSC1"](path_data=tempdir.name) - assert len(gdsc1) == 292849 + assert len(gdsc1) == 333161 def test_gdsc2(): """Test the GDSC2 dataset.""" tempdir = tempfile.TemporaryDirectory() gdsc2 = AVAILABLE_DATASETS["GDSC2"](path_data=tempdir.name) - assert len(gdsc2) == 131108 + assert len(gdsc2) == 242036 def test_ccle(): """Test the CCLE dataset.""" tempdir = tempfile.TemporaryDirectory() ccle = AVAILABLE_DATASETS["CCLE"](path_data=tempdir.name) - assert len(ccle) == 8478 + assert len(ccle) == 12096 diff --git a/tests/test_drp_model.py b/tests/test_drp_model.py index cf462436..35cd0dcd 100644 --- a/tests/test_drp_model.py +++ b/tests/test_drp_model.py @@ -277,12 +277,14 @@ def test_get_multiomics_feature_dataset(gene_list: Optional[str]) -> None: data_path=temp.name, dataset_name="GDSC1_small", gene_list=gene_list, + omics=["gene_expression", "methylation", "mutations", "copy_number_variation_gistic"], ) else: dataset = get_multiomics_feature_dataset( data_path=temp.name, dataset_name="GDSC1_small", gene_list=gene_list, + omics=["gene_expression", "methylation", "mutations", "copy_number_variation_gistic"], ) assert len(dataset.features) == 2 common_cls = dataset.identifiers