daisybio · PascalIversen · Feb 11, 2025 · Jan 31, 2025 · Jan 31, 2025 · Jan 31, 2025
diff --git a/drevalpy/.DS_Store b/drevalpy/.DS_Store
diff --git a/drevalpy/datasets/utils.py b/drevalpy/datasets/utils.py
@@ -1,5 +1,6 @@
 """Utility functions for datasets."""
 
+import os
 import zipfile
 from pathlib import Path
 from typing import Any
@@ -60,7 +61,7 @@ def download_dataset(
         with zipfile.ZipFile(file_path, "r") as z:
             for member in z.infolist():
                 if not member.filename.startswith("__MACOSX/"):
-                    z.extract(member, data_path)
+                    z.extract(member, os.path.join(data_path, dataset_name))
         file_path.unlink()  # Remove zip file after extraction
 
         print(f"{dataset_name} data downloaded and extracted to {data_path}")

diff --git a/drevalpy/experiment.py b/drevalpy/experiment.py
@@ -903,10 +903,11 @@ def train_and_predict(
 
     train_dataset.reduce_to(cell_line_ids=cell_lines_to_keep, drug_ids=drugs_to_keep)
     prediction_dataset.reduce_to(cell_line_ids=cell_lines_to_keep, drug_ids=drugs_to_keep)
-    print(f"Reduced training dataset from {len_train_before} to {len(train_dataset)}, because of missing features")
-    print(
-        f"Reduced prediction dataset from {len_pred_before} to {len(prediction_dataset)}, because of missing features"
-    )
+    if len(train_dataset) < len_train_before or len(prediction_dataset) < len_pred_before:
+        print(f"Reduced training dataset from {len_train_before} to {len(train_dataset)}, due to missing features")
+        print(
+            f"Reduced prediction dataset from {len_pred_before} to {len(prediction_dataset)}, due to missing features"
+        )
 
     if early_stopping_dataset is not None:
         len_es_before = len(early_stopping_dataset)
@@ -1142,8 +1143,7 @@ def make_model_list(models: list[type[DRPModel]], response_data: DrugResponseDat
 
 @pipeline_function
 def get_model_name_and_drug_id(model_name: str) -> tuple[str, str | None]:
-    """
-    Get the model name and drug id from the model name.
+    """Get the model name and drug id from the model name.
 
     :param model_name: model name, e.g., SimpleNeuralNetwork or MOLIR.Afatinib
     :returns: tuple of model name and, potentially drug id if it is a single drug model

diff --git a/drevalpy/models/.DS_Store b/drevalpy/models/.DS_Store
diff --git a/drevalpy/models/__init__.py b/drevalpy/models/__init__.py
@@ -12,6 +12,8 @@
     "MultiOmicsNeuralNetwork",
     "MultiOmicsRandomForest",
     "SingleDrugRandomForest",
+    "SingleDrugElasticNet",
+    "SingleDrugProteomicsElasticNet",
     "SRMF",
     "GradientBoosting",
     "MOLIR",
@@ -29,6 +31,7 @@
     NaiveMeanEffectsPredictor,
     NaivePredictor,
 )
+from .baselines.singledrug_elastic_net import SingleDrugElasticNet, SingleDrugProteomicsElasticNet
 from .baselines.singledrug_random_forest import SingleDrugRandomForest
 from .baselines.sklearn_models import ElasticNetModel, GradientBoosting, RandomForest, SVMRegressor
 from .DIPK.dipk import DIPKModel
@@ -44,6 +47,8 @@
     "SingleDrugRandomForest": SingleDrugRandomForest,
     "MOLIR": MOLIR,
     "SuperFELTR": SuperFELTR,
+    "SingleDrugElasticNet": SingleDrugElasticNet,
+    "SingleDrugProteomicsElasticNet": SingleDrugProteomicsElasticNet,
 }
 
 # MULTI_DRUG_MODEL_FACTORY is used in the pipeline!

diff --git a/drevalpy/models/baselines/hyperparameters.yaml b/drevalpy/models/baselines/hyperparameters.yaml
@@ -101,3 +101,33 @@ GradientBoosting:
     - 1.0
     - 0.8
     - 0.5
+SingleDrugElasticNet:
+  l1_ratio:
+    - 0.2
+    - 0.5
+    - 0.9
+  alpha:
+    - 1
+    - 0.8
+    - 0.6
+    - 0.4
+    - 0.2
+    - 0.1
+    - 5
+    - 10
+    - 100
+SingleDrugProteomicsElasticNet:
+  l1_ratio:
+    - 0.2
+    - 0.5
+    - 0.9
+  alpha:
+    - 1
+    - 0.8
+    - 0.6
+    - 0.4
+    - 0.2
+    - 0.1
+    - 5
+    - 10
+    - 100
diff --git a/drevalpy/models/baselines/singledrug_elastic_net.py b/drevalpy/models/baselines/singledrug_elastic_net.py
@@ -0,0 +1,221 @@
+"""SingleDrugElasticNet and SingleDrugProteomicsElasticNet classes. Fit an Elastic net for each drug seperately."""
+
+import numpy as np
+from sklearn.linear_model import ElasticNet
+
+from ...datasets.dataset import DrugResponseDataset, FeatureDataset
+from ..utils import load_and_reduce_gene_features
+from .sklearn_models import SklearnModel
+
+
+class SingleDrugElasticNet(SklearnModel):
+    """SingleDrugElasticNet class."""
+
+    is_single_drug_model = True
+    drug_views = []
+    cell_line_views = ["gene_expression"]
+    early_stopping = False
+
+    def build_model(self, hyperparameters):
+        """
+        Builds the model from hyperparameters.
+
+        :param hyperparameters: Elastic net hyperparameters
+        """
+        self.model = ElasticNet(**hyperparameters)
+
+    @classmethod
+    def get_model_name(cls) -> str:
+        """
+        Returns the model name.
+
+        :returns: SingleDrugElasticNet
+        """
+        return "SingleDrugElasticNet"
+
+    def train(
+        self,
+        output: DrugResponseDataset,
+        cell_line_input: FeatureDataset,
+        drug_input: FeatureDataset | None = None,
+        output_earlystopping: DrugResponseDataset | None = None,
+        model_checkpoint_dir: str = "checkpoints",
+    ) -> None:
+        """
+        Trains the model; the number of features is the number of fingerprints.
+
+        :param output: training dataset containing the response output
+        :param cell_line_input: training dataset containing gene expression data
+        :param drug_input: not needed
+        :param output_earlystopping: not needed
+        :param model_checkpoint_dir: not needed as checkpoints are not saved
+        :raises ValueError: if drug_input is not None
+        """
+        if drug_input is not None:
+            raise ValueError("SingleDrugElasticNet does not support drug_input!")
+
+        if len(output) > 0:
+            x = self.get_concatenated_features(
+                cell_line_view="gene_expression",
+                drug_view=None,
+                cell_line_ids_output=output.cell_line_ids,
+                drug_ids_output=output.drug_ids,
+                cell_line_input=cell_line_input,
+                drug_input=None,
+            )
+            self.model.fit(x, output.response)
+        else:
+            print("No training data provided, will predict NA.")
+            self.model = None
+
+    def predict(
+        self,
+        cell_line_ids: np.ndarray,
+        drug_ids: np.ndarray,
+        cell_line_input: FeatureDataset,
+        drug_input: FeatureDataset | None = None,
+    ) -> np.ndarray:
+        """
+        Predicts the drug response for the given cell lines.
+
+        :param cell_line_ids: cell line ids
+        :param drug_ids: drug ids, not needed here
+        :param cell_line_input: cell line input
+        :param drug_input: drug input, not needed here
+        :returns: predicted drug response
+        :raises ValueError: if drug_input is not None
+        """
+        if drug_input is not None:
+            raise ValueError("drug_input is not needed.")
+
+        if self.model is None:
+            print("No training data was available, predicting NA.")
+            return np.array([np.nan] * len(cell_line_ids))
+        x = self.get_concatenated_features(
+            cell_line_view="gene_expression",
+            drug_view=None,
+            cell_line_ids_output=cell_line_ids,
+            drug_ids_output=drug_ids,
+            cell_line_input=cell_line_input,
+            drug_input=None,
+        )
+        return self.model.predict(x)
+
+    def load_drug_features(self, data_path, dataset_name):
+        """
+        Load drug features. Not needed for SingleDrugElasticNet.
+
+        :param data_path: path to the data
+        :param dataset_name: name of the dataset
+        :returns: None
+        """
+        return None
+
+
+class SingleDrugProteomicsElasticNet(SingleDrugElasticNet):
+    """SingleDrugProteomicsElasticNet class."""
+
+    cell_line_views = ["proteomics"]
+    is_single_drug_model = True
+
+    @classmethod
+    def get_model_name(cls) -> str:
+        """
+        Returns the model name.
+
+        :returns: SingleDrugProteomicsElasticNet
+        """
+        return "SingleDrugProteomicsElasticNet"
+
+    def load_cell_line_features(self, data_path: str, dataset_name: str) -> FeatureDataset:
+        """
+        Loads the proteomics data.
+
+        :param data_path: path to the data
+        :param dataset_name: name of the dataset
+        :returns: proteomics data
+        """
+        return load_and_reduce_gene_features(
+            feature_type="proteomics",
+            gene_list=None,
+            data_path=data_path,
+            dataset_name=dataset_name,
+        )
+
+    def load_drug_features(self, data_path, dataset_name):
+        """
+        Load drug features. Not needed for SingleDrugProteomicsElasticNet.
+
+        :param data_path: path to the data
+        :param dataset_name: name of the dataset
+        :returns: None
+        """
+        return None
+
+    def train(
+        self,
+        output: DrugResponseDataset,
+        cell_line_input: FeatureDataset,
+        drug_input: FeatureDataset | None = None,
+        output_earlystopping: DrugResponseDataset | None = None,
+        model_checkpoint_dir: str = "checkpoints",
+    ) -> None:
+        """
+        Trains the model; the number of features is the number of fingerprints.
+
+        :param output: training dataset containing the response output
+        :param cell_line_input: training dataset containing gene expression data
+        :param drug_input: not needed
+        :param output_earlystopping: not needed
+        :param model_checkpoint_dir: not needed as checkpoints are not saved
+        :raises ValueError: if drug_input is not None
+        """
+        if drug_input is not None:
+            raise ValueError("SingleDrugElasticNet does not support drug_input!")
+
+        if len(output) > 0:
+            x = self.get_concatenated_features(
+                cell_line_view="proteomics",
+                drug_view=None,
+                cell_line_ids_output=output.cell_line_ids,
+                drug_ids_output=output.drug_ids,
+                cell_line_input=cell_line_input,
+                drug_input=None,
+            )
+            self.model.fit(x, output.response)
+        else:
+            print("No training data provided, will predict NA.")
+            self.model = None
+
+    def predict(
+        self,
+        cell_line_ids: np.ndarray,
+        drug_ids: np.ndarray,
+        cell_line_input: FeatureDataset,
+        drug_input: FeatureDataset | None = None,
+    ) -> np.ndarray:
+        """
+        Predicts the drug response for the given cell lines.
+
+        :param cell_line_ids: cell line ids
+        :param drug_ids: drug ids, not needed here
+        :param cell_line_input: cell line input
+        :param drug_input: drug input, not needed here
+        :returns: predicted drug response
+        :raises ValueError: if drug_input is not None
+        """
+        if drug_input is not None:
+            raise ValueError("drug_input is not needed.")
+
+        if self.model is None:
+            print("No training data was available, predicting NA.")
+            return np.array([np.nan] * len(cell_line_ids))
+        x = self.get_concatenated_features(
+            cell_line_view="proteomics",
+            drug_view=None,
+            cell_line_ids_output=cell_line_ids,
+            drug_ids_output=drug_ids,
+            cell_line_input=cell_line_input,
+            drug_input=None,
+        )
+        return self.model.predict(x)
diff --git a/drevalpy/models/baselines/singledrug_random_forest.py b/drevalpy/models/baselines/singledrug_random_forest.py
@@ -94,3 +94,13 @@ def predict(
             drug_input=None,
         )
         return self.model.predict(x)
+
+    def load_drug_features(self, data_path, dataset_name):
+        """
+        Load drug features. Not needed for SingleDrugRandomForest.
+
+        :param data_path: path to the data
+        :param dataset_name: name of the dataset
+        :returns: None
+        """
+        return None
diff --git a/drevalpy/models/utils.py b/drevalpy/models/utils.py
@@ -151,7 +151,7 @@ def get_multiomics_feature_dataset(
     :raises ValueError: if no omics features are found
     """
     if omics is None:
-        omics = ["gene_expression", "methylation", "mutations", "copy_number_variation_gistic"]
+        omics = ["gene_expression", "methylation", "mutations", "copy_number_variation_gistic", "proteomics"]
     feature_dataset = None
     for omic in omics:
         if feature_dataset is None: