daisybio · PascalIversen · Feb 12, 2025 · Jan 31, 2025 · Jan 31, 2025 · Jan 31, 2025
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
@@ -13,6 +13,6 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Run Labeler
-        uses: crazy-max/ghaction-github-labeler@v5.1.0
+        uses: crazy-max/ghaction-github-labeler@v5.2.0
         with:
           skip-delete: true
diff --git a/docs/conf.py b/docs/conf.py
@@ -56,9 +56,9 @@
 # the built documents.
 #
 # The short X.Y version.
-version = "1.1.3"
+version = "1.1.4"
 # The full version, including alpha/beta/rc tags.
-release = "1.1.3"
+release = "1.1.4"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/usage.rst b/docs/usage.rst
@@ -100,6 +100,9 @@ prediction. This is why we also offer the possibility to compare your model to a
 the mean IC50 of all drugs in the training set. We also offer two more advanced naive predictors:
 **NaiveCellLineMeanPredictor** and **NaiveDrugMeanPredictor**. The former predicts the mean IC50 of a cell line in
 the training set and the latter predicts the mean IC50 of a drug in the training set.
+Finally, as the strongest naive baseline we offer the **NaiveMeanEffectPredictor**
+which combines the effects of cell lines and drugs.
+It is equivalent to the **NaiveCellLineMeanPredictor** and **NaiveDrugMeanPredictor** for the LDO and LPO settings, respectively.
 
 Available Models
 ------------------
@@ -119,6 +122,8 @@ For ``--models``, you can also perform randomization and robustness tests. The `
 +----------------------------+----------------------------+--------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | NaiveDrugMeanPredictor     | Baseline Method            | Multi-Drug Model                     | Predicts the mean response of a drug in the training set.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
 +----------------------------+----------------------------+--------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| NaiveMeanEffectPredictor   | Baseline Method            | Multi-Drug Model                     | Predicts using ANOVA-like mean effect model of cell lines and drugs                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
++----------------------------+----------------------------+--------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | ElasticNet                 | Baseline Method            | Multi-Drug Model                     | Fits an `Sklearn Elastic Net <https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html>`_, `Lasso <https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html>`_, or `Ridge <https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html>`_ model on gene expression data and drug fingerprints (concatenated input matrix).                                                                                                                                                                                    |
 +----------------------------+----------------------------+--------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | GradientBoosting           | Baseline Method            | Multi-Drug Model                     | Fits an `Sklearn Gradient Boosting Regressor <https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html>`_ gene expression data and drug fingerprints.                                                                                                                                                                                                                                                                                                                                                                                             |

diff --git a/drevalpy/.DS_Store b/drevalpy/.DS_Store
diff --git a/drevalpy/datasets/utils.py b/drevalpy/datasets/utils.py
@@ -1,5 +1,6 @@
 """Utility functions for datasets."""
 
+import os
 import zipfile
 from pathlib import Path
 from typing import Any
@@ -60,7 +61,7 @@ def download_dataset(
         with zipfile.ZipFile(file_path, "r") as z:
             for member in z.infolist():
                 if not member.filename.startswith("__MACOSX/"):
-                    z.extract(member, data_path)
+                    z.extract(member, os.path.join(data_path, dataset_name))
         file_path.unlink()  # Remove zip file after extraction
 
         print(f"{dataset_name} data downloaded and extracted to {data_path}")

diff --git a/drevalpy/experiment.py b/drevalpy/experiment.py
@@ -94,7 +94,7 @@ def drug_response_experiment(
     if baselines is None:
         baselines = []
     cross_study_datasets = cross_study_datasets or []
-    result_path = os.path.join(path_out, run_id, test_mode)
+    result_path = os.path.join(path_out, run_id, response_data._name, test_mode)
     split_path = os.path.join(result_path, "splits")
     result_folder_exists = os.path.exists(result_path)
     if result_folder_exists and overwrite:
@@ -903,10 +903,11 @@ def train_and_predict(
 
     train_dataset.reduce_to(cell_line_ids=cell_lines_to_keep, drug_ids=drugs_to_keep)
     prediction_dataset.reduce_to(cell_line_ids=cell_lines_to_keep, drug_ids=drugs_to_keep)
-    print(f"Reduced training dataset from {len_train_before} to {len(train_dataset)}, because of missing features")
-    print(
-        f"Reduced prediction dataset from {len_pred_before} to {len(prediction_dataset)}, because of missing features"
-    )
+    if len(train_dataset) < len_train_before or len(prediction_dataset) < len_pred_before:
+        print(f"Reduced training dataset from {len_train_before} to {len(train_dataset)}, due to missing features")
+        print(
+            f"Reduced prediction dataset from {len_pred_before} to {len(prediction_dataset)}, due to missing features"
+        )
 
     if early_stopping_dataset is not None:
         len_es_before = len(early_stopping_dataset)
@@ -1142,8 +1143,7 @@ def make_model_list(models: list[type[DRPModel]], response_data: DrugResponseDat
 
 @pipeline_function
 def get_model_name_and_drug_id(model_name: str) -> tuple[str, str | None]:
-    """
-    Get the model name and drug id from the model name.
+    """Get the model name and drug id from the model name.
 
     :param model_name: model name, e.g., SimpleNeuralNetwork or MOLIR.Afatinib
     :returns: tuple of model name and, potentially drug id if it is a single drug model

diff --git a/drevalpy/models/.DS_Store b/drevalpy/models/.DS_Store
diff --git a/drevalpy/models/SimpleNeuralNetwork/simple_neural_network.py b/drevalpy/models/SimpleNeuralNetwork/simple_neural_network.py
@@ -94,6 +94,10 @@ def train(
                 "ignore",
                 message=".*does not have many workers which may be a bottleneck.*",
             )
+            warnings.filterwarnings(
+                "ignore",
+                message="Starting from v1\\.9\\.0, `tensorboardX` has been removed.*",
+            )
             self.model.fit(
                 output_train=output,
                 cell_line_input=cell_line_input,

diff --git a/drevalpy/models/SimpleNeuralNetwork/utils.py b/drevalpy/models/SimpleNeuralNetwork/utils.py
@@ -1,5 +1,6 @@
 """Utility functions for the simple neural network models."""
 
+import os
 import secrets
 from typing import Any
 
@@ -229,11 +230,14 @@ def fit(
         monitor = "train_loss" if (val_loader is None) else "val_loss"
 
         early_stop_callback = EarlyStopping(monitor=monitor, mode="min", patience=patience)
-        name = "version-" + "".join(
-            [secrets.choice("0123456789abcdef") for i in range(20)]
-        )  # preventing conflicts of filenames
+
+        unique_subfolder = os.path.join(model_checkpoint_dir, "run_" + secrets.token_hex(8))
+        os.makedirs(unique_subfolder, exist_ok=True)
+
+        # prevent conflicts
+        name = "version-" + "".join([secrets.choice("0123456789abcdef") for _ in range(10)])
         self.checkpoint_callback = pl.callbacks.ModelCheckpoint(
-            dirpath=model_checkpoint_dir,
+            dirpath=unique_subfolder,
             monitor=monitor,
             mode="min",
             save_top_k=1,
@@ -262,7 +266,7 @@ def fit(
 
         # load best model
         if self.checkpoint_callback.best_model_path is not None:
-            checkpoint = torch.load(self.checkpoint_callback.best_model_path)  # noqa: S614
+            checkpoint = torch.load(self.checkpoint_callback.best_model_path, weights_only=True)  # noqa: S614
             self.load_state_dict(checkpoint["state_dict"])
         else:
             print("checkpoint_callback: No best model found, using the last model.")

diff --git a/drevalpy/models/__init__.py b/drevalpy/models/__init__.py
@@ -4,13 +4,16 @@
     "NaivePredictor",
     "NaiveDrugMeanPredictor",
     "NaiveCellLineMeanPredictor",
+    "NaiveMeanEffectsPredictor",
     "ElasticNetModel",
     "RandomForest",
     "SVMRegressor",
     "SimpleNeuralNetwork",
     "MultiOmicsNeuralNetwork",
     "MultiOmicsRandomForest",
     "SingleDrugRandomForest",
+    "SingleDrugElasticNet",
+    "SingleDrugProteomicsElasticNet",
     "SRMF",
     "GradientBoosting",
     "MOLIR",
@@ -22,7 +25,13 @@
 ]
 
 from .baselines.multi_omics_random_forest import MultiOmicsRandomForest
-from .baselines.naive_pred import NaiveCellLineMeanPredictor, NaiveDrugMeanPredictor, NaivePredictor
+from .baselines.naive_pred import (
+    NaiveCellLineMeanPredictor,
+    NaiveDrugMeanPredictor,
+    NaiveMeanEffectsPredictor,
+    NaivePredictor,
+)
+from .baselines.singledrug_elastic_net import SingleDrugElasticNet, SingleDrugProteomicsElasticNet
 from .baselines.singledrug_random_forest import SingleDrugRandomForest
 from .baselines.sklearn_models import ElasticNetModel, GradientBoosting, RandomForest, SVMRegressor
 from .DIPK.dipk import DIPKModel
@@ -38,13 +47,16 @@
     "SingleDrugRandomForest": SingleDrugRandomForest,
     "MOLIR": MOLIR,
     "SuperFELTR": SuperFELTR,
+    "SingleDrugElasticNet": SingleDrugElasticNet,
+    "SingleDrugProteomicsElasticNet": SingleDrugProteomicsElasticNet,
 }
 
 # MULTI_DRUG_MODEL_FACTORY is used in the pipeline!
 MULTI_DRUG_MODEL_FACTORY: dict[str, type[DRPModel]] = {
     "NaivePredictor": NaivePredictor,
     "NaiveDrugMeanPredictor": NaiveDrugMeanPredictor,
     "NaiveCellLineMeanPredictor": NaiveCellLineMeanPredictor,
+    "NaiveMeanEffectsPredictor": NaiveMeanEffectsPredictor,
     "ElasticNet": ElasticNetModel,
     "RandomForest": RandomForest,
     "SVR": SVMRegressor,

diff --git a/drevalpy/models/baselines/hyperparameters.yaml b/drevalpy/models/baselines/hyperparameters.yaml
@@ -1,6 +1,7 @@
 NaivePredictor:
 NaiveDrugMeanPredictor:
 NaiveCellLineMeanPredictor:
+NaiveANOVAPredictor:
 ElasticNet:
   l1_ratio:
     - 0
@@ -100,3 +101,33 @@ GradientBoosting:
     - 1.0
     - 0.8
     - 0.5
+SingleDrugElasticNet:
+  l1_ratio:
+    - 0.2
+    - 0.5
+    - 0.9
+  alpha:
+    - 1
+    - 0.8
+    - 0.6
+    - 0.4
+    - 0.2
+    - 0.1
+    - 5
+    - 10
+    - 100
+SingleDrugProteomicsElasticNet:
+  l1_ratio:
+    - 0.2
+    - 0.5
+    - 0.9
+  alpha:
+    - 1
+    - 0.8
+    - 0.6
+    - 0.4
+    - 0.2
+    - 0.1
+    - 5
+    - 10
+    - 100
diff --git a/drevalpy/models/baselines/naive_pred.py b/drevalpy/models/baselines/naive_pred.py
@@ -4,6 +4,9 @@
 The naive predictor models are simple models that predict the mean of the response values. The NaivePredictor
 predicts the overall mean of the response, the NaiveCellLineMeanPredictor predicts the mean of the response per cell
 line, and the NaiveDrugMeanPredictor predicts the mean of the response per drug.
+The NaiveMeanEffectsPredictor predicts the response as the overall mean plus the cell line effect
+plus the drug effect and should be the strongest naive baseline.
+
 """
 
 import numpy as np
@@ -334,3 +337,145 @@ def load_drug_features(self, data_path: str, dataset_name: str) -> FeatureDatase
         :returns: FeatureDataset containing the drug ids
         """
         return load_drug_ids_from_csv(data_path, dataset_name)
+
+
+class NaiveMeanEffectsPredictor(DRPModel):
+    """
+    ANOVA-like predictor model.
+
+    Predicts the response as:
+    response = overall_mean + cell_line_effect + drug_effect.
+
+    Here:
+        - cell_line_effect = (cell line mean - overall_mean)
+        - drug_effect = (drug mean - overall_mean)
+
+    This formulation ensures that the overall mean is not counted twice.
+    """
+
+    cell_line_views = ["cell_line_id"]
+    drug_views = ["drug_id"]
+
+    def __init__(self):
+        """
+        Initializes the NaiveMeanEffectsPredictor model.
+
+        The overall dataset mean, cell line effects, and drug effects are initialized to None
+        and empty dictionaries, respectively.
+        """
+        super().__init__()
+        self.dataset_mean = None
+        self.cell_line_effects = {}
+        self.drug_effects = {}
+
+    @classmethod
+    def get_model_name(cls) -> str:
+        """
+        Returns the name of the model.
+
+        :return: The name of the model as a string.
+        """
+        return "NaiveMeanEffectsPredictor"
+
+    def build_model(self, hyperparameters: dict):
+        """
+        Builds the model.
+
+        This model does not require any hyperparameter tuning.
+
+        :param hyperparameters: Dictionary of hyperparameters (not used).
+        """
+        pass
+
+    def train(
+        self,
+        output: DrugResponseDataset,
+        cell_line_input: FeatureDataset,
+        drug_input: FeatureDataset | None = None,
+        output_earlystopping: DrugResponseDataset | None = None,
+        model_checkpoint_dir: str = "checkpoints",
+    ) -> None:
+        """
+        Trains with overall mean, cell line effects, and drug effects.
+
+        :param output: Training dataset containing the response output.
+        :param cell_line_input: Feature dataset containing cell line IDs.
+        :param drug_input: Feature dataset containing drug IDs. Must not be None.
+        :param output_earlystopping: Not used.
+        :param model_checkpoint_dir: Not used.
+        :raises ValueError: If drug_input is None.
+        """
+        if drug_input is None:
+            raise ValueError("drug_input (drug_id) is required for ANOVAPredictor.")
+
+        # Compute the overall mean response.
+        self.dataset_mean = np.mean(output.response)
+
+        # Obtain cell line features.
+        cell_line_ids = cell_line_input.get_feature_matrix(view="cell_line_id", identifiers=output.cell_line_ids)
+        cell_line_means = {}
+        for cl_output, cl_feature in zip(unique(output.cell_line_ids), unique(cell_line_ids), strict=True):
+            responses_cl = output.response[cl_feature == output.cell_line_ids]
+            if len(responses_cl) > 0:
+                cell_line_means[cl_output] = np.mean(responses_cl)
+
+        # Obtain drug features.
+        drug_ids = drug_input.get_feature_matrix(view="drug_id", identifiers=output.drug_ids)
+        drug_means = {}
+        for drug_output, drug_feature in zip(unique(output.drug_ids), unique(drug_ids), strict=True):
+            responses_drug = output.response[drug_feature == output.drug_ids]
+            if len(responses_drug) > 0:
+                drug_means[drug_output] = np.mean(responses_drug)
+
+        # Compute the effects as deviations from the overall mean.
+        self.cell_line_effects = {cl: (mean - self.dataset_mean) for cl, mean in cell_line_means.items()}
+        self.drug_effects = {drug: (mean - self.dataset_mean) for drug, mean in drug_means.items()}
+
+    def predict(
+        self,
+        cell_line_ids: np.ndarray,
+        drug_ids: np.ndarray,
+        cell_line_input: FeatureDataset,
+        drug_input: FeatureDataset | None = None,
+    ) -> np.ndarray:
+        """
+        Predicts responses for given cell line and drug pairs.
+
+        The prediction is computed as:
+            prediction = overall_mean + cell_line_effect + drug_effect
+
+        If a cell line or drug has not been seen during training, their effect is set to zero.
+
+        :param cell_line_ids: Array of cell line IDs.
+        :param drug_ids: Array of drug IDs.
+        :param cell_line_input: Not used.
+        :param drug_input: Not used.
+        :return: NumPy array of predicted responses.
+        """
+        predictions = []
+        for cl, drug in zip(cell_line_ids, drug_ids):
+            effect_cl = self.cell_line_effects.get(cl, 0)
+            effect_drug = self.drug_effects.get(drug, 0)
+            # ANOVA-based prediction: overall mean + cell line effect + drug effect.
+            predictions.append(self.dataset_mean + effect_cl + effect_drug)
+        return np.array(predictions)
+
+    def load_cell_line_features(self, data_path: str, dataset_name: str) -> FeatureDataset:
+        """
+        Loads the cell line features.
+
+        :param data_path: Path to the data.
+        :param dataset_name: Name of the dataset.
+        :return: FeatureDataset containing the cell line IDs.
+        """
+        return load_cl_ids_from_csv(data_path, dataset_name)
+
+    def load_drug_features(self, data_path: str, dataset_name: str) -> FeatureDataset:
+        """
+        Loads the drug features.
+
+        :param data_path: Path to the data.
+        :param dataset_name: Name of the dataset.
+        :return: FeatureDataset containing the drug IDs.
+        """
+        return load_drug_ids_from_csv(data_path, dataset_name)