daisybio · JudithBernett · Feb 21, 2025 · Feb 13, 2025 · Feb 13, 2025 · Feb 13, 2025
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,8 @@ data/GDSC1
 data/GDSC2
 data/CCLE
 data/Toy_Data
+data/CTRPv1
+data/CTRPv2
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/drevalpy/datasets/dataset.py b/drevalpy/datasets/dataset.py
@@ -55,7 +55,7 @@ def from_csv(
         - response:         the drug response values as floating point values
         - cell_line_ids:    a string identifier for cell lines
         - drug_ids:         a string identifier for drugs
-        - predictions:      an optional column containing a predicted value TODO what exactly?
+        - predictions:      an optional column containing drug response predictions
 
         :param input_file: Path to the csv file containing the data to be loaded
         :param dataset_name: Optional name to associate the dataset with, default = "unknown"
@@ -64,6 +64,8 @@ def from_csv(
         :returns: DrugResponseDataset object containing data from provided csv file.
         """
         data = pd.read_csv(input_file)
+        data["drug_id"] = data["drug_id"].astype(str)
+
         if "predictions" in data.columns:
             predictions = data["predictions"].values
         else:
@@ -152,9 +154,9 @@ def __init__(
         """
         super().__init__()
         if len(response) != len(cell_line_ids):
-            raise AssertionError("Response and cell_line_ids have different lengths.")
+            raise AssertionError("Response and cell line identifiers have different lengths.")
         if len(response) != len(drug_ids):
-            raise AssertionError("Response and drug_ids have different lengths.")
+            raise AssertionError("Response and drug identifiers have different lengths.")
         if predictions is not None and len(response) != len(predictions):
             raise AssertionError("Response and predictions have different lengths.")
         self._response = response

diff --git a/drevalpy/datasets/loader.py b/drevalpy/datasets/loader.py
@@ -9,13 +9,13 @@
 from ..pipeline_function import pipeline_function
 from .curvecurator import fit_curves
 from .dataset import DrugResponseDataset
-from .utils import download_dataset
+from .utils import CELL_LINE_IDENTIFIER, DRUG_IDENTIFIER, download_dataset
 
 
 def load_gdsc1(
     path_data: str = "data",
-    measure: str = "LN_IC50",
-    file_name: str = "response_GDSC1.csv",
+    measure: str = "LN_IC50_curvecurator",
+    file_name: str = "GDSC1.csv",
     dataset_name: str = "GDSC1",
 ) -> DrugResponseDataset:
     """
@@ -32,18 +32,18 @@ def load_gdsc1(
     if not os.path.exists(path):
         download_dataset(dataset_name, path_data, redownload=True)
 
-    response_data = pd.read_csv(path)
-    response_data["DRUG_NAME"] = response_data["DRUG_NAME"].str.replace(",", "")
+    response_data = pd.read_csv(path, dtype={"pubchem_id": str})
+    response_data[DRUG_IDENTIFIER] = response_data[DRUG_IDENTIFIER].str.replace(",", "")
 
     return DrugResponseDataset(
         response=response_data[measure].values,
-        cell_line_ids=response_data["CELL_LINE_NAME"].values,
-        drug_ids=response_data["DRUG_NAME"].values,
+        cell_line_ids=response_data[CELL_LINE_IDENTIFIER].values,
+        drug_ids=response_data[DRUG_IDENTIFIER].values,
         dataset_name=dataset_name,
     )
 
 
-def load_gdsc2(path_data: str = "data", measure: str = "LN_IC50", file_name: str = "response_GDSC2.csv"):
+def load_gdsc2(path_data: str = "data", measure: str = "LN_IC50_curvecurator", file_name: str = "GDSC2.csv"):
     """
     Loads the GDSC2 dataset.
 
@@ -57,7 +57,7 @@ def load_gdsc2(path_data: str = "data", measure: str = "LN_IC50", file_name: str
 
 
 def load_ccle(
-    path_data: str = "data", measure: str = "LN_IC50", file_name: str = "response_CCLE.csv"
+    path_data: str = "data", measure: str = "LN_IC50_curvecurator", file_name: str = "CCLE.csv"
 ) -> DrugResponseDataset:
     """
     Loads the CCLE dataset.
@@ -73,18 +73,18 @@ def load_ccle(
     if not os.path.exists(path):
         download_dataset(dataset_name, path_data, redownload=True)
 
-    response_data = pd.read_csv(path)
-    response_data["DRUG_NAME"] = response_data["DRUG_NAME"].str.replace(",", "")
+    response_data = pd.read_csv(path, dtype={"pubchem_id": str})
+    response_data[DRUG_IDENTIFIER] = response_data[DRUG_IDENTIFIER].str.replace(",", "")
 
     return DrugResponseDataset(
         response=response_data[measure].values,
-        cell_line_ids=response_data["CELL_LINE_NAME"].values,
-        drug_ids=response_data["DRUG_NAME"].values,
+        cell_line_ids=response_data[CELL_LINE_IDENTIFIER].values,
+        drug_ids=response_data[DRUG_IDENTIFIER].values,
         dataset_name=dataset_name,
     )
 
 
-def load_toy(path_data: str = "data", measure: str = "response") -> DrugResponseDataset:
+def load_toy(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
     """
     Loads small Toy dataset, subsampled from GDSC1.
 
@@ -94,20 +94,67 @@ def load_toy(path_data: str = "data", measure: str = "response") -> DrugResponse
     :return: DrugResponseDataset containing response, cell line IDs, and drug IDs.
     """
     dataset_name = "Toy_Data"
-    measure = "response"  # overwrite this explicitly to avoid problems, should be changed in the future
     path = os.path.join(path_data, dataset_name, "toy_data.csv")
     if not os.path.exists(path):
         download_dataset(dataset_name, path_data, redownload=True)
-    response_data = pd.read_csv(path)
+    response_data = pd.read_csv(path, dtype={"pubchem_id": str})
 
     return DrugResponseDataset(
         response=response_data[measure].values,
-        cell_line_ids=response_data["cell_line_id"].values,
-        drug_ids=response_data["drug_id"].values,
+        cell_line_ids=response_data[CELL_LINE_IDENTIFIER].values,
+        drug_ids=response_data[DRUG_IDENTIFIER].values,
         dataset_name=dataset_name,
     )
 
 
+def _load_ctrpv(version: str, path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
+    """
+    Load CTRPv1 dataset.
+
+    :param version: The version of the CTRP dataset to load.
+    :param path_data: Path to location of CTRPv1 dataset
+    :param measure: The name of the column containing the measure to predict, default = "response"
+
+    :return: DrugResponseDataset containing response, cell line IDs, and drug IDs
+    """
+    dataset_name = "CTRPv" + version
+    path = os.path.join(path_data, dataset_name, f"{dataset_name}.csv")
+    if not os.path.exists(path):
+        download_dataset(dataset_name, path_data, redownload=True)
+    response_data = pd.read_csv(path, dtype={"pubchem_id": str})
+
+    return DrugResponseDataset(
+        response=response_data[measure].values,
+        cell_line_ids=response_data[CELL_LINE_IDENTIFIER].values,
+        drug_ids=response_data[DRUG_IDENTIFIER].values,
+        dataset_name=dataset_name,
+    )
+
+
+def load_ctrpv1(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
+    """
+    Load CTRPv2 dataset.
+
+    :param path_data: Path to location of CTRPv2 dataset
+    :param measure: The name of the column containing the measure to predict, default = "LN_IC50_curvecurator"
+
+    :return: DrugResponseDataset containing response, cell line IDs, and drug IDs
+    """
+    return _load_ctrpv("1", path_data, measure)
+
+
+def load_ctrpv2(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
+    """
+    Load CTRPv2 dataset.
+
+    :param path_data: Path to location of CTRPv2 dataset
+    :param measure: The name of the column containing the measure to predict, default: LN_IC50_curvecurator
+
+    :return: DrugResponseDataset containing response, cell line IDs, and drug IDs
+    """
+    return _load_ctrpv("2", path_data, measure)
+
+
 def load_custom(path_data: str | Path, measure: str = "response") -> DrugResponseDataset:
     """
     Load custom dataset.
@@ -125,6 +172,8 @@ def load_custom(path_data: str | Path, measure: str = "response") -> DrugRespons
     "GDSC2": load_gdsc2,
     "CCLE": load_ccle,
     "Toy_Data": load_toy,
+    "CTRPv1": load_ctrpv1,
+    "CTRPv2": load_ctrpv2,
 }
 
 

diff --git a/drevalpy/datasets/utils.py b/drevalpy/datasets/utils.py
@@ -9,6 +9,9 @@
 import numpy as np
 import requests
 
+DRUG_IDENTIFIER = "pubchem_id"
+CELL_LINE_IDENTIFIER = "cell_line_name"
+
 
 def download_dataset(
     dataset_name: str,
@@ -26,18 +29,18 @@ def download_dataset(
     file_name = f"{dataset_name}.zip"
     file_path = Path(data_path) / file_name
     extracted_folder_path = file_path.with_suffix("")
-
+    timeout = 120
     # Check if the extracted data exists and skip download if not redownloading
     if extracted_folder_path.exists() and not redownload:
         print(f"{dataset_name} is already extracted, skipping download.")
     else:
         url = "https://zenodo.org/doi/10.5281/zenodo.12633909"
         # Fetch the latest record
-        response = requests.get(url, timeout=60)
+        response = requests.get(url, timeout=timeout)
         if response.status_code != 200:
             raise requests.exceptions.HTTPError(f"Error fetching record: {response.status_code}")
         latest_url = response.links["linkset"]["url"]
-        response = requests.get(latest_url, timeout=60)
+        response = requests.get(latest_url, timeout=timeout)
         if response.status_code != 200:
             raise requests.exceptions.HTTPError(f"Error fetching record: {response.status_code}")
         data = response.json()
@@ -50,7 +53,7 @@ def download_dataset(
         file_url = name_to_url[file_name]
         # Download the file
         print(f"Downloading {dataset_name} from {file_url}...")
-        response = requests.get(file_url, timeout=60)
+        response = requests.get(file_url, timeout=timeout)
         if response.status_code != 200:
             raise requests.exceptions.HTTPError(f"Error downloading file {dataset_name}: " f"{response.status_code}")
 
@@ -61,7 +64,7 @@ def download_dataset(
         with zipfile.ZipFile(file_path, "r") as z:
             for member in z.infolist():
                 if not member.filename.startswith("__MACOSX/"):
-                    z.extract(member, os.path.join(data_path, dataset_name))
+                    z.extract(member, os.path.join(data_path))
         file_path.unlink()  # Remove zip file after extraction
 
         print(f"{dataset_name} data downloaded and extracted to {data_path}")

diff --git a/drevalpy/models/SimpleNeuralNetwork/hyperparameters.yaml b/drevalpy/models/SimpleNeuralNetwork/hyperparameters.yaml
@@ -22,6 +22,8 @@ SimpleNeuralNetwork:
       - 128
       - 64
       - 16
+  max_epochs:
+    - 100
 
 MultiOmicsNeuralNetwork:
   dropout_prob:
@@ -44,3 +46,5 @@ MultiOmicsNeuralNetwork:
       - 32
   methylation_pca_components:
     - 100
+  max_epochs:
+    - 100
diff --git a/drevalpy/models/SimpleNeuralNetwork/multiomics_neural_network.py b/drevalpy/models/SimpleNeuralNetwork/multiomics_neural_network.py
@@ -110,6 +110,10 @@ def train(
                 cell_line_views=self.cell_line_views,
                 drug_views=self.drug_views,
                 output_earlystopping=output_earlystopping,
+                trainer_params={
+                    "max_epochs": self.hyperparameters.get("max_epochs", 100),
+                    "progress_bar_refresh_rate": 500,
+                },
                 batch_size=16,
                 patience=5,
                 num_workers=1,

diff --git a/drevalpy/models/SimpleNeuralNetwork/simple_neural_network.py b/drevalpy/models/SimpleNeuralNetwork/simple_neural_network.py
@@ -105,6 +105,10 @@ def train(
                 cell_line_views=self.cell_line_views,
                 drug_views=self.drug_views,
                 output_earlystopping=output_earlystopping,
+                trainer_params={
+                    "max_epochs": self.hyperparameters.get("max_epochs", 100),
+                    "progress_bar_refresh_rate": 500,
+                },
                 batch_size=16,
                 patience=5,
                 num_workers=1 if platform.system() == "Windows" else 8,

diff --git a/drevalpy/models/SimpleNeuralNetwork/utils.py b/drevalpy/models/SimpleNeuralNetwork/utils.py
@@ -180,16 +180,15 @@ def fit(
         :param model_checkpoint_dir: directory to save the model checkpoints
         :raises ValueError: if drug_input is missing
         """
-        if drug_input is None:
-            raise ValueError(
-                "Drug input (fingerprints) are required for SimpleNeuralNetwork and " "MultiOMICsNeuralNetwork."
-            )
-
         if trainer_params is None:
             trainer_params = {
+                "max_epochs": 100,
                 "progress_bar_refresh_rate": 500,
-                "max_epochs": 70,
             }
+        if drug_input is None:
+            raise ValueError(
+                "Drug input (fingerprints) are required for SimpleNeuralNetwork and " "MultiOMICsNeuralNetwork."
+            )
 
         train_dataset = RegressionDataset(
             output=output_train,

diff --git a/drevalpy/models/baselines/hyperparameters.yaml b/drevalpy/models/baselines/hyperparameters.yaml
@@ -1,7 +1,7 @@
 NaivePredictor:
 NaiveDrugMeanPredictor:
 NaiveCellLineMeanPredictor:
-NaiveANOVAPredictor:
+NaiveMeanEffectsPredictor:
 ElasticNet:
   l1_ratio:
     - 0

diff --git a/drevalpy/models/baselines/naive_pred.py b/drevalpy/models/baselines/naive_pred.py
@@ -12,15 +12,16 @@
 import numpy as np
 
 from drevalpy.datasets.dataset import DrugResponseDataset, FeatureDataset
+from drevalpy.datasets.utils import CELL_LINE_IDENTIFIER, DRUG_IDENTIFIER
 from drevalpy.models.drp_model import DRPModel
 from drevalpy.models.utils import load_cl_ids_from_csv, load_drug_ids_from_csv, unique
 
 
 class NaivePredictor(DRPModel):
     """Naive predictor model that predicts the overall mean of the response."""
 
-    cell_line_views = ["cell_line_id"]
-    drug_views = ["drug_id"]
+    cell_line_views = [CELL_LINE_IDENTIFIER]
+    drug_views = [DRUG_IDENTIFIER]
 
     def __init__(self):
         """
@@ -109,8 +110,8 @@ def load_drug_features(self, data_path: str, dataset_name: str) -> FeatureDatase
 class NaiveDrugMeanPredictor(DRPModel):
     """Naive predictor model that predicts the mean of the response per drug."""
 
-    cell_line_views = ["cell_line_id"]
-    drug_views = ["drug_id"]
+    cell_line_views = [CELL_LINE_IDENTIFIER]
+    drug_views = [DRUG_IDENTIFIER]
 
     def __init__(self):
         """
@@ -159,7 +160,7 @@ def train(
         """
         if drug_input is None:
             raise ValueError("drug_input (drug_id) is required for the NaiveDrugMeanPredictor.")
-        drug_ids = drug_input.get_feature_matrix(view="drug_id", identifiers=output.drug_ids)
+        drug_ids = drug_input.get_feature_matrix(view=DRUG_IDENTIFIER, identifiers=output.drug_ids)
         self.dataset_mean = np.mean(output.response)
         self.drug_means = {}
 
@@ -226,8 +227,8 @@ def load_drug_features(self, data_path: str, dataset_name: str) -> FeatureDatase
 class NaiveCellLineMeanPredictor(DRPModel):
     """Naive predictor model that predicts the mean of the response per cell line."""
 
-    cell_line_views = ["cell_line_id"]
-    drug_views = ["drug_id"]
+    cell_line_views = [CELL_LINE_IDENTIFIER]
+    drug_views = [DRUG_IDENTIFIER]
 
     def __init__(self):
         """
@@ -274,7 +275,7 @@ def train(
         :param output_earlystopping: not needed
         :param model_checkpoint_dir: not needed
         """
-        cell_line_ids = cell_line_input.get_feature_matrix(view="cell_line_id", identifiers=output.cell_line_ids)
+        cell_line_ids = cell_line_input.get_feature_matrix(view=CELL_LINE_IDENTIFIER, identifiers=output.cell_line_ids)
         self.dataset_mean = np.mean(output.response)
         self.cell_line_means = {}
 
@@ -353,8 +354,8 @@ class NaiveMeanEffectsPredictor(DRPModel):
     This formulation ensures that the overall mean is not counted twice.
     """
 
-    cell_line_views = ["cell_line_id"]
-    drug_views = ["drug_id"]
+    cell_line_views = [CELL_LINE_IDENTIFIER]
+    drug_views = [DRUG_IDENTIFIER]
 
     def __init__(self):
         """
@@ -412,15 +413,15 @@ def train(
         self.dataset_mean = np.mean(output.response)
 
         # Obtain cell line features.
-        cell_line_ids = cell_line_input.get_feature_matrix(view="cell_line_id", identifiers=output.cell_line_ids)
+        cell_line_ids = cell_line_input.get_feature_matrix(view=CELL_LINE_IDENTIFIER, identifiers=output.cell_line_ids)
         cell_line_means = {}
         for cl_output, cl_feature in zip(unique(output.cell_line_ids), unique(cell_line_ids), strict=True):
             responses_cl = output.response[cl_feature == output.cell_line_ids]
             if len(responses_cl) > 0:
                 cell_line_means[cl_output] = np.mean(responses_cl)
 
         # Obtain drug features.
-        drug_ids = drug_input.get_feature_matrix(view="drug_id", identifiers=output.drug_ids)
+        drug_ids = drug_input.get_feature_matrix(view=DRUG_IDENTIFIER, identifiers=output.drug_ids)
         drug_means = {}
         for drug_output, drug_feature in zip(unique(output.drug_ids), unique(drug_ids), strict=True):
             responses_drug = output.response[drug_feature == output.drug_ids]