Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
5959cb8
cell line and drug names now as FINAL
PascalIversen Feb 13, 2025
8a16b5d
ctrv12 loaders
PascalIversen Feb 13, 2025
d5e4c02
l
PascalIversen Feb 13, 2025
edceb51
str
PascalIversen Feb 20, 2025
08c384b
Empty commit for testing or triggering CI/CD
PascalIversen Feb 20, 2025
1d7850e
fixed tests for data
JudithBernett Feb 20, 2025
d0eeddd
loader back to other dezipping
PascalIversen Feb 20, 2025
0d0ee7d
merge
PascalIversen Feb 20, 2025
f64ef55
Merge branch 'loaders' of github.com:daisybio/drevalpy into loaders
JudithBernett Feb 21, 2025
afe05aa
fixing tests
PascalIversen Feb 21, 2025
4cbcd7b
fixing tests 2
PascalIversen Feb 21, 2025
f3aa3e8
fixed unused imports
JudithBernett Feb 21, 2025
f3756b2
Merge branch 'loaders' of github.com:daisybio/drevalpy into loaders
JudithBernett Feb 21, 2025
b389eb4
removing nan responses
JudithBernett Feb 21, 2025
e856285
fix double import
PascalIversen Feb 21, 2025
947aa4d
fixing rest
JudithBernett Feb 21, 2025
ed1b5e1
Merge branch 'loaders' of github.com:daisybio/drevalpy into loaders
JudithBernett Feb 21, 2025
fdcd917
supressed multirow warnings wfer first row and max_epochs for SNN for…
PascalIversen Feb 21, 2025
c1307e6
merge
PascalIversen Feb 21, 2025
934c4ba
fixed individual tests
JudithBernett Feb 21, 2025
39515f9
Merge branch 'loaders' of github.com:daisybio/drevalpy into loaders
JudithBernett Feb 21, 2025
9413eaf
mypy trainer_params
PascalIversen Feb 21, 2025
e61a81f
iterate features and load drug fingerprints tests were wrong
JudithBernett Feb 21, 2025
eb20386
Merge branch 'loaders' of github.com:daisybio/drevalpy into loaders
JudithBernett Feb 21, 2025
a3b183e
mypy fix?
JudithBernett Feb 21, 2025
285af01
mypy fix no 999999999
PascalIversen Feb 21, 2025
5222521
merge
PascalIversen Feb 21, 2025
e9b76e5
increase timeout
PascalIversen Feb 21, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ data/GDSC1
data/GDSC2
data/CCLE
data/Toy_Data
data/CTRPv1
data/CTRPv2

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
8 changes: 5 additions & 3 deletions drevalpy/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def from_csv(
- response: the drug response values as floating point values
- cell_line_ids: a string identifier for cell lines
- drug_ids: a string identifier for drugs
- predictions: an optional column containing a predicted value TODO what exactly?
- predictions: an optional column containing drug response predictions

:param input_file: Path to the csv file containing the data to be loaded
:param dataset_name: Optional name to associate the dataset with, default = "unknown"
Expand All @@ -64,6 +64,8 @@ def from_csv(
:returns: DrugResponseDataset object containing data from provided csv file.
"""
data = pd.read_csv(input_file)
data["drug_id"] = data["drug_id"].astype(str)

if "predictions" in data.columns:
predictions = data["predictions"].values
else:
Expand Down Expand Up @@ -152,9 +154,9 @@ def __init__(
"""
super().__init__()
if len(response) != len(cell_line_ids):
raise AssertionError("Response and cell_line_ids have different lengths.")
raise AssertionError("Response and cell line identifiers have different lengths.")
if len(response) != len(drug_ids):
raise AssertionError("Response and drug_ids have different lengths.")
raise AssertionError("Response and drug identifiers have different lengths.")
if predictions is not None and len(response) != len(predictions):
raise AssertionError("Response and predictions have different lengths.")
self._response = response
Expand Down
85 changes: 67 additions & 18 deletions drevalpy/datasets/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@
from ..pipeline_function import pipeline_function
from .curvecurator import fit_curves
from .dataset import DrugResponseDataset
from .utils import download_dataset
from .utils import CELL_LINE_IDENTIFIER, DRUG_IDENTIFIER, download_dataset


def load_gdsc1(
path_data: str = "data",
measure: str = "LN_IC50",
file_name: str = "response_GDSC1.csv",
measure: str = "LN_IC50_curvecurator",
file_name: str = "GDSC1.csv",
dataset_name: str = "GDSC1",
) -> DrugResponseDataset:
"""
Expand All @@ -32,18 +32,18 @@ def load_gdsc1(
if not os.path.exists(path):
download_dataset(dataset_name, path_data, redownload=True)

response_data = pd.read_csv(path)
response_data["DRUG_NAME"] = response_data["DRUG_NAME"].str.replace(",", "")
response_data = pd.read_csv(path, dtype={"pubchem_id": str})
response_data[DRUG_IDENTIFIER] = response_data[DRUG_IDENTIFIER].str.replace(",", "")

return DrugResponseDataset(
response=response_data[measure].values,
cell_line_ids=response_data["CELL_LINE_NAME"].values,
drug_ids=response_data["DRUG_NAME"].values,
cell_line_ids=response_data[CELL_LINE_IDENTIFIER].values,
drug_ids=response_data[DRUG_IDENTIFIER].values,
dataset_name=dataset_name,
)


def load_gdsc2(path_data: str = "data", measure: str = "LN_IC50", file_name: str = "response_GDSC2.csv"):
def load_gdsc2(path_data: str = "data", measure: str = "LN_IC50_curvecurator", file_name: str = "GDSC2.csv"):
"""
Loads the GDSC2 dataset.

Expand All @@ -57,7 +57,7 @@ def load_gdsc2(path_data: str = "data", measure: str = "LN_IC50", file_name: str


def load_ccle(
path_data: str = "data", measure: str = "LN_IC50", file_name: str = "response_CCLE.csv"
path_data: str = "data", measure: str = "LN_IC50_curvecurator", file_name: str = "CCLE.csv"
) -> DrugResponseDataset:
"""
Loads the CCLE dataset.
Expand All @@ -73,18 +73,18 @@ def load_ccle(
if not os.path.exists(path):
download_dataset(dataset_name, path_data, redownload=True)

response_data = pd.read_csv(path)
response_data["DRUG_NAME"] = response_data["DRUG_NAME"].str.replace(",", "")
response_data = pd.read_csv(path, dtype={"pubchem_id": str})
response_data[DRUG_IDENTIFIER] = response_data[DRUG_IDENTIFIER].str.replace(",", "")

return DrugResponseDataset(
response=response_data[measure].values,
cell_line_ids=response_data["CELL_LINE_NAME"].values,
drug_ids=response_data["DRUG_NAME"].values,
cell_line_ids=response_data[CELL_LINE_IDENTIFIER].values,
drug_ids=response_data[DRUG_IDENTIFIER].values,
dataset_name=dataset_name,
)


def load_toy(path_data: str = "data", measure: str = "response") -> DrugResponseDataset:
def load_toy(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
"""
Loads small Toy dataset, subsampled from GDSC1.

Expand All @@ -94,20 +94,67 @@ def load_toy(path_data: str = "data", measure: str = "response") -> DrugResponse
:return: DrugResponseDataset containing response, cell line IDs, and drug IDs.
"""
dataset_name = "Toy_Data"
measure = "response" # overwrite this explicitly to avoid problems, should be changed in the future
path = os.path.join(path_data, dataset_name, "toy_data.csv")
if not os.path.exists(path):
download_dataset(dataset_name, path_data, redownload=True)
response_data = pd.read_csv(path)
response_data = pd.read_csv(path, dtype={"pubchem_id": str})

return DrugResponseDataset(
response=response_data[measure].values,
cell_line_ids=response_data["cell_line_id"].values,
drug_ids=response_data["drug_id"].values,
cell_line_ids=response_data[CELL_LINE_IDENTIFIER].values,
drug_ids=response_data[DRUG_IDENTIFIER].values,
dataset_name=dataset_name,
)


def _load_ctrpv(version: str, path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
"""
Load CTRPv1 dataset.

:param version: The version of the CTRP dataset to load.
:param path_data: Path to location of CTRPv1 dataset
:param measure: The name of the column containing the measure to predict, default = "response"

:return: DrugResponseDataset containing response, cell line IDs, and drug IDs
"""
dataset_name = "CTRPv" + version
path = os.path.join(path_data, dataset_name, f"{dataset_name}.csv")
if not os.path.exists(path):
download_dataset(dataset_name, path_data, redownload=True)
response_data = pd.read_csv(path, dtype={"pubchem_id": str})

return DrugResponseDataset(
response=response_data[measure].values,
cell_line_ids=response_data[CELL_LINE_IDENTIFIER].values,
drug_ids=response_data[DRUG_IDENTIFIER].values,
dataset_name=dataset_name,
)


def load_ctrpv1(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
"""
Load CTRPv2 dataset.

:param path_data: Path to location of CTRPv2 dataset
:param measure: The name of the column containing the measure to predict, default = "LN_IC50_curvecurator"

:return: DrugResponseDataset containing response, cell line IDs, and drug IDs
"""
return _load_ctrpv("1", path_data, measure)


def load_ctrpv2(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
"""
Load CTRPv2 dataset.

:param path_data: Path to location of CTRPv2 dataset
:param measure: The name of the column containing the measure to predict, default: LN_IC50_curvecurator

:return: DrugResponseDataset containing response, cell line IDs, and drug IDs
"""
return _load_ctrpv("2", path_data, measure)


def load_custom(path_data: str | Path, measure: str = "response") -> DrugResponseDataset:
"""
Load custom dataset.
Expand All @@ -125,6 +172,8 @@ def load_custom(path_data: str | Path, measure: str = "response") -> DrugRespons
"GDSC2": load_gdsc2,
"CCLE": load_ccle,
"Toy_Data": load_toy,
"CTRPv1": load_ctrpv1,
"CTRPv2": load_ctrpv2,
}


Expand Down
13 changes: 8 additions & 5 deletions drevalpy/datasets/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
import numpy as np
import requests

DRUG_IDENTIFIER = "pubchem_id"
CELL_LINE_IDENTIFIER = "cell_line_name"


def download_dataset(
dataset_name: str,
Expand All @@ -26,18 +29,18 @@ def download_dataset(
file_name = f"{dataset_name}.zip"
file_path = Path(data_path) / file_name
extracted_folder_path = file_path.with_suffix("")

timeout = 120
# Check if the extracted data exists and skip download if not redownloading
if extracted_folder_path.exists() and not redownload:
print(f"{dataset_name} is already extracted, skipping download.")
else:
url = "https://zenodo.org/doi/10.5281/zenodo.12633909"
# Fetch the latest record
response = requests.get(url, timeout=60)
response = requests.get(url, timeout=timeout)
if response.status_code != 200:
raise requests.exceptions.HTTPError(f"Error fetching record: {response.status_code}")
latest_url = response.links["linkset"]["url"]
response = requests.get(latest_url, timeout=60)
response = requests.get(latest_url, timeout=timeout)
if response.status_code != 200:
raise requests.exceptions.HTTPError(f"Error fetching record: {response.status_code}")
data = response.json()
Expand All @@ -50,7 +53,7 @@ def download_dataset(
file_url = name_to_url[file_name]
# Download the file
print(f"Downloading {dataset_name} from {file_url}...")
response = requests.get(file_url, timeout=60)
response = requests.get(file_url, timeout=timeout)
if response.status_code != 200:
raise requests.exceptions.HTTPError(f"Error downloading file {dataset_name}: " f"{response.status_code}")

Expand All @@ -61,7 +64,7 @@ def download_dataset(
with zipfile.ZipFile(file_path, "r") as z:
for member in z.infolist():
if not member.filename.startswith("__MACOSX/"):
z.extract(member, os.path.join(data_path, dataset_name))
z.extract(member, os.path.join(data_path))
file_path.unlink() # Remove zip file after extraction

print(f"{dataset_name} data downloaded and extracted to {data_path}")
Expand Down
4 changes: 4 additions & 0 deletions drevalpy/models/SimpleNeuralNetwork/hyperparameters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ SimpleNeuralNetwork:
- 128
- 64
- 16
max_epochs:
- 100

MultiOmicsNeuralNetwork:
dropout_prob:
Expand All @@ -44,3 +46,5 @@ MultiOmicsNeuralNetwork:
- 32
methylation_pca_components:
- 100
max_epochs:
- 100
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,10 @@ def train(
cell_line_views=self.cell_line_views,
drug_views=self.drug_views,
output_earlystopping=output_earlystopping,
trainer_params={
"max_epochs": self.hyperparameters.get("max_epochs", 100),
"progress_bar_refresh_rate": 500,
},
batch_size=16,
patience=5,
num_workers=1,
Expand Down
4 changes: 4 additions & 0 deletions drevalpy/models/SimpleNeuralNetwork/simple_neural_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ def train(
cell_line_views=self.cell_line_views,
drug_views=self.drug_views,
output_earlystopping=output_earlystopping,
trainer_params={
"max_epochs": self.hyperparameters.get("max_epochs", 100),
"progress_bar_refresh_rate": 500,
},
batch_size=16,
patience=5,
num_workers=1 if platform.system() == "Windows" else 8,
Expand Down
11 changes: 5 additions & 6 deletions drevalpy/models/SimpleNeuralNetwork/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,16 +180,15 @@ def fit(
:param model_checkpoint_dir: directory to save the model checkpoints
:raises ValueError: if drug_input is missing
"""
if drug_input is None:
raise ValueError(
"Drug input (fingerprints) are required for SimpleNeuralNetwork and " "MultiOMICsNeuralNetwork."
)

if trainer_params is None:
trainer_params = {
"max_epochs": 100,
"progress_bar_refresh_rate": 500,
"max_epochs": 70,
}
if drug_input is None:
raise ValueError(
"Drug input (fingerprints) are required for SimpleNeuralNetwork and " "MultiOMICsNeuralNetwork."
)

train_dataset = RegressionDataset(
output=output_train,
Expand Down
2 changes: 1 addition & 1 deletion drevalpy/models/baselines/hyperparameters.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
NaivePredictor:
NaiveDrugMeanPredictor:
NaiveCellLineMeanPredictor:
NaiveANOVAPredictor:
NaiveMeanEffectsPredictor:
ElasticNet:
l1_ratio:
- 0
Expand Down
25 changes: 13 additions & 12 deletions drevalpy/models/baselines/naive_pred.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,16 @@
import numpy as np

from drevalpy.datasets.dataset import DrugResponseDataset, FeatureDataset
from drevalpy.datasets.utils import CELL_LINE_IDENTIFIER, DRUG_IDENTIFIER
from drevalpy.models.drp_model import DRPModel
from drevalpy.models.utils import load_cl_ids_from_csv, load_drug_ids_from_csv, unique


class NaivePredictor(DRPModel):
"""Naive predictor model that predicts the overall mean of the response."""

cell_line_views = ["cell_line_id"]
drug_views = ["drug_id"]
cell_line_views = [CELL_LINE_IDENTIFIER]
drug_views = [DRUG_IDENTIFIER]

def __init__(self):
"""
Expand Down Expand Up @@ -109,8 +110,8 @@ def load_drug_features(self, data_path: str, dataset_name: str) -> FeatureDatase
class NaiveDrugMeanPredictor(DRPModel):
"""Naive predictor model that predicts the mean of the response per drug."""

cell_line_views = ["cell_line_id"]
drug_views = ["drug_id"]
cell_line_views = [CELL_LINE_IDENTIFIER]
drug_views = [DRUG_IDENTIFIER]

def __init__(self):
"""
Expand Down Expand Up @@ -159,7 +160,7 @@ def train(
"""
if drug_input is None:
raise ValueError("drug_input (drug_id) is required for the NaiveDrugMeanPredictor.")
drug_ids = drug_input.get_feature_matrix(view="drug_id", identifiers=output.drug_ids)
drug_ids = drug_input.get_feature_matrix(view=DRUG_IDENTIFIER, identifiers=output.drug_ids)
self.dataset_mean = np.mean(output.response)
self.drug_means = {}

Expand Down Expand Up @@ -226,8 +227,8 @@ def load_drug_features(self, data_path: str, dataset_name: str) -> FeatureDatase
class NaiveCellLineMeanPredictor(DRPModel):
"""Naive predictor model that predicts the mean of the response per cell line."""

cell_line_views = ["cell_line_id"]
drug_views = ["drug_id"]
cell_line_views = [CELL_LINE_IDENTIFIER]
drug_views = [DRUG_IDENTIFIER]

def __init__(self):
"""
Expand Down Expand Up @@ -274,7 +275,7 @@ def train(
:param output_earlystopping: not needed
:param model_checkpoint_dir: not needed
"""
cell_line_ids = cell_line_input.get_feature_matrix(view="cell_line_id", identifiers=output.cell_line_ids)
cell_line_ids = cell_line_input.get_feature_matrix(view=CELL_LINE_IDENTIFIER, identifiers=output.cell_line_ids)
self.dataset_mean = np.mean(output.response)
self.cell_line_means = {}

Expand Down Expand Up @@ -353,8 +354,8 @@ class NaiveMeanEffectsPredictor(DRPModel):
This formulation ensures that the overall mean is not counted twice.
"""

cell_line_views = ["cell_line_id"]
drug_views = ["drug_id"]
cell_line_views = [CELL_LINE_IDENTIFIER]
drug_views = [DRUG_IDENTIFIER]

def __init__(self):
"""
Expand Down Expand Up @@ -412,15 +413,15 @@ def train(
self.dataset_mean = np.mean(output.response)

# Obtain cell line features.
cell_line_ids = cell_line_input.get_feature_matrix(view="cell_line_id", identifiers=output.cell_line_ids)
cell_line_ids = cell_line_input.get_feature_matrix(view=CELL_LINE_IDENTIFIER, identifiers=output.cell_line_ids)
cell_line_means = {}
for cl_output, cl_feature in zip(unique(output.cell_line_ids), unique(cell_line_ids), strict=True):
responses_cl = output.response[cl_feature == output.cell_line_ids]
if len(responses_cl) > 0:
cell_line_means[cl_output] = np.mean(responses_cl)

# Obtain drug features.
drug_ids = drug_input.get_feature_matrix(view="drug_id", identifiers=output.drug_ids)
drug_ids = drug_input.get_feature_matrix(view=DRUG_IDENTIFIER, identifiers=output.drug_ids)
drug_means = {}
for drug_output, drug_feature in zip(unique(output.drug_ids), unique(drug_ids), strict=True):
responses_drug = output.response[drug_feature == output.drug_ids]
Expand Down
Loading
Loading