Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
5959cb8
cell line and drug names now as FINAL
PascalIversen Feb 13, 2025
8a16b5d
ctrv12 loaders
PascalIversen Feb 13, 2025
d5e4c02
l
PascalIversen Feb 13, 2025
edceb51
str
PascalIversen Feb 20, 2025
08c384b
Empty commit for testing or triggering CI/CD
PascalIversen Feb 20, 2025
1d7850e
fixed tests for data
JudithBernett Feb 20, 2025
d0eeddd
loader back to other dezipping
PascalIversen Feb 20, 2025
0d0ee7d
merge
PascalIversen Feb 20, 2025
f64ef55
Merge branch 'loaders' of github.com:daisybio/drevalpy into loaders
JudithBernett Feb 21, 2025
afe05aa
fixing tests
PascalIversen Feb 21, 2025
4cbcd7b
fixing tests 2
PascalIversen Feb 21, 2025
f3aa3e8
fixed unused imports
JudithBernett Feb 21, 2025
f3756b2
Merge branch 'loaders' of github.com:daisybio/drevalpy into loaders
JudithBernett Feb 21, 2025
b389eb4
removing nan responses
JudithBernett Feb 21, 2025
e856285
fix double import
PascalIversen Feb 21, 2025
947aa4d
fixing rest
JudithBernett Feb 21, 2025
ed1b5e1
Merge branch 'loaders' of github.com:daisybio/drevalpy into loaders
JudithBernett Feb 21, 2025
fdcd917
supressed multirow warnings wfer first row and max_epochs for SNN for…
PascalIversen Feb 21, 2025
c1307e6
merge
PascalIversen Feb 21, 2025
934c4ba
fixed individual tests
JudithBernett Feb 21, 2025
39515f9
Merge branch 'loaders' of github.com:daisybio/drevalpy into loaders
JudithBernett Feb 21, 2025
9413eaf
mypy trainer_params
PascalIversen Feb 21, 2025
e61a81f
iterate features and load drug fingerprints tests were wrong
JudithBernett Feb 21, 2025
eb20386
Merge branch 'loaders' of github.com:daisybio/drevalpy into loaders
JudithBernett Feb 21, 2025
a3b183e
mypy fix?
JudithBernett Feb 21, 2025
285af01
mypy fix no 999999999
PascalIversen Feb 21, 2025
5222521
merge
PascalIversen Feb 21, 2025
e9b76e5
increase timeout
PascalIversen Feb 21, 2025
334a5c2
Merge pull request #125 from daisybio/loaders
JudithBernett Feb 21, 2025
1027aea
updating everything
JudithBernett Feb 21, 2025
f557eac
fixed precommit and docs-build
JudithBernett Feb 21, 2025
56c2557
minor fixes visualization
JudithBernett Feb 21, 2025
b36765d
Merge pull request #130 from daisybio/updates
JudithBernett Feb 21, 2025
21db061
taking mean now
JudithBernett Feb 21, 2025
078a5e9
minor fix run_suite
JudithBernett Feb 21, 2025
342860b
Merge pull request #131 from daisybio/multi_features
JudithBernett Feb 21, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ data/GDSC1
data/GDSC2
data/CCLE
data/Toy_Data
data/CTRPv1
data/CTRPv2

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
4 changes: 3 additions & 1 deletion create_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,8 +267,10 @@ def draw_per_grouping_algorithm_plots(
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate reports from evaluation results")
parser.add_argument("--run_id", required=True, help="Run ID for the current execution")
parser.add_argument("--dataset", required=True, help="Dataset name for which to render the result file")
args = parser.parse_args()
run_id = args.run_id
dataset = args.dataset

# assert that the run_id folder exists
if not os.path.exists(f"results/{run_id}"):
Expand All @@ -280,7 +282,7 @@ def draw_per_grouping_algorithm_plots(
evaluation_results_per_drug,
evaluation_results_per_cell_line,
true_vs_pred,
) = parse_results(path_to_results=f"results/{run_id}")
) = parse_results(path_to_results=f"results/{run_id}", dataset=dataset)

# part of pipeline: EVALUATE_FINAL, COLLECT_RESULTS
(
Expand Down
2 changes: 1 addition & 1 deletion docs/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
sphinx-autobuild==2024.10.3 ; python_version >= "3.11" and python_version < "3.13"
sphinx-autodoc-typehints==3.0.1 ; python_version >= "3.11" and python_version < "3.13"
sphinx-autodoc-typehints==3.1.0 ; python_version >= "3.11" and python_version < "3.13"
sphinx-click==6.0.0 ; python_version >= "3.11" and python_version < "3.13"
sphinx-rtd-theme==3.0.2 ; python_version >= "3.11" and python_version < "3.13"
8 changes: 5 additions & 3 deletions drevalpy/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def from_csv(
- response: the drug response values as floating point values
- cell_line_ids: a string identifier for cell lines
- drug_ids: a string identifier for drugs
- predictions: an optional column containing a predicted value TODO what exactly?
- predictions: an optional column containing drug response predictions

:param input_file: Path to the csv file containing the data to be loaded
:param dataset_name: Optional name to associate the dataset with, default = "unknown"
Expand All @@ -64,6 +64,8 @@ def from_csv(
:returns: DrugResponseDataset object containing data from provided csv file.
"""
data = pd.read_csv(input_file)
data["drug_id"] = data["drug_id"].astype(str)

if "predictions" in data.columns:
predictions = data["predictions"].values
else:
Expand Down Expand Up @@ -152,9 +154,9 @@ def __init__(
"""
super().__init__()
if len(response) != len(cell_line_ids):
raise AssertionError("Response and cell_line_ids have different lengths.")
raise AssertionError("Response and cell line identifiers have different lengths.")
if len(response) != len(drug_ids):
raise AssertionError("Response and drug_ids have different lengths.")
raise AssertionError("Response and drug identifiers have different lengths.")
if predictions is not None and len(response) != len(predictions):
raise AssertionError("Response and predictions have different lengths.")
self._response = response
Expand Down
85 changes: 67 additions & 18 deletions drevalpy/datasets/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@
from ..pipeline_function import pipeline_function
from .curvecurator import fit_curves
from .dataset import DrugResponseDataset
from .utils import download_dataset
from .utils import CELL_LINE_IDENTIFIER, DRUG_IDENTIFIER, download_dataset


def load_gdsc1(
path_data: str = "data",
measure: str = "LN_IC50",
file_name: str = "response_GDSC1.csv",
measure: str = "LN_IC50_curvecurator",
file_name: str = "GDSC1.csv",
dataset_name: str = "GDSC1",
) -> DrugResponseDataset:
"""
Expand All @@ -32,18 +32,18 @@ def load_gdsc1(
if not os.path.exists(path):
download_dataset(dataset_name, path_data, redownload=True)

response_data = pd.read_csv(path)
response_data["DRUG_NAME"] = response_data["DRUG_NAME"].str.replace(",", "")
response_data = pd.read_csv(path, dtype={"pubchem_id": str})
response_data[DRUG_IDENTIFIER] = response_data[DRUG_IDENTIFIER].str.replace(",", "")

return DrugResponseDataset(
response=response_data[measure].values,
cell_line_ids=response_data["CELL_LINE_NAME"].values,
drug_ids=response_data["DRUG_NAME"].values,
cell_line_ids=response_data[CELL_LINE_IDENTIFIER].values,
drug_ids=response_data[DRUG_IDENTIFIER].values,
dataset_name=dataset_name,
)


def load_gdsc2(path_data: str = "data", measure: str = "LN_IC50", file_name: str = "response_GDSC2.csv"):
def load_gdsc2(path_data: str = "data", measure: str = "LN_IC50_curvecurator", file_name: str = "GDSC2.csv"):
"""
Loads the GDSC2 dataset.

Expand All @@ -57,7 +57,7 @@ def load_gdsc2(path_data: str = "data", measure: str = "LN_IC50", file_name: str


def load_ccle(
path_data: str = "data", measure: str = "LN_IC50", file_name: str = "response_CCLE.csv"
path_data: str = "data", measure: str = "LN_IC50_curvecurator", file_name: str = "CCLE.csv"
) -> DrugResponseDataset:
"""
Loads the CCLE dataset.
Expand All @@ -73,18 +73,18 @@ def load_ccle(
if not os.path.exists(path):
download_dataset(dataset_name, path_data, redownload=True)

response_data = pd.read_csv(path)
response_data["DRUG_NAME"] = response_data["DRUG_NAME"].str.replace(",", "")
response_data = pd.read_csv(path, dtype={"pubchem_id": str})
response_data[DRUG_IDENTIFIER] = response_data[DRUG_IDENTIFIER].str.replace(",", "")

return DrugResponseDataset(
response=response_data[measure].values,
cell_line_ids=response_data["CELL_LINE_NAME"].values,
drug_ids=response_data["DRUG_NAME"].values,
cell_line_ids=response_data[CELL_LINE_IDENTIFIER].values,
drug_ids=response_data[DRUG_IDENTIFIER].values,
dataset_name=dataset_name,
)


def load_toy(path_data: str = "data", measure: str = "response") -> DrugResponseDataset:
def load_toy(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
"""
Loads small Toy dataset, subsampled from GDSC1.

Expand All @@ -94,20 +94,67 @@ def load_toy(path_data: str = "data", measure: str = "response") -> DrugResponse
:return: DrugResponseDataset containing response, cell line IDs, and drug IDs.
"""
dataset_name = "Toy_Data"
measure = "response" # overwrite this explicitly to avoid problems, should be changed in the future
path = os.path.join(path_data, dataset_name, "toy_data.csv")
if not os.path.exists(path):
download_dataset(dataset_name, path_data, redownload=True)
response_data = pd.read_csv(path)
response_data = pd.read_csv(path, dtype={"pubchem_id": str})

return DrugResponseDataset(
response=response_data[measure].values,
cell_line_ids=response_data["cell_line_id"].values,
drug_ids=response_data["drug_id"].values,
cell_line_ids=response_data[CELL_LINE_IDENTIFIER].values,
drug_ids=response_data[DRUG_IDENTIFIER].values,
dataset_name=dataset_name,
)


def _load_ctrpv(version: str, path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
"""
Load CTRPv1 dataset.

:param version: The version of the CTRP dataset to load.
:param path_data: Path to location of CTRPv1 dataset
:param measure: The name of the column containing the measure to predict, default = "response"

:return: DrugResponseDataset containing response, cell line IDs, and drug IDs
"""
dataset_name = "CTRPv" + version
path = os.path.join(path_data, dataset_name, f"{dataset_name}.csv")
if not os.path.exists(path):
download_dataset(dataset_name, path_data, redownload=True)
response_data = pd.read_csv(path, dtype={"pubchem_id": str})

return DrugResponseDataset(
response=response_data[measure].values,
cell_line_ids=response_data[CELL_LINE_IDENTIFIER].values,
drug_ids=response_data[DRUG_IDENTIFIER].values,
dataset_name=dataset_name,
)


def load_ctrpv1(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
"""
Load CTRPv2 dataset.

:param path_data: Path to location of CTRPv2 dataset
:param measure: The name of the column containing the measure to predict, default = "LN_IC50_curvecurator"

:return: DrugResponseDataset containing response, cell line IDs, and drug IDs
"""
return _load_ctrpv("1", path_data, measure)


def load_ctrpv2(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
"""
Load CTRPv2 dataset.

:param path_data: Path to location of CTRPv2 dataset
:param measure: The name of the column containing the measure to predict, default: LN_IC50_curvecurator

:return: DrugResponseDataset containing response, cell line IDs, and drug IDs
"""
return _load_ctrpv("2", path_data, measure)


def load_custom(path_data: str | Path, measure: str = "response") -> DrugResponseDataset:
"""
Load custom dataset.
Expand All @@ -125,6 +172,8 @@ def load_custom(path_data: str | Path, measure: str = "response") -> DrugRespons
"GDSC2": load_gdsc2,
"CCLE": load_ccle,
"Toy_Data": load_toy,
"CTRPv1": load_ctrpv1,
"CTRPv2": load_ctrpv2,
}


Expand Down
13 changes: 8 additions & 5 deletions drevalpy/datasets/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
import numpy as np
import requests

DRUG_IDENTIFIER = "pubchem_id"
CELL_LINE_IDENTIFIER = "cell_line_name"


def download_dataset(
dataset_name: str,
Expand All @@ -26,18 +29,18 @@ def download_dataset(
file_name = f"{dataset_name}.zip"
file_path = Path(data_path) / file_name
extracted_folder_path = file_path.with_suffix("")

timeout = 120
# Check if the extracted data exists and skip download if not redownloading
if extracted_folder_path.exists() and not redownload:
print(f"{dataset_name} is already extracted, skipping download.")
else:
url = "https://zenodo.org/doi/10.5281/zenodo.12633909"
# Fetch the latest record
response = requests.get(url, timeout=60)
response = requests.get(url, timeout=timeout)
if response.status_code != 200:
raise requests.exceptions.HTTPError(f"Error fetching record: {response.status_code}")
latest_url = response.links["linkset"]["url"]
response = requests.get(latest_url, timeout=60)
response = requests.get(latest_url, timeout=timeout)
if response.status_code != 200:
raise requests.exceptions.HTTPError(f"Error fetching record: {response.status_code}")
data = response.json()
Expand All @@ -50,7 +53,7 @@ def download_dataset(
file_url = name_to_url[file_name]
# Download the file
print(f"Downloading {dataset_name} from {file_url}...")
response = requests.get(file_url, timeout=60)
response = requests.get(file_url, timeout=timeout)
if response.status_code != 200:
raise requests.exceptions.HTTPError(f"Error downloading file {dataset_name}: " f"{response.status_code}")

Expand All @@ -61,7 +64,7 @@ def download_dataset(
with zipfile.ZipFile(file_path, "r") as z:
for member in z.infolist():
if not member.filename.startswith("__MACOSX/"):
z.extract(member, os.path.join(data_path, dataset_name))
z.extract(member, os.path.join(data_path))
file_path.unlink() # Remove zip file after extraction

print(f"{dataset_name} data downloaded and extracted to {data_path}")
Expand Down
4 changes: 4 additions & 0 deletions drevalpy/models/SimpleNeuralNetwork/hyperparameters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ SimpleNeuralNetwork:
- 128
- 64
- 16
max_epochs:
- 100

MultiOmicsNeuralNetwork:
dropout_prob:
Expand All @@ -44,3 +46,5 @@ MultiOmicsNeuralNetwork:
- 32
methylation_pca_components:
- 100
max_epochs:
- 100
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,10 @@ def train(
cell_line_views=self.cell_line_views,
drug_views=self.drug_views,
output_earlystopping=output_earlystopping,
trainer_params={
"max_epochs": self.hyperparameters.get("max_epochs", 100),
"progress_bar_refresh_rate": 500,
},
batch_size=16,
patience=5,
num_workers=1,
Expand Down
4 changes: 4 additions & 0 deletions drevalpy/models/SimpleNeuralNetwork/simple_neural_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ def train(
cell_line_views=self.cell_line_views,
drug_views=self.drug_views,
output_earlystopping=output_earlystopping,
trainer_params={
"max_epochs": self.hyperparameters.get("max_epochs", 100),
"progress_bar_refresh_rate": 500,
},
batch_size=16,
patience=5,
num_workers=1 if platform.system() == "Windows" else 8,
Expand Down
11 changes: 5 additions & 6 deletions drevalpy/models/SimpleNeuralNetwork/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,16 +180,15 @@ def fit(
:param model_checkpoint_dir: directory to save the model checkpoints
:raises ValueError: if drug_input is missing
"""
if drug_input is None:
raise ValueError(
"Drug input (fingerprints) are required for SimpleNeuralNetwork and " "MultiOMICsNeuralNetwork."
)

if trainer_params is None:
trainer_params = {
"max_epochs": 100,
"progress_bar_refresh_rate": 500,
"max_epochs": 70,
}
if drug_input is None:
raise ValueError(
"Drug input (fingerprints) are required for SimpleNeuralNetwork and " "MultiOMICsNeuralNetwork."
)

train_dataset = RegressionDataset(
output=output_train,
Expand Down
2 changes: 1 addition & 1 deletion drevalpy/models/baselines/hyperparameters.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
NaivePredictor:
NaiveDrugMeanPredictor:
NaiveCellLineMeanPredictor:
NaiveANOVAPredictor:
NaiveMeanEffectsPredictor:
ElasticNet:
l1_ratio:
- 0
Expand Down
Loading
Loading