diff --git a/.gitignore b/.gitignore index 6c1cfb9b..67ec4c67 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,8 @@ data/GDSC1 data/GDSC2 data/CCLE data/Toy_Data +data/CTRPv1 +data/CTRPv2 # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/drevalpy/datasets/dataset.py b/drevalpy/datasets/dataset.py index bef2fa06..9f13dd19 100644 --- a/drevalpy/datasets/dataset.py +++ b/drevalpy/datasets/dataset.py @@ -55,7 +55,7 @@ def from_csv( - response: the drug response values as floating point values - cell_line_ids: a string identifier for cell lines - drug_ids: a string identifier for drugs - - predictions: an optional column containing a predicted value TODO what exactly? + - predictions: an optional column containing drug response predictions :param input_file: Path to the csv file containing the data to be loaded :param dataset_name: Optional name to associate the dataset with, default = "unknown" @@ -64,6 +64,8 @@ def from_csv( :returns: DrugResponseDataset object containing data from provided csv file. """ data = pd.read_csv(input_file) + data["drug_id"] = data["drug_id"].astype(str) + if "predictions" in data.columns: predictions = data["predictions"].values else: @@ -152,9 +154,9 @@ def __init__( """ super().__init__() if len(response) != len(cell_line_ids): - raise AssertionError("Response and cell_line_ids have different lengths.") + raise AssertionError("Response and cell line identifiers have different lengths.") if len(response) != len(drug_ids): - raise AssertionError("Response and drug_ids have different lengths.") + raise AssertionError("Response and drug identifiers have different lengths.") if predictions is not None and len(response) != len(predictions): raise AssertionError("Response and predictions have different lengths.") self._response = response diff --git a/drevalpy/datasets/loader.py b/drevalpy/datasets/loader.py index f8537e8a..1ecf92e6 100644 --- a/drevalpy/datasets/loader.py +++ b/drevalpy/datasets/loader.py @@ -9,13 +9,13 @@ from ..pipeline_function import pipeline_function from .curvecurator import fit_curves from .dataset import DrugResponseDataset -from .utils import download_dataset +from .utils import CELL_LINE_IDENTIFIER, DRUG_IDENTIFIER, download_dataset def load_gdsc1( path_data: str = "data", - measure: str = "LN_IC50", - file_name: str = "response_GDSC1.csv", + measure: str = "LN_IC50_curvecurator", + file_name: str = "GDSC1.csv", dataset_name: str = "GDSC1", ) -> DrugResponseDataset: """ @@ -32,18 +32,18 @@ def load_gdsc1( if not os.path.exists(path): download_dataset(dataset_name, path_data, redownload=True) - response_data = pd.read_csv(path) - response_data["DRUG_NAME"] = response_data["DRUG_NAME"].str.replace(",", "") + response_data = pd.read_csv(path, dtype={"pubchem_id": str}) + response_data[DRUG_IDENTIFIER] = response_data[DRUG_IDENTIFIER].str.replace(",", "") return DrugResponseDataset( response=response_data[measure].values, - cell_line_ids=response_data["CELL_LINE_NAME"].values, - drug_ids=response_data["DRUG_NAME"].values, + cell_line_ids=response_data[CELL_LINE_IDENTIFIER].values, + drug_ids=response_data[DRUG_IDENTIFIER].values, dataset_name=dataset_name, ) -def load_gdsc2(path_data: str = "data", measure: str = "LN_IC50", file_name: str = "response_GDSC2.csv"): +def load_gdsc2(path_data: str = "data", measure: str = "LN_IC50_curvecurator", file_name: str = "GDSC2.csv"): """ Loads the GDSC2 dataset. @@ -57,7 +57,7 @@ def load_gdsc2(path_data: str = "data", measure: str = "LN_IC50", file_name: str def load_ccle( - path_data: str = "data", measure: str = "LN_IC50", file_name: str = "response_CCLE.csv" + path_data: str = "data", measure: str = "LN_IC50_curvecurator", file_name: str = "CCLE.csv" ) -> DrugResponseDataset: """ Loads the CCLE dataset. @@ -73,18 +73,18 @@ def load_ccle( if not os.path.exists(path): download_dataset(dataset_name, path_data, redownload=True) - response_data = pd.read_csv(path) - response_data["DRUG_NAME"] = response_data["DRUG_NAME"].str.replace(",", "") + response_data = pd.read_csv(path, dtype={"pubchem_id": str}) + response_data[DRUG_IDENTIFIER] = response_data[DRUG_IDENTIFIER].str.replace(",", "") return DrugResponseDataset( response=response_data[measure].values, - cell_line_ids=response_data["CELL_LINE_NAME"].values, - drug_ids=response_data["DRUG_NAME"].values, + cell_line_ids=response_data[CELL_LINE_IDENTIFIER].values, + drug_ids=response_data[DRUG_IDENTIFIER].values, dataset_name=dataset_name, ) -def load_toy(path_data: str = "data", measure: str = "response") -> DrugResponseDataset: +def load_toy(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset: """ Loads small Toy dataset, subsampled from GDSC1. @@ -94,20 +94,67 @@ def load_toy(path_data: str = "data", measure: str = "response") -> DrugResponse :return: DrugResponseDataset containing response, cell line IDs, and drug IDs. """ dataset_name = "Toy_Data" - measure = "response" # overwrite this explicitly to avoid problems, should be changed in the future path = os.path.join(path_data, dataset_name, "toy_data.csv") if not os.path.exists(path): download_dataset(dataset_name, path_data, redownload=True) - response_data = pd.read_csv(path) + response_data = pd.read_csv(path, dtype={"pubchem_id": str}) return DrugResponseDataset( response=response_data[measure].values, - cell_line_ids=response_data["cell_line_id"].values, - drug_ids=response_data["drug_id"].values, + cell_line_ids=response_data[CELL_LINE_IDENTIFIER].values, + drug_ids=response_data[DRUG_IDENTIFIER].values, dataset_name=dataset_name, ) +def _load_ctrpv(version: str, path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset: + """ + Load CTRPv1 dataset. + + :param version: The version of the CTRP dataset to load. + :param path_data: Path to location of CTRPv1 dataset + :param measure: The name of the column containing the measure to predict, default = "response" + + :return: DrugResponseDataset containing response, cell line IDs, and drug IDs + """ + dataset_name = "CTRPv" + version + path = os.path.join(path_data, dataset_name, f"{dataset_name}.csv") + if not os.path.exists(path): + download_dataset(dataset_name, path_data, redownload=True) + response_data = pd.read_csv(path, dtype={"pubchem_id": str}) + + return DrugResponseDataset( + response=response_data[measure].values, + cell_line_ids=response_data[CELL_LINE_IDENTIFIER].values, + drug_ids=response_data[DRUG_IDENTIFIER].values, + dataset_name=dataset_name, + ) + + +def load_ctrpv1(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset: + """ + Load CTRPv2 dataset. + + :param path_data: Path to location of CTRPv2 dataset + :param measure: The name of the column containing the measure to predict, default = "LN_IC50_curvecurator" + + :return: DrugResponseDataset containing response, cell line IDs, and drug IDs + """ + return _load_ctrpv("1", path_data, measure) + + +def load_ctrpv2(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset: + """ + Load CTRPv2 dataset. + + :param path_data: Path to location of CTRPv2 dataset + :param measure: The name of the column containing the measure to predict, default: LN_IC50_curvecurator + + :return: DrugResponseDataset containing response, cell line IDs, and drug IDs + """ + return _load_ctrpv("2", path_data, measure) + + def load_custom(path_data: str | Path, measure: str = "response") -> DrugResponseDataset: """ Load custom dataset. @@ -125,6 +172,8 @@ def load_custom(path_data: str | Path, measure: str = "response") -> DrugRespons "GDSC2": load_gdsc2, "CCLE": load_ccle, "Toy_Data": load_toy, + "CTRPv1": load_ctrpv1, + "CTRPv2": load_ctrpv2, } diff --git a/drevalpy/datasets/utils.py b/drevalpy/datasets/utils.py index 254e914b..28a0d5cd 100644 --- a/drevalpy/datasets/utils.py +++ b/drevalpy/datasets/utils.py @@ -9,6 +9,9 @@ import numpy as np import requests +DRUG_IDENTIFIER = "pubchem_id" +CELL_LINE_IDENTIFIER = "cell_line_name" + def download_dataset( dataset_name: str, @@ -26,18 +29,18 @@ def download_dataset( file_name = f"{dataset_name}.zip" file_path = Path(data_path) / file_name extracted_folder_path = file_path.with_suffix("") - + timeout = 120 # Check if the extracted data exists and skip download if not redownloading if extracted_folder_path.exists() and not redownload: print(f"{dataset_name} is already extracted, skipping download.") else: url = "https://zenodo.org/doi/10.5281/zenodo.12633909" # Fetch the latest record - response = requests.get(url, timeout=60) + response = requests.get(url, timeout=timeout) if response.status_code != 200: raise requests.exceptions.HTTPError(f"Error fetching record: {response.status_code}") latest_url = response.links["linkset"]["url"] - response = requests.get(latest_url, timeout=60) + response = requests.get(latest_url, timeout=timeout) if response.status_code != 200: raise requests.exceptions.HTTPError(f"Error fetching record: {response.status_code}") data = response.json() @@ -50,7 +53,7 @@ def download_dataset( file_url = name_to_url[file_name] # Download the file print(f"Downloading {dataset_name} from {file_url}...") - response = requests.get(file_url, timeout=60) + response = requests.get(file_url, timeout=timeout) if response.status_code != 200: raise requests.exceptions.HTTPError(f"Error downloading file {dataset_name}: " f"{response.status_code}") @@ -61,7 +64,7 @@ def download_dataset( with zipfile.ZipFile(file_path, "r") as z: for member in z.infolist(): if not member.filename.startswith("__MACOSX/"): - z.extract(member, os.path.join(data_path, dataset_name)) + z.extract(member, os.path.join(data_path)) file_path.unlink() # Remove zip file after extraction print(f"{dataset_name} data downloaded and extracted to {data_path}") diff --git a/drevalpy/models/SimpleNeuralNetwork/hyperparameters.yaml b/drevalpy/models/SimpleNeuralNetwork/hyperparameters.yaml index abd34123..3fbaa559 100644 --- a/drevalpy/models/SimpleNeuralNetwork/hyperparameters.yaml +++ b/drevalpy/models/SimpleNeuralNetwork/hyperparameters.yaml @@ -22,6 +22,8 @@ SimpleNeuralNetwork: - 128 - 64 - 16 + max_epochs: + - 100 MultiOmicsNeuralNetwork: dropout_prob: @@ -44,3 +46,5 @@ MultiOmicsNeuralNetwork: - 32 methylation_pca_components: - 100 + max_epochs: + - 100 diff --git a/drevalpy/models/SimpleNeuralNetwork/multiomics_neural_network.py b/drevalpy/models/SimpleNeuralNetwork/multiomics_neural_network.py index 8caebc45..ef374534 100644 --- a/drevalpy/models/SimpleNeuralNetwork/multiomics_neural_network.py +++ b/drevalpy/models/SimpleNeuralNetwork/multiomics_neural_network.py @@ -110,6 +110,10 @@ def train( cell_line_views=self.cell_line_views, drug_views=self.drug_views, output_earlystopping=output_earlystopping, + trainer_params={ + "max_epochs": self.hyperparameters.get("max_epochs", 100), + "progress_bar_refresh_rate": 500, + }, batch_size=16, patience=5, num_workers=1, diff --git a/drevalpy/models/SimpleNeuralNetwork/simple_neural_network.py b/drevalpy/models/SimpleNeuralNetwork/simple_neural_network.py index ee8765f0..ed4c015d 100644 --- a/drevalpy/models/SimpleNeuralNetwork/simple_neural_network.py +++ b/drevalpy/models/SimpleNeuralNetwork/simple_neural_network.py @@ -105,6 +105,10 @@ def train( cell_line_views=self.cell_line_views, drug_views=self.drug_views, output_earlystopping=output_earlystopping, + trainer_params={ + "max_epochs": self.hyperparameters.get("max_epochs", 100), + "progress_bar_refresh_rate": 500, + }, batch_size=16, patience=5, num_workers=1 if platform.system() == "Windows" else 8, diff --git a/drevalpy/models/SimpleNeuralNetwork/utils.py b/drevalpy/models/SimpleNeuralNetwork/utils.py index 0a6c5801..50ed7bba 100644 --- a/drevalpy/models/SimpleNeuralNetwork/utils.py +++ b/drevalpy/models/SimpleNeuralNetwork/utils.py @@ -180,16 +180,15 @@ def fit( :param model_checkpoint_dir: directory to save the model checkpoints :raises ValueError: if drug_input is missing """ - if drug_input is None: - raise ValueError( - "Drug input (fingerprints) are required for SimpleNeuralNetwork and " "MultiOMICsNeuralNetwork." - ) - if trainer_params is None: trainer_params = { + "max_epochs": 100, "progress_bar_refresh_rate": 500, - "max_epochs": 70, } + if drug_input is None: + raise ValueError( + "Drug input (fingerprints) are required for SimpleNeuralNetwork and " "MultiOMICsNeuralNetwork." + ) train_dataset = RegressionDataset( output=output_train, diff --git a/drevalpy/models/baselines/hyperparameters.yaml b/drevalpy/models/baselines/hyperparameters.yaml index cd37b5e7..dc746a68 100644 --- a/drevalpy/models/baselines/hyperparameters.yaml +++ b/drevalpy/models/baselines/hyperparameters.yaml @@ -1,7 +1,7 @@ NaivePredictor: NaiveDrugMeanPredictor: NaiveCellLineMeanPredictor: -NaiveANOVAPredictor: +NaiveMeanEffectsPredictor: ElasticNet: l1_ratio: - 0 diff --git a/drevalpy/models/baselines/naive_pred.py b/drevalpy/models/baselines/naive_pred.py index 75a50768..5198c54d 100644 --- a/drevalpy/models/baselines/naive_pred.py +++ b/drevalpy/models/baselines/naive_pred.py @@ -12,6 +12,7 @@ import numpy as np from drevalpy.datasets.dataset import DrugResponseDataset, FeatureDataset +from drevalpy.datasets.utils import CELL_LINE_IDENTIFIER, DRUG_IDENTIFIER from drevalpy.models.drp_model import DRPModel from drevalpy.models.utils import load_cl_ids_from_csv, load_drug_ids_from_csv, unique @@ -19,8 +20,8 @@ class NaivePredictor(DRPModel): """Naive predictor model that predicts the overall mean of the response.""" - cell_line_views = ["cell_line_id"] - drug_views = ["drug_id"] + cell_line_views = [CELL_LINE_IDENTIFIER] + drug_views = [DRUG_IDENTIFIER] def __init__(self): """ @@ -109,8 +110,8 @@ def load_drug_features(self, data_path: str, dataset_name: str) -> FeatureDatase class NaiveDrugMeanPredictor(DRPModel): """Naive predictor model that predicts the mean of the response per drug.""" - cell_line_views = ["cell_line_id"] - drug_views = ["drug_id"] + cell_line_views = [CELL_LINE_IDENTIFIER] + drug_views = [DRUG_IDENTIFIER] def __init__(self): """ @@ -159,7 +160,7 @@ def train( """ if drug_input is None: raise ValueError("drug_input (drug_id) is required for the NaiveDrugMeanPredictor.") - drug_ids = drug_input.get_feature_matrix(view="drug_id", identifiers=output.drug_ids) + drug_ids = drug_input.get_feature_matrix(view=DRUG_IDENTIFIER, identifiers=output.drug_ids) self.dataset_mean = np.mean(output.response) self.drug_means = {} @@ -226,8 +227,8 @@ def load_drug_features(self, data_path: str, dataset_name: str) -> FeatureDatase class NaiveCellLineMeanPredictor(DRPModel): """Naive predictor model that predicts the mean of the response per cell line.""" - cell_line_views = ["cell_line_id"] - drug_views = ["drug_id"] + cell_line_views = [CELL_LINE_IDENTIFIER] + drug_views = [DRUG_IDENTIFIER] def __init__(self): """ @@ -274,7 +275,7 @@ def train( :param output_earlystopping: not needed :param model_checkpoint_dir: not needed """ - cell_line_ids = cell_line_input.get_feature_matrix(view="cell_line_id", identifiers=output.cell_line_ids) + cell_line_ids = cell_line_input.get_feature_matrix(view=CELL_LINE_IDENTIFIER, identifiers=output.cell_line_ids) self.dataset_mean = np.mean(output.response) self.cell_line_means = {} @@ -353,8 +354,8 @@ class NaiveMeanEffectsPredictor(DRPModel): This formulation ensures that the overall mean is not counted twice. """ - cell_line_views = ["cell_line_id"] - drug_views = ["drug_id"] + cell_line_views = [CELL_LINE_IDENTIFIER] + drug_views = [DRUG_IDENTIFIER] def __init__(self): """ @@ -412,7 +413,7 @@ def train( self.dataset_mean = np.mean(output.response) # Obtain cell line features. - cell_line_ids = cell_line_input.get_feature_matrix(view="cell_line_id", identifiers=output.cell_line_ids) + cell_line_ids = cell_line_input.get_feature_matrix(view=CELL_LINE_IDENTIFIER, identifiers=output.cell_line_ids) cell_line_means = {} for cl_output, cl_feature in zip(unique(output.cell_line_ids), unique(cell_line_ids), strict=True): responses_cl = output.response[cl_feature == output.cell_line_ids] @@ -420,7 +421,7 @@ def train( cell_line_means[cl_output] = np.mean(responses_cl) # Obtain drug features. - drug_ids = drug_input.get_feature_matrix(view="drug_id", identifiers=output.drug_ids) + drug_ids = drug_input.get_feature_matrix(view=DRUG_IDENTIFIER, identifiers=output.drug_ids) drug_means = {} for drug_output, drug_feature in zip(unique(output.drug_ids), unique(drug_ids), strict=True): responses_drug = output.response[drug_feature == output.drug_ids] diff --git a/drevalpy/models/baselines/singledrug_elastic_net.py b/drevalpy/models/baselines/singledrug_elastic_net.py index 2000644d..f6052d60 100644 --- a/drevalpy/models/baselines/singledrug_elastic_net.py +++ b/drevalpy/models/baselines/singledrug_elastic_net.py @@ -49,11 +49,7 @@ def train( :param drug_input: not needed :param output_earlystopping: not needed :param model_checkpoint_dir: not needed as checkpoints are not saved - :raises ValueError: if drug_input is not None """ - if drug_input is not None: - raise ValueError("SingleDrugElasticNet does not support drug_input!") - if len(output) > 0: x = self.get_concatenated_features( cell_line_view="gene_expression", @@ -168,11 +164,7 @@ def train( :param drug_input: not needed :param output_earlystopping: not needed :param model_checkpoint_dir: not needed as checkpoints are not saved - :raises ValueError: if drug_input is not None """ - if drug_input is not None: - raise ValueError("SingleDrugElasticNet does not support drug_input!") - if len(output) > 0: x = self.get_concatenated_features( cell_line_view="proteomics", diff --git a/drevalpy/models/utils.py b/drevalpy/models/utils.py index af371531..7731aa3f 100644 --- a/drevalpy/models/utils.py +++ b/drevalpy/models/utils.py @@ -7,6 +7,7 @@ import pandas as pd from drevalpy.datasets.dataset import FeatureDataset +from drevalpy.datasets.utils import CELL_LINE_IDENTIFIER, DRUG_IDENTIFIER def load_cl_ids_from_csv(path: str, dataset_name: str) -> FeatureDataset: @@ -18,7 +19,7 @@ def load_cl_ids_from_csv(path: str, dataset_name: str) -> FeatureDataset: :returns: FeatureDataset with the cell line ids """ cl_names = pd.read_csv(f"{path}/{dataset_name}/cell_line_names.csv", index_col=1) - return FeatureDataset(features={cl: {"cell_line_id": np.array([cl])} for cl in cl_names.index}) + return FeatureDataset(features={cl: {CELL_LINE_IDENTIFIER: np.array([cl])} for cl in cl_names.index}) def load_and_reduce_gene_features( @@ -88,13 +89,18 @@ def iterate_features(df: pd.DataFrame, feature_type: str) -> dict[str, dict[str, :param feature_type: type of feature, e.g., gene_expression, methylation, etc. :returns: dictionary with the features """ - features = {} + features: dict[str, dict[str, np.ndarray]] = {} for cl in df.index: + if cl in features.keys(): + continue rows = df.loc[cl] - if len(rows.shape) > 1 and rows.shape[0] > 1: # multiple rows returned + if (len(rows.shape) > 1) and (rows.shape[0] > 1): # multiple rows returned warnings.warn( - f"Multiple rows returned for {cl} in feature {feature_type}, taking the first one.", stacklevel=2 + f"Multiple rows returned for Cell Line {cl} (and maybe others) " + f"in feature {feature_type}, taking the first one.", + stacklevel=2, ) + rows = rows.iloc[0] # convert to float values rows = rows.astype(float) @@ -111,24 +117,27 @@ def load_drug_ids_from_csv(data_path: str, dataset_name: str) -> FeatureDataset: :returns: FeatureDataset with the drug ids """ drug_names = pd.read_csv(f"{data_path}/{dataset_name}/drug_names.csv", index_col=0) - return FeatureDataset(features={drug: {"drug_id": np.array([drug])} for drug in drug_names.index}) + drug_names.index = drug_names.index.astype(str) + return FeatureDataset(features={drug: {DRUG_IDENTIFIER: np.array([drug])} for drug in drug_names.index}) -def load_drug_fingerprint_features(data_path: str, dataset_name: str) -> FeatureDataset: +def load_drug_fingerprint_features(data_path: str, dataset_name: str, default_random=True) -> FeatureDataset: """ Load drug features from fingerprints. :param data_path: path to the data, e.g., data/ :param dataset_name: name of the dataset, e.g., GDSC2 + :param default_random: whether to use default random fingerprints if fingerprint is not available :returns: FeatureDataset with the drug fingerprints """ - if dataset_name == "Toy_Data": - fingerprints = pd.read_csv(os.path.join(data_path, dataset_name, "fingerprints.csv"), index_col=0) - else: - fingerprints = pd.read_csv( - os.path.join(data_path, dataset_name, "drug_fingerprints", "drug_name_to_demorgan_128_map.csv"), - index_col=0, - ).T + fingerprints = pd.read_csv( + os.path.join(data_path, dataset_name, "drug_fingerprints", "pubchem_id_to_demorgan_128_map.csv"), index_col=None + ).T + if default_random: + for drug in fingerprints.index: + if not np.all(fingerprints.loc[drug].values == 0): + continue + fingerprints.loc[drug] = np.random.randint(0, 2, size=fingerprints.loc[drug]) return FeatureDataset( features={drug: {"fingerprints": fingerprints.loc[drug].values} for drug in fingerprints.index} ) diff --git a/poetry.lock b/poetry.lock index 64628322..4744ed53 100644 --- a/poetry.lock +++ b/poetry.lock @@ -5125,4 +5125,4 @@ multiprocessing = ["ray"] [metadata] lock-version = "2.1" python-versions = ">=3.11,<3.13" -content-hash = "bf0f826f8f395e1a264fd633e045bd60e57945e152008f1e7c8916c71e85a5a3" +content-hash = "f7374bb63fe04664d6585936fdb245bec2373eeed9bbc501871f62550239757b" diff --git a/pyproject.toml b/pyproject.toml index 17cd1dbd..e6f328ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ scikit-learn = ">=1.4,<1.7" pandas = "*" networkx = "*" pyyaml = "*" -pytorch-lightning = "*" +pytorch-lightning = ">=2.5" torch = ">=2.1,<=2.4" torch-geometric = "*" flaky = "*" diff --git a/tests/individual_models/conftest.py b/tests/individual_models/conftest.py index ea1025ef..3f2beb66 100644 --- a/tests/individual_models/conftest.py +++ b/tests/individual_models/conftest.py @@ -21,6 +21,7 @@ def sample_dataset() -> tuple[DrugResponseDataset, FeatureDataset, FeatureDatase """ path_data = "../data" drug_response = load_toy(path_data) + drug_response.remove_nan_responses() cell_line_input = get_multiomics_feature_dataset(data_path=path_data, dataset_name="Toy_Data", gene_list=None) cell_line_ids = load_cl_ids_from_csv(path=path_data, dataset_name="Toy_Data") cell_line_input.add_features(cell_line_ids) diff --git a/tests/individual_models/test_baselines.py b/tests/individual_models/test_baselines.py index 9639beb2..d9a3fcca 100644 --- a/tests/individual_models/test_baselines.py +++ b/tests/individual_models/test_baselines.py @@ -28,6 +28,7 @@ "NaivePredictor", "NaiveDrugMeanPredictor", "NaiveCellLineMeanPredictor", + "NaiveMeanEffectsPredictor", "ElasticNet", "RandomForest", "SVR", @@ -86,6 +87,8 @@ def test_baselines( drug_input, test_mode, ) + elif model_name == "NaiveMeanEffectsPredictor": + _call_naive_mean_effects_predictor(train_dataset, val_dataset, cell_line_input, drug_input, test_mode) else: _call_other_baselines( model_name, @@ -97,7 +100,12 @@ def test_baselines( @pytest.mark.parametrize( - "model_name", ["SingleDrugRandomForest", "SingleDrugElasticNet", "SingleDrugProteomicsElasticNet"] + "model_name", + [ + "SingleDrugRandomForest", + "SingleDrugElasticNet", + "SingleDrugProteomicsElasticNet", + ], ) @pytest.mark.parametrize("test_mode", ["LPO", "LCO"]) def test_single_drug_baselines( @@ -120,9 +128,19 @@ def test_single_drug_baselines( train_dataset = split["train"] val_dataset = split["validation"] + cell_lines_to_keep = cell_line_input.identifiers + drugs_to_keep = drug_input.identifiers + + len_train_before = len(train_dataset) + len_pred_before = len(val_dataset) + train_dataset.reduce_to(cell_line_ids=cell_lines_to_keep, drug_ids=drugs_to_keep) + val_dataset.reduce_to(cell_line_ids=cell_lines_to_keep, drug_ids=drugs_to_keep) + print(f"Reduced training dataset from {len_train_before} to {len(train_dataset)}") + print(f"Reduced val dataset from {len_pred_before} to {len(val_dataset)}") + all_unique_drugs = np.unique(train_dataset.drug_ids) # randomly sample a drug to speed up testing - np.random.seed(42) + np.random.seed(123) np.random.shuffle(all_unique_drugs) random_drug = all_unique_drugs[:1] @@ -153,10 +171,13 @@ def test_single_drug_baselines( cell_line_ids=val_dataset.cell_line_ids[val_mask], cell_line_input=cell_line_input, ) - pcc_drug = pearson(val_dataset.response[val_mask], all_predictions[val_mask]) - print(f"{test_mode}: Performance of {model_name} for drug {random_drug}: PCC = {pcc_drug}") - - assert pcc_drug >= -1.0 + # check whether predictions are constant + if np.all(all_predictions[val_mask] == all_predictions[val_mask][0]): + print("Predictions are constant") + else: + pcc_drug = pearson(val_dataset.response[val_mask], all_predictions[val_mask]) + print(f"{test_mode}: Performance of {model_name} for drug {random_drug}: PCC = {pcc_drug}") + assert pcc_drug >= -1.0 def _call_naive_predictor( @@ -316,33 +337,22 @@ def _call_other_baselines( @pytest.mark.parametrize("test_mode", ["LPO", "LCO", "LDO"]) -def test_naive_anova_predictor( - sample_dataset: tuple[DrugResponseDataset, FeatureDataset, FeatureDataset], test_mode: str +def _call_naive_mean_effects_predictor( + train_dataset: DrugResponseDataset, + val_dataset: DrugResponseDataset, + cell_line_input: FeatureDataset, + drug_input: FeatureDataset, + test_mode: str, ) -> None: """ Test the NaiveMeanEffectsPredictor model. - :param sample_dataset: from conftest.py + :param train_dataset: training dataset + :param val_dataset: validation dataset + :param cell_line_input: features cell lines + :param drug_input: features drugs :param test_mode: either LPO, LCO, or LDO """ - drug_response, cell_line_input, drug_input = sample_dataset - drug_response.split_dataset(n_cv_splits=5, mode=test_mode) - - assert drug_response.cv_splits is not None - split = drug_response.cv_splits[0] - train_dataset = split["train"] - val_dataset = split["validation"] - - cell_lines_to_keep = cell_line_input.identifiers - drugs_to_keep = drug_input.identifiers - - len_train_before = len(train_dataset) - len_pred_before = len(val_dataset) - train_dataset.reduce_to(cell_line_ids=cell_lines_to_keep, drug_ids=drugs_to_keep) - val_dataset.reduce_to(cell_line_ids=cell_lines_to_keep, drug_ids=drugs_to_keep) - print(f"Reduced training dataset from {len_train_before} to {len(train_dataset)}") - print(f"Reduced val dataset from {len_pred_before} to {len(val_dataset)}") - naive = NaiveMeanEffectsPredictor() naive.train(output=train_dataset, cell_line_input=cell_line_input, drug_input=drug_input) val_dataset._predictions = naive.predict( diff --git a/tests/individual_models/test_simple_neural_network.py b/tests/individual_models/test_simple_neural_network.py index 357b5a3d..8fca61e9 100644 --- a/tests/individual_models/test_simple_neural_network.py +++ b/tests/individual_models/test_simple_neural_network.py @@ -50,6 +50,7 @@ def test_simple_neural_network( hpams = model.get_hyperparameter_set() hpam_combi = hpams[0] hpam_combi["units_per_layer"] = [2, 2] + hpam_combi["max_epochs"] = 1 model.build_model(hyperparameters=hpam_combi) with tempfile.TemporaryDirectory() as tmpdirname: diff --git a/tests/test_available_data.py b/tests/test_available_data.py index 53301a13..32a441ed 100644 --- a/tests/test_available_data.py +++ b/tests/test_available_data.py @@ -11,25 +11,48 @@ def test_factory() -> None: assert "GDSC2" in AVAILABLE_DATASETS assert "CCLE" in AVAILABLE_DATASETS assert "Toy_Data" in AVAILABLE_DATASETS - assert len(AVAILABLE_DATASETS) == 4 + assert "CTRPv1" in AVAILABLE_DATASETS + assert "CTRPv2" in AVAILABLE_DATASETS + assert len(AVAILABLE_DATASETS) == 6 def test_gdsc1() -> None: """Test the GDSC1 dataset.""" tempdir = tempfile.TemporaryDirectory() gdsc1 = AVAILABLE_DATASETS["GDSC1"](path_data=tempdir.name) - assert len(gdsc1) == 333161 + assert len(gdsc1) == 316506 def test_gdsc2(): """Test the GDSC2 dataset.""" tempdir = tempfile.TemporaryDirectory() gdsc2 = AVAILABLE_DATASETS["GDSC2"](path_data=tempdir.name) - assert len(gdsc2) == 242036 + assert len(gdsc2) == 234436 def test_ccle(): """Test the CCLE dataset.""" tempdir = tempfile.TemporaryDirectory() ccle = AVAILABLE_DATASETS["CCLE"](path_data=tempdir.name) - assert len(ccle) == 12096 + assert len(ccle) == 11670 + + +def test_ctrpv1(): + """Test the CTRPv1 dataset.""" + tempdir = tempfile.TemporaryDirectory() + ctrpv1 = AVAILABLE_DATASETS["CTRPv1"](path_data=tempdir.name) + assert len(ctrpv1) == 60757 + + +def test_ctrpv2(): + """Test the CTRPv2 dataset.""" + tempdir = tempfile.TemporaryDirectory() + ctrpv2 = AVAILABLE_DATASETS["CTRPv2"](path_data=tempdir.name) + assert len(ctrpv2) == 395024 + + +def test_toy_data(): + """Test the Toy_Data dataset.""" + tempdir = tempfile.TemporaryDirectory() + toy_data = AVAILABLE_DATASETS["Toy_Data"](path_data=tempdir.name) + assert len(toy_data) == 3426 diff --git a/tests/test_drp_model.py b/tests/test_drp_model.py index 35cd0dcd..ca0b910a 100644 --- a/tests/test_drp_model.py +++ b/tests/test_drp_model.py @@ -178,27 +178,27 @@ def test_load_drugs_from_fingerprints() -> None: temp.name, "GDSC1_small", "drug_fingerprints", - "drug_name_to_demorgan_128_map.csv", + "pubchem_id_to_demorgan_128_map.csv", ) with open(temp_file, "w") as f: f.write( - ",Zibotentan,AZD1208,CI-1040,A-83-01,GSK269962A\n" - "0,1,1,1,1,1\n" - "1,1,1,0,0,1\n" - "2,0,1,1,0,1\n" - "3,1,0,1,1,1\n" - "4,1,1,0,1,1\n" + "3827738,5311510,46883536,73707530,16720766\n" + "1,1,1,1,1\n" + "1,1,0,0,1\n" + "0,1,1,0,1\n" + "1,0,1,1,1\n" + "1,1,0,1,1\n" ) drug_features_gdsc1 = load_drug_fingerprint_features(temp.name, "GDSC1_small") assert len(drug_features_gdsc1.features) == 5 assert drug_features_gdsc1.features.keys() == { - "Zibotentan", - "AZD1208", - "CI-1040", - "A-83-01", - "GSK269962A", + "3827738", + "5311510", + "46883536", + "73707530", + "16720766", } - assert np.all(drug_features_gdsc1.features["Zibotentan"]["fingerprints"] == [1, 1, 0, 1, 1]) + assert np.all(drug_features_gdsc1.features["3827738"]["fingerprints"] == [1, 1, 0, 1, 1]) @pytest.mark.parametrize( diff --git a/tests/test_run_suite.py b/tests/test_run_suite.py index 87d31afa..3bfd19cd 100644 --- a/tests/test_run_suite.py +++ b/tests/test_run_suite.py @@ -32,7 +32,7 @@ "response_transformation": "None", "multiprocessing": False, "path_data": "../data", - "model_checkpoint_dir": "None", + "model_checkpoint_dir": "TEMPORARY", } ], )