From 76e63277bc413cc78a12687d2874009801652e7f Mon Sep 17 00:00:00 2001 From: nictru Date: Mon, 12 Jan 2026 12:37:48 +0000 Subject: [PATCH 01/11] Implement first PharmaFormer draft --- .../create_bpe_smiles_embeddings.py | 137 ++++++ drevalpy/models/PharmaFormer/__init__.py | 5 + .../models/PharmaFormer/hyperparameters.yaml | 33 ++ drevalpy/models/PharmaFormer/model_utils.py | 140 ++++++ drevalpy/models/PharmaFormer/pharmaformer.py | 461 ++++++++++++++++++ drevalpy/models/__init__.py | 3 + poetry.lock | 36 +- pyproject.toml | 1 + tests/models/test_global_models.py | 13 +- tests/test_featurizers.py | 40 ++ 10 files changed, 867 insertions(+), 2 deletions(-) create mode 100644 drevalpy/datasets/featurizer/create_bpe_smiles_embeddings.py create mode 100644 drevalpy/models/PharmaFormer/__init__.py create mode 100644 drevalpy/models/PharmaFormer/hyperparameters.yaml create mode 100644 drevalpy/models/PharmaFormer/model_utils.py create mode 100644 drevalpy/models/PharmaFormer/pharmaformer.py diff --git a/drevalpy/datasets/featurizer/create_bpe_smiles_embeddings.py b/drevalpy/datasets/featurizer/create_bpe_smiles_embeddings.py new file mode 100644 index 00000000..6a13dc79 --- /dev/null +++ b/drevalpy/datasets/featurizer/create_bpe_smiles_embeddings.py @@ -0,0 +1,137 @@ +"""Preprocesses drug SMILES strings into BPE-encoded embeddings.""" + +import argparse +import codecs +import os +import tempfile +from pathlib import Path + +import numpy as np +import pandas as pd +from tqdm import tqdm + +try: + from subword_nmt.apply_bpe import BPE + from subword_nmt.learn_bpe import learn_bpe +except ImportError: + raise ImportError("Please install subword-nmt package for BPE SMILES featurizer: pip install subword-nmt") + + +def create_bpe_smiles_embeddings( + data_path: str, + dataset_name: str, + num_symbols: int = 10000, + max_length: int = 128, +) -> None: + """ + Create BPE-encoded SMILES embeddings for drugs. + + 1. Read drug_smiles.csv + 2. Learn BPE codes from all SMILES strings + 3. Apply BPE to each SMILES + 4. Convert to character ordinals + 5. Pad/truncate to max_length + 6. Save to drug_bpe_smiles.csv + + :param data_path: Path to the data folder + :param dataset_name: Name of the dataset to process + :param num_symbols: Number of BPE symbols to learn + :param max_length: Maximum length of encoded SMILES (padding/truncation) + :raises FileNotFoundError: If drug_smiles.csv is not found + :raises Exception: If a drug fails to process + """ + data_dir = Path(data_path).resolve() + dataset_dir = data_dir / dataset_name + + smiles_file = dataset_dir / "drug_smiles.csv" + bpe_codes_path = dataset_dir / "bpe.codes" + output_file = dataset_dir / "drug_bpe_smiles.csv" + + if not smiles_file.exists(): + raise FileNotFoundError(f"Error: {smiles_file} not found.") + + # Read SMILES data + smiles_df = pd.read_csv(smiles_file, dtype={"canonical_smiles": str, "pubchem_id": str}) + smiles_df = smiles_df.dropna(subset=["canonical_smiles"]) + + print(f"Learning BPE codes from {len(smiles_df)} SMILES strings...") + + # Create temporary file with SMILES strings for BPE learning + # learn_bpe expects one item per line + with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as tmp_file: + tmp_smiles_file = tmp_file.name + for smiles in smiles_df["canonical_smiles"]: + tmp_file.write(f"{smiles}\n") + + # Learn BPE codes from SMILES corpus + try: + with codecs.open(tmp_smiles_file, encoding="utf-8") as f_in: + with codecs.open(bpe_codes_path, "w", encoding="utf-8") as f_out: + learn_bpe(f_in, f_out, num_symbols=num_symbols) + finally: + # Clean up temporary file + if os.path.exists(tmp_smiles_file): + os.remove(tmp_smiles_file) + + print(f"BPE codes saved to {bpe_codes_path}") + + # Load BPE encoder + with codecs.open(bpe_codes_path, encoding="utf-8") as f_in: + bpe = BPE(f_in) + + # Encode each SMILES string + embeddings_list = [] + drug_ids = [] + + print(f"Encoding {len(smiles_df)} SMILES strings...") + + for row in tqdm(smiles_df.itertuples(index=False), total=len(smiles_df)): + drug_id = row.pubchem_id + smiles = row.canonical_smiles + + try: + # Apply BPE + bpe_processed = bpe.process_line(smiles) + # Convert to character ordinals + encoded = [ord(char) for char in bpe_processed] + # Pad/truncate to max_length + if len(encoded) > max_length: + encoded = encoded[:max_length] + else: + encoded = np.pad(encoded, (0, max_length - len(encoded)), "constant").tolist() + + embeddings_list.append(encoded) + drug_ids.append(drug_id) + except Exception as e: + print(f"\nFailed to process drug {drug_id} with SMILES: {smiles}") + print(f"Error: {e}") + raise e + + # Create DataFrame with pubchem_id and encoded features + embeddings_df = pd.DataFrame(embeddings_list) + embeddings_df.columns = [f"feature_{i}" for i in range(max_length)] + embeddings_df.insert(0, "pubchem_id", drug_ids) + embeddings_df.to_csv(output_file, index=False) + + print(f"Finished processing. BPE-encoded SMILES saved to {output_file}") + + +def main(): + """Process drug SMILES and save BPE-encoded embeddings.""" + parser = argparse.ArgumentParser(description="Preprocess drug SMILES to BPE-encoded embeddings.") + parser.add_argument("dataset_name", type=str, help="The name of the dataset to process.") + parser.add_argument("--data-path", type=str, default="data", help="Path to the data folder") + parser.add_argument("--num-symbols", type=int, default=10000, help="Number of BPE symbols to learn") + parser.add_argument("--max-length", type=int, default=128, help="Maximum length of encoded SMILES") + args = parser.parse_args() + + create_bpe_smiles_embeddings( + data_path=args.data_path, + dataset_name=args.dataset_name, + num_symbols=args.num_symbols, + max_length=args.max_length, + ) + + +if __name__ == "__main__": + main() diff --git a/drevalpy/models/PharmaFormer/__init__.py b/drevalpy/models/PharmaFormer/__init__.py new file mode 100644 index 00000000..f78b623a --- /dev/null +++ b/drevalpy/models/PharmaFormer/__init__.py @@ -0,0 +1,5 @@ +"""PharmaFormer model.""" + +from .pharmaformer import PharmaFormerModel + +__all__ = ["PharmaFormerModel"] diff --git a/drevalpy/models/PharmaFormer/hyperparameters.yaml b/drevalpy/models/PharmaFormer/hyperparameters.yaml new file mode 100644 index 00000000..2982f0c1 --- /dev/null +++ b/drevalpy/models/PharmaFormer/hyperparameters.yaml @@ -0,0 +1,33 @@ +--- +PharmaFormer: + gene_hidden_size: + - 2048 + - 4096 + drug_hidden_size: + - 128 + - 256 + feature_dim: + - 64 + - 128 + nhead: + - 4 + - 8 + num_layers: + - 2 + - 3 + dim_feedforward: + - 1024 + - 2048 + dropout: + - 0.1 + - 0.2 + batch_size: + - 64 + - 128 + lr: + - 0.00001 + - 0.0001 + epochs: + - 100 + patience: + - 10 diff --git a/drevalpy/models/PharmaFormer/model_utils.py b/drevalpy/models/PharmaFormer/model_utils.py new file mode 100644 index 00000000..a8e632fe --- /dev/null +++ b/drevalpy/models/PharmaFormer/model_utils.py @@ -0,0 +1,140 @@ +"""Neural network components for PharmaFormer model.""" + +import torch +import torch.nn as nn +import torch.nn.functional as functional + + +class FeatureExtractor(nn.Module): + """Feature extractor for gene expression and drug SMILES.""" + + def __init__(self, gene_input_size: int, gene_hidden_size: int, drug_hidden_size: int): + """ + Initialize the feature extractor. + + :param gene_input_size: Input size for gene expression features + :param gene_hidden_size: Hidden size for gene expression MLP + :param drug_hidden_size: Hidden size for drug SMILES MLP + """ + super().__init__() + self.gene_fc1 = nn.Linear(gene_input_size, gene_hidden_size) + self.gene_fc2 = nn.Linear(gene_hidden_size, gene_hidden_size) + self.smiles_fc = nn.Linear(128, drug_hidden_size) + + def forward(self, gene_expr: torch.Tensor, smiles: torch.Tensor) -> torch.Tensor: + """ + Forward pass of the feature extractor. + + :param gene_expr: Gene expression features [batch_size, gene_input_size] + :param smiles: BPE-encoded SMILES features [batch_size, 128] + :return: Combined features [batch_size, gene_hidden_size + drug_hidden_size] + """ + gene_out = functional.relu(self.gene_fc1(gene_expr)) + gene_out = functional.relu(self.gene_fc2(gene_out)) + smiles_out = functional.relu(self.smiles_fc(smiles)) + combined_features = torch.cat((gene_out, smiles_out), dim=1) + return combined_features + + +class TransModel(nn.Module): + """Transformer model for processing combined features.""" + + def __init__( + self, + feature_dim: int, + nhead: int, + seq_len: int, + dim_feedforward: int = 2048, + dropout: float = 0.1, + num_layers: int = 3, + ): + """ + Initialize the transformer model. + + :param feature_dim: Dimension of each feature in the sequence + :param nhead: Number of attention heads + :param seq_len: Length of the input sequence + :param dim_feedforward: Dimension of feedforward network + :param dropout: Dropout rate + :param num_layers: Number of transformer encoder layers + """ + super().__init__() + encoder_layer = nn.TransformerEncoderLayer( + d_model=feature_dim, + nhead=nhead, + dim_feedforward=dim_feedforward, + dropout=dropout, + batch_first=True, + ) + self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers) + self.output = nn.Sequential( + nn.Linear(seq_len * feature_dim, 1024), + nn.ReLU(), + nn.Dropout(dropout), + nn.Linear(1024, 1), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Forward pass of the transformer model. + + :param x: Input tensor [batch_size, seq_len, feature_dim] + :return: Output predictions [batch_size, 1] + """ + x = self.transformer_encoder(x) + x = torch.flatten(x, 1) + return self.output(x) + + +class CombinedModel(nn.Module): + """Combined model integrating feature extraction and transformer.""" + + def __init__( + self, + gene_input_size: int, + gene_hidden_size: int, + drug_hidden_size: int, + feature_dim: int, + nhead: int, + num_layers: int = 3, + dim_feedforward: int = 2048, + dropout: float = 0.1, + ): + """ + Initialize the combined model. + + :param gene_input_size: Input size for gene expression features + :param gene_hidden_size: Hidden size for gene expression MLP + :param drug_hidden_size: Hidden size for drug SMILES MLP + :param feature_dim: Dimension of each feature in the transformer sequence + :param nhead: Number of attention heads + :param num_layers: Number of transformer encoder layers + :param dim_feedforward: Dimension of feedforward network + :param dropout: Dropout rate + """ + super().__init__() + self.feature_extractor = FeatureExtractor(gene_input_size, gene_hidden_size, drug_hidden_size) + self.feature_dim = feature_dim + self.seq_len = (gene_hidden_size + drug_hidden_size) // feature_dim + self.transformer = TransModel( + feature_dim=feature_dim, + nhead=nhead, + seq_len=self.seq_len, + num_layers=num_layers, + dim_feedforward=dim_feedforward, + dropout=dropout, + ) + + def forward(self, gene_expr: torch.Tensor, smiles: torch.Tensor) -> torch.Tensor: + """ + Forward pass of the combined model. + + :param gene_expr: Gene expression features [batch_size, gene_input_size] + :param smiles: BPE-encoded SMILES features [batch_size, 128] + :return: Output predictions [batch_size, 1] + """ + features = self.feature_extractor(gene_expr, smiles) + batch_size = features.size(0) + features = features.view(batch_size, self.seq_len, self.feature_dim) + output = self.transformer(features) + return output diff --git a/drevalpy/models/PharmaFormer/pharmaformer.py b/drevalpy/models/PharmaFormer/pharmaformer.py new file mode 100644 index 00000000..385411f1 --- /dev/null +++ b/drevalpy/models/PharmaFormer/pharmaformer.py @@ -0,0 +1,461 @@ +"""PharmaFormer model for drug response prediction.""" + +import json +import os +import secrets +from typing import Any, cast + +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +import torch.optim as optim +from sklearn.preprocessing import MinMaxScaler, StandardScaler +from torch.utils.data import DataLoader, Dataset + +from drevalpy.datasets.dataset import DrugResponseDataset, FeatureDataset +from drevalpy.models.drp_model import DRPModel +from drevalpy.models.utils import load_and_select_gene_features + +from .model_utils import CombinedModel + + +class _PharmaFormerDataset(Dataset): + """PyTorch Dataset for PharmaFormer model.""" + + def __init__( + self, + response: np.ndarray, + cell_line_ids: np.ndarray, + drug_ids: np.ndarray, + cell_line_features: FeatureDataset, + drug_features: FeatureDataset, + ): + """ + Initialize the dataset. + + :param response: Drug response values + :param cell_line_ids: Cell line identifiers + :param drug_ids: Drug identifiers + :param cell_line_features: FeatureDataset with cell line features + :param drug_features: FeatureDataset with drug features + """ + self.response = response + self.cell_line_ids = cell_line_ids + self.drug_ids = drug_ids + self.cell_line_features = cell_line_features + self.drug_features = drug_features + + def __len__(self) -> int: + """Return the length of the dataset. + + :return: Length of the dataset + """ + return len(self.response) + + def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Get a single item from the dataset. + + :param idx: Index of the item + :return: Tuple of (gene_features, drug_features, response) + """ + cell_line_id = self.cell_line_ids[idx] + drug_id = self.drug_ids[idx] + + gene_features = torch.tensor( + self.cell_line_features.features[cell_line_id]["gene_expression"], dtype=torch.float32 + ) + drug_features = torch.tensor(self.drug_features.features[drug_id]["bpe_smiles"], dtype=torch.float32) + response = torch.tensor(self.response[idx], dtype=torch.float32) + + return gene_features, drug_features, response + + +class PharmaFormerModel(DRPModel): + """PharmaFormer model for drug response prediction.""" + + cell_line_views = ["gene_expression"] + drug_views = ["bpe_smiles"] + early_stopping = True + + def __init__(self) -> None: + """Initialize the PharmaFormer model.""" + super().__init__() + self.DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.model: CombinedModel | None = None + self.hyperparameters: dict[str, Any] = {} + self.gene_expression_scaler: StandardScaler | None = None + self.gene_expression_normalizer: MinMaxScaler | None = None + + @classmethod + def get_model_name(cls) -> str: + """ + Get the model name. + + :returns: PharmaFormer + """ + return "PharmaFormer" + + def build_model(self, hyperparameters: dict[str, Any]) -> None: + """ + Builds the PharmaFormer model with the specified hyperparameters. + + :param hyperparameters: Model hyperparameters including gene_hidden_size, drug_hidden_size, + feature_dim, nhead, num_layers, dim_feedforward, dropout, batch_size, lr, epochs, patience + """ + self.hyperparameters = hyperparameters + # Model will be built in train() when we know the input dimensions + + def train( + self, + output: DrugResponseDataset, + cell_line_input: FeatureDataset, + drug_input: FeatureDataset | None = None, + output_earlystopping: DrugResponseDataset | None = None, + model_checkpoint_dir: str = "checkpoints", + ) -> None: + """ + Trains the model. + + :param output: training data associated with the response output + :param cell_line_input: input data associated with the cell line + :param drug_input: input data associated with the drug + :param output_earlystopping: early stopping data associated with the response output + :param model_checkpoint_dir: directory to save the model checkpoint + :raises ValueError: if drug_input is None or if early stopping data is missing + """ + if drug_input is None: + raise ValueError("PharmaFormer model requires drug features.") + if output_earlystopping is None: + raise ValueError("PharmaFormer model requires early stopping data.") + + # Get feature dimensions + train_gene_features = cell_line_input.get_feature_matrix( + view="gene_expression", identifiers=output.cell_line_ids + ) + gene_input_size = train_gene_features.shape[1] + + # Standardize and normalize gene expression (matching original PharmaFormer) + self.gene_expression_scaler = StandardScaler() + self.gene_expression_normalizer = MinMaxScaler() + + train_gene_scaled = self.gene_expression_scaler.fit_transform(train_gene_features) + self.gene_expression_normalizer.fit_transform(train_gene_scaled) + + # Apply transformations to all gene expression features + cell_line_input = cell_line_input.copy() + for cell_line_id in cell_line_input.features: + gene_expr = cell_line_input.features[cell_line_id]["gene_expression"] + gene_expr_scaled = self.gene_expression_scaler.transform(gene_expr.reshape(1, -1)) + gene_expr_normalized = self.gene_expression_normalizer.transform(gene_expr_scaled) + cell_line_input.features[cell_line_id]["gene_expression"] = gene_expr_normalized.flatten() + + # Build model with known input dimensions + self.model = CombinedModel( + gene_input_size=gene_input_size, + gene_hidden_size=self.hyperparameters["gene_hidden_size"], + drug_hidden_size=self.hyperparameters["drug_hidden_size"], + feature_dim=self.hyperparameters["feature_dim"], + nhead=self.hyperparameters["nhead"], + num_layers=self.hyperparameters.get("num_layers", 3), + dim_feedforward=self.hyperparameters.get("dim_feedforward", 2048), + dropout=self.hyperparameters.get("dropout", 0.1), + ).to(self.DEVICE) + + loss_func = nn.MSELoss() + optimizer = optim.Adam(self.model.parameters(), lr=self.hyperparameters["lr"]) + + # Create datasets + train_dataset = _PharmaFormerDataset( + response=output.response, + cell_line_ids=output.cell_line_ids, + drug_ids=output.drug_ids, + cell_line_features=cell_line_input, + drug_features=drug_input, + ) + early_stopping_dataset = _PharmaFormerDataset( + response=output_earlystopping.response, + cell_line_ids=output_earlystopping.cell_line_ids, + drug_ids=output_earlystopping.drug_ids, + cell_line_features=cell_line_input, + drug_features=drug_input, + ) + + train_loader = DataLoader( + train_dataset, + batch_size=self.hyperparameters["batch_size"], + shuffle=True, + ) + early_stopping_loader = DataLoader( + early_stopping_dataset, + batch_size=self.hyperparameters["batch_size"], + shuffle=False, + ) + + # Early stopping parameters + best_val_loss = float("inf") + epochs_without_improvement = 0 + + # Ensure the checkpoint directory exists + os.makedirs(model_checkpoint_dir, exist_ok=True) + version = "version-" + "".join([secrets.choice("0123456789abcdef") for _ in range(20)]) + + checkpoint_path = os.path.join(model_checkpoint_dir, f"{version}_best_PharmaFormer_model.pth") + + # Train model + print("Training PharmaFormer model") + for epoch in range(self.hyperparameters["epochs"]): + self.model.train() + epoch_loss = 0.0 + batch_count = 0 + + # Training phase + for gene_inputs, smiles_inputs, targets in train_loader: + gene_inputs = gene_inputs.to(self.DEVICE) + smiles_inputs = smiles_inputs.to(self.DEVICE) + targets = targets.to(self.DEVICE) + + # Forward pass + outputs = self.model(gene_inputs, smiles_inputs) + loss = loss_func(outputs.squeeze(), targets) + + # Backpropagation + optimizer.zero_grad() + loss.backward() + optimizer.step() + + epoch_loss += loss.detach().item() + batch_count += 1 + + epoch_loss /= batch_count + print(f"PharmaFormer: Epoch [{epoch + 1}/{self.hyperparameters['epochs']}] Training Loss: {epoch_loss:.4f}") + + # Validation phase for early stopping + self.model.eval() + val_loss = 0.0 + val_batch_count = 0 + with torch.no_grad(): + for gene_inputs, smiles_inputs, targets in early_stopping_loader: + gene_inputs = gene_inputs.to(self.DEVICE) + smiles_inputs = smiles_inputs.to(self.DEVICE) + targets = targets.to(self.DEVICE) + + outputs = self.model(gene_inputs, smiles_inputs) + loss = loss_func(outputs.squeeze(), targets) + + val_loss += loss.item() + val_batch_count += 1 + + val_loss /= val_batch_count + print(f"PharmaFormer: Epoch [{epoch + 1}/{self.hyperparameters['epochs']}] Validation Loss: {val_loss:.4f}") + + # Checkpointing: Save the best model + if val_loss < best_val_loss: + best_val_loss = val_loss + epochs_without_improvement = 0 + torch.save(self.model.state_dict(), checkpoint_path) # noqa: S614 + print(f"PharmaFormer: Saved best model at epoch {epoch + 1}") + else: + epochs_without_improvement += 1 + if epochs_without_improvement >= self.hyperparameters.get("patience", 10): + print(f"PharmaFormer: Early stopping triggered at epoch {epoch + 1}") + break + + # Reload the best model after training + print("PharmaFormer: Reloading the best model") + self.model.load_state_dict( + torch.load(checkpoint_path, map_location=self.DEVICE, weights_only=True) + ) # noqa: S614 + self.model.to(self.DEVICE) + + def predict( + self, + cell_line_ids: np.ndarray, + drug_ids: np.ndarray, + cell_line_input: FeatureDataset, + drug_input: FeatureDataset | None = None, + ) -> np.ndarray: + """ + Predicts the response values for the given cell lines and drugs. + + :param cell_line_ids: list of cell line IDs + :param drug_ids: list of drug IDs + :param cell_line_input: input data associated with the cell line + :param drug_input: input data associated with the drug + :return: predicted response values + :raises ValueError: if drug_input is None or if the model is not initialized + """ + if drug_input is None: + raise ValueError("PharmaFormer model requires drug features.") + if self.model is None: + raise ValueError("PharmaFormer model not initialized.") + + # Apply transformations to gene expression if scalers are available + if self.gene_expression_scaler is not None and self.gene_expression_normalizer is not None: + cell_line_input = cell_line_input.copy() + for cell_line_id in cell_line_ids: + if cell_line_id in cell_line_input.features: + gene_expr = cell_line_input.features[cell_line_id]["gene_expression"] + gene_expr_scaled = self.gene_expression_scaler.transform(gene_expr.reshape(1, -1)) + gene_expr_normalized = self.gene_expression_normalizer.transform(gene_expr_scaled) + cell_line_input.features[cell_line_id]["gene_expression"] = gene_expr_normalized.flatten() + + # Create dataset + predict_dataset = _PharmaFormerDataset( + response=np.zeros(len(cell_line_ids)), + cell_line_ids=cell_line_ids, + drug_ids=drug_ids, + cell_line_features=cell_line_input, + drug_features=drug_input, + ) + + predict_loader = DataLoader( + predict_dataset, batch_size=self.hyperparameters.get("batch_size", 64), shuffle=False + ) + + # Run prediction + self.model.eval() + predictions = [] + with torch.no_grad(): + for gene_inputs, smiles_inputs, _ in predict_loader: + gene_inputs = gene_inputs.to(self.DEVICE) + smiles_inputs = smiles_inputs.to(self.DEVICE) + + outputs = self.model(gene_inputs, smiles_inputs) + if outputs.numel() > 1: + predictions += outputs.squeeze().cpu().tolist() + else: + predictions += [outputs.item()] + + return np.array(predictions) + + def load_cell_line_features(self, data_path: str, dataset_name: str) -> FeatureDataset: + """ + Load cell line features. + + :param data_path: path to the data + :param dataset_name: name of the dataset + :returns: cell line features + """ + return load_and_select_gene_features( + feature_type="gene_expression", + gene_list="landmark_genes_reduced", + data_path=data_path, + dataset_name=dataset_name, + ) + + def load_drug_features(self, data_path: str, dataset_name: str) -> FeatureDataset: + """ + Load drug features (BPE-encoded SMILES). + + :param data_path: path to the data + :param dataset_name: name of the dataset + :returns: drug features + :raises FileNotFoundError: if the BPE SMILES file is not found + """ + bpe_smiles_file = os.path.join(data_path, dataset_name, "drug_bpe_smiles.csv") + if not os.path.exists(bpe_smiles_file): + raise FileNotFoundError( + f"BPE SMILES file not found: {bpe_smiles_file}. " + "Please run the BPE featurizer first: " + "python -m drevalpy.datasets.featurizer.create_bpe_smiles_embeddings " + ) + + bpe_df = pd.read_csv(bpe_smiles_file, dtype={"pubchem_id": str}) + features = {} + for _, row in bpe_df.iterrows(): + drug_id = row["pubchem_id"] + # Extract all feature columns (excluding pubchem_id) + embedding = row.drop("pubchem_id").values.astype(np.float32) + features[drug_id] = {"bpe_smiles": embedding} + + return FeatureDataset(features) + + def save(self, directory: str) -> None: + """ + Save the PharmaFormer model using PyTorch conventions. + + This method stores: + - "pharmaformer_model.pt": PyTorch state_dict of the model + - "hyperparameters.json": All hyperparameters + - "gene_scaler.pkl": Fitted StandardScaler for gene expression + - "gene_normalizer.pkl": Fitted MinMaxScaler for gene expression + + :param directory: Target directory where the model files will be saved + :raises ValueError: If model is not built + """ + import joblib + + os.makedirs(directory, exist_ok=True) + if self.model is None: + raise ValueError("Cannot save model: model is not built.") + + model = cast(CombinedModel, self.model) + + torch.save(model.state_dict(), os.path.join(directory, "pharmaformer_model.pt")) # noqa: S614 + + # Save hyperparameters including gene_input_size + save_hyperparameters = self.hyperparameters.copy() + if self.model is not None: + # Extract gene_input_size from the model + save_hyperparameters["gene_input_size"] = self.model.feature_extractor.gene_fc1.in_features + + with open(os.path.join(directory, "hyperparameters.json"), "w") as f: + json.dump(save_hyperparameters, f) + + if self.gene_expression_scaler is not None: + joblib.dump(self.gene_expression_scaler, os.path.join(directory, "gene_scaler.pkl")) + if self.gene_expression_normalizer is not None: + joblib.dump(self.gene_expression_normalizer, os.path.join(directory, "gene_normalizer.pkl")) + + @classmethod + def load(cls, directory: str) -> "PharmaFormerModel": + """ + Load the PharmaFormer model using PyTorch conventions. + + This method expects the following files in the given directory: + - "pharmaformer_model.pt": PyTorch state_dict of the model + - "hyperparameters.json": Dictionary of hyperparameters + - "gene_scaler.pkl": Fitted StandardScaler (optional) + - "gene_normalizer.pkl": Fitted MinMaxScaler (optional) + + :param directory: Path to the directory containing the model files + :return: An instance of PharmaFormerModel with loaded model + """ + import joblib + + instance = cls() + + with open(os.path.join(directory, "hyperparameters.json")) as f: + instance.hyperparameters = json.load(f) + + # Load scalers if they exist + scaler_path = os.path.join(directory, "gene_scaler.pkl") + normalizer_path = os.path.join(directory, "gene_normalizer.pkl") + if os.path.exists(scaler_path): + instance.gene_expression_scaler = joblib.load(scaler_path) + if os.path.exists(normalizer_path): + instance.gene_expression_normalizer = joblib.load(normalizer_path) + + # Model will be built when needed (requires input dimensions) + # For now, we'll need to rebuild it with the saved hyperparameters + # This requires knowing the gene_input_size, which should be saved in hyperparameters + if "gene_input_size" in instance.hyperparameters: + instance.model = CombinedModel( + gene_input_size=instance.hyperparameters["gene_input_size"], + gene_hidden_size=instance.hyperparameters["gene_hidden_size"], + drug_hidden_size=instance.hyperparameters["drug_hidden_size"], + feature_dim=instance.hyperparameters["feature_dim"], + nhead=instance.hyperparameters["nhead"], + num_layers=instance.hyperparameters.get("num_layers", 3), + dim_feedforward=instance.hyperparameters.get("dim_feedforward", 2048), + dropout=instance.hyperparameters.get("dropout", 0.1), + ).to(instance.DEVICE) + + instance.model.load_state_dict( + torch.load(os.path.join(directory, "pharmaformer_model.pt"), map_location=instance.DEVICE) # noqa: S614 + ) + instance.model.eval() + + return instance diff --git a/drevalpy/models/__init__.py b/drevalpy/models/__init__.py index c7e5d6ec..5ecf2e4f 100644 --- a/drevalpy/models/__init__.py +++ b/drevalpy/models/__init__.py @@ -29,6 +29,7 @@ "SingleDrugProteomicsRandomForest", "DrugGNN", "ChemBERTaNeuralNetwork", + "PharmaFormerModel", ] from .baselines.multi_omics_random_forest import MultiOmicsRandomForest @@ -54,6 +55,7 @@ from .drp_model import DRPModel from .DrugGNN import DrugGNN from .MOLIR.molir import MOLIR +from .PharmaFormer.pharmaformer import PharmaFormerModel from .SimpleNeuralNetwork.multiomics_neural_network import MultiOmicsNeuralNetwork from .SimpleNeuralNetwork.simple_neural_network import ChemBERTaNeuralNetwork, SimpleNeuralNetwork from .SRMF.srmf import SRMF @@ -90,6 +92,7 @@ "ProteomicsElasticNet": ProteomicsElasticNetModel, "DrugGNN": DrugGNN, "ChemBERTaNeuralNetwork": ChemBERTaNeuralNetwork, + "PharmaFormer": PharmaFormerModel, } # MODEL_FACTORY is used in the pipeline! diff --git a/poetry.lock b/poetry.lock index 72534c7c..123f027c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -915,6 +915,7 @@ python-versions = "<3.14,>=3.11" groups = ["main"] files = [ {file = "curve_curator-0.6.0-py3-none-any.whl", hash = "sha256:c4345ff856e6de68826fd80536731989e88d9910ba81b00a8ffb5665e318f7e2"}, + {file = "curve_curator-0.6.0.tar.gz", hash = "sha256:d323dbf900a2c390de7e4ee744a74268d328f917decb277216043ffabccd1a92"}, ] [package.dependencies] @@ -2192,6 +2193,23 @@ files = [ {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, ] +[[package]] +name = "mock" +version = "5.2.0" +description = "Rolling backport of unittest.mock for all Pythons" +optional = false +python-versions = ">=3.6" +groups = ["main"] +files = [ + {file = "mock-5.2.0-py3-none-any.whl", hash = "sha256:7ba87f72ca0e915175596069dbbcc7c75af7b5e9b9bc107ad6349ede0819982f"}, + {file = "mock-5.2.0.tar.gz", hash = "sha256:4e460e818629b4b173f32d08bf30d3af8123afbb8e04bb5707a1fd4799e503f0"}, +] + +[package.extras] +build = ["blurb", "twine", "wheel"] +docs = ["sphinx"] +test = ["pytest", "pytest-cov"] + [[package]] name = "more-itertools" version = "10.8.0" @@ -5023,6 +5041,22 @@ files = [ {file = "stevedore-5.6.0.tar.gz", hash = "sha256:f22d15c6ead40c5bbfa9ca54aa7e7b4a07d59b36ae03ed12ced1a54cf0b51945"}, ] +[[package]] +name = "subword-nmt" +version = "0.3.8" +description = "Unsupervised Word Segmentation for Neural Machine Translation and Text Generation" +optional = false +python-versions = "*" +groups = ["main"] +files = [ + {file = "subword_nmt-0.3.8-py3-none-any.whl", hash = "sha256:d22526b557752f35ac15e8ea384ea7773e50a51d966b8752d023d16cb87eac36"}, + {file = "subword_nmt-0.3.8.tar.gz", hash = "sha256:3964c66b37712ca1d9fb9a1a6ff7e57c9ab72d838813da3e9a1d4d4997f4fb75"}, +] + +[package.dependencies] +mock = "*" +tqdm = "*" + [[package]] name = "sympy" version = "1.14.0" @@ -6138,4 +6172,4 @@ multiprocessing = ["pydantic", "ray"] [metadata] lock-version = "2.1" python-versions = ">=3.11,<3.14" -content-hash = "6b4b90e15caa03310435b840d16c90f8fbfa11d992ffdbd94c3fbdbb46d4a3bf" +content-hash = "cb7bf71d59b895c0cfbe611d73c357cf78478175d923b41ae5a3565aaf47437e" diff --git a/pyproject.toml b/pyproject.toml index 328a9a5e..02886951 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ matplotlib = "*" importlib-resources = "*" scikit-posthocs = "*" curve-curator = "*" +subword-nmt = ">=0.3.8" toml = {version = "^0.10.2"} poetry = "^2.0.1" starlette = ">=0.49.1" diff --git a/tests/models/test_global_models.py b/tests/models/test_global_models.py index 00ddce99..69e20b2b 100644 --- a/tests/models/test_global_models.py +++ b/tests/models/test_global_models.py @@ -17,7 +17,15 @@ @pytest.mark.parametrize("test_mode", ["LTO"]) @pytest.mark.parametrize( "model_name", - ["DrugGNN", "ChemBERTaNeuralNetwork", "SRMF", "DIPK", "SimpleNeuralNetwork", "MultiOmicsNeuralNetwork"], + [ + "DrugGNN", + "ChemBERTaNeuralNetwork", + "SRMF", + "DIPK", + "SimpleNeuralNetwork", + "MultiOmicsNeuralNetwork", + "PharmaFormer", + ], ) def test_global_models( sample_dataset: DrugResponseDataset, @@ -68,6 +76,9 @@ def test_global_models( elif model_name in ["SimpleNeuralNetwork", "MultiOmicsNeuralNetwork"]: hpam_combi["units_per_layer"] = [2, 2] hpam_combi["max_epochs"] = 1 + elif model_name == "PharmaFormer": + hpam_combi["epochs"] = 1 + hpam_combi["patience"] = 2 model.build_model(hyperparameters=hpam_combi) with tempfile.TemporaryDirectory() as tmpdirname: diff --git a/tests/test_featurizers.py b/tests/test_featurizers.py index 5c38b613..cf93a723 100644 --- a/tests/test_featurizers.py +++ b/tests/test_featurizers.py @@ -110,3 +110,43 @@ def test_molgnet_featurizer(tmp_path): # verify outputs assert (ds_dir / "DIPK_features/Drugs" / "MolGNet_D1.csv").exists() + + +def test_bpe_smiles_featurizer(tmp_path): + """ + Test BPE SMILES featurizer end-to-end. + + :param tmp_path: Temporary path provided by pytest. + """ + try: + import drevalpy.datasets.featurizer.create_bpe_smiles_embeddings as bpe_feat + except ImportError: + print("subword-nmt package not installed; skipping BPE SMILES featurizer test.") + return + dataset = "testset" + data_dir = tmp_path / dataset + data_dir.mkdir(parents=True) + + # write minimal SMILES CSV + df = pd.DataFrame({"pubchem_id": ["D1"], "canonical_smiles": ["CCO"]}) + (data_dir / "drug_smiles.csv").write_text(df.to_csv(index=False)) + + # run main exactly as the script would + sys.argv = ["prog", dataset, "--data_path", str(tmp_path), "--num-symbols", "100", "--max-length", "128"] + bpe_feat.main() + + # expected output files + out_file = data_dir / "drug_bpe_smiles.csv" + bpe_codes_file = data_dir / "bpe.codes" + assert out_file.exists() + assert bpe_codes_file.exists() + + # verify output format + df_out = pd.read_csv(out_file) + assert "pubchem_id" in df_out.columns + assert df_out.pubchem_id.tolist() == ["D1"] + # Should have 128 feature columns + feature_cols = [col for col in df_out.columns if col.startswith("feature_")] + assert len(feature_cols) == 128 + # Values should be numeric (character ordinals, may be stored as float in CSV) + assert pd.api.types.is_numeric_dtype(df_out[feature_cols[0]]) From 14233fbf5c8482888e682d5da769bc028a5407d5 Mon Sep 17 00:00:00 2001 From: nictru Date: Mon, 12 Jan 2026 12:47:11 +0000 Subject: [PATCH 02/11] Remove gene list from pharmaFormer loader --- drevalpy/models/PharmaFormer/pharmaformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drevalpy/models/PharmaFormer/pharmaformer.py b/drevalpy/models/PharmaFormer/pharmaformer.py index 385411f1..f8a75bea 100644 --- a/drevalpy/models/PharmaFormer/pharmaformer.py +++ b/drevalpy/models/PharmaFormer/pharmaformer.py @@ -340,7 +340,7 @@ def load_cell_line_features(self, data_path: str, dataset_name: str) -> FeatureD """ return load_and_select_gene_features( feature_type="gene_expression", - gene_list="landmark_genes_reduced", + gene_list=None, data_path=data_path, dataset_name=dataset_name, ) From 7758ee11180edecb1a42e98c50dd24157d8ef9be Mon Sep 17 00:00:00 2001 From: nictru Date: Mon, 12 Jan 2026 13:02:37 +0000 Subject: [PATCH 03/11] Add citation --- drevalpy/models/PharmaFormer/pharmaformer.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drevalpy/models/PharmaFormer/pharmaformer.py b/drevalpy/models/PharmaFormer/pharmaformer.py index f8a75bea..a91d415b 100644 --- a/drevalpy/models/PharmaFormer/pharmaformer.py +++ b/drevalpy/models/PharmaFormer/pharmaformer.py @@ -1,4 +1,12 @@ -"""PharmaFormer model for drug response prediction.""" +""" +Contains PharmaFormer, a transformer-based deep learning model for drug response prediction. + +A Transformer-based deep learning model designed to predict clinical drug responses +by integrating gene expression profiles and drug molecular structures. + +Original authors: Zhou et al. (2025, 10.1038/s41698-025-01082-6) +Code adapted from their Github: https://github.com/zhouyuru1205/PharmaFormer +""" import json import os From c565e480f0a2eba35b41c1948150521fffc35b73 Mon Sep 17 00:00:00 2001 From: nictru Date: Mon, 12 Jan 2026 13:10:50 +0000 Subject: [PATCH 04/11] Fix BPE tests --- .../featurizer/create_bpe_smiles_embeddings.py | 2 +- tests/test_featurizers.py | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/drevalpy/datasets/featurizer/create_bpe_smiles_embeddings.py b/drevalpy/datasets/featurizer/create_bpe_smiles_embeddings.py index 6a13dc79..0aed5cd5 100644 --- a/drevalpy/datasets/featurizer/create_bpe_smiles_embeddings.py +++ b/drevalpy/datasets/featurizer/create_bpe_smiles_embeddings.py @@ -120,7 +120,7 @@ def main(): """Process drug SMILES and save BPE-encoded embeddings.""" parser = argparse.ArgumentParser(description="Preprocess drug SMILES to BPE-encoded embeddings.") parser.add_argument("dataset_name", type=str, help="The name of the dataset to process.") - parser.add_argument("--data-path", type=str, default="data", help="Path to the data folder") + parser.add_argument("--data_path", type=str, default="data", help="Path to the data folder") parser.add_argument("--num-symbols", type=int, default=10000, help="Number of BPE symbols to learn") parser.add_argument("--max-length", type=int, default=128, help="Maximum length of encoded SMILES") args = parser.parse_args() diff --git a/tests/test_featurizers.py b/tests/test_featurizers.py index cf93a723..0d1f5f67 100644 --- a/tests/test_featurizers.py +++ b/tests/test_featurizers.py @@ -127,8 +127,13 @@ def test_bpe_smiles_featurizer(tmp_path): data_dir = tmp_path / dataset data_dir.mkdir(parents=True) - # write minimal SMILES CSV - df = pd.DataFrame({"pubchem_id": ["D1"], "canonical_smiles": ["CCO"]}) + # write minimal SMILES CSV with multiple SMILES for BPE learning + df = pd.DataFrame( + { + "pubchem_id": ["D1", "D2", "D3", "D4", "D5"], + "canonical_smiles": ["CCO", "CC(=O)O", "c1ccccc1", "CCN(CC)CC", "C1CCC(CC1)O"], + } + ) (data_dir / "drug_smiles.csv").write_text(df.to_csv(index=False)) # run main exactly as the script would @@ -144,7 +149,7 @@ def test_bpe_smiles_featurizer(tmp_path): # verify output format df_out = pd.read_csv(out_file) assert "pubchem_id" in df_out.columns - assert df_out.pubchem_id.tolist() == ["D1"] + assert df_out.pubchem_id.tolist() == ["D1", "D2", "D3", "D4", "D5"] # Should have 128 feature columns feature_cols = [col for col in df_out.columns if col.startswith("feature_")] assert len(feature_cols) == 128 From 360dd2d73b6dd2e6cb4bb8cd1ac5947fa28c146b Mon Sep 17 00:00:00 2001 From: nictru Date: Mon, 12 Jan 2026 13:23:42 +0000 Subject: [PATCH 05/11] Use consistent gene set --- drevalpy/models/PharmaFormer/pharmaformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drevalpy/models/PharmaFormer/pharmaformer.py b/drevalpy/models/PharmaFormer/pharmaformer.py index a91d415b..4062a457 100644 --- a/drevalpy/models/PharmaFormer/pharmaformer.py +++ b/drevalpy/models/PharmaFormer/pharmaformer.py @@ -348,7 +348,7 @@ def load_cell_line_features(self, data_path: str, dataset_name: str) -> FeatureD """ return load_and_select_gene_features( feature_type="gene_expression", - gene_list=None, + gene_list="landmark_genes_reduced", data_path=data_path, dataset_name=dataset_name, ) From 627813e72eb7d9cb35b1b52191e8cfab9c16ccfa Mon Sep 17 00:00:00 2001 From: nictru Date: Mon, 12 Jan 2026 13:34:38 +0000 Subject: [PATCH 06/11] Fix mypy --- drevalpy/datasets/featurizer/create_bpe_smiles_embeddings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drevalpy/datasets/featurizer/create_bpe_smiles_embeddings.py b/drevalpy/datasets/featurizer/create_bpe_smiles_embeddings.py index 0aed5cd5..adc7cb25 100644 --- a/drevalpy/datasets/featurizer/create_bpe_smiles_embeddings.py +++ b/drevalpy/datasets/featurizer/create_bpe_smiles_embeddings.py @@ -66,7 +66,7 @@ def create_bpe_smiles_embeddings( # Learn BPE codes from SMILES corpus try: with codecs.open(tmp_smiles_file, encoding="utf-8") as f_in: - with codecs.open(bpe_codes_path, "w", encoding="utf-8") as f_out: + with codecs.open(str(bpe_codes_path), "w", encoding="utf-8") as f_out: learn_bpe(f_in, f_out, num_symbols=num_symbols) finally: # Clean up temporary file @@ -76,7 +76,7 @@ def create_bpe_smiles_embeddings( print(f"BPE codes saved to {bpe_codes_path}") # Load BPE encoder - with codecs.open(bpe_codes_path, encoding="utf-8") as f_in: + with codecs.open(str(bpe_codes_path), encoding="utf-8") as f_in: bpe = BPE(f_in) # Encode each SMILES string From 2b6f15482a780b2096c206b72edd0e5e50b8fbc3 Mon Sep 17 00:00:00 2001 From: nictru Date: Mon, 12 Jan 2026 13:43:40 +0000 Subject: [PATCH 07/11] Make sure BPE embeddings are created before tests --- tests/conftest.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index d99983fb..4a5ebfa1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -47,3 +47,42 @@ def cross_study_dataset() -> DrugResponseDataset: drug_response = load_toyv2(path_data) drug_response.remove_nan_responses() return drug_response + + +@pytest.fixture(scope="session", autouse=True) +def ensure_bpe_features() -> None: + """ + Ensure BPE SMILES features are created for TOYv1 and TOYv2 before tests run. + + This fixture runs automatically before any tests to ensure that PharmaFormer + and other models requiring BPE features have the necessary data available. + """ + path_data = str((pathlib.Path("..") / "data").resolve()) + + try: + from drevalpy.datasets.featurizer.create_bpe_smiles_embeddings import ( + create_bpe_smiles_embeddings, + ) + except ImportError: + # If subword-nmt is not installed, skip BPE feature creation + # Tests that require BPE features will fail with a clear error message + return + + # Create BPE features for both TOYv1 and TOYv2 + for dataset_name in ["TOYv1", "TOYv2"]: + bpe_smiles_file = pathlib.Path(path_data) / dataset_name / "drug_bpe_smiles.csv" + + # Only create if it doesn't exist + if not bpe_smiles_file.exists(): + try: + print(f"Creating BPE SMILES features for {dataset_name}...") + create_bpe_smiles_embeddings( + data_path=path_data, + dataset_name=dataset_name, + num_symbols=10000, + max_length=128, + ) + print(f"BPE SMILES features created for {dataset_name}") + except Exception as e: + # Log but don't fail - let individual tests handle missing features + print(f"Warning: Could not create BPE features for {dataset_name}: {e}") From e3d7b32d159e61f20c510d092775e76a17e67fb7 Mon Sep 17 00:00:00 2001 From: nictru Date: Mon, 12 Jan 2026 14:06:58 +0000 Subject: [PATCH 08/11] Next attempt of adding BP encoding before tests --- tests/conftest.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 4a5ebfa1..0c26a976 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -57,7 +57,9 @@ def ensure_bpe_features() -> None: This fixture runs automatically before any tests to ensure that PharmaFormer and other models requiring BPE features have the necessary data available. """ - path_data = str((pathlib.Path("..") / "data").resolve()) + # Ensure we're in the tests directory (pytest_configure should have done this) + tests_dir = pathlib.Path(__file__).parent.resolve() + path_data = str((tests_dir.parent / "data").resolve()) try: from drevalpy.datasets.featurizer.create_bpe_smiles_embeddings import ( @@ -68,12 +70,27 @@ def ensure_bpe_features() -> None: # Tests that require BPE features will fail with a clear error message return + # Ensure datasets are loaded first (this will download them if needed) + try: + load_toyv1(path_data) + load_toyv2(path_data) + except Exception as e: + # If dataset loading fails, skip BPE creation + print(f"Warning: Could not load datasets for BPE feature creation: {e}") + return + # Create BPE features for both TOYv1 and TOYv2 for dataset_name in ["TOYv1", "TOYv2"]: - bpe_smiles_file = pathlib.Path(path_data) / dataset_name / "drug_bpe_smiles.csv" + dataset_dir = pathlib.Path(path_data) / dataset_name + bpe_smiles_file = dataset_dir / "drug_bpe_smiles.csv" + smiles_file = dataset_dir / "drug_smiles.csv" - # Only create if it doesn't exist + # Only create if it doesn't exist and if drug_smiles.csv exists if not bpe_smiles_file.exists(): + if not smiles_file.exists(): + print(f"Warning: drug_smiles.csv not found for {dataset_name}, skipping BPE creation") + continue + try: print(f"Creating BPE SMILES features for {dataset_name}...") create_bpe_smiles_embeddings( @@ -86,3 +103,6 @@ def ensure_bpe_features() -> None: except Exception as e: # Log but don't fail - let individual tests handle missing features print(f"Warning: Could not create BPE features for {dataset_name}: {e}") + import traceback + + traceback.print_exc() From 1204dcb13e4f527af8a2119fabb77f50a385df36 Mon Sep 17 00:00:00 2001 From: nictru Date: Wed, 14 Jan 2026 09:25:57 +0000 Subject: [PATCH 09/11] Add docs --- docs/drevalpy.models.PharmaFormer.rst | 18 ++++++++++++++++++ docs/drevalpy.models.rst | 1 + 2 files changed, 19 insertions(+) create mode 100644 docs/drevalpy.models.PharmaFormer.rst diff --git a/docs/drevalpy.models.PharmaFormer.rst b/docs/drevalpy.models.PharmaFormer.rst new file mode 100644 index 00000000..2adf093f --- /dev/null +++ b/docs/drevalpy.models.PharmaFormer.rst @@ -0,0 +1,18 @@ +PharmaFormer +============================= + +PharmaFormer Model +---------------------------------- + +.. automodule:: drevalpy.models.PharmaFormer.pharmaformer + :members: + :undoc-members: + :show-inheritance: + +Model utils +---------------------------------- + +.. automodule:: drevalpy.models.PharmaFormer.model_utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/drevalpy.models.rst b/docs/drevalpy.models.rst index 55a21d41..7fca1fa2 100644 --- a/docs/drevalpy.models.rst +++ b/docs/drevalpy.models.rst @@ -26,6 +26,7 @@ Implemented models drevalpy.models.DIPK drevalpy.models.MOLIR + drevalpy.models.PharmaFormer drevalpy.models.SRMF drevalpy.models.SimpleNeuralNetwork drevalpy.models.SuperFELTR From 6d05984b397778f88304d150094893850dffb99b Mon Sep 17 00:00:00 2001 From: nictru Date: Wed, 14 Jan 2026 09:26:16 +0000 Subject: [PATCH 10/11] Add warning for featurizer --- ...=> create_pharmaformer_drug_embeddings.py} | 35 ++++++++++++++++--- 1 file changed, 31 insertions(+), 4 deletions(-) rename drevalpy/datasets/featurizer/{create_bpe_smiles_embeddings.py => create_pharmaformer_drug_embeddings.py} (77%) diff --git a/drevalpy/datasets/featurizer/create_bpe_smiles_embeddings.py b/drevalpy/datasets/featurizer/create_pharmaformer_drug_embeddings.py similarity index 77% rename from drevalpy/datasets/featurizer/create_bpe_smiles_embeddings.py rename to drevalpy/datasets/featurizer/create_pharmaformer_drug_embeddings.py index adc7cb25..3100affa 100644 --- a/drevalpy/datasets/featurizer/create_bpe_smiles_embeddings.py +++ b/drevalpy/datasets/featurizer/create_pharmaformer_drug_embeddings.py @@ -1,4 +1,13 @@ -"""Preprocesses drug SMILES strings into BPE-encoded embeddings.""" +"""Preprocesses drug SMILES strings into BPE-encoded embeddings. + +WARNING: This featurizer produces problematic embeddings and should ONLY be used +with the PharmaFormer model. It replicates the original PharmaFormer implementation +for compatibility, but the embeddings have known issues and should not be used +for any other models. + +Details about the issues are explained in: +https://github.com/daisybio/drevalpy/pull/336#discussion_r2682718948 +""" import argparse import codecs @@ -17,7 +26,7 @@ raise ImportError("Please install subword-nmt package for BPE SMILES featurizer: pip install subword-nmt") -def create_bpe_smiles_embeddings( +def create_pharmaformer_drug_embeddings( data_path: str, dataset_name: str, num_symbols: int = 10000, @@ -26,6 +35,15 @@ def create_bpe_smiles_embeddings( """ Create BPE-encoded SMILES embeddings for drugs. + WARNING: This featurizer produces problematic embeddings and should ONLY be used + with the PharmaFormer model. It replicates the original PharmaFormer implementation + for compatibility purposes, but the embeddings have known issues and should NOT + be used for any other models. + + Details about the issues are explained in: + https://github.com/daisybio/drevalpy/pull/336#discussion_r2682718948 + + Process: 1. Read drug_smiles.csv 2. Learn BPE codes from all SMILES strings 3. Apply BPE to each SMILES @@ -117,7 +135,16 @@ def create_bpe_smiles_embeddings( def main(): - """Process drug SMILES and save BPE-encoded embeddings.""" + """Process drug SMILES and save BPE-encoded embeddings. + + WARNING: This featurizer produces problematic embeddings and should ONLY be used + with the PharmaFormer model. It replicates the original PharmaFormer implementation + for compatibility purposes, but the embeddings have known issues and should NOT + be used for any other models. + + Details about the issues are explained in: + https://github.com/daisybio/drevalpy/pull/336#discussion_r2682718948 + """ parser = argparse.ArgumentParser(description="Preprocess drug SMILES to BPE-encoded embeddings.") parser.add_argument("dataset_name", type=str, help="The name of the dataset to process.") parser.add_argument("--data_path", type=str, default="data", help="Path to the data folder") @@ -125,7 +152,7 @@ def main(): parser.add_argument("--max-length", type=int, default=128, help="Maximum length of encoded SMILES") args = parser.parse_args() - create_bpe_smiles_embeddings( + create_pharmaformer_drug_embeddings( data_path=args.data_path, dataset_name=args.dataset_name, num_symbols=args.num_symbols, From 96118286366cf2af3cc46be45bf4ddc96f59f079 Mon Sep 17 00:00:00 2001 From: nictru Date: Wed, 14 Jan 2026 09:28:17 +0000 Subject: [PATCH 11/11] Update pharmaformer drug featurizer references --- drevalpy/models/PharmaFormer/pharmaformer.py | 2 +- tests/conftest.py | 6 +++--- tests/test_featurizers.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drevalpy/models/PharmaFormer/pharmaformer.py b/drevalpy/models/PharmaFormer/pharmaformer.py index 4062a457..b6eaf647 100644 --- a/drevalpy/models/PharmaFormer/pharmaformer.py +++ b/drevalpy/models/PharmaFormer/pharmaformer.py @@ -367,7 +367,7 @@ def load_drug_features(self, data_path: str, dataset_name: str) -> FeatureDatase raise FileNotFoundError( f"BPE SMILES file not found: {bpe_smiles_file}. " "Please run the BPE featurizer first: " - "python -m drevalpy.datasets.featurizer.create_bpe_smiles_embeddings " + "python -m drevalpy.datasets.featurizer.create_pharmaformer_drug_embeddings " ) bpe_df = pd.read_csv(bpe_smiles_file, dtype={"pubchem_id": str}) diff --git a/tests/conftest.py b/tests/conftest.py index 0c26a976..48fa7703 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -62,8 +62,8 @@ def ensure_bpe_features() -> None: path_data = str((tests_dir.parent / "data").resolve()) try: - from drevalpy.datasets.featurizer.create_bpe_smiles_embeddings import ( - create_bpe_smiles_embeddings, + from drevalpy.datasets.featurizer.create_pharmaformer_drug_embeddings import ( + create_pharmaformer_drug_embeddings, ) except ImportError: # If subword-nmt is not installed, skip BPE feature creation @@ -93,7 +93,7 @@ def ensure_bpe_features() -> None: try: print(f"Creating BPE SMILES features for {dataset_name}...") - create_bpe_smiles_embeddings( + create_pharmaformer_drug_embeddings( data_path=path_data, dataset_name=dataset_name, num_symbols=10000, diff --git a/tests/test_featurizers.py b/tests/test_featurizers.py index 0d1f5f67..a2db9b69 100644 --- a/tests/test_featurizers.py +++ b/tests/test_featurizers.py @@ -119,7 +119,7 @@ def test_bpe_smiles_featurizer(tmp_path): :param tmp_path: Temporary path provided by pytest. """ try: - import drevalpy.datasets.featurizer.create_bpe_smiles_embeddings as bpe_feat + import drevalpy.datasets.featurizer.create_pharmaformer_drug_embeddings as bpe_feat except ImportError: print("subword-nmt package not installed; skipping BPE SMILES featurizer test.") return