Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions docs/drevalpy.models.PharmaFormer.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
PharmaFormer
=============================

PharmaFormer Model
----------------------------------

.. automodule:: drevalpy.models.PharmaFormer.pharmaformer
:members:
:undoc-members:
:show-inheritance:

Model utils
----------------------------------

.. automodule:: drevalpy.models.PharmaFormer.model_utils
:members:
:undoc-members:
:show-inheritance:
1 change: 1 addition & 0 deletions docs/drevalpy.models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ Implemented models

drevalpy.models.DIPK
drevalpy.models.MOLIR
drevalpy.models.PharmaFormer
drevalpy.models.SRMF
drevalpy.models.SimpleNeuralNetwork
drevalpy.models.SuperFELTR
Expand Down
164 changes: 164 additions & 0 deletions drevalpy/datasets/featurizer/create_pharmaformer_drug_embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
"""Preprocesses drug SMILES strings into BPE-encoded embeddings.

WARNING: This featurizer produces problematic embeddings and should ONLY be used
with the PharmaFormer model. It replicates the original PharmaFormer implementation
for compatibility, but the embeddings have known issues and should not be used
for any other models.

Details about the issues are explained in:
https://github.com/daisybio/drevalpy/pull/336#discussion_r2682718948
"""

import argparse
import codecs
import os
import tempfile
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm

try:
from subword_nmt.apply_bpe import BPE
from subword_nmt.learn_bpe import learn_bpe
except ImportError:
raise ImportError("Please install subword-nmt package for BPE SMILES featurizer: pip install subword-nmt")


def create_pharmaformer_drug_embeddings(
data_path: str,
dataset_name: str,
num_symbols: int = 10000,
max_length: int = 128,
) -> None:
"""
Create BPE-encoded SMILES embeddings for drugs.

WARNING: This featurizer produces problematic embeddings and should ONLY be used
with the PharmaFormer model. It replicates the original PharmaFormer implementation
for compatibility purposes, but the embeddings have known issues and should NOT
be used for any other models.

Details about the issues are explained in:
https://github.com/daisybio/drevalpy/pull/336#discussion_r2682718948

Process:
1. Read drug_smiles.csv
2. Learn BPE codes from all SMILES strings
3. Apply BPE to each SMILES
4. Convert to character ordinals
5. Pad/truncate to max_length
6. Save to drug_bpe_smiles.csv

:param data_path: Path to the data folder
:param dataset_name: Name of the dataset to process
:param num_symbols: Number of BPE symbols to learn
:param max_length: Maximum length of encoded SMILES (padding/truncation)
:raises FileNotFoundError: If drug_smiles.csv is not found
:raises Exception: If a drug fails to process
"""
data_dir = Path(data_path).resolve()
dataset_dir = data_dir / dataset_name

smiles_file = dataset_dir / "drug_smiles.csv"
bpe_codes_path = dataset_dir / "bpe.codes"
output_file = dataset_dir / "drug_bpe_smiles.csv"

if not smiles_file.exists():
raise FileNotFoundError(f"Error: {smiles_file} not found.")

# Read SMILES data
smiles_df = pd.read_csv(smiles_file, dtype={"canonical_smiles": str, "pubchem_id": str})
smiles_df = smiles_df.dropna(subset=["canonical_smiles"])

print(f"Learning BPE codes from {len(smiles_df)} SMILES strings...")

# Create temporary file with SMILES strings for BPE learning
# learn_bpe expects one item per line
with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as tmp_file:
tmp_smiles_file = tmp_file.name
for smiles in smiles_df["canonical_smiles"]:
tmp_file.write(f"{smiles}\n")

# Learn BPE codes from SMILES corpus
try:
with codecs.open(tmp_smiles_file, encoding="utf-8") as f_in:
with codecs.open(str(bpe_codes_path), "w", encoding="utf-8") as f_out:
learn_bpe(f_in, f_out, num_symbols=num_symbols)
finally:
# Clean up temporary file
if os.path.exists(tmp_smiles_file):
os.remove(tmp_smiles_file)

print(f"BPE codes saved to {bpe_codes_path}")

# Load BPE encoder
with codecs.open(str(bpe_codes_path), encoding="utf-8") as f_in:
bpe = BPE(f_in)

# Encode each SMILES string
embeddings_list = []
drug_ids = []

print(f"Encoding {len(smiles_df)} SMILES strings...")

for row in tqdm(smiles_df.itertuples(index=False), total=len(smiles_df)):
drug_id = row.pubchem_id
smiles = row.canonical_smiles

try:
# Apply BPE
bpe_processed = bpe.process_line(smiles)
# Convert to character ordinals
encoded = [ord(char) for char in bpe_processed]
# Pad/truncate to max_length
if len(encoded) > max_length:
encoded = encoded[:max_length]
else:
encoded = np.pad(encoded, (0, max_length - len(encoded)), "constant").tolist()

embeddings_list.append(encoded)
drug_ids.append(drug_id)
except Exception as e:
print(f"\nFailed to process drug {drug_id} with SMILES: {smiles}")
print(f"Error: {e}")
raise e

# Create DataFrame with pubchem_id and encoded features
embeddings_df = pd.DataFrame(embeddings_list)
embeddings_df.columns = [f"feature_{i}" for i in range(max_length)]
embeddings_df.insert(0, "pubchem_id", drug_ids)
embeddings_df.to_csv(output_file, index=False)

print(f"Finished processing. BPE-encoded SMILES saved to {output_file}")


def main():
"""Process drug SMILES and save BPE-encoded embeddings.

WARNING: This featurizer produces problematic embeddings and should ONLY be used
with the PharmaFormer model. It replicates the original PharmaFormer implementation
for compatibility purposes, but the embeddings have known issues and should NOT
be used for any other models.

Details about the issues are explained in:
https://github.com/daisybio/drevalpy/pull/336#discussion_r2682718948
"""
parser = argparse.ArgumentParser(description="Preprocess drug SMILES to BPE-encoded embeddings.")
parser.add_argument("dataset_name", type=str, help="The name of the dataset to process.")
parser.add_argument("--data_path", type=str, default="data", help="Path to the data folder")
parser.add_argument("--num-symbols", type=int, default=10000, help="Number of BPE symbols to learn")
parser.add_argument("--max-length", type=int, default=128, help="Maximum length of encoded SMILES")
args = parser.parse_args()

create_pharmaformer_drug_embeddings(
data_path=args.data_path,
dataset_name=args.dataset_name,
num_symbols=args.num_symbols,
max_length=args.max_length,
)


if __name__ == "__main__":
main()
5 changes: 5 additions & 0 deletions drevalpy/models/PharmaFormer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""PharmaFormer model."""

from .pharmaformer import PharmaFormerModel

__all__ = ["PharmaFormerModel"]
33 changes: 33 additions & 0 deletions drevalpy/models/PharmaFormer/hyperparameters.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
---
PharmaFormer:
gene_hidden_size:
- 2048
- 4096
drug_hidden_size:
- 128
- 256
feature_dim:
- 64
- 128
nhead:
- 4
- 8
num_layers:
- 2
- 3
dim_feedforward:
- 1024
- 2048
dropout:
- 0.1
- 0.2
batch_size:
- 64
- 128
lr:
- 0.00001
- 0.0001
epochs:
- 100
patience:
- 10
140 changes: 140 additions & 0 deletions drevalpy/models/PharmaFormer/model_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
"""Neural network components for PharmaFormer model."""

import torch
import torch.nn as nn
import torch.nn.functional as functional


class FeatureExtractor(nn.Module):
"""Feature extractor for gene expression and drug SMILES."""

def __init__(self, gene_input_size: int, gene_hidden_size: int, drug_hidden_size: int):
"""
Initialize the feature extractor.

:param gene_input_size: Input size for gene expression features
:param gene_hidden_size: Hidden size for gene expression MLP
:param drug_hidden_size: Hidden size for drug SMILES MLP
"""
super().__init__()
self.gene_fc1 = nn.Linear(gene_input_size, gene_hidden_size)
self.gene_fc2 = nn.Linear(gene_hidden_size, gene_hidden_size)
self.smiles_fc = nn.Linear(128, drug_hidden_size)

def forward(self, gene_expr: torch.Tensor, smiles: torch.Tensor) -> torch.Tensor:
"""
Forward pass of the feature extractor.

:param gene_expr: Gene expression features [batch_size, gene_input_size]
:param smiles: BPE-encoded SMILES features [batch_size, 128]
:return: Combined features [batch_size, gene_hidden_size + drug_hidden_size]
"""
gene_out = functional.relu(self.gene_fc1(gene_expr))
gene_out = functional.relu(self.gene_fc2(gene_out))
smiles_out = functional.relu(self.smiles_fc(smiles))
combined_features = torch.cat((gene_out, smiles_out), dim=1)
return combined_features


class TransModel(nn.Module):
"""Transformer model for processing combined features."""

def __init__(
self,
feature_dim: int,
nhead: int,
seq_len: int,
dim_feedforward: int = 2048,
dropout: float = 0.1,
num_layers: int = 3,
):
"""
Initialize the transformer model.

:param feature_dim: Dimension of each feature in the sequence
:param nhead: Number of attention heads
:param seq_len: Length of the input sequence
:param dim_feedforward: Dimension of feedforward network
:param dropout: Dropout rate
:param num_layers: Number of transformer encoder layers
"""
super().__init__()
encoder_layer = nn.TransformerEncoderLayer(
d_model=feature_dim,
nhead=nhead,
dim_feedforward=dim_feedforward,
dropout=dropout,
batch_first=True,
)
self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
self.output = nn.Sequential(
nn.Linear(seq_len * feature_dim, 1024),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(1024, 1),
)

def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Forward pass of the transformer model.

:param x: Input tensor [batch_size, seq_len, feature_dim]
:return: Output predictions [batch_size, 1]
"""
x = self.transformer_encoder(x)
x = torch.flatten(x, 1)
return self.output(x)


class CombinedModel(nn.Module):
"""Combined model integrating feature extraction and transformer."""

def __init__(
self,
gene_input_size: int,
gene_hidden_size: int,
drug_hidden_size: int,
feature_dim: int,
nhead: int,
num_layers: int = 3,
dim_feedforward: int = 2048,
dropout: float = 0.1,
):
"""
Initialize the combined model.

:param gene_input_size: Input size for gene expression features
:param gene_hidden_size: Hidden size for gene expression MLP
:param drug_hidden_size: Hidden size for drug SMILES MLP
:param feature_dim: Dimension of each feature in the transformer sequence
:param nhead: Number of attention heads
:param num_layers: Number of transformer encoder layers
:param dim_feedforward: Dimension of feedforward network
:param dropout: Dropout rate
"""
super().__init__()
self.feature_extractor = FeatureExtractor(gene_input_size, gene_hidden_size, drug_hidden_size)
self.feature_dim = feature_dim
self.seq_len = (gene_hidden_size + drug_hidden_size) // feature_dim
self.transformer = TransModel(
feature_dim=feature_dim,
nhead=nhead,
seq_len=self.seq_len,
num_layers=num_layers,
dim_feedforward=dim_feedforward,
dropout=dropout,
)

def forward(self, gene_expr: torch.Tensor, smiles: torch.Tensor) -> torch.Tensor:
"""
Forward pass of the combined model.

:param gene_expr: Gene expression features [batch_size, gene_input_size]
:param smiles: BPE-encoded SMILES features [batch_size, 128]
:return: Output predictions [batch_size, 1]
"""
features = self.feature_extractor(gene_expr, smiles)
batch_size = features.size(0)
features = features.view(batch_size, self.seq_len, self.feature_dim)
output = self.transformer(features)
return output
Loading
Loading