Skip to content

Commit 4df3315

Browse files
committed
Add BioPandas to facilitate recovery of any missing PDB chain IDs during inference
1 parent f77978b commit 4df3315

File tree

3 files changed

+48
-3
lines changed

3 files changed

+48
-3
lines changed

project/utils/deepinteract_utils.py

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import difflib
12
import itertools
23
import logging
34
import os
@@ -27,6 +28,7 @@
2728
from Bio.Align import MultipleSeqAlignment
2829
from Bio.Seq import Seq
2930
from Bio.SeqRecord import SeqRecord
31+
from biopandas.pdb import PandasPdb
3032
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
3133

3234
from project.utils.deepinteract_constants import FEAT_COLS, ALLOWABLE_FEATS, D3TO1
@@ -593,9 +595,11 @@ def copy_input_to_raw_dir(input_dataset_dir: str, pdb_filepath: str, pdb_code: s
593595
"""Make a copy of the input PDB file in the newly-created raw directory."""
594596
filename = db.get_pdb_code(pdb_filepath) + f'_{chain_indic}.pdb' \
595597
if chain_indic not in pdb_filepath else db.get_pdb_name(pdb_filepath)
598+
new_filepath = os.path.join(input_dataset_dir, "raw", pdb_code, filename)
596599
input_copy_cmd = f'cp {pdb_filepath} {os.path.join(input_dataset_dir, "raw", pdb_code, filename)}'
597600
input_copy_proc = subprocess.Popen(input_copy_cmd.split(), stdout=subprocess.PIPE, cwd=os.getcwd())
598601
_, _ = input_copy_proc.communicate() # Wait until the input copy cmd is finished
602+
return new_filepath
599603

600604

601605
def make_dataset(input_dataset_dir='datasets/Input/raw', output_dir='datasets/Input/interim', num_cpus=1,
@@ -618,6 +622,41 @@ def make_dataset(input_dataset_dir='datasets/Input/raw', output_dir='datasets/In
618622
pair.all_complex_to_pairs(complexes, source_type, get_pairs, pairs_dir, num_cpus)
619623

620624

625+
def recover_any_missing_chain_ids(interim_dataset_dir: str, new_pdb_filepath: str,
626+
orig_pdb_filepath: str, pdb_code: str, chain_number: int):
627+
"""Restore any missing chain IDs for the chain represented by the corresponding Pandas DataFrame."""
628+
orig_pdb_chain_id = '_' # Default value for missing chain IDs
629+
new_pdb_code = db.get_pdb_code(new_pdb_filepath)
630+
orig_pdb_name = db.get_pdb_name(orig_pdb_filepath)
631+
orig_pdb_df = PandasPdb().read_pdb(new_pdb_filepath).df['ATOM']
632+
unique_chain_ids = np.unique(orig_pdb_df['chain_id'].values)
633+
"""Ascertain the chain ID corresponding to the original PDB file, using one of two available methods.
634+
Method 1: Used with datasets such as EVCoupling adopting .atom filename extensions (e.g., 4DI3C.atom)
635+
Method 2: Used with datasets such as DeepHomo adopting regular .pdb filename extensions (e.g., 2FNUA.pdb)"""
636+
if len(unique_chain_ids) == 1 and unique_chain_ids[0].strip() == '': # Method 1: Try to use filename differences
637+
# No chain IDs were found, so we instead need to look to the original PDB filename to get the orig. chain ID
638+
pdb_code_diffs = difflib.ndiff(new_pdb_code, orig_pdb_name)
639+
for i, s in enumerate(pdb_code_diffs):
640+
if s[0] == '+':
641+
orig_pdb_chain_id = s[1:].strip()[0]
642+
break
643+
else: # Method 2: Try to use unique chain IDs
644+
# Assume the first/second index is the first non-empty chain ID (e.g., 'A')
645+
orig_pdb_chain_id = unique_chain_ids[0] if (unique_chain_ids[0] != '') else unique_chain_ids[1]
646+
# Update the existing Pair to contain the newly-recovered chain ID
647+
pair_dir = os.path.join(interim_dataset_dir, 'pairs', pdb_code)
648+
pair_filenames = [os.path.join(pair_dir, filename) for filename in os.listdir(pair_dir) if new_pdb_code in filename]
649+
# Load in the existing Pair
650+
with open(pair_filenames[0], 'rb') as f:
651+
pair = dill.load(f)
652+
# Update the corresponding chain ID
653+
pair.df0.chain = orig_pdb_chain_id if chain_number == 1 else pair.df0.chain
654+
pair.df1.chain = orig_pdb_chain_id if chain_number == 2 else pair.df1.chain
655+
# Save the updated Pair
656+
with open(pair_filenames[0], 'wb') as f:
657+
dill.dump(pair, f)
658+
659+
621660
def generate_psaia_features(psaia_dir='~/Programs/PSAIA_1.0_source/bin/linux/psa',
622661
psaia_config='datasets/builder/psaia_config_file_input.txt',
623662
pdb_dataset='datasets/Input/raw', pkl_dataset='datasets/Input/interim/parsed',
@@ -727,9 +766,13 @@ def convert_input_pdb_files_to_pair(left_pdb_filepath: str, right_pdb_filepath:
727766
pdb_code = db.get_pdb_group(list(ca.get_complex_pdb_codes([left_pdb_filepath, right_pdb_filepath]))[0])
728767
# Iteratively execute the PDB file feature generation process
729768
create_input_dir_struct(input_dataset_dir, pdb_code)
730-
copy_input_to_raw_dir(input_dataset_dir, left_pdb_filepath, pdb_code, 'l_u')
731-
copy_input_to_raw_dir(input_dataset_dir, right_pdb_filepath, pdb_code, 'r_u')
769+
new_l_u_filepath = copy_input_to_raw_dir(input_dataset_dir, left_pdb_filepath, pdb_code, 'l_u')
770+
new_r_u_filepath = copy_input_to_raw_dir(input_dataset_dir, right_pdb_filepath, pdb_code, 'r_u')
732771
make_dataset(os.path.join(input_dataset_dir, 'raw'), os.path.join(input_dataset_dir, 'interim'))
772+
recover_any_missing_chain_ids(os.path.join(input_dataset_dir, 'interim'),
773+
new_l_u_filepath, left_pdb_filepath, pdb_code, 1)
774+
recover_any_missing_chain_ids(os.path.join(input_dataset_dir, 'interim'),
775+
new_r_u_filepath, right_pdb_filepath, pdb_code, 2)
733776
generate_psaia_features(psaia_dir=psaia_dir,
734777
psaia_config=psaia_config,
735778
pdb_dataset=os.path.join(input_dataset_dir, 'raw'),

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ setuptools==57.4.0
33
atom3-py3==0.1.9.8
44
click==8.0.1
55
easy-parallel-py3==0.1.6.4
6+
biopandas==0.2.9
67
dill==0.3.4
78
tqdm==4.62.0
89
Sphinx==4.0.1

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
setup(
66
name='DeepInteract',
7-
version='1.0.2',
7+
version='1.0.3',
88
description='A geometric deep learning pipeline for predicting protein interface contacts.',
99
author='Alex Morehead',
1010
author_email='[email protected]',
@@ -15,6 +15,7 @@
1515
'atom3-py3==0.1.9.8',
1616
'click==8.0.1',
1717
'easy-parallel-py3==0.1.6.4',
18+
'biopandas==0.2.9',
1819
'dill==0.3.4',
1920
'tqdm==4.62.0',
2021
'Sphinx==4.0.1',

0 commit comments

Comments
 (0)