1
+ import difflib
1
2
import itertools
2
3
import logging
3
4
import os
27
28
from Bio .Align import MultipleSeqAlignment
28
29
from Bio .Seq import Seq
29
30
from Bio .SeqRecord import SeqRecord
31
+ from biopandas .pdb import PandasPdb
30
32
from pytorch_lightning .loggers import TensorBoardLogger , WandbLogger
31
33
32
34
from project .utils .deepinteract_constants import FEAT_COLS , ALLOWABLE_FEATS , D3TO1
@@ -593,9 +595,11 @@ def copy_input_to_raw_dir(input_dataset_dir: str, pdb_filepath: str, pdb_code: s
593
595
"""Make a copy of the input PDB file in the newly-created raw directory."""
594
596
filename = db .get_pdb_code (pdb_filepath ) + f'_{ chain_indic } .pdb' \
595
597
if chain_indic not in pdb_filepath else db .get_pdb_name (pdb_filepath )
598
+ new_filepath = os .path .join (input_dataset_dir , "raw" , pdb_code , filename )
596
599
input_copy_cmd = f'cp { pdb_filepath } { os .path .join (input_dataset_dir , "raw" , pdb_code , filename )} '
597
600
input_copy_proc = subprocess .Popen (input_copy_cmd .split (), stdout = subprocess .PIPE , cwd = os .getcwd ())
598
601
_ , _ = input_copy_proc .communicate () # Wait until the input copy cmd is finished
602
+ return new_filepath
599
603
600
604
601
605
def make_dataset (input_dataset_dir = 'datasets/Input/raw' , output_dir = 'datasets/Input/interim' , num_cpus = 1 ,
@@ -618,6 +622,41 @@ def make_dataset(input_dataset_dir='datasets/Input/raw', output_dir='datasets/In
618
622
pair .all_complex_to_pairs (complexes , source_type , get_pairs , pairs_dir , num_cpus )
619
623
620
624
625
+ def recover_any_missing_chain_ids (interim_dataset_dir : str , new_pdb_filepath : str ,
626
+ orig_pdb_filepath : str , pdb_code : str , chain_number : int ):
627
+ """Restore any missing chain IDs for the chain represented by the corresponding Pandas DataFrame."""
628
+ orig_pdb_chain_id = '_' # Default value for missing chain IDs
629
+ new_pdb_code = db .get_pdb_code (new_pdb_filepath )
630
+ orig_pdb_name = db .get_pdb_name (orig_pdb_filepath )
631
+ orig_pdb_df = PandasPdb ().read_pdb (new_pdb_filepath ).df ['ATOM' ]
632
+ unique_chain_ids = np .unique (orig_pdb_df ['chain_id' ].values )
633
+ """Ascertain the chain ID corresponding to the original PDB file, using one of two available methods.
634
+ Method 1: Used with datasets such as EVCoupling adopting .atom filename extensions (e.g., 4DI3C.atom)
635
+ Method 2: Used with datasets such as DeepHomo adopting regular .pdb filename extensions (e.g., 2FNUA.pdb)"""
636
+ if len (unique_chain_ids ) == 1 and unique_chain_ids [0 ].strip () == '' : # Method 1: Try to use filename differences
637
+ # No chain IDs were found, so we instead need to look to the original PDB filename to get the orig. chain ID
638
+ pdb_code_diffs = difflib .ndiff (new_pdb_code , orig_pdb_name )
639
+ for i , s in enumerate (pdb_code_diffs ):
640
+ if s [0 ] == '+' :
641
+ orig_pdb_chain_id = s [1 :].strip ()[0 ]
642
+ break
643
+ else : # Method 2: Try to use unique chain IDs
644
+ # Assume the first/second index is the first non-empty chain ID (e.g., 'A')
645
+ orig_pdb_chain_id = unique_chain_ids [0 ] if (unique_chain_ids [0 ] != '' ) else unique_chain_ids [1 ]
646
+ # Update the existing Pair to contain the newly-recovered chain ID
647
+ pair_dir = os .path .join (interim_dataset_dir , 'pairs' , pdb_code )
648
+ pair_filenames = [os .path .join (pair_dir , filename ) for filename in os .listdir (pair_dir ) if new_pdb_code in filename ]
649
+ # Load in the existing Pair
650
+ with open (pair_filenames [0 ], 'rb' ) as f :
651
+ pair = dill .load (f )
652
+ # Update the corresponding chain ID
653
+ pair .df0 .chain = orig_pdb_chain_id if chain_number == 1 else pair .df0 .chain
654
+ pair .df1 .chain = orig_pdb_chain_id if chain_number == 2 else pair .df1 .chain
655
+ # Save the updated Pair
656
+ with open (pair_filenames [0 ], 'wb' ) as f :
657
+ dill .dump (pair , f )
658
+
659
+
621
660
def generate_psaia_features (psaia_dir = '~/Programs/PSAIA_1.0_source/bin/linux/psa' ,
622
661
psaia_config = 'datasets/builder/psaia_config_file_input.txt' ,
623
662
pdb_dataset = 'datasets/Input/raw' , pkl_dataset = 'datasets/Input/interim/parsed' ,
@@ -727,9 +766,13 @@ def convert_input_pdb_files_to_pair(left_pdb_filepath: str, right_pdb_filepath:
727
766
pdb_code = db .get_pdb_group (list (ca .get_complex_pdb_codes ([left_pdb_filepath , right_pdb_filepath ]))[0 ])
728
767
# Iteratively execute the PDB file feature generation process
729
768
create_input_dir_struct (input_dataset_dir , pdb_code )
730
- copy_input_to_raw_dir (input_dataset_dir , left_pdb_filepath , pdb_code , 'l_u' )
731
- copy_input_to_raw_dir (input_dataset_dir , right_pdb_filepath , pdb_code , 'r_u' )
769
+ new_l_u_filepath = copy_input_to_raw_dir (input_dataset_dir , left_pdb_filepath , pdb_code , 'l_u' )
770
+ new_r_u_filepath = copy_input_to_raw_dir (input_dataset_dir , right_pdb_filepath , pdb_code , 'r_u' )
732
771
make_dataset (os .path .join (input_dataset_dir , 'raw' ), os .path .join (input_dataset_dir , 'interim' ))
772
+ recover_any_missing_chain_ids (os .path .join (input_dataset_dir , 'interim' ),
773
+ new_l_u_filepath , left_pdb_filepath , pdb_code , 1 )
774
+ recover_any_missing_chain_ids (os .path .join (input_dataset_dir , 'interim' ),
775
+ new_r_u_filepath , right_pdb_filepath , pdb_code , 2 )
733
776
generate_psaia_features (psaia_dir = psaia_dir ,
734
777
psaia_config = psaia_config ,
735
778
pdb_dataset = os .path .join (input_dataset_dir , 'raw' ),
0 commit comments