inspiremd · jiayeguo · Mar 18, 2019 · Mar 18, 2019 · Mar 26, 2019 · Aug 5, 2019
diff --git a/devtools/conda-recipe/meta.yaml b/devtools/conda-recipe/meta.yaml
@@ -22,6 +22,11 @@ requirements:
     #- pypdb # TODO: Build this for omnia or conda-forge
     - xmltodict
 
+dependencies:
+    # dependencies
+    #- numpy
+    # Base depends
+    - pip
 test:
   requires:
     - pytest

diff --git a/kinomodel/data/docked.mol2 b/kinomodel/data/docked.mol2
diff --git a/kinomodel/data/docking/1m17.pdb b/kinomodel/data/docking/1m17.pdb
diff --git a/kinomodel/data/docking/prepared_receptor.oeb b/kinomodel/data/docking/prepared_receptor.oeb
diff --git a/kinomodel/features/protein.py b/kinomodel/features/protein.py
@@ -103,42 +103,68 @@ def compute_simple_protein_features(pdbid, chainid, coordfile, numbering):
     import os
     import mdtraj as md
     import numpy as np
+    import sys
+    np.set_printoptions(threshold=sys.maxsize)
 
     pdb_file = None
 
     # A safer way to download files as wget may not exist on systems such MacOS
     # TODO: Since we retrieve the PDB file in multiple pieces of code, let's refactor this into one utility function
     # to avoid code duplication.
     import urllib
+
+    # get toppology info either from fixed pdb or original pdb file (based on input) 
+    # if analyzing a trajectory 
+    if coordfile == 'dcd':
+        traj = md.load(subprocess.check_output('ls *dcd', shell=True).strip(b'\n').decode("utf-8"),top = str(pdbid) + '_minimized.pdb')
+        #traj = md.load(str(pdbid) + '.dcd',top = str(pdbid) + '_fixed_solvated.pdb')
+        topology = md.load(str(pdbid)+'_fixed.pdb').topology
+
+    chain_lst = [] # get a list of chains to identify the chain of interest
+    import string
     with urllib.request.urlopen('http://www.pdb.org/pdb/files/{}.pdb'.format(pdbid)) as response:
         pdb_file = response.read()
-
+        # check whether chains are in reverse order (e.g. B, A)
+        check = pdb_file.decode().split()
+        for i in range(len(check)):
+            if check[i] == 'ATOM':
+                if check[i+4] in list(string.ascii_uppercase):
+                    if check[i+4] not in chain_lst:
+                        chain_lst.append(check[i+4])
+            elif check[i] == 'HETATM':
+                if check[i+4] in list(string.ascii_uppercase):
+                    if 'HET{}'.format(check[i+4]) not in chain_lst:
+                        chain_lst.append('HET{}'.format(check[i+4]))
+            elif check[i] == 'CONECT':
+                break
     with tempfile.TemporaryDirectory() as pdb_directory:
         pdb = os.path.join(pdb_directory,'{}.pdb'.format(pdbid))
         with open(pdb, 'w') as file:
             file.write(pdb_file.decode())
             # load traj before the temp pdb file was removed
             if coordfile == 'pdb':
+                print("loading top from pdb")
                 traj = md.load(pdb)
-            # get topology info from the structure
-            topology = md.load(pdb).topology
-
+                topology = md.load(pdb).topology
+    coord = traj.xyz
     table, bonds = topology.to_dataframe()
     atoms = table.values
-    # translate a letter chain id into a number index (A->0, B->1 etc)
-    # TODO: This may not be robust, since chains aren't always in sequence from A to Z
-    chain_index = ord(str(chainid).lower()) - 97
+    #print(atoms)
 
+    # translate a letter chain id into a number index based on chain order
+    # identify the index of the chain of interest based on it's position in chain list
+    chain_index = chain_lst.index(chainid) 
+
     # get the array of atom indices for the calculation of:
     #       * eight dihedrals (a 8*4 array where each row contains indices of the four atoms for each dihedral)
     #       * five ditances (a 5*2 array where each row contains indices of the two atoms for each dihedral)
-    dih = np.zeros(shape=(8, 4), dtype=int, order='C')
+    dih = np.zeros(shape=(10, 4), dtype=int, order='C')
     dis = np.zeros(shape=(5, 2), dtype=int, order='C')
 
     # name list of the dihedrals and distances
     dih_names = [
-        'aC_rot', 'xDFG_phi', 'xDFG_psi', 'dFG_phi', 'dFG_psi', 'DfG_phi',
-        'DfG_psi', 'DfG_chi'
+        'aC_rot', 'xDFG_phi', 'xDFG_psi', 'dFG_phi', 'dFG_psi', 'dFG_chi1', 'dFG_chi2', 'DfG_phi',
+        'DfG_psi', 'DfG_chi1'
     ]
     dis_names = ['K_E1', 'K_E2', 'DFG_conf1', 'DFG_conf2', 'fret']
 
@@ -148,100 +174,79 @@ def compute_simple_protein_features(pdbid, chainid, coordfile, numbering):
     by mdtraj but when the atom indices are not continuous there is a problem so a safer way to locate the coordinates
     is through row number (as a fake atom index) in case the atom indices are not continuous.
     '''
-    count = 0  # keep track of row indices as "atom indices"
-    for line in atoms:
-        # for the specified chain
-        if line[5] == chain_index:
-            # TODO: Use MDTraj DSL instead of this cumbersome comparison scheme.
-            # http://mdtraj.org/latest/atom_selection.html
-
-            # dihedral 1: between aC and aE helices
-            dih[0][0] = count if line[3] == numbering[20] and line[
-                1] == 'CA' else dih[0][0]
-            dih[0][1] = count if line[3] == numbering[28] and line[
-                1] == 'CA' else dih[0][1]
-            dih[0][2] = count if line[3] == numbering[60] and line[
-                1] == 'CA' else dih[0][2]
-            dih[0][3] = count if line[3] == numbering[62] and line[
-                1] == 'CA' else dih[0][3]
-            # dihedral 2 & 3: X-DFG Phi & Psi
-            dih[1][0] = count if line[3] == numbering[78] and line[
-                1] == 'C' else dih[1][0]
-            dih[1][1] = count if line[3] == numbering[79] and line[
-                1] == 'N' else dih[1][1]
-            dih[1][2] = count if line[3] == numbering[79] and line[
-                1] == 'CA' else dih[1][2]
-            dih[1][3] = count if line[3] == numbering[79] and line[
-                1] == 'C' else dih[1][3]
-            dih[2][0] = dih[1][1]
-            dih[2][1] = dih[1][2]
-            dih[2][2] = dih[1][3]
-            dih[2][3] = count if line[3] == numbering[80] and line[
-                1] == 'N' else dih[2][3]
-
-            # dihedral 4 & 5: DFG-Asp Phi & Psi
-            dih[3][0] = dih[1][3]
-            dih[3][1] = dih[2][3]
-            dih[3][2] = count if line[3] == numbering[80] and line[
-                1] == 'CA' else dih[3][2]
-            dih[3][3] = count if line[3] == numbering[80] and line[
-                1] == 'C' else dih[3][3]
-            dih[4][0] = dih[3][1]
-            dih[4][1] = dih[3][2]
-            dih[4][2] = dih[3][3]
-            dih[4][3] = count if line[3] == numbering[81] and line[
-                1] == 'N' else dih[4][3]
-
-            # dihedral 6 & 7: DFG-Phe Phi & Psi
-            dih[5][0] = dih[3][3]
-            dih[5][1] = dih[4][3]
-            dih[5][2] = count if line[3] == numbering[81] and line[
-                1] == 'CA' else dih[5][2]
-            dih[5][3] = count if line[3] == numbering[81] and line[
-                1] == 'C' else dih[5][3]
-            dih[6][0] = dih[5][1]
-            dih[6][1] = dih[5][2]
-            dih[6][2] = dih[5][3]
-            dih[6][3] = count if line[3] == numbering[82] and line[
-                1] == 'N' else dih[6][3]
-
-            # dihedral 8: DFG-Phe Chi
-            dih[7][0] = dih[5][1]
-            dih[7][1] = dih[5][2]
-            dih[7][2] = count if line[3] == numbering[81] and line[
-                1] == 'CB' else dih[7][2]
-            dih[7][3] = count if line[3] == numbering[81] and line[
-                1] == 'CG' else dih[7][3]
-
-            # distance 1 & 2: K-E salt bridge
-            dis[0][0] = count if line[3] == numbering[16] and line[
-                1] == 'NZ' else dis[0][0]
-            dis[0][1] = count if line[3] == numbering[23] and line[
-                1] == 'OE1' else dis[0][1]
-            dis[1][0] = count if line[3] == numbering[16] and line[
-                1] == 'NZ' else dis[1][0]
-            dis[1][1] = count if line[3] == numbering[23] and line[
-                1] == 'OE2' else dis[1][1]
-
-            # distance 3 & 4: DFG conformation-related distances
-            dis[2][0] = count if line[3] == numbering[27] and line[
-                1] == 'CA' else dis[2][0]
-            dis[2][1] = count if line[3] == numbering[81] and line[
-                1] == 'CZ' else dis[2][1]
-            dis[3][0] = count if line[3] == numbering[16] and line[
-                1] == 'CA' else dis[3][0]
-            dis[3][1] = dis[2][1]
-
-            # distance 5: FRET distance
-            dis[4][0] = count if line[3] == int(
-                numbering[80] + 10) and line[1] == 'CA' else dis[4][0]
-            dis[4][1] = count if line[3] == int(
-                numbering[80] - 20) and line[1] == 'CA' else dis[4][1]
-
-        if line[5] > chain_index:
-            break
-
-        count += 1
+
+    # dihedral 0: between aC and aE helices
+    dih[0][0] = topology.select("chainid {} and residue {} and name CA".format(chain_index,numbering[20]))
+    dih[0][1] = topology.select("chainid {} and residue {} and name CA".format(chain_index,numbering[28]))
+    dih[0][2] = topology.select("chainid {} and residue {} and name CA".format(chain_index,numbering[60]))
+    dih[0][3] = topology.select("chainid {} and residue {} and name CA".format(chain_index,numbering[62]))
+
+    # dihedral 1 & 2: X-DFG Phi & Psi
+    dih[1][0] = topology.select("chainid {} and residue {} and name C".format(chain_index,numbering[78]))
+    dih[1][1] = topology.select("chainid {} and residue {} and name N".format(chain_index,numbering[79]))
+    dih[1][2] = topology.select("chainid {} and residue {} and name CA".format(chain_index,numbering[79]))
+    dih[1][3] = topology.select("chainid {} and residue {} and name C".format(chain_index,numbering[79]))
+
+    dih[2][0] = dih[1][1]
+    dih[2][1] = dih[1][2]
+    dih[2][2] = dih[1][3]
+    dih[2][3] = topology.select("chainid {} and residue {} and name N".format(chain_index,numbering[80]))
+
+    # dihedral 3 & 4: DFG-Asp Phi & Psi
+    dih[3][0] = dih[1][3]
+    dih[3][1] = dih[2][3]
+    dih[3][2] = topology.select("chainid {} and residue {} and name CA".format(chain_index,numbering[80]))
+    dih[3][3] = topology.select("chainid {} and residue {} and name C".format(chain_index,numbering[80])) 
+
+    dih[4][0] = dih[3][1]
+    dih[4][1] = dih[3][2]
+    dih[4][2] = dih[3][3]
+    dih[4][3] = topology.select("chainid {} and residue {} and name N".format(chain_index,numbering[81]))
+
+    # dihedral 5 & 6: DFG-Asp Chi1 & Chi2
+    dih[5][0] = dih[2][3] # DFG-Asp N
+    dih[5][1] = dih[3][2] # DFG-Asp CA
+    dih[5][2] = topology.select("chainid {} and residue {} and name CB".format(chain_index,numbering[80])) # DFG-Asp CB
+    dih[5][3] = topology.select("chainid {} and residue {} and name CG".format(chain_index,numbering[80])) # DFG-Asp CG
+
+    dih[6][0] = dih[5][1]
+    dih[6][1] = dih[5][2]
+    dih[6][2] = dih[5][3]
+    dih[6][3] = topology.select("chainid {} and residue {} and name OD1".format(chain_index,numbering[80])) #DFG-Asp OD1
+
+    # dihedral 7 & 8: DFG-Phe Phi & Psi
+    dih[7][0] = dih[3][3]
+    dih[7][1] = dih[4][3]
+    dih[7][2] = topology.select("chainid {} and residue {} and name CA".format(chain_index,numbering[81]))              
+    dih[7][3] = topology.select("chainid {} and residue {} and name C".format(chain_index,numbering[81]))
+
+    dih[8][0] = dih[7][1]
+    dih[8][1] = dih[7][2]
+    dih[8][2] = dih[7][3]
+    dih[8][3] = topology.select("chainid {} and residue {} and name N".format(chain_index,numbering[82]))
+
+    # dihedral 9: DFG-Phe Chi1
+    dih[9][0] = dih[4][3] #DFG-Phe N
+    dih[9][1] = dih[7][2] #DFG-Phe CA
+    dih[9][2] = topology.select("chainid {} and residue {} and name CB".format(chain_index,numbering[81])) # DFG-Phe CB
+    dih[9][3] = topology.select("chainid {} and residue {} and name CG".format(chain_index,numbering[81])) # DFG-Phe CG
+
+    # distance 0 & 1: K-E salt bridge
+    dis[0][0] = topology.select("chainid {} and residue {} and name NZ".format(chain_index,numbering[16]))
+    dis[0][1] = topology.select("chainid {} and residue {} and name OE1".format(chain_index,numbering[23]))
+    dis[1][0] = dis[0][0]
+    dis[1][1] = topology.select("chainid {} and residue {} and name OE2".format(chain_index,numbering[23]))
+
+    # distance 2 & 3: DFG conformation-related distances
+    dis[2][0] = topology.select("chainid {} and residue {} and name CA".format(chain_index,numbering[27]))
+    dis[2][1] = topology.select("chainid {} and residue {} and name CZ".format(chain_index,numbering[81]))
+    dis[3][0] = topology.select("chainid {} and residue {} and name CA".format(chain_index,numbering[16]))
+    dis[3][1] = dis[2][1]
+
+    # distance 4: FRET distance
+    dis[4][0] = topology.select("chainid {} and residue {} and name CA".format(chain_index,numbering[16]+120))
+    dis[4][1] = topology.select("chainid {} and residue {} and name CA".format(chain_index,numbering[16]+61))
+
     # check if there is any missing coordinates; if so, skip dihedral/distance calculation for those residues
     check_flag = 1
     for i in range(len(dih)):
@@ -264,9 +269,7 @@ def compute_simple_protein_features(pdbid, chainid, coordfile, numbering):
         #    "There is no missing coordinates.  All dihedrals and distances will be computed."
         #)
     # calculate the dihedrals and distances for the user-specifed structure (a static structure or an MD trajectory)
-    if coordfile == 'dcd':
-        traj = md.load(str(pdbid) + '.dcd',top = str(pdbid) + '_fixed_solvated.pdb')
-    dihedrals = md.compute_dihedrals(traj, dih)
+    dihedrals = md.compute_dihedrals(traj, dih)/np.pi*180
     distances = md.compute_distances(traj, dis)
 
     # option to log the results
@@ -281,12 +284,6 @@ def compute_simple_protein_features(pdbid, chainid, coordfile, numbering):
     '''
 
     # clean up
-    # TODO: This is dangerous! Instead, rely on using the "with tempfile.TemporaryDirectory" context manager idiom to create and clean up temporary directories
-    #import subprocess
-    #rm_file = 'rm ./' + str(pdb) + '.pdb*'
-    #rm_file = 'rm ./' + str(pdb) + '.pdb*'
-    #subprocess.call(rm_file, shell=True)
-
     del traj, dih, dis
 
     return dihedrals, distances
diff --git a/kinomodel/features/query_klifs.py b/kinomodel/features/query_klifs.py
@@ -66,7 +66,7 @@ def query_klifs_database(pdbid, chainid):
             chain_found = True
     if not chain_found:
         raise ValueError("No data found for chainid '{}'."
-                         "Please make sure you provide a capital letter (A, B, C, ...) as a chain ID.".format(chainid))
+                         "Please make sure you provide a capital letter (A, B, C, ...) as a chain ID and the structure has the corresponding chain.".format(chainid))
 
     # Get the numbering of the 85 pocket residues
     cmd = "http://klifs.vu-compmedchem.nl/details.php?structure_id=" + str(struct_id)