Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions devtools/conda-recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ requirements:
#- pypdb # TODO: Build this for omnia or conda-forge
- xmltodict

dependencies:
# dependencies
#- numpy
# Base depends
- pip
test:
requires:
- pytest
Expand Down
4,492 changes: 2,246 additions & 2,246 deletions kinomodel/data/docked.mol2

Large diffs are not rendered by default.

2,928 changes: 2,928 additions & 0 deletions kinomodel/data/docking/1m17.pdb

Large diffs are not rendered by default.

Binary file modified kinomodel/data/docking/prepared_receptor.oeb
Binary file not shown.
223 changes: 110 additions & 113 deletions kinomodel/features/protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,42 +103,68 @@ def compute_simple_protein_features(pdbid, chainid, coordfile, numbering):
import os
import mdtraj as md
import numpy as np
import sys
np.set_printoptions(threshold=sys.maxsize)

pdb_file = None

# A safer way to download files as wget may not exist on systems such MacOS
# TODO: Since we retrieve the PDB file in multiple pieces of code, let's refactor this into one utility function
# to avoid code duplication.
import urllib

# get toppology info either from fixed pdb or original pdb file (based on input)
# if analyzing a trajectory
if coordfile == 'dcd':
traj = md.load(subprocess.check_output('ls *dcd', shell=True).strip(b'\n').decode("utf-8"),top = str(pdbid) + '_minimized.pdb')
#traj = md.load(str(pdbid) + '.dcd',top = str(pdbid) + '_fixed_solvated.pdb')
topology = md.load(str(pdbid)+'_fixed.pdb').topology

chain_lst = [] # get a list of chains to identify the chain of interest
import string
with urllib.request.urlopen('http://www.pdb.org/pdb/files/{}.pdb'.format(pdbid)) as response:
pdb_file = response.read()

# check whether chains are in reverse order (e.g. B, A)
check = pdb_file.decode().split()
for i in range(len(check)):
if check[i] == 'ATOM':
if check[i+4] in list(string.ascii_uppercase):
if check[i+4] not in chain_lst:
chain_lst.append(check[i+4])
elif check[i] == 'HETATM':
if check[i+4] in list(string.ascii_uppercase):
if 'HET{}'.format(check[i+4]) not in chain_lst:
chain_lst.append('HET{}'.format(check[i+4]))
elif check[i] == 'CONECT':
break
with tempfile.TemporaryDirectory() as pdb_directory:
pdb = os.path.join(pdb_directory,'{}.pdb'.format(pdbid))
with open(pdb, 'w') as file:
file.write(pdb_file.decode())
# load traj before the temp pdb file was removed
if coordfile == 'pdb':
print("loading top from pdb")
traj = md.load(pdb)
# get topology info from the structure
topology = md.load(pdb).topology

topology = md.load(pdb).topology
coord = traj.xyz
table, bonds = topology.to_dataframe()
atoms = table.values
# translate a letter chain id into a number index (A->0, B->1 etc)
# TODO: This may not be robust, since chains aren't always in sequence from A to Z
chain_index = ord(str(chainid).lower()) - 97
#print(atoms)

# translate a letter chain id into a number index based on chain order
# identify the index of the chain of interest based on it's position in chain list
chain_index = chain_lst.index(chainid)

# get the array of atom indices for the calculation of:
# * eight dihedrals (a 8*4 array where each row contains indices of the four atoms for each dihedral)
# * five ditances (a 5*2 array where each row contains indices of the two atoms for each dihedral)
dih = np.zeros(shape=(8, 4), dtype=int, order='C')
dih = np.zeros(shape=(10, 4), dtype=int, order='C')
dis = np.zeros(shape=(5, 2), dtype=int, order='C')

# name list of the dihedrals and distances
dih_names = [
'aC_rot', 'xDFG_phi', 'xDFG_psi', 'dFG_phi', 'dFG_psi', 'DfG_phi',
'DfG_psi', 'DfG_chi'
'aC_rot', 'xDFG_phi', 'xDFG_psi', 'dFG_phi', 'dFG_psi', 'dFG_chi1', 'dFG_chi2', 'DfG_phi',
'DfG_psi', 'DfG_chi1'
]
dis_names = ['K_E1', 'K_E2', 'DFG_conf1', 'DFG_conf2', 'fret']

Expand All @@ -148,100 +174,79 @@ def compute_simple_protein_features(pdbid, chainid, coordfile, numbering):
by mdtraj but when the atom indices are not continuous there is a problem so a safer way to locate the coordinates
is through row number (as a fake atom index) in case the atom indices are not continuous.
'''
count = 0 # keep track of row indices as "atom indices"
for line in atoms:
# for the specified chain
if line[5] == chain_index:
# TODO: Use MDTraj DSL instead of this cumbersome comparison scheme.
# http://mdtraj.org/latest/atom_selection.html

# dihedral 1: between aC and aE helices
dih[0][0] = count if line[3] == numbering[20] and line[
1] == 'CA' else dih[0][0]
dih[0][1] = count if line[3] == numbering[28] and line[
1] == 'CA' else dih[0][1]
dih[0][2] = count if line[3] == numbering[60] and line[
1] == 'CA' else dih[0][2]
dih[0][3] = count if line[3] == numbering[62] and line[
1] == 'CA' else dih[0][3]
# dihedral 2 & 3: X-DFG Phi & Psi
dih[1][0] = count if line[3] == numbering[78] and line[
1] == 'C' else dih[1][0]
dih[1][1] = count if line[3] == numbering[79] and line[
1] == 'N' else dih[1][1]
dih[1][2] = count if line[3] == numbering[79] and line[
1] == 'CA' else dih[1][2]
dih[1][3] = count if line[3] == numbering[79] and line[
1] == 'C' else dih[1][3]
dih[2][0] = dih[1][1]
dih[2][1] = dih[1][2]
dih[2][2] = dih[1][3]
dih[2][3] = count if line[3] == numbering[80] and line[
1] == 'N' else dih[2][3]

# dihedral 4 & 5: DFG-Asp Phi & Psi
dih[3][0] = dih[1][3]
dih[3][1] = dih[2][3]
dih[3][2] = count if line[3] == numbering[80] and line[
1] == 'CA' else dih[3][2]
dih[3][3] = count if line[3] == numbering[80] and line[
1] == 'C' else dih[3][3]
dih[4][0] = dih[3][1]
dih[4][1] = dih[3][2]
dih[4][2] = dih[3][3]
dih[4][3] = count if line[3] == numbering[81] and line[
1] == 'N' else dih[4][3]

# dihedral 6 & 7: DFG-Phe Phi & Psi
dih[5][0] = dih[3][3]
dih[5][1] = dih[4][3]
dih[5][2] = count if line[3] == numbering[81] and line[
1] == 'CA' else dih[5][2]
dih[5][3] = count if line[3] == numbering[81] and line[
1] == 'C' else dih[5][3]
dih[6][0] = dih[5][1]
dih[6][1] = dih[5][2]
dih[6][2] = dih[5][3]
dih[6][3] = count if line[3] == numbering[82] and line[
1] == 'N' else dih[6][3]

# dihedral 8: DFG-Phe Chi
dih[7][0] = dih[5][1]
dih[7][1] = dih[5][2]
dih[7][2] = count if line[3] == numbering[81] and line[
1] == 'CB' else dih[7][2]
dih[7][3] = count if line[3] == numbering[81] and line[
1] == 'CG' else dih[7][3]

# distance 1 & 2: K-E salt bridge
dis[0][0] = count if line[3] == numbering[16] and line[
1] == 'NZ' else dis[0][0]
dis[0][1] = count if line[3] == numbering[23] and line[
1] == 'OE1' else dis[0][1]
dis[1][0] = count if line[3] == numbering[16] and line[
1] == 'NZ' else dis[1][0]
dis[1][1] = count if line[3] == numbering[23] and line[
1] == 'OE2' else dis[1][1]

# distance 3 & 4: DFG conformation-related distances
dis[2][0] = count if line[3] == numbering[27] and line[
1] == 'CA' else dis[2][0]
dis[2][1] = count if line[3] == numbering[81] and line[
1] == 'CZ' else dis[2][1]
dis[3][0] = count if line[3] == numbering[16] and line[
1] == 'CA' else dis[3][0]
dis[3][1] = dis[2][1]

# distance 5: FRET distance
dis[4][0] = count if line[3] == int(
numbering[80] + 10) and line[1] == 'CA' else dis[4][0]
dis[4][1] = count if line[3] == int(
numbering[80] - 20) and line[1] == 'CA' else dis[4][1]

if line[5] > chain_index:
break

count += 1

# dihedral 0: between aC and aE helices
dih[0][0] = topology.select("chainid {} and residue {} and name CA".format(chain_index,numbering[20]))
dih[0][1] = topology.select("chainid {} and residue {} and name CA".format(chain_index,numbering[28]))
dih[0][2] = topology.select("chainid {} and residue {} and name CA".format(chain_index,numbering[60]))
dih[0][3] = topology.select("chainid {} and residue {} and name CA".format(chain_index,numbering[62]))

# dihedral 1 & 2: X-DFG Phi & Psi
dih[1][0] = topology.select("chainid {} and residue {} and name C".format(chain_index,numbering[78]))
dih[1][1] = topology.select("chainid {} and residue {} and name N".format(chain_index,numbering[79]))
dih[1][2] = topology.select("chainid {} and residue {} and name CA".format(chain_index,numbering[79]))
dih[1][3] = topology.select("chainid {} and residue {} and name C".format(chain_index,numbering[79]))

dih[2][0] = dih[1][1]
dih[2][1] = dih[1][2]
dih[2][2] = dih[1][3]
dih[2][3] = topology.select("chainid {} and residue {} and name N".format(chain_index,numbering[80]))

# dihedral 3 & 4: DFG-Asp Phi & Psi
dih[3][0] = dih[1][3]
dih[3][1] = dih[2][3]
dih[3][2] = topology.select("chainid {} and residue {} and name CA".format(chain_index,numbering[80]))
dih[3][3] = topology.select("chainid {} and residue {} and name C".format(chain_index,numbering[80]))

dih[4][0] = dih[3][1]
dih[4][1] = dih[3][2]
dih[4][2] = dih[3][3]
dih[4][3] = topology.select("chainid {} and residue {} and name N".format(chain_index,numbering[81]))

# dihedral 5 & 6: DFG-Asp Chi1 & Chi2
dih[5][0] = dih[2][3] # DFG-Asp N
dih[5][1] = dih[3][2] # DFG-Asp CA
dih[5][2] = topology.select("chainid {} and residue {} and name CB".format(chain_index,numbering[80])) # DFG-Asp CB
dih[5][3] = topology.select("chainid {} and residue {} and name CG".format(chain_index,numbering[80])) # DFG-Asp CG

dih[6][0] = dih[5][1]
dih[6][1] = dih[5][2]
dih[6][2] = dih[5][3]
dih[6][3] = topology.select("chainid {} and residue {} and name OD1".format(chain_index,numbering[80])) #DFG-Asp OD1

# dihedral 7 & 8: DFG-Phe Phi & Psi
dih[7][0] = dih[3][3]
dih[7][1] = dih[4][3]
dih[7][2] = topology.select("chainid {} and residue {} and name CA".format(chain_index,numbering[81]))
dih[7][3] = topology.select("chainid {} and residue {} and name C".format(chain_index,numbering[81]))

dih[8][0] = dih[7][1]
dih[8][1] = dih[7][2]
dih[8][2] = dih[7][3]
dih[8][3] = topology.select("chainid {} and residue {} and name N".format(chain_index,numbering[82]))

# dihedral 9: DFG-Phe Chi1
dih[9][0] = dih[4][3] #DFG-Phe N
dih[9][1] = dih[7][2] #DFG-Phe CA
dih[9][2] = topology.select("chainid {} and residue {} and name CB".format(chain_index,numbering[81])) # DFG-Phe CB
dih[9][3] = topology.select("chainid {} and residue {} and name CG".format(chain_index,numbering[81])) # DFG-Phe CG

# distance 0 & 1: K-E salt bridge
dis[0][0] = topology.select("chainid {} and residue {} and name NZ".format(chain_index,numbering[16]))
dis[0][1] = topology.select("chainid {} and residue {} and name OE1".format(chain_index,numbering[23]))
dis[1][0] = dis[0][0]
dis[1][1] = topology.select("chainid {} and residue {} and name OE2".format(chain_index,numbering[23]))

# distance 2 & 3: DFG conformation-related distances
dis[2][0] = topology.select("chainid {} and residue {} and name CA".format(chain_index,numbering[27]))
dis[2][1] = topology.select("chainid {} and residue {} and name CZ".format(chain_index,numbering[81]))
dis[3][0] = topology.select("chainid {} and residue {} and name CA".format(chain_index,numbering[16]))
dis[3][1] = dis[2][1]

# distance 4: FRET distance
dis[4][0] = topology.select("chainid {} and residue {} and name CA".format(chain_index,numbering[16]+120))
dis[4][1] = topology.select("chainid {} and residue {} and name CA".format(chain_index,numbering[16]+61))

# check if there is any missing coordinates; if so, skip dihedral/distance calculation for those residues
check_flag = 1
for i in range(len(dih)):
Expand All @@ -264,9 +269,7 @@ def compute_simple_protein_features(pdbid, chainid, coordfile, numbering):
# "There is no missing coordinates. All dihedrals and distances will be computed."
#)
# calculate the dihedrals and distances for the user-specifed structure (a static structure or an MD trajectory)
if coordfile == 'dcd':
traj = md.load(str(pdbid) + '.dcd',top = str(pdbid) + '_fixed_solvated.pdb')
dihedrals = md.compute_dihedrals(traj, dih)
dihedrals = md.compute_dihedrals(traj, dih)/np.pi*180
distances = md.compute_distances(traj, dis)

# option to log the results
Expand All @@ -281,12 +284,6 @@ def compute_simple_protein_features(pdbid, chainid, coordfile, numbering):
'''

# clean up
# TODO: This is dangerous! Instead, rely on using the "with tempfile.TemporaryDirectory" context manager idiom to create and clean up temporary directories
#import subprocess
#rm_file = 'rm ./' + str(pdb) + '.pdb*'
#rm_file = 'rm ./' + str(pdb) + '.pdb*'
#subprocess.call(rm_file, shell=True)

del traj, dih, dis

return dihedrals, distances
2 changes: 1 addition & 1 deletion kinomodel/features/query_klifs.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def query_klifs_database(pdbid, chainid):
chain_found = True
if not chain_found:
raise ValueError("No data found for chainid '{}'."
"Please make sure you provide a capital letter (A, B, C, ...) as a chain ID.".format(chainid))
"Please make sure you provide a capital letter (A, B, C, ...) as a chain ID and the structure has the corresponding chain.".format(chainid))

# Get the numbering of the 85 pocket residues
cmd = "http://klifs.vu-compmedchem.nl/details.php?structure_id=" + str(struct_id)
Expand Down