forked from BT18D011/Seq2Feature
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsequence_based_features2_1.py
116 lines (107 loc) · 4.43 KB
/
sequence_based_features2_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/python2.7
# Import modules for CGI handling
import pandas as pd
import re
import os
import argparse
#import glob
#import string
#import urllib
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input_file",help = "Input file")
parser.add_argument("--property", help = """Enter 'AAP' for Amino acid properties""")
# parser.add_argument("--SUB", help = "Substitution matrices")
# parser.add_argument("--CON", help = "Pairwise properties and contact potential")
# parser.add_argument("--ALL", help = "ALL properties")
args = parser.parse_args()
newDF_49 = pd.DataFrame()
newDF_491 = pd.DataFrame()
f1 = open(args.input_file,'r').readlines()[1:]
################ Left right pref#################
def user_prop(mutation,AA_dist):
mut = mutation[-1]
wild = mutation[0]
return(int(AA_dist[wild])-int(AA_dist[mut]))
def left_pref(seq , mutation):
pos =int(''.join(re.findall('\d+',mutation)))
if pos <=1 or pos>= len(seq):
left_pref = ''
right_pref = ''
return(left_pref,right_pref)
out_df = pd.DataFrame({'Mutation':[mutation],'N_Terminal':[left_pref],'C_Terminal':[right_pref]},columns= ['Mutation','N_Terminal','C_Terminal'])
else:
left_pref = seq[pos-2]
right_pref = seq[pos]
out_df = pd.DataFrame({'Mutation':[mutation],'N_Terminal':[left_pref],'C_Terminal':[right_pref]},columns= ['Mutation','N_Terminal','C_Terminal'])
return(out_df)
def prop_49(seq ,mutation):
df1 = pd.read_csv('./data/49_properties_numerical_Values.csv', sep =',' )
# file1 = pd.read_csv('./data/prop_49_list.csv')
mut = mutation[-1]
wild = mutation[0]
wrt = (df1[wild]-df1[mut])
# out = pd.concat([file1,wrt],axis =1)
return(wrt.transpose())
def prop_491(seq ,mutation):
df_ = pd.read_csv('./data/49_properties_normalizedValues.csv', sep =',' )
# file1 = pd.read_csv('./data/prop_49_list.csv')
mut = mutation[-1]
wild = mutation[0]
wrt1 = (df_[wild]-df_[mut])
# out = pd.concat([file1,wrt],axis =1)
return(wrt1.transpose())
path = "./out_protein"
directory = os.path.dirname(path)
if not os.path.exists(directory):
os.makedirs(directory)
newDF = pd.DataFrame()
newDF_ = pd.DataFrame()
i = 1
for li in f1:
# print(li)
i = i+1
# Get data from fields
seq = li.strip().split('\t')[0].replace('X','').replace('Z','')
mutation_list = []
AA = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']
#mutation_list = form.getvalue('mutation').split(",")
for number in range(len(seq)):
# mutation_list.append(seq[number])
for aa in AA:
if seq[number]+str(number)+aa not in mutation_list:
mutation_list.append(seq[number]+str(number)+aa)
se = pd.Series(mutation_list)
df_mut_list= pd.DataFrame()
for mut in mutation_list:
if(len(mut)<3):
pass
# continue
mutation = mut.upper()
if(str(args.property) == "AAP"):
# print("calculating... amino acid properties")
pf = prop_49(str(seq) , str(mutation))
pf1 = prop_491(str(seq) , str(mutation))
newDF_49 = newDF_49.append(pf, ignore_index=True)
newDF_491 = newDF_491.append(pf1, ignore_index=True)
else:
print("!!!Wrong choice!!!")
print("Please select at least one property ...!!!")
#print"%s"%(newDF)
if(str(args.property) == "AAP"):
# cols = ['Properties']
# for ij in mutation_list:
# cols.append(ij)
# #
file1 = pd.read_csv('./data/prop_49_list.csv').transpose()
newDF1 = pd.concat([file1,newDF_49],axis =0)
newDF1_1 = pd.concat([file1,newDF_491],axis =0)
new = newDF1.transpose()
new1 = newDF1_1.transpose()
new = pd.concat([file1,newDF_49.mean(axis = 0).to_frame(name=None).transpose()]).transpose()
new1 = pd.concat([file1,newDF_491.mean(axis = 0).to_frame(name=None).transpose()]).transpose()
newDF = newDF.append(new.T,ignore_index = False)
newDF_ = newDF_.append(new1.T,ignore_index = False)
if(str(args.property) == "AAP"):
newDF.drop_duplicates(keep= 'first').to_csv("./out_protein/avg_prop_AAP.csv",index = False,header = False)
newDF_.drop_duplicates(keep= 'first').to_csv("./out_protein/normalizedValues_avg_prop_AAP.csv",index = False,header = False)