-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathslimitdown.py
247 lines (216 loc) · 7.41 KB
/
slimitdown.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
import pandas as pd
import numpy as np
import itertools
import os
import os.path
import sys
from itertools import groupby
import subprocess
def runcmd(cmd, verbose = False, *args, **kwargs):
### this function allows us to run command line operations
process = subprocess.Popen(
cmd,
stdout = subprocess.PIPE,
stderr = subprocess.PIPE,
text = True,
shell = True
)
std_out, std_err = process.communicate()
if verbose:
print(std_out.strip(), std_err)
pass
check_file = os.path.isfile('go.obo')
if check_file == True:
runcmd('rm go.obo')
###this removes old versions of go.obo file if it exists
runcmd('wget http://purl.obolibrary.org/obo/go.obo', verbose = True)
### this uses the runcmd functon and gets the latest version of go.obo
inputfile = sys.argv[1]
### save the user input file
with open('go.obo') as f:
obo = f.readlines()
z = [list(group) for key, group in groupby(obo, key=lambda x: x == '[Term]\n') if not key]
### takess to obo file andd turns it into a list grouped by GO Terms
cleaned = z[1:]
### This removes the header and summary information
format_version = z[0][0]
date_version = z[0][1]
print('obo', format_version)
print('obo', date_version)
### this prints to the user the version of the go.obo file
child = []
parent = []
subset =[]
names = []
name_space = []
id_alt = []
temp_names =[]
temp_parent = []
temp_name_space = []
temp_subset = []
temp_id_alt = []
par_w_def = []
for i in cleaned:
child.append(i[0][4:-1])
for a in i:
if 'name:' in a:
temp_names.append(a[6:-1])
if 'namespace' in a:
temp_name_space.append(a[10:-1])
if 'is_a' in a:
temp_parent.append(a[6:16])
par_w_def.append(a[6:-1])
if 'subset' in a:
temp_subset.append(a[8:-1])
if 'alt_id' in a:
temp_id_alt.append(a[8:-1])
parent.append(temp_parent)
name_space.append(temp_name_space)
subset.append(temp_subset)
id_alt.append(temp_id_alt)
names.append(temp_names)
temp_name_space = []
temp_id_alt =[]
temp_parent = []
temp_subset = []
temp_names = []
### this short section of code creates lists of all go terms (child)
### of each of their parents (parent) if none = []
### if it iss in a subset (subset)
### the go terms alternate ids (alt_ids)
### and each go term name (names)
all_names ={}
for i in range(len(child)):
all_names[child[i]]= names[i]
### this creates a dictionary of all the GO terms and their names
all_name_space ={}
for i in range(len(child)):
all_name_space[child[i]]= name_space[i]
### this creates a dictionary of all the GO terms and their namespaces
myList = [i.split(' ! ') for i in par_w_def]
define_dic = {}
for i in myList:
define_dic[i[0]] = i[-1]
### this creates a dictionary of all parents of children with their names
child_parent_map = {}
for i in range(len(child)):
child_parent_map[child[i]] = parent[i]
### this maps each child go term to its parent
is_slim = {}
for i in range(len(child)):
if subset[i] != []:
is_slim[child[i]] = subset[i]
### this creates a list of child terms that are already in a slim set
is_alt = {}
for i in range(len(child)):
if id_alt[i] != []:
is_alt[child[i]] = id_alt[i]
### this creates a dictionary if there are alternative ids
user_input = pd.read_csv(inputfile, sep = ',', header = None)
user_go = list(user_input[1])
already_slims =[]
checker = []
checked_users = []
for i in user_go:
if i in is_slim.keys():
already_slims.append(i)
else:
if i in child_parent_map:
checked_users.append(i)
checker.append(child_parent_map[i])
### this determines if any of the user inputs are already in a slim
### And then gets the parent terms of the children
summary_file = []
obselete = []
def get_allslims(checker, summary_file, checked_users, obselete, child_parent_map):
###this function creats a loop that goes up the GO term tree until
### it finds a slim term that encompassses it. It also determines
### if any of the terms are obselte.
count = 0
for i in checker:
if i != []:
for j in i:
if j in is_slim:
summary_file.append([checked_users[count], j])
i.remove(j)
if j not in is_slim:
new = child_parent_map[j]
i.remove(j)
for v in new:
i.append(v)
if i == []:
checker.remove(i)
checked_users.remove(checked_users[count])
count +=1
else:
obselete.append([checked_users[count], 'obselete'])
checked_users.remove(checked_users[count])
checker.remove(i)
count+=1
while checker and len(checker) >0:
get_allslims(checker, summary_file, checked_users, obselete, child_parent_map)
tot = list(is_alt.values())
keys = list(is_alt.keys())
alt_id_fnder = []
look = []
new_go = []
for i in summary_file:
look.append(i[0])
for f in obselete:
look.append(f[0])
for zed in already_slims:
look.append(zed)
for c in user_go:
if c not in look:
for i in range(len(tot)):
for j in tot[i]:
if c == j:
new_go.append([keys[i]])
alt_id_fnder.append(c)
### this function says if there are still go terms look for there alt_ids and store them
while new_go and len(new_go) >0:
get_allslims(new_go, summary_file, alt_id_fnder, obselete, child_parent_map)
### then try again with the alt ids
summary_file.sort()
unique_summary_file = list(summary_file for summary_file,_ in itertools.groupby(summary_file))
### removing exact dupluicates
slimmed = []
for i in unique_summary_file:
slimmed.append(i[1])
###this is reformating
slimmed.extend(already_slims)
###adds the already slimmed terms from the user input to the slimmed terms
final_counts = pd.Series(slimmed).value_counts()
finaldf = pd.DataFrame(final_counts)
names = list(finaldf.index)
defs = []
for i in names:
if i in define_dic:
defs.append(define_dic[i])
if i not in define_dic:
defs.append(str(all_names[i]))
finaldf['Names'] = defs
### This counts the number of occurences of each Parent go term
function = []
for i in names:
function.append(all_name_space[i])
finaldf['Namespace'] = function
unique_summary_file2 = []
for i in unique_summary_file:
first = i[0]
second = i[1]
unique_summary_file2.append([first, str(all_names[first]), second, str(all_names[second])])
# this is to format the summary file
for i in already_slims:
unique_summary_file2.append([i, str(all_names[i]), i, str(all_names[i])])
# this adds the already slims to the summary file
finaldf['Names'] = finaldf['Names'].str.replace("^\['|'\]$","", regex=True)
finaldf.to_csv('frequency_table.tsv', sep="\t", header = ['obo_format', format_version, date_version])
sumdf = pd.DataFrame(unique_summary_file2, columns = ['User_GO:IDS', 'User_Go_Defs', 'Slim_GO:IDS', 'Slim_Go_Defs'])
sumdf['User_Go_Defs'] = sumdf['User_Go_Defs'].str.replace("^\['|'\]$","", regex=True)
sumdf['Slim_Go_Defs'] = sumdf['Slim_Go_Defs'].str.replace("^\['|'\]$","", regex=True)
sumdf.to_csv('summary_file.tsv' , sep="\t")
obseletedf = pd.DataFrame(obselete)
obseletedf.to_csv('obselete.tsv', sep="\t")
### all of these steps are creating the dataframes into tsv files
### the str.replace statments are removing the brackets in the lists.