-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathannotate_genes.py
86 lines (70 loc) · 2.87 KB
/
annotate_genes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import sys
import os
import cobra
from tqdm import tqdm
import memote
import helper_functions as hf
from bioservices.kegg import KEGG
import gffpandas.gffpandas as gffpd
'''
Usage: annotate_genes.py <path_input_sbml-file> <path_output_sbml-file> <path-GFF File> <path_memote-report> <name_organism>
Adds annotations from KEGG and SBO Terms to genes.
'''
def main(args):
# console access
if len(args) != 5:
print(main.__doc__)
sys.exit(1)
infile = args[1]
outfile = args[2]
gff_file = args[3]
memote_report = args[4]
organism_name = args[5]
if not os.path.exists(infile):
print("[Error] %s : No such file." % infile)
sys.exit(1)
# Read in files
model = cobra.io.sbml.read_sbml_model(infile)
df = gffpd.read_gff3(gff_file)
df_attr = df.attributes_to_columns()
kegg = KEGG()
# find organism
req = kegg.lookfor_organism(organism_name)[0].split(' ')
entry = req[0] # 'T00661'
org_id = req[1] # 'fma'
sbo_nr = "SBO:0000243"
# Kegg genes extraction
genes = kegg.list(org_id).split("\n")[:-1]
genome_dict = {g.split("\t")[0].replace("fma:", ""): g.split("\t")[1] for g in genes}
# -- Cobra model annotation
for i in tqdm(range(len(model.genes))):
annotations = {"sbo": sbo_nr}
id_sbml = model.genes[i].id
refseq = id_sbml[:-2] + "." + id_sbml[-1]
gff_query = df_attr["Name"] == refseq
if gff_query.any():
matches = df_attr[gff_query]
for j, row in matches.iterrows():
if df_attr.loc[j - 1, "type"] == "gene" and df_attr.loc[j, "type"] == "CDS":
locus_tag = df_attr.loc[j - 1, "old_locus_tag"]
new_locus_tag = df_attr.loc[j - 1, "locus_tag"]
note = df_attr.loc[j, "Note"]
name = df_attr.loc[j, "Name"]
annotations = hf.dict_add_overlap_to_list(annotations, {"kegg.genes": f"{org_id}:{locus_tag}",
"refseq": refseq})
model.genes[i].annotation = \
hf.dict_add_overlap_to_list(model.genes[i].annotation, annotations)
if note is None:
model.genes[i].notes.update({"locus tag:": new_locus_tag})
else:
model.genes[i].notes.update({"locus tag:": new_locus_tag, "NCBI note:": note})
model.genes[i].name = name
# Export model
cobra.io.sbml.write_sbml_model(model, outfile)
# Make memote report
result = memote.test_model(model, results=True, skip=["test_find_metabolites_not_produced_with_open_bounds"])
report = memote.snapshot_report(result[1], config=None, html=True)
with open(memote_report, "w") as handle:
handle.write(report)
if __name__ == '__main__':
main(sys.argv)