-
Notifications
You must be signed in to change notification settings - Fork 29
/
emirge_rename_fasta.py
executable file
·164 lines (135 loc) · 6.12 KB
/
emirge_rename_fasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/python2
"""
EMIRGE: Expectation-Maximization Iterative Reconstruction of Genes from the Environment
Copyright (C) 2010-2016 Christopher S. Miller ([email protected])
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>
https://github.com/csmiller/EMIRGE
for help, type:
emirge_rename_fasta.py --help
re-write the fasta file to stdout with renamed sequences:
>SEQ_ID|ORIG_SEQ_NAME prior_prob prior/length
sort by decending prior probability
bases with no read support are labeled 'N', and terminal N's are trimmed
"""
import sys
import os
from optparse import OptionParser
from Bio import SeqIO, Seq
import copy
from math import log, e
import cPickle
from gzip import GzipFile
import numpy
import re
USAGE = \
"""usage: %prog [options] <iter.DIR> > renamed.fasta
%prog rewrites an emirge fasta file to include proper
sequence names and prior probabilities (abundance estimates) in the
record headers, and sorts the sequences from most to least abundant
iter.DIR is one of the iteration directories created by emirge (for
example: emirge_working_dir/iter.40). If no iter.DIR is given,
%prog assumes that iter.DIR is the current working
directory.
Note that, with default options, bases with no read support are
labeled 'N', and terminal N's are trimmed"""
# from emirge.py: self.DEFAULT_ERROR = 0.05
DEFAULT_ERROR = 0.05
def replace_with_Ns(probN, seq_i, seq, trim_N = True):
"""
IN: probN matrix, seq_i for sequence, seq (BioPython Seq object)
OUT: returns the sequence where bases with no read support are replaced with "N", and terminal N's are trimmed
"""
default_probN = 1.0 - DEFAULT_ERROR
try:
this_probN = probN[seq_i]
except:
print >> sys.stderr, seq_i, len(probN)
raise
indices = numpy.where(numpy.max(this_probN, axis=1) == default_probN)
newseq = numpy.array(str(seq), dtype='c')
newseq[indices] = 'N'
newseq = ''.join(newseq)
if trim_N:
newseq = newseq.strip("N")
return Seq.Seq(newseq)
def rename(wd = os.getcwd(), prob_min = None, record_prefix = '', no_N = False, no_trim_N = True):
"""
wd is an iteration directory
prob_min is minimum prior prob of a sequence to include in output. If None, include all seqs.
"""
if prob_min is None:
prob_min = -1
current_iter = int(wd.split('.')[-1])
prior_file = file(os.path.join(wd, 'priors.iter.%02d.txt'%current_iter))
probN_filename = os.path.join(wd, 'probN.pkl.gz')
probN = cPickle.load(GzipFile(probN_filename))
name2seq_i = {}
name2prior = {}
name2normed_prior = {} # normed by length of sequences
for line in prior_file:
atoms = line.split()
name2seq_i[atoms[1]] = int(atoms[0])
name2prior[atoms[1]] = atoms[2]
sorted_records = []
for record in SeqIO.parse(file(os.path.join(wd, "iter.%02d.cons.fasta"%current_iter)), "fasta"):
name = record.description.split()[0]
record.id = "%s%d|%s"%(record_prefix, name2seq_i[name], name)
if not no_N:
record.seq = replace_with_Ns(probN, name2seq_i[name], record.seq, not no_trim_N)
if len(record.seq) == 0:
continue
record.description = ""
p = float(name2prior[name])
if p == 0:
continue
sorted_records.append((p, record))
# normalize priors by length
sum_lengths = sum(len(record.seq) for prior, record in sorted_records)
normed_priors = [prior/ len(record.seq) for prior, record in sorted_records]
sum_norm = float(sum(normed_priors))
normed_priors = [x/sum_norm for x in normed_priors]
for i, (prior, record) in enumerate(sorted_records):
record.description = "Prior=%06f Length=%d NormPrior=%06f"%(prior, len(record.seq), normed_priors[i])
for prior, record in sorted(sorted_records, lambda x,y: cmp(x[0], y[0]), reverse=True):
if prior < prob_min:
break
sys.stdout.write(record.format('fasta'))
def main(argv = None):
if argv is None:
argv = sys.argv[1:]
parser = OptionParser(USAGE)
parser.add_option('-p', '--prob_min',
type='float',
help='Only include sequences in output with prior probability above PROB_MIN (Default: include all sequences)')
parser.add_option('-r', '--record_prefix',
type='string', default = '',
help='Add the specified prefix to each fasta record title')
parser.add_option('-n', '--no_N',
action="store_true",
help="Don't change bases with no read support to N. Caution: these bases are not supported by reads in the input data, but will usually be from a closely related sequence.")
parser.add_option('-t', '--no_trim_N',
action="store_true",
help="Don't trim off N bases with no read support from ends of sequences. Ignored if --no_N is also passed")
(options, args) = parser.parse_args(argv)
if len(args) == 0:
wd = os.getcwd()
elif len(args) == 1:
wd = os.path.abspath(args[0])
else:
parser.error("Found more than one argument on command line (expects only a single iter.DIR plus any options): %s"%(args))
if not os.path.exists(wd):
parser.error("Directory not found: %s"%(wd))
if re.match("iter.[0-9]+$", os.path.basename(wd)) is None:
parser.error("Directory %s is not a valid emirge iter.DIR (try --help for full usage)"%(wd))
rename(wd, options.prob_min, options.record_prefix, options.no_N, options.no_trim_N)
if __name__ == '__main__':
main()