diff --git a/examples/python-nullomers/README.md b/examples/python-nullomers/README.md new file mode 100644 index 0000000000..849a02f42a --- /dev/null +++ b/examples/python-nullomers/README.md @@ -0,0 +1,20 @@ +# find-nullomers + +Find k-mers that have an abundance of 0 in the input +files. Canonicalizes k-mers so that forward and reverse complement +k-mers are correctly accounted for; this means that 'TTT' will count +as 'AAA', and only 'AAA' will be output if neither 'AAA' nor 'TTT' +exist. + +To test, run: + +``` +./find-nullomers.py -k 3 tst.fa +``` + +and you should see the output + +``` +AAA +CCC +``` diff --git a/examples/python-nullomers/find-nullomers.py b/examples/python-nullomers/find-nullomers.py old mode 100644 new mode 100755 index c73189ccee..96ef44fcbb --- a/examples/python-nullomers/find-nullomers.py +++ b/examples/python-nullomers/find-nullomers.py @@ -1,34 +1,52 @@ #! /usr/bin/env python +""" +Find k-mers that have abundance 0 (fwd and rev comp) in input files. + +Uses exact counting and requires 4**K / 8 bytes, so don't use this +for large k :). + +Usage: + + ./find-nullomers.py -k 11 inp.fasta inp2.fasta inp3.fasta +""" +import sys import khmer import argparse import screed -K = 11 -SIZE = 4**K # important: use exact counting. def main(): p = argparse.ArgumentParser() p.add_argument('contigs', nargs='+') + p.add_argument('-k', '--ksize', type=int, default=11) args = p.parse_args() - assert K % 2 == 1, "K must be odd" + assert args.ksize % 2 == 1, "K must be odd" + assert args.ksize > 1, "K must be larger than 1" + + SIZE=4**args.ksize # use exact counting - print('allocating lots of memory for exact counts: {} bytes'.format(8*SIZE)) - # use Countgraph because we want to use the predictable hash function + print('allocating lots of memory for exact counts: {} bytes'.format(SIZE / 8), file=sys.stderr) + + # use Nodegraph because we want to use the predictable hash function # that only goes up to 4**K. - ct = khmer.Countgraph(K, SIZE, 1) + ct = khmer.Nodegraph(args.ksize, SIZE, 1) for filename in args.contigs: - print('consuming {}'.format(filename)) + print("consuming '{}'".format(filename), file=sys.stderr) ct.consume_seqfile(filename) - print('...done!') + print('...done!', file=sys.stderr) - print('Iterating over all {}-mers'.format(K)) + print('Iterating over all {}-mers'.format(args.ksize), file=sys.stderr) # for large K, this is going to end up producing a massive amount of # output... for i in range(SIZE): - print(ct.reverse_hash(i), ct.get(i)) + # the point of this is to canonicalize k-mers to less of RC. + kmer = ct.reverse_hash(i) + if ct.get(kmer) == 0: + print(kmer) + ct.add(kmer) if __name__ == '__main__': diff --git a/examples/python-nullomers/tst.fa b/examples/python-nullomers/tst.fa new file mode 100644 index 0000000000..e5ef1be002 --- /dev/null +++ b/examples/python-nullomers/tst.fa @@ -0,0 +1,118 @@ +>foo +AAT +>foo +AAC +>foo +AAG +>foo +ATA +>foo +ATT +>foo +ATC +>foo +ATG +>foo +ACA +>foo +ACT +>foo +ACC +>foo +ACG +>foo +AGA +>foo +AGT +>foo +AGC +>foo +AGG +>foo +TAA +>foo +TAT +>foo +TAC +>foo +TAG +>foo +TTA +>foo +TTC +>foo +TTG +>foo +TCA +>foo +TCT +>foo +TCC +>foo +TCG +>foo +TGA +>foo +TGT +>foo +TGC +>foo +TGG +>foo +CAA +>foo +CAT +>foo +CAC +>foo +CAG +>foo +CTA +>foo +CTT +>foo +CTC +>foo +CTG +>foo +CCA +>foo +CCT +>foo +CCG +>foo +CGA +>foo +CGT +>foo +CGC +>foo +CGG +>foo +GAA +>foo +GAT +>foo +GAC +>foo +GAG +>foo +GTA +>foo +GTT +>foo +GTC +>foo +GTG +>foo +GCA +>foo +GCT +>foo +GCC +>foo +GCG +>foo +GGA +>foo +GGC