dib-lab · ctb · Sep 1, 2017 · Sep 1, 2017 · Sep 14, 2017 · Nov 10, 2017
diff --git a/examples/python-nullomers/README.md b/examples/python-nullomers/README.md
@@ -0,0 +1,20 @@
+# find-nullomers
+
+Find k-mers that have an abundance of 0 in the input
+files. Canonicalizes k-mers so that forward and reverse complement
+k-mers are correctly accounted for; this means that 'TTT' will count
+as 'AAA', and only 'AAA' will be output if neither 'AAA' nor 'TTT'
+exist.
+
+To test, run:
+
+```
+./find-nullomers.py -k 3 tst.fa
+```
+
+and you should see the output
+
+```
+AAA
+CCC
+```
diff --git a/examples/python-nullomers/find-nullomers.py b/examples/python-nullomers/find-nullomers.py
@@ -1,34 +1,52 @@
 #! /usr/bin/env python
+"""
+Find k-mers that have abundance 0 (fwd and rev comp) in input files.
+
+Uses exact counting and requires 4**K / 8 bytes, so don't use this
+for large k :).
+
+Usage:
+
+   ./find-nullomers.py -k 11 inp.fasta inp2.fasta inp3.fasta
+"""
+import sys
 import khmer
 import argparse
 import screed
 
-K = 11
-SIZE = 4**K                               # important: use exact counting.
 
 def main():
     p = argparse.ArgumentParser()
     p.add_argument('contigs', nargs='+')
+    p.add_argument('-k', '--ksize', type=int, default=11)
     args = p.parse_args()
 
-    assert K % 2 == 1, "K must be odd"
+    assert args.ksize % 2 == 1, "K must be odd"
+    assert args.ksize > 1, "K must be larger than 1"
+
+    SIZE=4**args.ksize                               # use exact counting
 
-    print('allocating lots of memory for exact counts: {} bytes'.format(8*SIZE))
-    # use Countgraph because we want to use the predictable hash function
+    print('allocating lots of memory for exact counts: {} bytes'.format(SIZE / 8), file=sys.stderr)
+
+    # use Nodegraph because we want to use the predictable hash function
     # that only goes up to 4**K.
-    ct = khmer.Countgraph(K, SIZE, 1)
+    ct = khmer.Nodegraph(args.ksize, SIZE, 1)
 
     for filename in args.contigs:
-        print('consuming {}'.format(filename))
+        print("consuming '{}'".format(filename), file=sys.stderr)
         ct.consume_seqfile(filename)
-    print('...done!')
+    print('...done!', file=sys.stderr)
 
-    print('Iterating over all {}-mers'.format(K))
+    print('Iterating over all {}-mers'.format(args.ksize), file=sys.stderr)
 
     # for large K, this is going to end up producing a massive amount of
     # output...
     for i in range(SIZE):
-        print(ct.reverse_hash(i), ct.get(i))
+        # the point of this is to canonicalize k-mers to less of RC.
+        kmer = ct.reverse_hash(i)
+        if ct.get(kmer) == 0:
+            print(kmer)
+            ct.add(kmer)
 
 
 if __name__ == '__main__':

diff --git a/examples/python-nullomers/tst.fa b/examples/python-nullomers/tst.fa
@@ -0,0 +1,118 @@
+>foo
+AAT
+>foo
+AAC
+>foo
+AAG
+>foo
+ATA
+>foo
+ATT
+>foo
+ATC
+>foo
+ATG
+>foo
+ACA
+>foo
+ACT
+>foo
+ACC
+>foo
+ACG
+>foo
+AGA
+>foo
+AGT
+>foo
+AGC
+>foo
+AGG
+>foo
+TAA
+>foo
+TAT
+>foo
+TAC
+>foo
+TAG
+>foo
+TTA
+>foo
+TTC
+>foo
+TTG
+>foo
+TCA
+>foo
+TCT
+>foo
+TCC
+>foo
+TCG
+>foo
+TGA
+>foo
+TGT
+>foo
+TGC
+>foo
+TGG
+>foo
+CAA
+>foo
+CAT
+>foo
+CAC
+>foo
+CAG
+>foo
+CTA
+>foo
+CTT
+>foo
+CTC
+>foo
+CTG
+>foo
+CCA
+>foo
+CCT
+>foo
+CCG
+>foo
+CGA
+>foo
+CGT
+>foo
+CGC
+>foo
+CGG
+>foo
+GAA
+>foo
+GAT
+>foo
+GAC
+>foo
+GAG
+>foo
+GTA
+>foo
+GTT
+>foo
+GTC
+>foo
+GTG
+>foo
+GCA
+>foo
+GCT
+>foo
+GCC
+>foo
+GCG
+>foo
+GGA
+>foo
+GGC
-Original file line number
+Diff line change
@@ -0,0 +1,118 @@
+    >foo
+    AAT
+    >foo
+    AAC
+    >foo
+    AAG
+    >foo
+    ATA
+    >foo
+    ATT
+    >foo
+    ATC
+    >foo
+    ATG
+    >foo
+    ACA
+    >foo
+    ACT
+    >foo
+    ACC
+    >foo
+    ACG
+    >foo
+    AGA
+    >foo
+    AGT
+    >foo
+    AGC
+    >foo
+    AGG
+    >foo
+    TAA
+    >foo
+    TAT
+    >foo
+    TAC
+    >foo
+    TAG
+    >foo
+    TTA
+    >foo
+    TTC
+    >foo
+    TTG
+    >foo
+    TCA
+    >foo
+    TCT
+    >foo
+    TCC
+    >foo
+    TCG
+    >foo
+    TGA
+    >foo
+    TGT
+    >foo
+    TGC
+    >foo
+    TGG
+    >foo
+    CAA
+    >foo
+    CAT
+    >foo
+    CAC
+    >foo
+    CAG
+    >foo
+    CTA
+    >foo
+    CTT
+    >foo
+    CTC
+    >foo
+    CTG
+    >foo
+    CCA
+    >foo
+    CCT
+    >foo
+    CCG
+    >foo
+    CGA
+    >foo
+    CGT
+    >foo
+    CGC
+    >foo
+    CGG
+    >foo
+    GAA
+    >foo
+    GAT
+    >foo
+    GAC
+    >foo
+    GAG
+    >foo
+    GTA
+    >foo
+    GTT
+    >foo
+    GTC
+    >foo
+    GTG
+    >foo
+    GCA
+    >foo
+    GCT
+    >foo
+    GCC
+    >foo
+    GCG
+    >foo
+    GGA
+    >foo
+    GGC