Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions examples/python-nullomers/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# find-nullomers

Find k-mers that have an abundance of 0 in the input
files. Canonicalizes k-mers so that forward and reverse complement
k-mers are correctly accounted for; this means that 'TTT' will count
as 'AAA', and only 'AAA' will be output if neither 'AAA' nor 'TTT'
exist.

To test, run:

```
./find-nullomers.py -k 3 tst.fa
```

and you should see the output

```
AAA
CCC
```
38 changes: 28 additions & 10 deletions examples/python-nullomers/find-nullomers.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,34 +1,52 @@
#! /usr/bin/env python
"""
Find k-mers that have abundance 0 (fwd and rev comp) in input files.

Uses exact counting and requires 4**K / 8 bytes, so don't use this
for large k :).

Usage:

./find-nullomers.py -k 11 inp.fasta inp2.fasta inp3.fasta
"""
import sys
import khmer
import argparse
import screed

K = 11
SIZE = 4**K # important: use exact counting.

def main():
p = argparse.ArgumentParser()
p.add_argument('contigs', nargs='+')
p.add_argument('-k', '--ksize', type=int, default=11)
args = p.parse_args()

assert K % 2 == 1, "K must be odd"
assert args.ksize % 2 == 1, "K must be odd"
assert args.ksize > 1, "K must be larger than 1"

SIZE=4**args.ksize # use exact counting

print('allocating lots of memory for exact counts: {} bytes'.format(8*SIZE))
# use Countgraph because we want to use the predictable hash function
print('allocating lots of memory for exact counts: {} bytes'.format(SIZE / 8), file=sys.stderr)

# use Nodegraph because we want to use the predictable hash function
# that only goes up to 4**K.
ct = khmer.Countgraph(K, SIZE, 1)
ct = khmer.Nodegraph(args.ksize, SIZE, 1)

for filename in args.contigs:
print('consuming {}'.format(filename))
print("consuming '{}'".format(filename), file=sys.stderr)
ct.consume_seqfile(filename)
print('...done!')
print('...done!', file=sys.stderr)

print('Iterating over all {}-mers'.format(K))
print('Iterating over all {}-mers'.format(args.ksize), file=sys.stderr)

# for large K, this is going to end up producing a massive amount of
# output...
for i in range(SIZE):
print(ct.reverse_hash(i), ct.get(i))
# the point of this is to canonicalize k-mers to less of RC.
kmer = ct.reverse_hash(i)
if ct.get(kmer) == 0:
print(kmer)
ct.add(kmer)


if __name__ == '__main__':
Expand Down
118 changes: 118 additions & 0 deletions examples/python-nullomers/tst.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
>foo
AAT
>foo
AAC
>foo
AAG
>foo
ATA
>foo
ATT
>foo
ATC
>foo
ATG
>foo
ACA
>foo
ACT
>foo
ACC
>foo
ACG
>foo
AGA
>foo
AGT
>foo
AGC
>foo
AGG
>foo
TAA
>foo
TAT
>foo
TAC
>foo
TAG
>foo
TTA
>foo
TTC
>foo
TTG
>foo
TCA
>foo
TCT
>foo
TCC
>foo
TCG
>foo
TGA
>foo
TGT
>foo
TGC
>foo
TGG
>foo
CAA
>foo
CAT
>foo
CAC
>foo
CAG
>foo
CTA
>foo
CTT
>foo
CTC
>foo
CTG
>foo
CCA
>foo
CCT
>foo
CCG
>foo
CGA
>foo
CGT
>foo
CGC
>foo
CGG
>foo
GAA
>foo
GAT
>foo
GAC
>foo
GAG
>foo
GTA
>foo
GTT
>foo
GTC
>foo
GTG
>foo
GCA
>foo
GCT
>foo
GCC
>foo
GCG
>foo
GGA
>foo
GGC