From b906b74235848699eea2c3cf96d1662adf8fa054 Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Mon, 18 Jun 2018 15:46:51 -0700 Subject: [PATCH] Experiment composing counttables onto a nodetable given a max abundance criterion --- include/oxli/hashtable.hh | 18 +++++++++++++++++ include/oxli/storage.hh | 41 +++++++++++++++++++++++++++++++++++---- khmer/_oxli/graphs.pxd | 2 ++ khmer/_oxli/graphs.pyx | 6 ++++++ tests/test_counttable.py | 30 ++++++++++++++++++++++++++++ 5 files changed, 93 insertions(+), 4 deletions(-) diff --git a/include/oxli/hashtable.hh b/include/oxli/hashtable.hh index 9e72c9dad3..fa2d71f3c7 100644 --- a/include/oxli/hashtable.hh +++ b/include/oxli/hashtable.hh @@ -624,6 +624,24 @@ class Nodetable : public oxli::MurmurHashtable public: explicit Nodetable(WordLength ksize, std::vector sizes) : MurmurHashtable(ksize, new BitStorage(sizes)) { } ; + + void compose_init(Counttable& other, BoundedCounterType max) + { + if (get_tablesizes() != other.get_tablesizes()) { + std::cerr << "Table size mismatch\n"; + return; + } + store->compose_init(other.get_raw_tables(), max); + } + + void compose_update(Counttable& other, BoundedCounterType max) + { + if (get_tablesizes() != other.get_tablesizes()) { + std::cerr << "Table size mismatch\n"; + return; + } + store->compose_update(other.get_raw_tables(), max); + } }; } diff --git a/include/oxli/storage.hh b/include/oxli/storage.hh index 33fb6f7f73..fd08fa6a37 100644 --- a/include/oxli/storage.hh +++ b/include/oxli/storage.hh @@ -58,6 +58,7 @@ class Storage protected: bool _supports_bigcount; bool _use_bigcount; + Byte ** _counts; public: Storage() : _supports_bigcount(false), _use_bigcount(false) { } ; @@ -75,6 +76,42 @@ public: void set_use_bigcount(bool b); bool get_use_bigcount(); + + inline void compose_init(Byte **othertable, BoundedCounterType max) + { + size_t _n_tables = n_tables(); + std::vector _tablesizes = get_tablesizes(); + + for (size_t row = 0; row < _n_tables; row++) { + for (size_t bin = 0; bin < _tablesizes[row]; bin++) { + BoundedCounterType count = othertable[row][bin]; + if (count <= max) { + unsigned char bit = bin % 8; + _counts[row][bin] |= 1UL << bit; + } + } + } + } + + inline void compose_update(Byte **othertable, BoundedCounterType max) + { + size_t _n_tables = n_tables(); + std::vector _tablesizes = get_tablesizes(); + + for (size_t row = 0; row < _n_tables; row++) { + for (size_t bin = 0; bin < _tablesizes[row]; bin++) { + BoundedCounterType count = othertable[row][bin]; + unsigned char bit = bin % 8; + unsigned char isset = (_counts[row][bin] >> bit) & 1U; + if (count <= max && isset) { + _counts[row][bin] |= 1UL << bit; + } + else { + _counts[row][bin] &= ~(1UL << bit); + } + } + } + } }; @@ -99,7 +136,6 @@ protected: size_t _n_tables; uint64_t _occupied_bins; uint64_t _n_unique_kmers; - Byte ** _counts; public: BitStorage(std::vector& tablesizes) : @@ -252,7 +288,6 @@ protected: uint64_t _n_unique_kmers; std::array mutexes; static constexpr uint8_t _max_count{15}; - Byte ** _counts; // Compute index into the table, this retrieves the correct byte // which you then need to select the correct nibble from @@ -496,8 +531,6 @@ protected: uint64_t _n_unique_kmers; uint64_t _occupied_bins; - Byte ** _counts; - // initialize counts with empty hashtables. void _allocate_counters() { diff --git a/khmer/_oxli/graphs.pxd b/khmer/_oxli/graphs.pxd index 52d15991af..90cf92c91d 100644 --- a/khmer/_oxli/graphs.pxd +++ b/khmer/_oxli/graphs.pxd @@ -120,6 +120,8 @@ cdef extern from "oxli/hashtable.hh" namespace "oxli" nogil: cdef cppclass CpNodetable "oxli::Nodetable" (CpMurmurHashtable): CpNodetable(WordLength, vector[uint64_t]) + void compose_init(CpCounttable&, BoundedCounterType) + void compose_update(CpCounttable&, BoundedCounterType) cdef cppclass CpQFCounttable "oxli::QFCounttable" (CpHashtable): CpQFCounttable(WordLength, uint64_t) except +oxli_raise_py_error diff --git a/khmer/_oxli/graphs.pyx b/khmer/_oxli/graphs.pyx index 62881c9166..d901b4eec2 100644 --- a/khmer/_oxli/graphs.pyx +++ b/khmer/_oxli/graphs.pyx @@ -459,6 +459,12 @@ cdef class Nodetable(Hashtable): self._nt_this = make_shared[CpNodetable](k, _primes) self._ht_this = self._nt_this + def compose_init(self, Counttable ct, int max): + deref(self._nt_this).compose_init(deref(ct._ct_this), max) + + def compose_update(self, Counttable ct, int max): + deref(self._nt_this).compose_update(deref(ct._ct_this), max) + cdef class Hashgraph(Hashtable): diff --git a/tests/test_counttable.py b/tests/test_counttable.py index 0b8b33deaf..f1e22726a1 100755 --- a/tests/test_counttable.py +++ b/tests/test_counttable.py @@ -199,3 +199,33 @@ def test_init_with_primes(sketchtype): primes = khmer.get_n_primes_near_x(4, random.randint(1000, 2000)) sketch = sketchtype(31, 1, 1, primes=primes) assert sketch.hashsizes() == primes + + +def test_compose(): + counts1 = khmer.Counttable(21, 1e4, 4) + counts2 = khmer.Counttable(21, 1e4, 4) + + counts1.add('GATTACAGATTACAGATTACA') + counts2.add('GATTACAGATTACAGATTACA') + + counts1.add('TAGATCTGCTTGAAACAAGTG') + for _ in range(5): + counts2.add('TAGATCTGCTTGAAACAAGTG') + + for _ in range(5): + counts1.add('AAGTGGATTTGAGAAAAAAGT') + counts2.add('AAGTGGATTTGAGAAAAAAGT') + + for _ in range(5): + counts1.add('GGGGGGGGGGGGGGGGGGGGG') + counts2.add('GGGGGGGGGGGGGGGGGGGGG') + + kmers = khmer.Nodetable(21, 1e4, 4) + kmers.compose_init(counts1, 3) + kmers.compose_update(counts2, 3) + + assert kmers.get('GATTACAGATTACAGATTACA') == 1 + assert kmers.get('TAGATCTGCTTGAAACAAGTG') == 0 + assert kmers.get('AAGTGGATTTGAGAAAAAAGT') == 0 + assert kmers.get('GGGGGGGGGGGGGGGGGGGGG') == 0 + assert kmers.get('AAAAAAAAAAAAAAAAAAAAA') == 1