dib-lab · standage · Sep 13, 2017 · Sep 13, 2017 · Sep 13, 2017 · standage
diff --git a/khmer/_oxli/graphs.pxd b/khmer/_oxli/graphs.pxd
@@ -139,15 +139,15 @@ cdef extern from "oxli/hashgraph.hh" namespace "oxli" nogil:
 
         void consume_seqfile_and_tag[SeqIO](const string &,
                                    unsigned int,
-                                   unsigned long long) 
+                                   unsigned long long)
 
         # Ugly workaround. For some reason, Cython doesn't like *just this*
         # templated overload -- it chooses whichever was defined last, breaking
         # resolution for either strings of FastxParserPtr. So, we rename it on
         # the Cython side and give it a real name substitution for code gen.
         void consume_seqfile_and_tag_readparser "consume_seqfile_and_tag" [SeqIO](shared_ptr[CpReadParser[SeqIO]],
                                    unsigned int,
-                                   unsigned long long) 
+                                   unsigned long long)
 
         void consume_sequence_and_tag(const string &,
                                       unsigned long long &)
@@ -160,7 +160,7 @@ cdef extern from "oxli/hashgraph.hh" namespace "oxli" nogil:
                                        unsigned int &,
                                        unsigned long long &) except +oxli_raise_py_error
 
-        uintptr_t trim_on_stoptags(string) 
+        uintptr_t trim_on_stoptags(string)
 
         unsigned int traverse_from_kmer(CpKmer,
                                         uint32_t,
@@ -177,7 +177,7 @@ cdef extern from "oxli/hashgraph.hh" namespace "oxli" nogil:
         void load_stop_tags(string, bool) except +oxli_raise_py_error
         void extract_unique_paths(string, uint32_t, float, vector[string])
         void calc_connected_graph_size(CpKmer, uint64_t&, KmerSet&,
-                                       const uint64_t, bool) 
+                                       const uint64_t, bool)
         uint32_t kmer_degree(HashIntoType, HashIntoType)
         uint32_t kmer_degree(const char *)
         void find_high_degree_nodes(const char *, set[HashIntoType] &) const

diff --git a/khmer/_oxli/graphs.pyx b/khmer/_oxli/graphs.pyx
@@ -128,7 +128,7 @@ cdef class Hashtable:
             return deref(self._ht_this).get_count(<HashIntoType> kmer)
         else:
             self._kmer_type_error(kmer)
-            
+
 
     def ksize(self):
         """k-mer size"""
@@ -211,18 +211,6 @@ cdef class Hashtable:
                                                                    max_count))
         return posns
 
-    def consume_seqfile_with_reads_parser(self, read_parser):
-        """Count all k-mers from read_parser."""
-        cdef unsigned long long n_consumed = 0
-        cdef unsigned int total_reads = 0
-
-        cdef CPyReadParser_Object* parser = <CPyReadParser_Object*>read_parser
-
-        deref(self._ht_this).consume_seqfile[CpFastxReader](parser.parser,
-                                                           total_reads,
-                                                           n_consumed)
-        return total_reads, n_consumed
-
     def consume_seqfile(self, file_name):
         """Count all k-mers from file_name."""
         cdef unsigned long long n_consumed = 0
@@ -273,6 +261,53 @@ cdef class Hashtable:
                                                                      n_consumed)
         return total_reads, n_consumed
 
+    def consume_seqfile_with_parser(self, object read_parser):
+        """Count all k-mers from read_parser."""
+        cdef unsigned long long n_consumed = 0
+        cdef unsigned int total_reads = 0
+        cdef CPyReadParser_Object* parser = <CPyReadParser_Object*>read_parser
+
+        deref(self._ht_this).consume_seqfile[CpFastxReader](
+            parser.parser, total_reads, n_consumed
+        )
+        return total_reads, n_consumed
+
+    def consume_seqfile_with_mask_with_parser(self, object read_parser,
+                                              Hashtable mask, int threshold=0):
+        cdef unsigned long long n_consumed = 0
+        cdef unsigned int total_reads = 0
+        cdef CPyReadParser_Object* parser = <CPyReadParser_Object*>read_parser
+        cdef CpHashtable * cmask = mask._ht_this.get()
+        deref(self._ht_this).consume_seqfile_with_mask[CpFastxReader](
+            parser.parser, cmask, threshold, total_reads, n_consumed
+        )
+        return total_reads, n_consumed
+
+    def consume_seqfile_banding_with_parser(self, object read_parser, num_bands,
+                                            band):
+        """Count all k-mers from file_name."""
+        cdef unsigned long long n_consumed = 0
+        cdef unsigned int total_reads = 0
+        cdef CPyReadParser_Object* parser = <CPyReadParser_Object*>read_parser
+        deref(self._ht_this).consume_seqfile_banding[CpFastxReader](
+            parser.parser, num_bands, band, total_reads, n_consumed
+        )
+        return total_reads, n_consumed
+
+    def consume_seqfile_banding_with_mask_with_parser(self, object read_parser,
+                                                      num_bands, band,
+                                                      Hashtable mask,
+                                                      int threshold=0):
+        cdef unsigned long long n_consumed = 0
+        cdef unsigned int total_reads = 0
+        cdef CPyReadParser_Object* parser = <CPyReadParser_Object*>read_parser
+        cdef CpHashtable * cmask = mask._ht_this.get()
+        deref(self._ht_this).consume_seqfile_banding_with_mask[CpFastxReader](
+            parser.parser, num_bands, band, cmask, threshold, total_reads,
+            n_consumed
+        )
+        return total_reads, n_consumed
+
     def abundance_distribution(self, file_name, Hashtable tracking):
         """Calculate the k-mer abundance distribution over reads in file_name."""
         cdef FastxParserPtr parser = get_parser[CpFastxReader](_bstring(file_name))
@@ -284,7 +319,7 @@ cdef class Hashtable:
             abunds.append(x[i])
         return abunds
 
-    def abundance_distribution_with_reads_parser(self, object read_parser, Hashtable tracking):
+    def abundance_distribution_with_parser(self, object read_parser, Hashtable tracking):
         """Calculate the k-mer abundance distribution over reads."""
 
         cdef CpHashtable * cptracking = tracking._ht_this.get()
@@ -486,12 +521,12 @@ cdef class Hashgraph(Hashtable):
         list; used in graph contraction.'''
         cdef HashSet hdns = HashSet(self.ksize())
         _sequence = self._valid_sequence(sequence)
-        deref(self._hg_this).find_high_degree_nodes(_sequence, 
+        deref(self._hg_this).find_high_degree_nodes(_sequence,
                                                     hdns.hs)
         return hdns
 
 
-    def traverse_linear_path(self, object kmer, HashSet hdns, 
+    def traverse_linear_path(self, object kmer, HashSet hdns,
                              Nodegraph stop_filter=None):
         '''Traverse the path through the graph starting with the given
         k-mer and avoiding high-degree nodes, finding (and returning)
@@ -539,7 +574,7 @@ cdef class Hashgraph(Hashtable):
         cdef HashSet hs = HashSet(self.ksize())
         deref(self._hg_this).get_tags_for_sequence(_sequence, hs.hs)
         return hs
-            
+
     def find_all_tags_list(self, object kmer):
         '''Find all tags within range of the given k-mer, return as list'''
         cdef CpKmer _kmer = self._build_kmer(kmer)
@@ -548,7 +583,7 @@ cdef class Hashgraph(Hashtable):
         cdef shared_ptr[CpHashgraph] this = self._hg_this
 
         with nogil:
-            deref(deref(self._hg_this).partition).find_all_tags(_kmer, deref(tags), 
+            deref(deref(self._hg_this).partition).find_all_tags(_kmer, deref(tags),
                                                                 deref(this).all_tags)
 
         return result
@@ -564,16 +599,16 @@ cdef class Hashgraph(Hashtable):
                                                                    total_reads,
                                                                    n_consumed)
         return total_reads, n_consumed
-    
+
     def print_tagset(self, str filename):
         '''Print out all of the tags.'''
         deref(self._hg_this).print_tagset(_bstring(filename))
-    
+
     def add_tag(self, object kmer):
         '''Add a k-mer to the tagset.'''
         cdef HashIntoType _kmer = self.sanitize_hash_kmer(kmer)
         deref(self._hg_this).add_tag(_kmer)
-    
+
     def get_tagset(self):
         '''Get all tagged k-mers as DNA strings.'''
         cdef HashIntoType st
@@ -591,29 +626,29 @@ cdef class Hashgraph(Hashtable):
     def load_tagset(self, str filename, clear_tags=True):
         '''Load tags from a file.'''
         deref(self._hg_this).load_tagset(_bstring(filename), clear_tags)
-        
+
     def save_tagset(self, str filename):
         '''Save tags to a file.'''
         deref(self._hg_this).save_tagset(_bstring(filename))
-    
+
     @property
     def n_tags(self):
         '''Return the count of all tags.'''
         return deref(self._hg_this).n_tags()
-    
+
     def divide_tags_into_subsets(self, int subset_size=0):
         '''Divide tags equally up into subsets of given size.'''
         cdef set[HashIntoType] divvy
         deref(self._hg_this).divide_tags_into_subsets(subset_size, divvy)
         cdef HashSet hs = HashSet(self.ksize())
         hs.hs = divvy
         return hs
-    
+
     @property
     def tag_density(self):
         '''Get the tagging density.'''
         return deref(self._hg_this)._get_tag_density()
-    
+
     @tag_density.setter
     def tag_density(self, int density):
         '''Set the tagging density.'''
@@ -630,7 +665,7 @@ cdef class Hashgraph(Hashtable):
         cdef HashIntoType end = self.sanitize_hash_kmer(end_kmer)
         cdef bool cbreak = break_on_stoptags
         cdef bool cstop = stop_big_traversals
-        
+
         with nogil:
             deref(subset_ptr).do_partition(start, end, cbreak, cstop)
 
@@ -650,15 +685,15 @@ cdef class Hashgraph(Hashtable):
 
         return ppi
 
-    
+
     def assign_partition_id(self, PrePartitionInfo ppi):
         '''Assign a partition ID to a given tag.'''
         cdef cp_pre_partition_info * cppi = ppi._this.get()
         cdef PartitionID pi
         pi = deref(deref(self._hg_this).partition).assign_partition_id(deref(cppi).kmer,
                                                                        deref(cppi).tagged_kmers)
         return pi
-    
+
     def output_partitions(self, str filename, str output, bool
                                 output_unassigned=False):
         '''Write out sequences in given filename to another file, annotating '''
@@ -668,20 +703,20 @@ cdef class Hashgraph(Hashtable):
                                                     _bstring(output),
                                                     output_unassigned)
         return n_partitions
-    
+
     def load_partitionmap(self, str filename):
         '''Load a partitionmap for the master subset.'''
         deref(deref(self._hg_this).partition).load_partitionmap(_bstring(filename))
 
     def save_partitionmap(self, str filename):
         '''Save a partitionmap for the master subset.'''
         deref(deref(self._hg_this).partition).save_partitionmap(_bstring(filename))
-    
+
     def _validate_partitionmap(self):
         '''Run internal validation checks.'''
         deref(deref(self._hg_this).partition)._validate_pmap()
-    
-    def consume_seqfile_and_tag_with_reads_parser(self, object read_parser):
+
+    def consume_seqfile_and_tag_with_parser(self, object read_parser):
         '''Count all k-mers using the given reads parser'''
         cdef unsigned long long n_consumed = 0
         cdef unsigned int total_reads = 0
@@ -693,7 +728,7 @@ cdef class Hashgraph(Hashtable):
                                                             total_reads,
                                                             n_consumed)
         return total_reads, n_consumed
-    
+
     def consume_partitioned_fasta(self, filename):
         '''Count all k-mers in a given file'''
         cdef unsigned long long n_consumed = 0
@@ -703,19 +738,19 @@ cdef class Hashgraph(Hashtable):
                                                                      total_reads,
                                                                      n_consumed)
         return total_reads, n_consumed
-    
+
     def merge_subset(self, SubsetPartition subset):
         '''Merge the given subset into this one.'''
         deref(deref(self._hg_this).partition).merge(subset._this.get())
 
     def merge_subset_from_disk(self, str filename):
         '''Merge the given subset (filename) into this one.'''
         deref(deref(self._hg_this).partition).merge_from_disk(_bstring(filename))
-    
+
     def count_partitions(self):
         '''Count the number of partitions in the master partitionmap.'''
         return self.partition.count_partitions()
-    
+
     def set_partition_id(self, object kmer, PartitionID pid):
         '''Set the partition ID for this tag.'''
         cdef string start = self.sanitize_kmer(kmer)
@@ -729,7 +764,7 @@ cdef class Hashgraph(Hashtable):
         '''Get the partition ID of this tag.'''
         cdef string _kmer = self.sanitize_kmer(kmer)
         return deref(deref(self._hg_this).partition).get_partition_id(_kmer)
-    
+
     def repartition_largest_partition(self, Countgraph counts not None,
                                             unsigned int distance,
                                             unsigned int threshold,
@@ -754,15 +789,15 @@ cdef class Hashgraph(Hashtable):
     def load_stop_tags(self, object filename, clear_tags=False):
         '''Load the set of stop tags.'''
         deref(self._hg_this).load_stop_tags(_bstring(filename), clear_tags)
-        
+
     def save_stop_tags(self, object filename):
         '''Save the set of stop tags.'''
         deref(self._hg_this).save_stop_tags(_bstring(filename))
 
     def print_stop_tags(self, filename):
         '''Print out the set of stop tags.'''
         deref(self._hg_this).print_stop_tags(_bstring(filename))
-    
+
     def trim_on_stoptags(self, str sequence):
         '''Trim the reads on the given stop tags.'''
         cdef size_t trim_at
@@ -776,7 +811,7 @@ cdef class Hashgraph(Hashtable):
         '''Add this k-mer as a stop tag.'''
         cdef HashIntoType _kmer = self.sanitize_hash_kmer(kmer)
         deref(self._hg_this).add_stop_tag(_kmer)
-    
+
     def get_stop_tags(self):
         '''Return a DNA list of all of the stop tags.'''
         cdef HashIntoType st

diff --git a/oxli/functions.py b/oxli/functions.py
@@ -50,9 +50,9 @@ def build_graph(ifilenames, graph, num_threads=1, tags=False):
     - tags: should there be tags
     """
     if tags:
-        eat = graph.consume_seqfile_and_tag_with_reads_parser
+        eat = graph.consume_seqfile_and_tag_with_parser
     else:
-        eat = graph.consume_seqfile_with_reads_parser
+        eat = graph.consume_seqfile_with_parser
 
     for _, ifile in enumerate(ifilenames):
         rparser = khmer.ReadParser(ifile)

diff --git a/sandbox/count-kmers-single.py b/sandbox/count-kmers-single.py
@@ -103,7 +103,7 @@ def main():
     for _ in range(args.threads):
         thread = \
             threading.Thread(
-                target=countgraph.consume_seqfile_with_reads_parser,
+                target=countgraph.consume_seqfile_with_parser,
                 args=(rparser, )
             )
         threads.append(thread)

diff --git a/sandbox/optimal_args_hashbits.py b/sandbox/optimal_args_hashbits.py
@@ -81,7 +81,7 @@ def main():
           file=sys.stderr)
 
     htable = khmer.new_nodegraph(args.ksize, args.max_tablesize, args.n_tables)
-    target_method = htable.consume_seqfile_with_reads_parser
+    target_method = htable.consume_seqfile_with_parser
 
     for _, filename in enumerate(filenames):
         rparser = khmer.ReadParser(filename)

diff --git a/scripts/abundance-dist-single.py b/scripts/abundance-dist-single.py
@@ -148,7 +148,7 @@ def main():  # pylint: disable=too-many-locals,too-many-branches
     for _ in range(args.threads):
         thread = \
             threading.Thread(
-                target=countgraph.consume_seqfile_with_reads_parser,
+                target=countgraph.consume_seqfile_with_parser,
                 args=(rparser, )
             )
         threads.append(thread)
@@ -163,7 +163,7 @@ def main():  # pylint: disable=too-many-locals,too-many-branches
     abundance_lists = []
 
     def __do_abundance_dist__(read_parser):
-        abundances = countgraph.abundance_distribution_with_reads_parser(
+        abundances = countgraph.abundance_distribution_with_parser(
             read_parser, tracking)
         abundance_lists.append(abundances)
 

diff --git a/scripts/filter-abund-single.py b/scripts/filter-abund-single.py
@@ -141,7 +141,7 @@ def main():
     for _ in range(args.threads):
         cur_thread = \
             threading.Thread(
-                target=graph.consume_seqfile_with_reads_parser,
+                target=graph.consume_seqfile_with_parser,
                 args=(rparser, )
             )
         threads.append(cur_thread)