From c6a5e8b0eabb5705b2dafbc2f612a2b7693de12d Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Thu, 31 Aug 2017 09:19:31 -0700 Subject: [PATCH 1/3] First step toward banding in the CLI --- khmer/khmer_args.py | 9 +++++++-- scripts/abundance-dist-single.py | 19 +++++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/khmer/khmer_args.py b/khmer/khmer_args.py index c072984f4b..43c229419a 100644 --- a/khmer/khmer_args.py +++ b/khmer/khmer_args.py @@ -52,6 +52,8 @@ import screed import khmer from khmer import extract_countgraph_info +from khmer import Nodegraph, Countgraph, SmallCountgraph +from khmer import Nodetable, Counttable, SmallCounttable from khmer import __version__ from .utils import print_error from .khmer_logger import log_info, log_warn, configure_logging @@ -562,14 +564,17 @@ def create_countgraph(args, ksize=None, multiplier=1.0, fp_rate=0.1): print_error("\n** ERROR: khmer only supports k-mer sizes <= 32.\n") sys.exit(1) + dotable = hasattr(args, 'hash_function') and args.hash_function == 'murmur' if args.small_count: tablesize = calculate_graphsize(args, 'smallcountgraph', multiplier=multiplier) - return khmer.SmallCountgraph(ksize, tablesize, args.n_tables) + constructor = SmallCounttable if dotable else SmallCountgraph + return constructor(ksize, tablesize, args.n_tables) else: tablesize = calculate_graphsize(args, 'countgraph', multiplier=multiplier) - cg = khmer.Countgraph(ksize, tablesize, args.n_tables) + constructor = Counttable if dotable else Countgraph + cg = constructor(ksize, tablesize, args.n_tables) if hasattr(args, 'bigcount'): cg.set_use_bigcount(args.bigcount) return cg diff --git a/scripts/abundance-dist-single.py b/scripts/abundance-dist-single.py index 56278cbfa1..88b0aa95c5 100755 --- a/scripts/abundance-dist-single.py +++ b/scripts/abundance-dist-single.py @@ -85,6 +85,13 @@ def get_parser(): 'output histogram file. The columns are: (1) k-mer ' 'abundance, (2) k-mer count, (3) cumulative count, ' '(4) fraction of total distinct k-mers.') + parser.add_argument('-H', '--hash-function', choices=['2bit', 'murmur'], + default='2bit', help='Indicate the hash function to ' + 'be used; "2bit" is faster, is reversible, and ' + 'supports subsequent graph operations, but is limited ' + 'to k <= 32; "murmur" supports arbitrarily large ' + 'values of k and is compatible with k-mer banding, ' + 'but is slower and does not support graph operations') parser.add_argument('-z', '--no-zero', dest='output_zero', default=True, action='store_false', help='Do not output zero-count bins') @@ -99,6 +106,15 @@ def get_parser(): "filename.") parser.add_argument('-f', '--force', default=False, action='store_true', help='Override sanity checks') + parser.add_argument('--banding', type=int, nargs=2, default=False, + metavar=('N', 'B'), help='process k-mers in "banding" ' + 'mode; specify two integers: a number of bands "N", ' + 'and a band index "B" such that B is between 1 and N ' + 'inclusive; as a result, only 1/N k-mers will be ' + 'processed, resulting in a roughly N-fold reduction ' + 'in memory consumption; for example, "--banding 50 9" ' + 'will split the k-mer space into 50 bands and only ' + 'process k-mers in band 9') parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') return parser @@ -107,6 +123,9 @@ def get_parser(): def main(): # pylint: disable=too-many-locals,too-many-branches args = sanitize_help(get_parser()).parse_args() graph_type = 'smallcountgraph' if args.small_count else 'countgraph' + if args.banding and args.hash_function != 'murmur': + message = 'can only process in "banding" mode with "murmur" hash' + raise ValueError(message) configure_logging(args.quiet) report_on_config(args, graph_type) From 17b4db25779b91bd96011ed8e2ca125eba0e0241 Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Thu, 31 Aug 2017 09:29:30 -0700 Subject: [PATCH 2/3] Add consume_seqfile call for banding mode --- scripts/abundance-dist-single.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/scripts/abundance-dist-single.py b/scripts/abundance-dist-single.py index 88b0aa95c5..490a48ff07 100755 --- a/scripts/abundance-dist-single.py +++ b/scripts/abundance-dist-single.py @@ -165,8 +165,15 @@ def main(): # pylint: disable=too-many-locals,too-many-branches log_info('consuming input, round 1 -- {input}', input=args.input_sequence_filename) for _ in range(args.threads): - thread = \ - threading.Thread( + if args.banding: + numbands = args.banding[0] + bandindex = args.banding[1] - 1 # CLI is 1-based, API is 0-based + thread = threading.Thread( + target=countgraph.consume_seqfile_banding_with_reads_parser, + args=(rparser, numbands, bandindex, ) + ) + else: + thread = threading.Thread( target=countgraph.consume_seqfile_with_reads_parser, args=(rparser, ) ) From ed6d02694feee1c3ff36e73ee05a05643b3827dc Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Mon, 13 Nov 2017 14:21:02 -0800 Subject: [PATCH 3/3] Support for CyclicCounttable --- khmer/khmer_args.py | 9 +++++++-- scripts/abundance-dist-single.py | 18 ++++++++++-------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/khmer/khmer_args.py b/khmer/khmer_args.py index bcf3f9beca..c71d9c52f8 100755 --- a/khmer/khmer_args.py +++ b/khmer/khmer_args.py @@ -51,7 +51,7 @@ import khmer from khmer import extract_countgraph_info from khmer import Nodegraph, Countgraph, SmallCountgraph -from khmer import Nodetable, Counttable, SmallCounttable +from khmer import Nodetable, Counttable, CyclicCounttable, SmallCounttable from khmer import __version__ from .utils import print_error from .khmer_logger import log_info, log_warn, configure_logging @@ -563,6 +563,7 @@ def create_countgraph(args, ksize=None, multiplier=1.0, fp_rate=0.1): sys.exit(1) dotable = hasattr(args, 'hash_function') and args.hash_function == 'murmur' + docyclic = hasattr(args, 'hash_function') and args.hash_function == 'cyclic' if args.small_count: tablesize = calculate_graphsize(args, 'smallcountgraph', multiplier=multiplier) @@ -571,7 +572,11 @@ def create_countgraph(args, ksize=None, multiplier=1.0, fp_rate=0.1): else: tablesize = calculate_graphsize(args, 'countgraph', multiplier=multiplier) - constructor = Counttable if dotable else Countgraph + constructor = Countgraph + if dotable: + constructor = Counttable + elif docyclic: + constructor = CyclicCounttable cg = constructor(ksize, tablesize, args.n_tables) if hasattr(args, 'bigcount'): cg.set_use_bigcount(args.bigcount) diff --git a/scripts/abundance-dist-single.py b/scripts/abundance-dist-single.py index dd6c76d26b..34ba715eb0 100755 --- a/scripts/abundance-dist-single.py +++ b/scripts/abundance-dist-single.py @@ -84,13 +84,15 @@ def get_parser(): 'output histogram file. The columns are: (1) k-mer ' 'abundance, (2) k-mer count, (3) cumulative count, ' '(4) fraction of total distinct k-mers.') - parser.add_argument('-H', '--hash-function', choices=['2bit', 'murmur'], - default='2bit', help='Indicate the hash function to ' - 'be used; "2bit" is faster, is reversible, and ' - 'supports subsequent graph operations, but is limited ' - 'to k <= 32; "murmur" supports arbitrarily large ' - 'values of k and is compatible with k-mer banding, ' - 'but is slower and does not support graph operations') + parser.add_argument('-H', '--hash-function', choices=['2bit', 'murmur', + 'cyclic'], default='2bit', help='Indicate the hash ' + 'function to be used; "2bit" is faster, is reversible,' + ' and supports subsequent graph operations, but is ' + 'limited to k <= 32; "murmur" supports arbitrarily ' + 'large values of k and is compatible with k-mer ' + 'banding, but is slower and does not support graph ' + 'operations; "cyclic" is fast and supports banding, ' + 'but does not support graph operations') parser.add_argument('-z', '--no-zero', dest='output_zero', default=True, action='store_false', help='Do not output zero-count bins') @@ -122,7 +124,7 @@ def get_parser(): def main(): # pylint: disable=too-many-locals,too-many-branches args = sanitize_help(get_parser()).parse_args() graph_type = 'smallcountgraph' if args.small_count else 'countgraph' - if args.banding and args.hash_function != 'murmur': + if args.banding and args.hash_function == '2bit': message = 'can only process in "banding" mode with "murmur" hash' raise ValueError(message)