From 3bf833fb9b71a7c13ab2c3b20899fff81bf3667a Mon Sep 17 00:00:00 2001 From: skchronicles Date: Tue, 25 Oct 2022 19:08:14 -0400 Subject: [PATCH] Adding new option for shared resources, i.e. FastQ Screen and Kraken2 DBs --- rna-seek | 105 +++++++++++++++-- workflow/rules/build.smk | 247 +++++++++++++++++++++++++++++++++++---- 2 files changed, 316 insertions(+), 36 deletions(-) diff --git a/rna-seek b/rna-seek index 00a55ad..523e02f 100755 --- a/rna-seek +++ b/rna-seek @@ -21,7 +21,7 @@ import argparse # potential python3 3rd party package, added in python/3.5 # Pipeline Metadata and globals __author__ = 'Skyler Kuhn' -__version__ = 'v1.6.0' +__version__ = 'v1.7.0' __email__ = 'kuhnsa@nih.gov' __home__ = os.path.dirname(os.path.abspath(__file__)) _name = os.path.basename(sys.argv[0]) @@ -613,12 +613,17 @@ def setup(sub_args, ifiles, repo_path, output_path): config['options']['star_2_pass_basic'] = sub_args.star_2_pass_basic config['options']['small_rna'] = sub_args.small_rna config['options']['tmp_dir'] = sub_args.tmp_dir + config['options']['shared_resources'] = sub_args.shared_resources # Get latest git commit hash git_hash = get_repo_git_commit_hash(repo_path) config['project']['git_commit_hash'] = git_hash - + if sub_args.shared_resources: + # Update paths to shared resources directory + config['bin']['rnaseq']['tool_parameters']['FASTQ_SCREEN_CONFIG'] = os.path.join(sub_args.shared_resources, "fastq_screen_db", "fastq_screen_p1.conf") + config['bin']['rnaseq']['tool_parameters']['FASTQ_SCREEN_CONFIG2'] = os.path.join(sub_args.shared_resources, "fastq_screen_db", "fastq_screen_p2.conf") + config['bin']['rnaseq']['tool_parameters']['KRAKENBACDB'] = os.path.join(sub_args.shared_resources, "20180907_standard_kraken2") # Save config to output directory print("\nGenerating config file in '{}'... ".format(os.path.join(output_path, 'config.json')), end = "") @@ -981,6 +986,7 @@ def _configure(sub_args, filename, git_repo): fh.write('BUILD_HOME: "{}"\n'.format(git_repo)) fh.write('SMALL_GENOME: "{}"\n'.format(sub_args.small_genome)) fh.write('TMP_DIR: "{}"\n'.format(sub_args.tmp_dir)) + fh.write('SHARED_RESOURCES: "{}"\n'.format(sub_args.shared_resources)) fh.write('READLENGTHS:\n') read_lengths = ['50', '75', '100', '125', '150'] for rl in read_lengths: @@ -1032,28 +1038,51 @@ def build(sub_args): @param sub_args : Parsed arguments for unlock sub-command """ - # Get PATH to RNA-seek git repository for copying over pipeline resources + # Get PATH to RNA-seek git repository + # for copying over pipeline resources git_repo = os.path.dirname(os.path.abspath(__file__)) # Build Output directory output_path = os.path.abspath(sub_args.output) - # Configure build output directory: initialize, copy resources, generate config file - additional_bind_paths = configure_build(sub_args = sub_args, git_repo = git_repo, output_path = output_path) + # Configure build output directory, + # initialize, copy resources, and + # generate config file + additional_bind_paths = configure_build( + sub_args = sub_args, + git_repo = git_repo, + output_path = output_path + ) + + # Add any additional bindpaths + if sub_args.shared_resources: + # Check if shared resource path + # needs to be added to bindlist + if not sub_args.shared_resources in additional_bind_paths: + additional_bind_paths.append(sub_args.shared_resources) + additional_bind_paths = ','.join(additional_bind_paths) # Dryrun pipeline if sub_args.dry_run: - dryrun_output = dryrun(outdir = output_path, config = os.path.join('config', 'build.yml'), snakefile = os.path.join('workflow', 'rules', 'build.smk')) - # python3 returns byte-string representation - print("\nDry-running RNA-seek Reference building pipeline:\n{}".format(dryrun_output.decode("utf-8"))) - sys.exit(0) + dryrun_output = dryrun( + outdir = output_path, + config = os.path.join('config', 'build.yml'), + snakefile = os.path.join('workflow', 'rules', 'build.smk') + ) + print( + "\nDry-running RNA-seek Reference building pipeline:\n{}".format( + dryrun_output.decode("utf-8") + ) + ) + sys.exit(0) + # Run RNA-seek reference building pipeline masterjob = orchestrate( mode = 'slurm', outdir = output_path, - additional_bind_paths = [], + additional_bind_paths = additional_bind_paths, alt_cache = sub_args.singularity_cache, submission_script = 'builder', masterjob = 'pl:build', @@ -1216,6 +1245,7 @@ def parsed_arguments(name, description): $ {0} run [--help] \\ [--small-rna] [--star-2-pass-basic] \\ [--dry-run] [--mode {{slurm, local}}] \\ + [--shared-resources SHARED_RESOURCES] \\ [--singularity-cache SINGULARITY_CACHE] \\ [--sif-cache SIF_CACHE] \\ [--tmp-dir TMP_DIR] \\ @@ -1312,6 +1342,23 @@ def parsed_arguments(name, description): recommended running the pipeline in this mode as it will be significantly faster. Example: --mode slurm + + --shared-resources Local path to shared resources. The pipeline uses a set + of shared reference files that can be re-used across ref- + erence genomes. These currently include reference files + for kraken and FQScreen. These reference files can be + downloaded with the build sub command's --shared-resources + option. These files only need to be downloaded once. If + you are running the pipeline on Biowulf, you do NOT need + to download these reference files! They already exist on + the filesystem in a location that anyone can acceess. If + you are running the pipeline on another cluster or target + system, you will need to download the shared resources + with the build sub command, and you will need to provide + this option to the run sub command every time. Please + provide the same path that was provided to the build sub + command's --shared-resources option. + Example: --shared-resources /data/shared/rna-seek --singularity-cache SINGULARITY_CACHE Overrides the $SINGULARITY_CACHEDIR variable. Images @@ -1480,6 +1527,17 @@ def parsed_arguments(name, description): help = argparse.SUPPRESS ) + # Path to previously downloaded shared + # reference files, see build option for + # more information + subparser_run.add_argument( + '--shared-resources', + type = lambda option: permissions(parser, os.path.abspath(os.path.expanduser(option)), os.R_OK), + required = False, + default = None, + help = argparse.SUPPRESS + ) + # Singularity cache directory, # default uses output directory subparser_run.add_argument( @@ -1535,8 +1593,8 @@ def parsed_arguments(name, description): {1}{2}Synopsis:{4} $ {0} build [--help] \\ - [--dry-run] [--small-genome] \\ - [--singularity-cache SINGULARITY_CACHE] \\ + [--shared-resources SHARED_RESOURCES] [--small-genome] \\ + [--dry-run] [--singularity-cache SINGULARITY_CACHE] \\ [--sif-cache SIF_CACHE] [--tmp-dir TMP_DIR] \\ --ref-fa REF_FA \\ --ref-name REF_NAME \\ @@ -1588,6 +1646,20 @@ def parsed_arguments(name, description): Example: --output /data/$USER/refs/GRCh38_41 {1}{2}Build options:{4} + --shared-resources Path to download shared resources. The pipeline uses a + set of shared reference files that can be re-used across + reference genomes. These currently include reference files + for kraken and FQScreen. With that being said, these files + can be downloaded once in a shared or common location. If + you are running the pipeline on Biowulf, you do NOT need + to download these reference files. They already exist in + an accessible location on the filesystem. If you're setting + up the pipeline on a new cluster or target system, you will + need to provide this option at least one time. The path + provided to this option can be provided to the rna-seek + run sub command via the --shared-resources option. + Example: --shared-resources /data/shared/rna-seek + --small-genome Builds a small genome index. For small genomes, it is recommeded running STAR with --genomeSAindexNbases value scaled down. This option runs the build pipeline in a @@ -1740,6 +1812,15 @@ def parsed_arguments(name, description): help = argparse.SUPPRESS ) + # Path to download shared refs + subparser_build.add_argument( + '--shared-resources', + type = lambda option: os.path.abspath(os.path.expanduser(option)), + required = False, + default = None, + help = argparse.SUPPRESS + ) + # Small Genome build option for STAR subparser_build.add_argument( '--small-genome', diff --git a/workflow/rules/build.smk b/workflow/rules/build.smk index 5f1778a..b0a73bb 100644 --- a/workflow/rules/build.smk +++ b/workflow/rules/build.smk @@ -1,5 +1,67 @@ from os.path import join, basename +# Helper Functions +def allocated(resource, rule, lookup, default="__default__"): + """Pulls resource information for a given rule. If a rule does not have any information + for a given resource type, then it will pull from the default. Information is pulled from + definitions in the cluster.json (which is used a job submission). This ensures that any + resources used at runtime mirror the resources that were allocated. + :param resource : resource type to look in cluster.json (i.e. threads, mem, time, gres) + :param rule : rule to lookup its information + :param lookup : Lookup containing allocation information (i.e. cluster.json) + :param default : default information to use if rule information cannot be found + :return allocation : + allocation information for a given resource type for a given rule + """ + + try: + # Try to get allocation information + # for a given rule + allocation = lookup[rule][resource] + except KeyError: + # Use default allocation information + allocation = lookup[default][resource] + + return allocation + + +def provided(samplelist, condition): + """ + Determines if optional rules should run. If an empty list is provided to rule all, + snakemake will not try to generate that set of target files. If a given condition + is not met (i.e. False) then it will not try to run that rule. + """ + if not str_bool(condition): + # If condition is False, + # returns an empty list + # to prevent rule from + # running + samplelist = [] + return samplelist + + +def str_bool(s): + """Converts a string to boolean. It is dangerous to try to + typecast a string into a boolean value using the built-in + `bool()` function. This function avoids any issues that can + arise when using `bool()`. + Example: + boolean('True') returns True + boolean('False') returns False + boolean('asdas') raises TypeError + """ + if not s: + return False + val = s.lower() + if val in ['true', '1', 'y', 'yes']: + return True + elif val in ['false', '0', 'n', 'no', '', 'none']: + return False + else: + # Provided a string or path + return s + + # Global Workflow variables configfile:join("config","build.yml") GENOME=config["GENOME"].strip().replace(' ', '') @@ -26,29 +88,10 @@ try: except KeyError: SMALL_GENOME="False" - -def allocated(resource, rule, lookup, default="__default__"): - """Pulls resource information for a given rule. If a rule does not have any information - for a given resource type, then it will pull from the default. Information is pulled from - definitions in the cluster.json (which is used a job submission). This ensures that any - resources used at runtime mirror the resources that were allocated. - :param resource : resource type to look in cluster.json (i.e. threads, mem, time, gres) - :param rule : rule to lookup its information - :param lookup : Lookup containing allocation information (i.e. cluster.json) - :param default : default information to use if rule information cannot be found - :return allocation : - allocation information for a given resource type for a given rule - """ - - try: - # Try to get allocation information - # for a given rule - allocation = lookup[rule][resource] - except KeyError: - # Use default allocation information - allocation = lookup[default][resource] - - return allocation +try: + SHARED_PATH=config["SHARED_RESOURCES"] +except KeyError: + SHARED_PATH="None" rule all: @@ -65,7 +108,31 @@ rule all: "karyoplot_gene_coordinates.txt", "qualimap_info.txt", "karyobeds/karyobed.bed", - "transcripts.protein_coding_only.bed12" + "transcripts.protein_coding_only.bed12", + # FastQ Screen P1 Reference files, + # conditional runs with --shared-resources option + provided(expand(join(SHARED_PATH, "fastq_screen_db", "hg19", "hg19.{ext}.bt2"), ext=["1", "2", "3", "4"]), SHARED_PATH), + provided(expand(join(SHARED_PATH, "fastq_screen_db", "mm9", "mm9.{ext}.bt2"), ext=["1", "2", "3", "4"]), SHARED_PATH), + provided(expand(join(SHARED_PATH, "fastq_screen_db", "Virus", "virus.{ext}.bt2"), ext=["1", "2", "3", "4"]), SHARED_PATH), + provided(expand(join(SHARED_PATH, "fastq_screen_db", "hg19", "hg19.rev.{ext}.bt2"), ext=["1", "2"]), SHARED_PATH), + provided(expand(join(SHARED_PATH, "fastq_screen_db", "mm9", "mm9.rev.{ext}.bt2"), ext=["1", "2"]), SHARED_PATH), + provided(expand(join(SHARED_PATH, "fastq_screen_db", "Virus", "virus.rev.{ext}.bt2"), ext=["1", "2"]), SHARED_PATH), + provided(expand(join(SHARED_PATH, "fastq_screen_db", "Bacteria", "bacteria.{ext}.bt2l"), ext=["1", "2", "3", "4"]), SHARED_PATH), + provided(expand(join(SHARED_PATH, "fastq_screen_db", "Fungi", "fungi.{ext}.bt2l"), ext=["1", "2", "3", "4"]), SHARED_PATH), + provided(expand(join(SHARED_PATH, "fastq_screen_db", "Bacteria", "bacteria.rev.{ext}.bt2l"), ext=["1", "2"]), SHARED_PATH), + provided(expand(join(SHARED_PATH, "fastq_screen_db", "Fungi", "fungi.rev.{ext}.bt2l"), ext=["1", "2"]), SHARED_PATH), + # FastQ Screen P2 Reference files, + # conditional runs with --shared-resources option + provided(expand(join(SHARED_PATH, "fastq_screen_db", "UniVec_vectors", "UniVec_vectors.{ext}.bt2"), ext=["1", "2", "3", "4"]), SHARED_PATH), + provided(expand(join(SHARED_PATH, "fastq_screen_db", "rRNA", "rRNA.{ext}.bt2"), ext=["1", "2", "3", "4"]), SHARED_PATH), + provided(expand(join(SHARED_PATH, "fastq_screen_db", "UniVec_vectors", "UniVec_vectors.rev.{ext}.bt2"), ext=["1", "2"]), SHARED_PATH), + provided(expand(join(SHARED_PATH, "fastq_screen_db", "rRNA", "rRNA.rev.{ext}.bt2"), ext=["1", "2"]), SHARED_PATH), + # FastQ Screeen confs, + # conditional runs with --shared-resources option + provided(expand(join(SHARED_PATH, "fastq_screen_db", "fastq_screen_p{ext}.conf"), ext=["1", "2"]), SHARED_PATH), + # Kraken2 Database, + # conditional runs with --shared-resources option + provided(expand(join(SHARED_PATH, "20180907_standard_kraken2", "{ref}.k2d"), ref=["hash", "opts", "taxo"]), SHARED_PATH), rule rsem: @@ -187,6 +254,7 @@ rule star_rl: --outTmpDir ${{tmp}}/tmp_{wildcards.readlength} """ + if SMALL_GENOME == "True": # Build a index that is optimized for # small genomes. For small genomes, the @@ -302,6 +370,7 @@ else: --outTmpDir ${{tmp}}/tmp_genome """ + rule rRNA_list: """ Builds ribosomal RNA list reference file for picard CollectRnaMetrics to @@ -453,6 +522,135 @@ rule qualimapinfo: """ +rule fqscreen_db1: + """ + Downloads fastq screen bowtie2 databases from the OpenOmics public resource bundle. + Currently, there are fastq screen indices for the following organisms: hg19, mm9, + bateria, fungi, virus, univec vector sequences, and rRNA. + + @Input: + Genomic FASTA file + Annotation file in GTF format + @Output: + Downloaded bowtie2 indices for hg19, mm9, virus, univec, and rRNA + """ + input: + fa=REFFA, + gtf=GTFFILE, + output: + fwd=expand( + join(SHARED_PATH, "fastq_screen_db", "{{genome}}", "{{ref}}.{ext}.bt2"), + ext=["1", "2", "3", "4"] + ), + rev=expand( + join(SHARED_PATH, "fastq_screen_db", "{{genome}}", "{{ref}}.rev.{ext}.bt2"), + ext=["1", "2"] + ), + params: + rname='bl:fqscreen_db1', + tarname=join("fastq_screen_db.{genome}.tar"), + tarfile=join(SHARED_PATH, "fastq_screen_db.{genome}.tar"), + outdir=SHARED_PATH, + container: config['images']['build_rnaseq'] + shell: """ + wget https://hpc.nih.gov/~OpenOmics/common/{params.tarname} -O {params.tarfile} + tar vxf {params.tarfile} -C {params.outdir} && rm {params.tarfile} + """ + + +rule fqscreen_db2: + """ + Downloads fastq screen bowtie2 databases from the OpenOmics public resource bundle. + Currently, there are fastq screen indices for the following organisms: hg19, mm9, + bateria, fungi, virus, univec vector sequences, and rRNA. + + @Input: + Genomic FASTA file + Annotation file in GTF format + @Output: + Downloaded bowtie2 indices for bateria, fungi + """ + input: + fa=REFFA, + gtf=GTFFILE, + output: + fwd=expand( + join(SHARED_PATH, "fastq_screen_db", "{{genome}}", "{{ref}}.{ext}.bt2l"), + ext=["1", "2", "3", "4"] + ), + rev=expand( + join(SHARED_PATH, "fastq_screen_db", "{{genome}}", "{{ref}}.rev.{ext}.bt2l"), + ext=["1", "2"] + ), + params: + rname='bl:fqscreen_db2', + tarname=join("fastq_screen_db.{genome}.tar"), + tarfile=join(SHARED_PATH, "fastq_screen_db.{genome}.tar"), + outdir=SHARED_PATH, + container: config['images']['build_rnaseq'] + shell: """ + wget https://hpc.nih.gov/~OpenOmics/common/{params.tarname} -O {params.tarfile} + tar vxf {params.tarfile} -C {params.outdir} && rm {params.tarfile} + """ + + +rule fqscreen_conf: + """ + Downloads fastq screen confs from the OpenOmics public resource bundle. + @Input: + Genomic FASTA file + Annotation file in GTF format + @Output: + Downloaded bowtie2 indices for bateria, fungi + """ + input: + fa=REFFA, + gtf=GTFFILE, + output: + conf1=join(SHARED_PATH, "fastq_screen_db", "fastq_screen_p1.conf"), + conf2=join(SHARED_PATH, "fastq_screen_db", "fastq_screen_p2.conf"), + params: + rname='bl:fqscreen_conf', + conf1="fastq_screen_p1.conf", + conf2="fastq_screen_p2.conf", + new=join(SHARED_PATH).rstrip('/'), + outdir=join(SHARED_PATH, "fastq_screen_db") + container: config['images']['build_rnaseq'] + shell: """ + mkdir -p "{params.outdir}" + wget https://hpc.nih.gov/~OpenOmics/common/{params.conf1} -O {output.conf1} + wget https://hpc.nih.gov/~OpenOmics/common/{params.conf2} -O {output.conf2} + sed -i 's@/data/OpenOmics/references/common@{params.new}@g' {output.conf1} + sed -i 's@/data/OpenOmics/references/common@{params.new}@g' {output.conf2} + """ + + +rule kraken2_db: + """ + Downloads kraken2 database from the OpenOmics public resource bundle. + @Input: + Genomic FASTA file + Annotation file in GTF format + @Output: + Downloaded kraken2 db for contamination screening + """ + input: + fa=REFFA, + gtf=GTFFILE, + output: + expand(join(SHARED_PATH, "20180907_standard_kraken2", "{ref}.k2d"), ref=["hash", "opts", "taxo"]), + params: + rname='bl:kraken2_db', + outfh=join(SHARED_PATH, "20180907_standard_kraken2.tar"), + tarfile="20180907_standard_kraken2.tar", + outdir=SHARED_PATH, + container: config['images']['build_rnaseq'] + shell: """ + wget https://hpc.nih.gov/~OpenOmics/common/{params.tarfile} -O {params.outfh} + tar vxf {params.outfh} -C {params.outdir} && rm {params.outfh} + """ + + rule jsonmaker: """ Builds reference genome reference JSON file. This is a config file for the @@ -527,3 +725,4 @@ rule jsonmaker: with open(output.json, 'w') as fp: json.dump(refdict, fp, indent=4) +