From 3bf833fb9b71a7c13ab2c3b20899fff81bf3667a Mon Sep 17 00:00:00 2001
From: skchronicles <kuhnsa3@gmail.com>
Date: Tue, 25 Oct 2022 19:08:14 -0400
Subject: [PATCH] Adding new option for shared resources, i.e. FastQ Screen and
 Kraken2 DBs

---
 rna-seek                 | 105 +++++++++++++++--
 workflow/rules/build.smk | 247 +++++++++++++++++++++++++++++++++++----
 2 files changed, 316 insertions(+), 36 deletions(-)

diff --git a/rna-seek b/rna-seek
index 00a55ad..523e02f 100755
--- a/rna-seek
+++ b/rna-seek
@@ -21,7 +21,7 @@ import argparse  # potential python3 3rd party package, added in python/3.5
 
 # Pipeline Metadata and globals 
 __author__   = 'Skyler Kuhn'
-__version__  = 'v1.6.0'
+__version__  = 'v1.7.0'
 __email__    = 'kuhnsa@nih.gov'
 __home__     =  os.path.dirname(os.path.abspath(__file__))
 _name        = os.path.basename(sys.argv[0])
@@ -613,12 +613,17 @@ def setup(sub_args, ifiles, repo_path, output_path):
     config['options']['star_2_pass_basic'] = sub_args.star_2_pass_basic
     config['options']['small_rna'] = sub_args.small_rna
     config['options']['tmp_dir'] = sub_args.tmp_dir
+    config['options']['shared_resources'] = sub_args.shared_resources
 
     # Get latest git commit hash
     git_hash = get_repo_git_commit_hash(repo_path)
     config['project']['git_commit_hash'] = git_hash
 
-
+    if sub_args.shared_resources:
+        # Update paths to shared resources directory
+        config['bin']['rnaseq']['tool_parameters']['FASTQ_SCREEN_CONFIG'] = os.path.join(sub_args.shared_resources, "fastq_screen_db", "fastq_screen_p1.conf")
+        config['bin']['rnaseq']['tool_parameters']['FASTQ_SCREEN_CONFIG2'] = os.path.join(sub_args.shared_resources, "fastq_screen_db", "fastq_screen_p2.conf")
+        config['bin']['rnaseq']['tool_parameters']['KRAKENBACDB'] = os.path.join(sub_args.shared_resources, "20180907_standard_kraken2")
 
     # Save config to output directory
     print("\nGenerating config file in '{}'... ".format(os.path.join(output_path, 'config.json')), end = "")
@@ -981,6 +986,7 @@ def _configure(sub_args, filename, git_repo):
         fh.write('BUILD_HOME: "{}"\n'.format(git_repo))
         fh.write('SMALL_GENOME: "{}"\n'.format(sub_args.small_genome))
         fh.write('TMP_DIR: "{}"\n'.format(sub_args.tmp_dir))
+        fh.write('SHARED_RESOURCES: "{}"\n'.format(sub_args.shared_resources))
         fh.write('READLENGTHS:\n')
         read_lengths = ['50', '75', '100', '125', '150']
         for rl in read_lengths:
@@ -1032,28 +1038,51 @@ def build(sub_args):
     @param sub_args <parser.parse_args() object>:
         Parsed arguments for unlock sub-command
     """
-    # Get PATH to RNA-seek git repository for copying over pipeline resources
+    # Get PATH to RNA-seek git repository 
+    # for copying over pipeline resources
     git_repo = os.path.dirname(os.path.abspath(__file__))
 
     # Build Output directory
     output_path = os.path.abspath(sub_args.output)
 
-    # Configure build output directory: initialize, copy resources, generate config file
-    additional_bind_paths = configure_build(sub_args = sub_args, git_repo = git_repo, output_path = output_path)
+    # Configure build output directory,
+    # initialize, copy resources, and 
+    # generate config file
+    additional_bind_paths = configure_build(
+        sub_args = sub_args, 
+        git_repo = git_repo, 
+        output_path = output_path
+    )
+    
+    # Add any additional bindpaths  
+    if sub_args.shared_resources:
+        # Check if shared resource path
+        # needs to be added to bindlist 
+        if not sub_args.shared_resources in additional_bind_paths:
+            additional_bind_paths.append(sub_args.shared_resources)
+    
     additional_bind_paths =  ','.join(additional_bind_paths)
 
     # Dryrun pipeline
     if sub_args.dry_run:
-        dryrun_output = dryrun(outdir = output_path, config = os.path.join('config', 'build.yml'), snakefile = os.path.join('workflow', 'rules', 'build.smk'))
-        # python3 returns byte-string representation
-        print("\nDry-running RNA-seek Reference building pipeline:\n{}".format(dryrun_output.decode("utf-8")))
-        sys.exit(0)
+        dryrun_output = dryrun(
+            outdir = output_path, 
+            config = os.path.join('config', 'build.yml'), 
+            snakefile = os.path.join('workflow', 'rules', 'build.smk')
+        )
 
+        print(
+            "\nDry-running RNA-seek Reference building pipeline:\n{}".format(
+                dryrun_output.decode("utf-8")
+            )
+        )
+        sys.exit(0)
+    
     # Run RNA-seek reference building pipeline
     masterjob = orchestrate(
         mode = 'slurm', 
         outdir = output_path,
-        additional_bind_paths = [], 
+        additional_bind_paths = additional_bind_paths, 
         alt_cache = sub_args.singularity_cache,
         submission_script = 'builder', 
         masterjob = 'pl:build',
@@ -1216,6 +1245,7 @@ def parsed_arguments(name, description):
           $ {0} run [--help] \\
                               [--small-rna] [--star-2-pass-basic] \\
                               [--dry-run] [--mode {{slurm, local}}] \\
+                              [--shared-resources SHARED_RESOURCES] \\
                               [--singularity-cache SINGULARITY_CACHE] \\
                               [--sif-cache SIF_CACHE] \\
                               [--tmp-dir TMP_DIR] \\
@@ -1312,6 +1342,23 @@ def parsed_arguments(name, description):
                                 recommended running the pipeline in this mode as it 
                                 will be significantly faster. 
                                   Example: --mode slurm
+
+          --shared-resources    Local path to shared resources. The pipeline uses a set
+                                of shared reference files that can be re-used across ref-
+                                erence genomes. These currently include reference files
+                                for kraken and FQScreen. These reference files can be 
+                                downloaded with the build sub command's --shared-resources 
+                                option. These files only need to be downloaded once. If 
+                                you are running the pipeline on Biowulf, you do NOT need 
+                                to download these reference files! They already exist on 
+                                the filesystem in a location that anyone can acceess. If 
+                                you are running the pipeline on another cluster or target 
+                                system, you will need to download the shared resources 
+                                with the build sub command, and you will need to provide 
+                                this option to the run sub command every time. Please
+                                provide the same path that was provided to the build sub
+                                command's --shared-resources option.
+                                  Example: --shared-resources /data/shared/rna-seek
     
           --singularity-cache SINGULARITY_CACHE
                                 Overrides the $SINGULARITY_CACHEDIR variable. Images
@@ -1480,6 +1527,17 @@ def parsed_arguments(name, description):
         help = argparse.SUPPRESS
     )
 
+    # Path to previously downloaded shared 
+    # reference files, see build option for
+    # more information
+    subparser_run.add_argument(
+        '--shared-resources',
+        type = lambda option: permissions(parser, os.path.abspath(os.path.expanduser(option)), os.R_OK),
+        required = False,
+        default = None,
+        help = argparse.SUPPRESS
+    )
+
     # Singularity cache directory, 
     # default uses output directory
     subparser_run.add_argument(
@@ -1535,8 +1593,8 @@ def parsed_arguments(name, description):
 
         {1}{2}Synopsis:{4}
           $ {0} build [--help] \\
-                                [--dry-run] [--small-genome] \\
-                                [--singularity-cache SINGULARITY_CACHE] \\
+                                [--shared-resources SHARED_RESOURCES] [--small-genome] \\
+                                [--dry-run] [--singularity-cache SINGULARITY_CACHE] \\
                                 [--sif-cache SIF_CACHE] [--tmp-dir TMP_DIR] \\
                                 --ref-fa REF_FA \\
                                 --ref-name REF_NAME \\
@@ -1588,6 +1646,20 @@ def parsed_arguments(name, description):
                                 Example: --output /data/$USER/refs/GRCh38_41
         
         {1}{2}Build options:{4}
+          --shared-resources  Path to download shared resources. The pipeline uses a
+                              set of shared reference files that can be re-used across
+                              reference genomes. These currently include reference files
+                              for kraken and FQScreen. With that being said, these files
+                              can be downloaded once in a shared or common location. If 
+                              you are running the pipeline on Biowulf, you do NOT need 
+                              to download these reference files. They already exist in 
+                              an accessible location on the filesystem. If you're setting
+                              up the pipeline on a new cluster or target system, you will
+                              need to provide this option at least one time. The path 
+                              provided to this option can be provided to the rna-seek
+                              run sub command via the --shared-resources option.
+                                Example: --shared-resources /data/shared/rna-seek
+
           --small-genome      Builds a small genome index. For small genomes, it is
                               recommeded running STAR with --genomeSAindexNbases value
                               scaled down. This option runs the build pipeline in a 
@@ -1740,6 +1812,15 @@ def parsed_arguments(name, description):
         help = argparse.SUPPRESS
     )
     
+    # Path to download shared refs
+    subparser_build.add_argument(
+        '--shared-resources',
+        type = lambda option: os.path.abspath(os.path.expanduser(option)),
+        required = False,
+        default = None,
+        help = argparse.SUPPRESS
+    )
+
     # Small Genome build option for STAR
     subparser_build.add_argument(
         '--small-genome',
diff --git a/workflow/rules/build.smk b/workflow/rules/build.smk
index 5f1778a..b0a73bb 100644
--- a/workflow/rules/build.smk
+++ b/workflow/rules/build.smk
@@ -1,5 +1,67 @@
 from os.path import join, basename
 
+# Helper Functions
+def allocated(resource, rule, lookup, default="__default__"):
+    """Pulls resource information for a given rule. If a rule does not have any information 
+    for a given resource type, then it will pull from the default. Information is pulled from
+    definitions in the cluster.json (which is used a job submission). This ensures that any 
+    resources used at runtime mirror the resources that were allocated.
+    :param resource <str>: resource type to look in cluster.json (i.e. threads, mem, time, gres)
+    :param rule <str>: rule to lookup its information
+    :param lookup <dict>: Lookup containing allocation information (i.e. cluster.json)
+    :param default <str>: default information to use if rule information cannot be found
+    :return allocation <str>: 
+        allocation information for a given resource type for a given rule
+    """
+
+    try: 
+        # Try to get allocation information
+        # for a given rule
+        allocation = lookup[rule][resource]
+    except KeyError:
+        # Use default allocation information
+        allocation = lookup[default][resource]
+    
+    return allocation
+
+
+def provided(samplelist, condition):
+    """
+    Determines if optional rules should run. If an empty list is provided to rule all,
+    snakemake will not try to generate that set of target files. If a given condition
+    is not met (i.e. False) then it will not try to run that rule.
+    """   
+    if not str_bool(condition):
+        # If condition is False, 
+        # returns an empty list 
+        # to prevent rule from 
+        # running
+        samplelist = []
+    return samplelist
+
+
+def str_bool(s):
+    """Converts a string to boolean. It is dangerous to try to
+    typecast a string into a boolean value using the built-in 
+    `bool()` function. This function avoids any issues that can
+    arise when using `bool()`. 
+    Example:
+      boolean('True') returns True
+      boolean('False') returns False
+      boolean('asdas') raises TypeError
+    """
+    if not s:
+        return False
+    val = s.lower()
+    if val in ['true', '1', 'y', 'yes']:
+        return True
+    elif val in ['false', '0', 'n', 'no', '', 'none']:
+        return False
+    else:
+        # Provided a string or path
+        return s
+
+
 # Global Workflow variables
 configfile:join("config","build.yml")
 GENOME=config["GENOME"].strip().replace(' ', '')
@@ -26,29 +88,10 @@ try:
 except KeyError:
     SMALL_GENOME="False"
 
-
-def allocated(resource, rule, lookup, default="__default__"):
-    """Pulls resource information for a given rule. If a rule does not have any information 
-    for a given resource type, then it will pull from the default. Information is pulled from
-    definitions in the cluster.json (which is used a job submission). This ensures that any 
-    resources used at runtime mirror the resources that were allocated.
-    :param resource <str>: resource type to look in cluster.json (i.e. threads, mem, time, gres)
-    :param rule <str>: rule to lookup its information
-    :param lookup <dict>: Lookup containing allocation information (i.e. cluster.json)
-    :param default <str>: default information to use if rule information cannot be found
-    :return allocation <str>: 
-        allocation information for a given resource type for a given rule
-    """
-
-    try: 
-        # Try to get allocation information
-        # for a given rule
-        allocation = lookup[rule][resource]
-    except KeyError:
-        # Use default allocation information
-        allocation = lookup[default][resource]
-    
-    return allocation
+try:
+    SHARED_PATH=config["SHARED_RESOURCES"]
+except KeyError:
+    SHARED_PATH="None"
 
 
 rule all:
@@ -65,7 +108,31 @@ rule all:
         "karyoplot_gene_coordinates.txt",
         "qualimap_info.txt",
         "karyobeds/karyobed.bed",
-        "transcripts.protein_coding_only.bed12"
+        "transcripts.protein_coding_only.bed12",
+        # FastQ Screen P1 Reference files,
+        # conditional runs with --shared-resources option
+        provided(expand(join(SHARED_PATH, "fastq_screen_db", "hg19", "hg19.{ext}.bt2"), ext=["1", "2", "3", "4"]), SHARED_PATH),
+        provided(expand(join(SHARED_PATH, "fastq_screen_db", "mm9", "mm9.{ext}.bt2"), ext=["1", "2", "3", "4"]), SHARED_PATH),
+        provided(expand(join(SHARED_PATH, "fastq_screen_db", "Virus", "virus.{ext}.bt2"), ext=["1", "2", "3", "4"]), SHARED_PATH),
+        provided(expand(join(SHARED_PATH, "fastq_screen_db", "hg19", "hg19.rev.{ext}.bt2"), ext=["1", "2"]), SHARED_PATH),
+        provided(expand(join(SHARED_PATH, "fastq_screen_db", "mm9", "mm9.rev.{ext}.bt2"), ext=["1", "2"]), SHARED_PATH),
+        provided(expand(join(SHARED_PATH, "fastq_screen_db", "Virus", "virus.rev.{ext}.bt2"), ext=["1", "2"]), SHARED_PATH),
+        provided(expand(join(SHARED_PATH, "fastq_screen_db", "Bacteria", "bacteria.{ext}.bt2l"), ext=["1", "2", "3", "4"]), SHARED_PATH),
+        provided(expand(join(SHARED_PATH, "fastq_screen_db", "Fungi", "fungi.{ext}.bt2l"), ext=["1", "2", "3", "4"]), SHARED_PATH),
+        provided(expand(join(SHARED_PATH, "fastq_screen_db", "Bacteria", "bacteria.rev.{ext}.bt2l"), ext=["1", "2"]), SHARED_PATH),
+        provided(expand(join(SHARED_PATH, "fastq_screen_db", "Fungi", "fungi.rev.{ext}.bt2l"), ext=["1", "2"]), SHARED_PATH),
+        # FastQ Screen P2 Reference files,
+        # conditional runs with --shared-resources option
+        provided(expand(join(SHARED_PATH, "fastq_screen_db", "UniVec_vectors", "UniVec_vectors.{ext}.bt2"), ext=["1", "2", "3", "4"]), SHARED_PATH),
+        provided(expand(join(SHARED_PATH, "fastq_screen_db", "rRNA", "rRNA.{ext}.bt2"), ext=["1", "2", "3", "4"]), SHARED_PATH),
+        provided(expand(join(SHARED_PATH, "fastq_screen_db", "UniVec_vectors", "UniVec_vectors.rev.{ext}.bt2"), ext=["1", "2"]), SHARED_PATH),
+        provided(expand(join(SHARED_PATH, "fastq_screen_db", "rRNA", "rRNA.rev.{ext}.bt2"), ext=["1", "2"]), SHARED_PATH),
+        # FastQ Screeen confs,
+        # conditional runs with --shared-resources option
+        provided(expand(join(SHARED_PATH, "fastq_screen_db", "fastq_screen_p{ext}.conf"), ext=["1", "2"]), SHARED_PATH),
+        # Kraken2 Database,
+        # conditional runs with --shared-resources option
+        provided(expand(join(SHARED_PATH, "20180907_standard_kraken2", "{ref}.k2d"), ref=["hash", "opts", "taxo"]), SHARED_PATH),
 
 
 rule rsem:
@@ -187,6 +254,7 @@ rule star_rl:
         --outTmpDir ${{tmp}}/tmp_{wildcards.readlength}
     """
 
+
 if SMALL_GENOME == "True":
     # Build a index that is optimized for 
     # small genomes. For small genomes, the 
@@ -302,6 +370,7 @@ else:
             --outTmpDir ${{tmp}}/tmp_genome
         """
 
+
 rule rRNA_list:
     """
     Builds ribosomal RNA list reference file for picard CollectRnaMetrics to
@@ -453,6 +522,135 @@ rule qualimapinfo:
     """
 
 
+rule fqscreen_db1:
+    """
+    Downloads fastq screen bowtie2 databases from the OpenOmics public resource bundle.
+    Currently, there are fastq screen indices for the following organisms: hg19, mm9, 
+    bateria, fungi, virus, univec vector sequences, and rRNA.
+
+    @Input:
+        Genomic FASTA file
+        Annotation file in GTF format
+    @Output:
+        Downloaded bowtie2 indices for hg19, mm9, virus, univec, and rRNA
+    """
+    input:
+        fa=REFFA,
+        gtf=GTFFILE,
+    output:
+        fwd=expand(
+            join(SHARED_PATH, "fastq_screen_db", "{{genome}}", "{{ref}}.{ext}.bt2"),
+            ext=["1", "2", "3", "4"]
+        ),
+        rev=expand(
+            join(SHARED_PATH, "fastq_screen_db", "{{genome}}", "{{ref}}.rev.{ext}.bt2"),
+            ext=["1", "2"]
+        ),
+    params:
+        rname='bl:fqscreen_db1',
+        tarname=join("fastq_screen_db.{genome}.tar"),
+        tarfile=join(SHARED_PATH, "fastq_screen_db.{genome}.tar"),
+        outdir=SHARED_PATH,
+    container: config['images']['build_rnaseq']
+    shell: """
+    wget https://hpc.nih.gov/~OpenOmics/common/{params.tarname} -O {params.tarfile}
+    tar vxf {params.tarfile} -C {params.outdir} && rm {params.tarfile}
+    """
+
+
+rule fqscreen_db2:
+    """
+    Downloads fastq screen bowtie2 databases from the OpenOmics public resource bundle.
+    Currently, there are fastq screen indices for the following organisms: hg19, mm9, 
+    bateria, fungi, virus, univec vector sequences, and rRNA.
+
+    @Input:
+        Genomic FASTA file
+        Annotation file in GTF format
+    @Output:
+        Downloaded bowtie2 indices for bateria, fungi
+    """
+    input:
+        fa=REFFA,
+        gtf=GTFFILE,
+    output:
+        fwd=expand(
+            join(SHARED_PATH, "fastq_screen_db", "{{genome}}", "{{ref}}.{ext}.bt2l"),
+            ext=["1", "2", "3", "4"]
+        ),
+        rev=expand(
+            join(SHARED_PATH, "fastq_screen_db", "{{genome}}", "{{ref}}.rev.{ext}.bt2l"),
+            ext=["1", "2"]
+        ),
+    params:
+        rname='bl:fqscreen_db2',
+        tarname=join("fastq_screen_db.{genome}.tar"),
+        tarfile=join(SHARED_PATH, "fastq_screen_db.{genome}.tar"),
+        outdir=SHARED_PATH,
+    container: config['images']['build_rnaseq']
+    shell: """
+    wget https://hpc.nih.gov/~OpenOmics/common/{params.tarname} -O {params.tarfile}
+    tar vxf {params.tarfile} -C {params.outdir} && rm {params.tarfile}
+    """
+
+
+rule fqscreen_conf:
+    """
+    Downloads fastq screen confs from the OpenOmics public resource bundle.
+    @Input:
+        Genomic FASTA file
+        Annotation file in GTF format
+    @Output:
+        Downloaded bowtie2 indices for bateria, fungi
+    """
+    input:
+        fa=REFFA,
+        gtf=GTFFILE,
+    output:
+        conf1=join(SHARED_PATH, "fastq_screen_db", "fastq_screen_p1.conf"),
+        conf2=join(SHARED_PATH, "fastq_screen_db", "fastq_screen_p2.conf"),
+    params:
+        rname='bl:fqscreen_conf',
+        conf1="fastq_screen_p1.conf",
+        conf2="fastq_screen_p2.conf",
+        new=join(SHARED_PATH).rstrip('/'),
+        outdir=join(SHARED_PATH, "fastq_screen_db")
+    container: config['images']['build_rnaseq']
+    shell: """
+    mkdir -p "{params.outdir}"
+    wget https://hpc.nih.gov/~OpenOmics/common/{params.conf1} -O {output.conf1}
+    wget https://hpc.nih.gov/~OpenOmics/common/{params.conf2} -O {output.conf2}
+    sed -i 's@/data/OpenOmics/references/common@{params.new}@g' {output.conf1}
+    sed -i 's@/data/OpenOmics/references/common@{params.new}@g' {output.conf2}
+    """
+
+
+rule kraken2_db:
+    """
+    Downloads kraken2 database from the OpenOmics public resource bundle.
+    @Input:
+        Genomic FASTA file
+        Annotation file in GTF format
+    @Output:
+        Downloaded kraken2 db for contamination screening
+    """
+    input:
+        fa=REFFA,
+        gtf=GTFFILE,
+    output:
+        expand(join(SHARED_PATH, "20180907_standard_kraken2", "{ref}.k2d"), ref=["hash", "opts", "taxo"]),
+    params:
+        rname='bl:kraken2_db',
+        outfh=join(SHARED_PATH, "20180907_standard_kraken2.tar"),
+        tarfile="20180907_standard_kraken2.tar",
+        outdir=SHARED_PATH,
+    container: config['images']['build_rnaseq']
+    shell: """
+    wget https://hpc.nih.gov/~OpenOmics/common/{params.tarfile} -O {params.outfh}
+    tar vxf {params.outfh} -C {params.outdir} && rm {params.outfh}
+    """
+
+
 rule jsonmaker:
     """
     Builds reference genome reference JSON file. This is a config file for the
@@ -527,3 +725,4 @@ rule jsonmaker:
 
         with open(output.json, 'w') as fp:
             json.dump(refdict, fp, indent=4)
+