workflow/Snakefile

configfile: "config/config.yaml"

# TODO refactor the snakefile so that all dependencies are managed by a containerized mamba.
# TODO make it so that parallelization is dynamic, probably using checkpoints.
# container: "docker://condaforge/mambaforge:23.1.0-1"


rule all:
    input:
        "results/grafted.tre"

# Creates and populates a SQLite database with filtered sequence records.
# Uses BOLD dump TSV as defined in config file
rule create_database:
  input: config["file_names"]["bold_tsv"]
  output: 'results/databases/create_database.ok'
  params:
    db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]),
    marker=config["marker"],
    minlength=config["minlength"],
    log_level=config['log_level']
  conda: "envs/create_database.yml"
  log: "logs/create_database/create_database.log"
  benchmark:
        "benchmarks/create_database.benchmark.txt"
  shell:
    """
    python workflow/scripts/create_database.py \
      -v {params.log_level} \
      -o {params.db} \
      -i {input} 2> {log}
    touch {output}
    """

# Enriches the database with mappings to OpenToL. Because this operates on
# the same database file, the output is a 0-byte file `map_opentol.ok` to
# indicate that the task was run.
rule map_opentol:
  input: 'results/databases/create_database.ok'
  output: 'results/databases/map_opentol.ok'
  params:
    db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]),
    marker=config['marker'],
    log_level=config['log_level']
  conda: "envs/map_opentol.yml"
  log: "logs/map_opentol/map_opentol.log"
  benchmark:
        "benchmarks/map_opentol.benchmark.txt"
  shell:
      """
      python workflow/scripts/map_opentol.py \
        -d {params.db} \
        -m {params.marker} \
        -v {params.log_level} \
        -o {output} 2> {log}
      """

# Creates and populates the OpenToL SQLite database. Uses the location of
# the OpenToL tree as per the config file. Merges the resulting database
# with the BOLD database (which gains a table `node`). SQLite can, at time
# of writing, not apply foreign key constraints retroactively. This is a
# shame because taxon.opentol_id implicitly references node.id.
rule megatree_loader:
    input:
        tree = config["file_names"]["open_tre"],
        mapping_ok = rules.map_opentol.output
    output: 'results/databases/megatree_loader.ok'
    conda: "envs/megatree_loader.yml"
    log: "logs/megatree_loader/megatree_loader.log"
    benchmark:
        "benchmarks/megatree_loader.benchmark.txt"
    params:
        db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]),
        tempdb = temp("results/databases/opentree_nodes.db"),
        tempsql = temp("results/databases/node.sql")
    shell:
        """
        megatree-loader -i {input.tree} -d {params.tempdb} -v 2> {log}
        sqlite3 {params.tempdb} ".dump node" > {params.tempsql} 2>> {log}
        sqlite3 {params.db} < {params.tempsql} 2>> {log}
        touch {output} 2>> {log}
        rm {params.tempsql} {params.tempdb}
        """

# TODO either this is updated dynamically or scattergather is abandoned for dynamic()
scattergather:
  split = config["nfamilies"]

# Exports unaligned BIN sequences for each family within the total set. This task is parallelized as many times as
# specified by config["nfamilies"]. During this task, the longest raw sequence within each BIN is selected.
rule family_fasta:
    input: rules.map_opentol.output
    output:
      fastas=scatter.split("results/fasta/family/{scatteritem}/unaligned.fa"),
      status=f'{config["file_names"]["fasta_dir"]}/family_fasta.ok'
    params:
      log_level=config['log_level'],
      fasta_dir=config["file_names"]["fasta_dir"],
      filter_level=config["fasta_filter"]["filter_level"],
      filter_name=config["fasta_filter"]["filter_name"],
      marker=config["marker"],
      db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]),
      chunks=config["nfamilies"],
    conda: "envs/family_fasta.yml"
    log: "logs/family_fasta/family_fasta.log"
    benchmark:
        "benchmarks/family_fasta.benchmark.txt"
    shell:
        """
        python workflow/scripts/family_fasta.py \
            -d {params.db} \
            -f {params.fasta_dir} \
            -l {params.filter_level} \
            -n {params.filter_name} \
            -c {params.chunks} \
            -m {params.marker} \
            -v {params.log_level} 2> {log}
        touch {output.status}
        """

# Creates a local BLAST database from the exported sequences that have OTT IDs. Later on, this database will be used
# for locating outgroups that are i) close to the ingroup but not inside it, ii) occur in the OpenToL. The latter will
# hopefully mean that even monotypic families will have a constraint tree with >= 3 tips so all chunks can be treated
# identically.
rule makeblastdb:
    input: rules.family_fasta.output.status
    output: f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}.nsq'
    params:
      chunks=config["nfamilies"],
      fasta_dir=config["file_names"]["fasta_dir"],
      concatenated=f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}.fa',
      blastdb=f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}'
    conda: "envs/blast.yml"
    log: "logs/makeblastdb/makeblastdb.log"
    benchmark:
        "benchmarks/makeblastdb.benchmark.txt"
    shell:
        """
        sh workflow/scripts/makeblastdb.sh \
            {params.blastdb} {params.fasta_dir} {params.chunks} {params.concatenated} 2> {log}
        """


# Gets the nearest outgroups by blasting. The number of outgroups is defined in config["outgroups"]. The outgroups are
# selected by querying every sequence in the ingroup for the top 10 hits, then taking the most frequently occurring
# hits across all queries.
rule get_outgroups:
    input:
        unaligned = "results/fasta/family/{scatteritem}/unaligned.fa",
        makeblastdb = rules.makeblastdb.output
    output: "results/fasta/family/{scatteritem}/outgroups.fa",
    params:
        outgroups = config["outgroups"],
        blastdb=f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}'
    conda: "envs/blast.yml"
    log: "logs/get_outgroups/get_outgroups-{scatteritem}.log"
    benchmark:
        "benchmarks/get_outgroups/get_outgroups-{scatteritem}.benchmark.txt"
    shell:
        """
        sh workflow/scripts/get_outgroups.sh {params.blastdb} {params.outgroups} {input.unaligned} {output} 2> {log}
        """


# Exports OpenToL newick file for each unaligned BIN sequence file. This implementation uses the induced subtree web
# service endpoint from OpenToL. The new implementation appears to achieve better coverage.
rule family_constraint:
    input:
        unaligned = "results/fasta/family/{scatteritem}/unaligned.fa",
        outgroups = "results/fasta/family/{scatteritem}/outgroups.fa"
    output: "results/fasta/family/{scatteritem}/constraint.tre"
    params:
        log_level=config['log_level'],
        db="results/databases/BOLD_{}_barcodes.db".format(config["marker"])
    conda: "envs/family_constraint.yml"
    log: "logs/family_constraint/family_constraint-{scatteritem}.log"
    benchmark:
        "benchmarks/family_constraint/family_constraint-{scatteritem}.benchmark.txt"
    shell:
        """
        python workflow/scripts/family_constraint.py \
            -i {input.unaligned} \
            -g {input.outgroups} \
            -d {params.db} \
            -o {output} \
            -v {params.log_level} 2> {log}
        """

# Aligns sequences with HMM. Here, the sequences are also corrected for possible revcom issues. It is possible that this
# is not needed at all because so far 0 revcom sequences were observed in BOLD.
rule msa_hmm:
    input:
        ingroup="results/fasta/family/{scatteritem}/unaligned.fa",
        outgroup="results/fasta/family/{scatteritem}/outgroups.fa"
    output: "results/fasta/family/{scatteritem}/aligned.fa"
    params:
        log_level=config['log_level'],
        hmm_file=config['file_names']['hmm'],
        db="results/databases/BOLD_{}_barcodes.db".format(config["marker"])
    conda: "envs/msa_hmm.yml"
    log: "logs/msa_hmm/msa_hmm-{scatteritem}.log"
    benchmark:
        "benchmarks/msa_hmm/msa_hmm-{scatteritem}.benchmark.txt"
    shell:
        """
        python workflow/scripts/msa_hmm.py \
            -i {input.ingroup} \
            -g {input.outgroup} \
            -o {output} \
            -m {params.hmm_file} \
            -v {params.log_level} \
            -d {params.db} 2> {log}               
        """

# Prepares the aligned ingroup sequences and constraint tree for analysis by raxml-ng. This step also includes outgroup
# selection. A number of changes are needed here. Firstly, there are cases where the selected outgroup sequences yield
# results where the ingroup is not monophyletic with respect to the outgroups. This means that outgroup selection needs
# to be refined by considering all pairwise distances within the ingroup, and any outgroup must have a greater distance
# to the nearest ingroup taxon than the latter has to all other members of the ingroup. Secondly, the constraint tree
# should be generated here so that it also includes the outgroups, which means that an additional outgroup selection
# criterion should be that the candidate outgroups are in the OpenToL.
rule prep_raxml:
    input:
        alignment="results/fasta/family/{scatteritem}/aligned.fa",
        tree="results/fasta/family/{scatteritem}/constraint.tre"
    output:
        tree="results/fasta/family/{scatteritem}/remapped.tre"
    params:
        db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]),
        log_level=config['log_level']
    conda: "envs/prep_raxml.yml"
    log: "logs/prep_raxml/prep_raxml-{scatteritem}.log"
    benchmark:
        "benchmarks/prep_raxml/prep_raxml-{scatteritem}.benchmark.txt"
    shell:
        """
        python workflow/scripts/prep_raxml.py \
            -t {input.tree} \
            -o {output.tree} \
            -a {input.alignment} \
            -v {params.log_level} \
            -d {params.db} 2> {log}
        """

# Runs raxml as a constrained tree search. Possibly this should instead take a two-step approach, where unplaced
# sequences are first placed, and then the branch lengths are estimated. This requires that there actually is a
# constraint tree. Currently, this rule deals with the fact that some of the constraint trees are 0-byte files due
# to incomplete OpenToL coverage by trapping raxml-ng errors and then re-running without the constraint.
rule run_raxml:
    input:
        alignment = "results/fasta/family/{scatteritem}/aligned.fa",
        tree = "results/fasta/family/{scatteritem}/remapped.tre"
    output:
        tree = "results/fasta/family/{scatteritem}/aligned.fa.raxml.bestTree"
    params:
        model = config['model'],
        num_outgroups= config['outgroups']
    log: "logs/run_raxml/run_raxml-{scatteritem}.log"
    benchmark:
        "benchmarks/run_raxml/run_raxml-{scatteritem}.benchmark.txt"
    conda: "envs/raxml.yml"
    shell:
        """
        if [ -s {input.alignment} ]; then
            OG=$(grep '>' {input.alignment} | tail -{params.num_outgroups} | sed -e 's/>//' | tr '\n' ',')
            if [ -s {input.tree} ]; then
            set -e
            raxml-ng \
                --redo \
                --msa {input.alignment} \
                --outgroup $OG \
                --model {params.model} \
                --tree-constraint {input.tree} \
                --search1 > {log} 2>&1 \
            || \
            raxml-ng \
                --redo \
                --msa {input.alignment} \
                --outgroup $OG \
                --model {params.model} \
                --search1 >> {log} 2>&1
            else
                raxml-ng --msa {input.alignment} --outgroup $OG --model {params.model} --search --redo > {log} 2>&1
            fi
        else
            touch {output.tree}
        fi
        """

# Reroots the raxml output in and then prunes the outgroups. Rooting is done by finding the smallest clade subtended
# by the root in the constraint tree, looking for the smallest bipartition in the result tree that includes those same
# clade members and then rooting on that bipartition branch. Subsequently, the outgroup taxa are removed.
rule reroot_raxml_output:
    input:
        tree = "results/fasta/family/{scatteritem}/aligned.fa.raxml.bestTree",
        constraint = "results/fasta/family/{scatteritem}/remapped.tre",
        alignment = "results/fasta/family/{scatteritem}/aligned.fa"
    output:
        outtree = "results/fasta/family/{scatteritem}/aligned.fa.raxml.bestTree.rooted"
    params:
        log_level = config['log_level'],
        num_outgroups = config['outgroups']
    log: "logs/reroot_raxml_output/reroot_raxml_output-{scatteritem}.log"
    benchmark:
        "benchmarks/reroot_raxml_output/reroot_raxml_output-{scatteritem}.benchmark.txt"
    conda: "envs/reroot_backbone.yml"
    shell:
        """
        if [ -s {input.alignment} ]; then
            OG=$(grep '>' {input.alignment} | tail -{params.num_outgroups} | sed -e 's/>//' | tr '\n' ',')
            python workflow/scripts/reroot_backbone.py \
                -i {input.tree} \
                -c {input.constraint} \
                -o {output.outtree} \
                -g $OG \
                -v {params.log_level} 2> {log}
        else 
            touch {output.outtree}
        fi
        """

# In principle, exemplar choosing can be performed using one of three ways:
# 1. the tallest tips on either side of the root are chosen
# 2. the shallowest tips on either side
# 3. the ones closest to the median root-to-tip path length
# Empirically, the first option yields rescaled branch lengths that are closest
# to those optimized freely on a total tree.
rule choose_exemplars:
    input:
        alignment = "results/fasta/family/{scatteritem}/aligned.fa",
        tree= "results/fasta/family/{scatteritem}/aligned.fa.raxml.bestTree.rooted"
    output: "results/fasta/family/{scatteritem}/exemplars.fa"
    params:
        log_level = config['log_level'],
        strategy = 'median'
    log: "logs/choose_exemplars/choose_exemplars-{scatteritem}.log"
    benchmark:
        "benchmarks/choose_exemplars/choose_exemplars-{scatteritem}.benchmark.txt"
    conda: "envs/choose_exemplars.yaml"
    shell:
        """
        if [ -s {input.alignment} ]; then
            python workflow/scripts/choose_exemplars.py \
                -v {params.log_level} \
                -t {input.tree} \
                -i {input.alignment} \
                -s {params.strategy} \
                -o {output} 2> {log}
        else
            touch {output}
        fi
        """

# When aligning the family-level subsets, different families may have different indel
# patterns. So, although they are all orientated against the model in the same way,
# there can be some frame shifting. The least bad option is to unalign and realign 
# the exemplar sequences on the fly. Sed does the unaligning, hmmalign emits Stockholm
# data on the command line, awk turns it into FASTA.
rule prep_raxml_backbone:
    input:
        fastas=gather.split("results/fasta/family/{scatteritem}/exemplars.fa"),
        opentol='results/databases/megatree_loader.ok'
    output:
        fasta="results/fasta/raxml-ready.fa",
        tree="results/fasta/raxml-ready.tre",
        extinct="results/fasta/extinct_pids.txt"
    params:
        db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]),
        log_level = config['log_level'],
        hmm_file=config['file_names']['hmm']
    log: "logs/prep_raxml_backbone/prep_raxml_backbone.log"
    conda: "envs/prep_raxml_backbone.yml"
    shell:
        """
        # Generate constraint tree and list of extinct PIDs
        python workflow/scripts/backbone_constraint.py \
            -d {params.db} \
            -v {params.log_level} \
            -i '{input.fastas}' \
            -e {output.extinct} \
            -o {output.tree} 2> {log}

        # Clean the concatenated FASTA by removing gaps (dashes)
        sed '/^>/! s/-//g' {input.fastas} > results/fasta/unaligned.fa
        
        # Align with hmmalign and output in Stockholm format
        hmmalign --trim --dna --informat FASTA --outformat Stockholm {params.hmm_file} results/fasta/unaligned.fa > results/fasta/aligned.sto
        
        # Convert the Stockholm alignment to a non-interleaved FASTA format for RAxML
        seqmagick convert results/fasta/aligned.sto {output.fasta}

        # Remove any extinct PIDs
        [ -e {output.extinct} ] && seqmagick mogrify --exclude-from-file {output.extinct} {output.fasta}        
        """


# Constructs the backbone topology under ML using raxml-ng with a tree constraint that
# is intended to keep all pairs of exemplars monophyletic. Here, no outgroups are 
# specified, because in principle this step could be dealing with the entire 
# taxonomic width of BOLD. Instead, the tree will be rooted using the constraint.
rule run_raxml_backbone:
    input:
        alignment = "results/fasta/raxml-ready.fa",
        tree = "results/fasta/raxml-ready.tre"
    output:
        tree = "results/fasta/raxml-ready.fa.raxml.bestTree"
    params:
        model = config['model']
    log: "logs/run_raxml_backbone/run_raxml_backbone.log"
    benchmark:
        "benchmarks/run_raxml_backbone.benchmark.txt"
    conda: "envs/raxml.yml"
    shell:
        """
          raxml-ng \
            --redo \
            --msa {input.alignment} \
            --model {params.model} \
            --tree-constraint {input.tree} \
            --search > {log} 2>&1
        """

# Reroots the backbone using the constraint tree. The basic method is to find the
# basal split in the constraint, look for the equivalent bipartition in the inferred
# tree, and root there. TODO: this approach currently places the root on the midpoint
# of the bipartition. This is not ideal but it is not obvious what better options
# are available if there are no outgroups.
rule reroot_backbone:
    input:
        backbone = "results/fasta/raxml-ready.fa.raxml.bestTree",
        constraint = "results/fasta/raxml-ready.tre"
    output:
        tree = "results/fasta/raxml-ready.fa.raxml.bestTree.rooted"
    params:
        log_level = config['log_level']
    log: "logs/reroot_backbone/reroot_backbone.log"
    benchmark:
        "benchmarks/reroot_backbone.benchmark.txt"
    conda: "envs/reroot_backbone.yml"
    shell:
        """
        python workflow/scripts/reroot_backbone.py \
            -i {input.backbone} \
            -c {input.constraint} \
            -o {output.tree} \
            -v {params.log_level} 2> {log}
        """

# Grafts the individual family clades onto the backbone. This is done by taking the
# backbone tree and replacing the tips that are in the clade with the clade tree.
rule graft_clades:
    input:
        backbone = "results/fasta/raxml-ready.fa.raxml.bestTree.rooted",
        clades = config['file_names']['fasta_dir'],
        extinct = "results/fasta/extinct_pids.txt"
    output:
        tree = "results/grafted.tre"
    params:
        log_level = config['log_level'],
        nfamilies = config["nfamilies"]
    log: "logs/graft_clades/graft_clades.log"
    benchmark:
        "benchmarks/graft_clades.benchmark.txt"
    conda: "envs/graft_clades.yml"
    shell:
        """
        python workflow/scripts/graft_clades.py \
            -t {input.backbone} \
            -f {input.clades} \
            -e {input.extinct} \
            -o {output.tree} \
            -n {params.nfamilies} \
            -v {params.log_level} 2> {log}                  
        """