Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow provided assemblies #147

Merged
merged 5 commits into from
Oct 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions binchicken/binchicken.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,23 @@ def evaluate_bins(aviary_outputs, checkm_version, min_completeness, max_contamin
else:
return {"-".join([c, str(i)]): os.path.join(recovered_bins[c], b + ".fna") for c in coassembly_bins for i, b in enumerate(coassembly_bins[c])}

def check_prior_assemblies(prior_assemblies, input_samples, prior_dir):
mismatched_samples = (
prior_assemblies
.join(pl.DataFrame({"sample": input_samples}), on="sample", how="full", suffix="_input")
)

missing_assemblies = mismatched_samples.filter(pl.col("sample").is_null())
extra_assemblies = mismatched_samples.filter(pl.col("sample_input").is_null())
if missing_assemblies.height > 0:
missing_assemblies = " ".join(missing_assemblies.sort("sample_input").get_column("sample_input").to_list())
raise ValueError(f"Samples missing assemblies in prior assemblies: {missing_assemblies}")
elif extra_assemblies.height > 0:
extra_assemblies = " ".join(extra_assemblies.sort("sample").get_column("sample").to_list())
raise ValueError(f"Extra assemblies not matching any samples in prior assemblies: {extra_assemblies}")

return {row[0]: os.path.join(os.path.dirname(prior_dir), row[1]) for row in prior_assemblies.iter_rows()}

def coassemble(args):
logging.info("Loading sample info")
if args.forward_list:
Expand Down Expand Up @@ -472,6 +489,16 @@ def coassemble(args):
if not args.precluster_size:
args.precluster_size = args.max_recovery_samples * 5

if args.prior_assemblies:
if args.single_assembly:
logging.info("Preparing prior assemblies")
prior_assemblies = pl.read_csv(args.prior_assemblies, separator="\t")

input_samples = args.coassembly_samples if args.coassembly_samples else forward_reads.keys()
prior_assemblies = check_prior_assemblies(prior_assemblies, input_samples, args.prior_assemblies)
else:
prior_assemblies = args.prior_assemblies

try:
build_status = args.build
except AttributeError:
Expand Down Expand Up @@ -510,6 +537,7 @@ def coassemble(args):
"aviary_speed": args.aviary_speed,
"assembly_strategy": args.assembly_strategy,
"run_aviary": args.run_aviary,
"prior_assemblies": prior_assemblies if args.prior_assemblies else {},
"cluster_submission": args.cluster_submission,
"aviary_gtdbtk": args.aviary_gtdbtk_db,
"aviary_checkm2": args.aviary_checkm2_db,
Expand Down Expand Up @@ -720,6 +748,20 @@ def update(args):
args.forward = [f for f in args.forward if f in sra_samples]
args.reverse = args.forward

if args.prior_assemblies:
if args.coassemble_elusive_clusters:
if not args.coassemblies:
elusive_clusters = pl.read_csv(os.path.abspath(args.coassemble_elusive_clusters), separator="\t")

logging.info("Preparing prior assemblies")
prior_assemblies = pl.read_csv(args.prior_assemblies, separator="\t")
input_samples = args.coassemblies if args.coassemblies else elusive_clusters.get_column("coassembly").to_list()

prior_assemblies = check_prior_assemblies(prior_assemblies, input_samples, args.prior_assemblies)
args.prior_assemblies = prior_assemblies
else:
raise ValueError("Prior assemblies require elusive clusters")

copy_input(
os.path.abspath(args.coassemble_unbinned),
os.path.join(args.output, "coassemble", "appraise", "unbinned.otu_table.tsv")
Expand Down Expand Up @@ -1267,6 +1309,7 @@ def add_evaluation_options(argument_group):

def add_aviary_options(argument_group):
argument_group.add_argument("--run-aviary", action="store_true", help="Run Aviary commands for all identified coassemblies (unless specific coassemblies are chosen with --coassemblies) [default: do not]")
argument_group.add_argument("--prior-assemblies", help="Prior assemblies to use for Aviary recovery. tsv file with header sample [tab] assembly for single-sample or coassembly [tab] assembly for update. [default: generate assemblies through Aviary assemble]")
argument_group.add_argument("--cluster-submission", action="store_true", help="Flag that cluster submission will occur through `--snakemake-profile`. This sets the local threads of Aviary recover to 1, allowing parallel job submission [default: do not]")
default_aviary_speed = FAST_AVIARY_MODE
argument_group.add_argument("--aviary-speed", help=f"Run Aviary recover in 'fast' or 'comprehensive' mode. Fast mode skips slow binners and refinement steps. [default: {default_aviary_speed}]",
Expand Down Expand Up @@ -1533,6 +1576,8 @@ def coassemble_argument_verification(args, iterate=False):
raise Exception("Run Aviary (--run-aviary) fast mode requires path to CheckM2 databases to be provided (--aviary-checkm2-db or CHECKM2DB)")
if args.aviary_speed != FAST_AVIARY_MODE and not (args.aviary_gtdbtk_db and args.aviary_checkm2_db):
raise Exception("Run Aviary (--run-aviary) comprehensive mode requires paths to GTDB-Tk and CheckM2 databases to be provided (--aviary-gtdbtk-db or GTDBTK_DATA_PATH and --aviary-checkm2-db or CHECKM2DB)")
if args.prior_assemblies and not args.single_assembly:
raise Exception("Prior assemblies requires `update` or `single`-assembly modes")
if args.cluster_submission and not args.snakemake_profile:
logging.warning("The arg `--cluster-submission` is only a flag and cannot activate cluster submission alone. Please see `--snakemake-profile` for cluster submission.")

Expand Down
1 change: 1 addition & 0 deletions binchicken/config/template_coassemble.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ unmapping_max_alignment: 1
aviary_speed: "fast"
assembly_strategy: "dynamic"
run_aviary: false
prior_assemblies: {}
cluster_submission: false
aviary_gtdbtk: ""
aviary_checkm2: ""
Expand Down
15 changes: 15 additions & 0 deletions binchicken/workflow/coassemble.smk
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#############
ruleorder: no_genomes > query_processing > update_appraise > singlem_appraise
ruleorder: mock_download_sra > download_sra
ruleorder: prior_assemble > aviary_assemble

import os
import polars as pl
Expand Down Expand Up @@ -766,6 +767,20 @@ def get_assemble_assembler(wildcards, attempt):
elif config["assembly_strategy"] == MEGAHIT_ASSEMBLY:
return "--use-megahit"

rule prior_assemble:
input:
elusive_clusters = output_dir + "/target/elusive_clusters.tsv",
output:
dir = directory(output_dir + "/coassemble/{coassembly}/assemble") if config["prior_assemblies"] else [],
assembly = output_dir + "/coassemble/{coassembly}/assemble/assembly/final_contigs.fasta" if config["prior_assemblies"] else [],
params:
prior_assembly = lambda wildcards: config["prior_assemblies"][wildcards.coassembly],
threads: 1
localrule: True
shell:
"mkdir -p {output.dir} && "
"cp {params.prior_assembly} {output.assembly}"

rule aviary_assemble:
input:
output_dir + "/mapping/done" if config["assemble_unmapped"] else output_dir + "/qc/done" if config["run_qc"] else [],
Expand Down
Loading
Loading