Skip to content

Commit

Permalink
remove outgroups
Browse files Browse the repository at this point in the history
  • Loading branch information
dominika.kresa committed Jan 6, 2025
1 parent cafa572 commit 269583f
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 50 deletions.
108 changes: 58 additions & 50 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -140,11 +140,57 @@ checkpoint family_fasta:
-v {params.log_level} 2> {log}
"""



# Creates a local BLAST database from the exported sequences that have OTT IDs. Later on, this database will be used
# for locating outgroups that are i) close to the ingroup but not inside it, ii) occur in the OpenToL. The latter will
# hopefully mean that even monotypic families will have a constraint tree with >= 3 tips so all chunks can be treated
# identically.
rule makeblastdb:
input: rules.family_fasta.output.fasta_tsv
output: f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}.nsq'
params:
fasta_dir=config["file_names"]["fasta_dir"],
concatenated=f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}.fa',
blastdb=f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}'
conda: "envs/blast.yml"
log: "logs/makeblastdb/makeblastdb.log"
benchmark:
"benchmarks/makeblastdb.benchmark.txt"
shell:
"""
sh workflow/scripts/makeblastdb.sh \
{params.blastdb} {params.fasta_dir} {params.concatenated} 2> {log}
"""


# Gets the nearest outgroups by blasting. The number of outgroups is defined in config["outgroups"]. The outgroups are
# selected by querying every sequence in the ingroup for the top 10 hits, then taking the most frequently occurring
# hits across all queries.
rule get_outgroups:
input:
unaligned = "results/fasta/taxon/{taxon}/unaligned.fa",
makeblastdb = rules.makeblastdb.output
output: "results/fasta/taxon/{taxon}/outgroups.fa",
params:
outgroups = config["outgroups"],
blastdb=f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}'
conda: "envs/blast.yml"
log: "logs/get_outgroups/get_outgroups-{taxon}.log"
benchmark:
"benchmarks/get_outgroups/get_outgroups-{taxon}.benchmark.txt"
shell:
"""
sh workflow/scripts/get_outgroups.sh {params.blastdb} {params.outgroups} {input.unaligned} {output} 2> {log}
"""


# === RULE TO COUNT SEQUENCES ===
rule count_sequences:
"""Count the number of sequences in each family.fasta file."""
input: "results/fasta/taxon/{taxon}/unaligned.fa"
input:
"results/fasta/taxon/{taxon}/unaligned.fa",
"results/fasta/taxon/{taxon}/outgroups.fa"
output: "results/fasta/taxon/{taxon}/sequences_count.txt"
run:
count = sum(1 for line in open(input[0]) if line.startswith(">"))
Expand Down Expand Up @@ -191,49 +237,6 @@ def get_failed_taxa(wildcards):
failed_taxa = [taxon for taxon in get_all_taxon(wildcards) if taxon not in passed_taxa]
return failed_taxa



# Creates a local BLAST database from the exported sequences that have OTT IDs. Later on, this database will be used
# for locating outgroups that are i) close to the ingroup but not inside it, ii) occur in the OpenToL. The latter will
# hopefully mean that even monotypic families will have a constraint tree with >= 3 tips so all chunks can be treated
# identically.
rule makeblastdb:
input: rules.family_fasta.output.fasta_tsv
output: f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}.nsq'
params:
fasta_dir=config["file_names"]["fasta_dir"],
concatenated=f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}.fa',
blastdb=f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}'
conda: "envs/blast.yml"
log: "logs/makeblastdb/makeblastdb.log"
benchmark:
"benchmarks/makeblastdb.benchmark.txt"
shell:
"""
sh workflow/scripts/makeblastdb.sh \
{params.blastdb} {params.fasta_dir} {params.concatenated} 2> {log}
"""


# Gets the nearest outgroups by blasting. The number of outgroups is defined in config["outgroups"]. The outgroups are
# selected by querying every sequence in the ingroup for the top 10 hits, then taking the most frequently occurring
# hits across all queries.
rule get_outgroups:
input:
unaligned = "results/fasta/taxon/{taxon}/unaligned.fa",
makeblastdb = rules.makeblastdb.output
output: "results/fasta/taxon/{taxon}/outgroups.fa",
params:
outgroups = config["outgroups"],
blastdb=f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}'
conda: "envs/blast.yml"
log: "logs/get_outgroups/get_outgroups-{taxon}.log"
benchmark:
"benchmarks/get_outgroups/get_outgroups-{taxon}.benchmark.txt"
shell:
"""
sh workflow/scripts/get_outgroups.sh {params.blastdb} {params.outgroups} {input.unaligned} {output} 2> {log}
"""


# Exports OpenToL newick file for each unaligned BIN sequence file. This implementation uses the induced subtree web
Expand Down Expand Up @@ -414,12 +417,13 @@ rule choose_exemplars:
input:
alignment=lambda wildcards: f"results/fasta/taxon/{wildcards.taxon}/aligned.fa" if wildcards.taxon in get_passed_taxa(wildcards) else [],
tree=lambda wildcards: f"results/fasta/taxon/{wildcards.taxon}/aligned.fa.raxml.bestTree.rooted" if wildcards.taxon in get_passed_taxa(wildcards) else [],
failed_file=lambda wildcards: f"results/fasta/taxon/{wildcards.taxon}/aligned.fa" if wildcards.taxon in get_failed_taxa(wildcards) else []
failed_file=lambda wildcards: f"results/fasta/taxon/{wildcards.taxon}/aligned.fa" if wildcards.taxon in get_failed_taxa(wildcards) else [],
failed_outgroup=lambda wildcards: f"results/fasta/taxon/{wildcards.taxon}/outgroups.fa" if wildcards.taxon in get_failed_taxa(wildcards) else []
output:
"results/fasta/taxon/{taxon}/exemplars.fa"
params:
log_level = config['log_level'],
strategy = 'median'
log_level=config['log_level'],
strategy='median'
log:
"logs/choose_exemplars/choose_exemplars-{taxon}.log"
benchmark:
Expand All @@ -432,12 +436,15 @@ rule choose_exemplars:
echo "Alignment file: {input.alignment}" >> {log}
echo "Tree file: {input.tree}" >> {log}
echo "Failed file: {input.failed_file}" >> {log}
echo "Outgroup file: {input.failed_outgroup}" >> {log}
# Check if the failed file exists and is not empty
if [ -n "{input.failed_file}" ] && [ -s {input.failed_file} ]; then
echo "Failed file exists, skipping the choose_exemplars.py script." >> {log}
# Copy the failed file to the output
cp {input.failed_file} {output}
echo "Failed file exists, processing with process_failed_file.py." >> {log}
python workflow/scripts/process_failed_file.py \
--failed {input.failed_file} \
--outgroup {input.failed_outgroup} \
--output {output} 2>> {log}
# If the failed file doesn't exist and the tree file is available, run the choose_exemplars.py script
elif [ -s {input.tree} ] && [ -s {input.alignment} ]; then
python workflow/scripts/choose_exemplars.py \
Expand All @@ -455,6 +462,7 @@ rule choose_exemplars:




# When aligning the family-level subsets, different families may have different indel
# patterns. So, although they are all orientated against the model in the same way,
# there can be some frame shifting. The least bad option is to unalign and realign
Expand Down
25 changes: 25 additions & 0 deletions workflow/scripts/process_failed_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import argparse
from Bio import SeqIO

def remove_outgroup_records(failed_file, outgroup_file, output_file):
# Read outgroup sequences
outgroup_records = set()
if outgroup_file:
with open(outgroup_file, "r") as outgroup:
for record in SeqIO.parse(outgroup, "fasta"):
outgroup_records.add(record.id)

# Filter failed file records
with open(failed_file, "r") as failed, open(output_file, "w") as output:
for record in SeqIO.parse(failed, "fasta"):
if record.id not in outgroup_records:
SeqIO.write(record, output, "fasta")

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process failed file by removing outgroup records.")
parser.add_argument("--failed", required=True, help="Path to the failed file (FASTA format).")
parser.add_argument("--outgroup", required=False, help="Path to the outgroup file (FASTA format).")
parser.add_argument("--output", required=True, help="Path to the output file (FASTA format).")
args = parser.parse_args()

remove_outgroup_records(args.failed, args.outgroup, args.output)

0 comments on commit 269583f

Please sign in to comment.