Skip to content

Commit

Permalink
fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
pecholleyn committed Jan 7, 2025
1 parent 8ba55e7 commit 55c4a74
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 18 deletions.
18 changes: 8 additions & 10 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,14 @@ def get_all_taxon(wildcards):
t for t in os.listdir(taxon_dir)
if os.path.isdir(os.path.join(taxon_dir, t))
and os.stat(os.path.join(taxon_dir, t, "unaligned.fa")).st_size != 0 # ignore empty files
and "{" not in t ] # ignore one file named "Erebidae-Plecoptera{Genus_moth}", it breaks the wildcard
]
return taxon


def get_prep_raxml_backbone_input(wildcards):
return expand(
"results/fasta/taxon/{{taxon}}/exemplars.fa".format(config['datatype']),
taxon=get_all_taxon(wildcards) # TODO remove the limit
taxon=get_all_taxon(wildcards)
)

rule all:
Expand Down Expand Up @@ -197,11 +198,13 @@ rule count_sequences:
with open(output[0], "w") as f:
f.write(f"{count}\n")

def get_list_taxa_file_input(wildcards):
return expand("results/fasta/taxon/{taxon}/sequences_count.txt", taxon=get_all_taxon(wildcards))

# === CHECKPOINT TO GENERATE LISTS ===
checkpoint list_taxa_files:
"""Generate a list of taxa that passed the condition."""
input: expand("results/fasta/taxon/{taxon}/sequences_count.txt", taxon=get_all_taxon)
input: get_list_taxa_file_input
output:
passed="results/passed_taxa.txt"
run:
Expand All @@ -225,7 +228,7 @@ def get_passed_taxa(wildcards):
"""Get the list of taxa that passed the condition from the checkpoint output."""
checkpoint_output = checkpoints.list_taxa_files.get().output.passed
with open(checkpoint_output, "r") as passed_f:
passed_taxa = [line.strip() for line in passed_f.readlines()]
passed_taxa = {line.strip() for line in passed_f.readlines()}
return passed_taxa

# === FUNCTION TO GET TAXA THAT FAILED THE CONDITION ===
Expand All @@ -234,7 +237,7 @@ def get_failed_taxa(wildcards):
checkpoint_output = checkpoints.list_taxa_files.get().output.passed
with open(checkpoint_output, "r") as passed_f:
passed_taxa = {line.strip() for line in passed_f.readlines()}
failed_taxa = [taxon for taxon in get_all_taxon(wildcards) if taxon not in passed_taxa]
failed_taxa = {taxon for taxon in get_all_taxon(wildcards) if taxon not in passed_taxa}
return failed_taxa


Expand Down Expand Up @@ -460,9 +463,6 @@ rule choose_exemplars:
"""





# When aligning the family-level subsets, different families may have different indel
# patterns. So, although they are all orientated against the model in the same way,
# there can be some frame shifting. The least bad option is to unalign and realign
Expand Down Expand Up @@ -569,7 +569,6 @@ rule graft_clades:
tree = "results/grafted.tre"
params:
log_level = config['log_level'],
nfamilies = config["nfamilies"]
log: "logs/graft_clades/graft_clades.log"
benchmark:
"benchmarks/graft_clades.benchmark.txt"
Expand All @@ -581,6 +580,5 @@ rule graft_clades:
-f {input.clades} \
-e {input.extinct} \
-o {output.tree} \
-n {params.nfamilies} \
-v {params.log_level} 2> {log}
"""
14 changes: 7 additions & 7 deletions workflow/scripts/graft_clades.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,19 +53,18 @@ def read_tree(filename, rooting='default-rooted', schema='newick'):
parser.add_argument('-f', '--folder', required=True, help='Location of folder with subtree folders')
parser.add_argument('-e', '--extinct', required=True, help='File with extinct PIDs to skip')
parser.add_argument('-o', '--out', required=True, help="Output grafted newick")
parser.add_argument('-n', '--nfamilies', required=True, help='Number of families')
parser.add_argument('-v', '--verbosity', required=True, help='Log level (e.g. DEBUG)')
args = parser.parse_args()

# Configure logger
logger = util.get_formatted_logger('graft_clades', args.verbosity)

# Read the extinct PIDs
extinct = []
extinct = set()
with open(args.extinct, 'r') as file:
for line in file:
clean_line = line.strip()
extinct.append(clean_line)
extinct.add(clean_line)
logger.info(f"extinct: {extinct}")
# Read the backbone tree as a dendropy object, calculate distances to root, and get its leaves
backbone = read_tree(args.tree)
Expand All @@ -74,15 +73,16 @@ def read_tree(filename, rooting='default-rooted', schema='newick'):

# Iterate over folders
base_folder = os.path.abspath(args.folder)
for i in range(1, int(args.nfamilies) + 1):
logger.info(f'Processing subtree {i}')
for taxon in os.listdir(base_folder):
logger.info(f'Processing subtree {taxon}')
subfolder = os.path.join(base_folder, taxon)

# Peprocess the focal family tree
subfolder = f'{i}-of-{args.nfamilies}'
# Preprocess the focal family tree
subtree_file = os.path.join(base_folder, subfolder, 'aligned.fa.raxml.bestTree.rooted')
try:
subtree = read_tree(subtree_file)
except:
logger.info(f'warning: subtree for taxon "{taxon}" could not be read')
continue
subtree.calc_node_root_distances()

Expand Down
2 changes: 1 addition & 1 deletion workflow/scripts/makeblastdb.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ TMP=$3
# doing this sequentially to avoid race conditions in BLAST indexing
i=0
for TAXON in $(ls -d $FASTADIR/taxon/*); do
i+=1
i=$((i + 1))
INFILE=${TAXON}/unaligned.fa

# only keep records with ott IDs, reformat the headers to retain the process ID, write to $TMP
Expand Down

0 comments on commit 55c4a74

Please sign in to comment.