From c3cd0a3b829c14bbf53db2edabd733de980c8ce2 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Tue, 28 Jan 2025 10:31:15 +0100 Subject: [PATCH 1/2] feat: use nightly nextclade tree - [x] switch Nextclade dataset in directory format (which allows to replace dataset files) - [x] replace reference tree in the dataset with the nightly tree from https://nextstrain.org/staging/nextclade/sars-cov-2 This allows to bypass laggy Nextclade dataset updates and use the latest data always. Which may or may not be what we want. This aims to be a workaround until the dataset updates are sorted out. Potential problems: - nightly trees are not systematically reviewed and can contain bugs - does any other parts of the dataset need to be updated along with the tree? (such as pathogen.json) - does any other repos need to be updated to use nightly tree? (e.g. ncov-ingest) i.e. is there an assumption that the exact same dataset is used in 2 or more places? --- workflow/snakemake_rules/main_workflow.smk | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index 8834897bf..9d4727d38 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -455,14 +455,20 @@ rule prepare_nextclade: Downloading reference files for nextclade (used for alignment and qc). """ output: - nextclade_dataset = "data/sars-cov-2-nextclade-defaults.zip", + nextclade_dataset = "data/sars-cov-2-nextclade-defaults", params: name = config["nextclade_dataset"], conda: config["conda_environment"] shell: r""" nextclade --version - nextclade dataset get --name {params.name} --output-zip {output.nextclade_dataset} + nextclade dataset get --name {params.name} --output-dir {output.nextclade_dataset} + + # override tree.json with nightly tree + curl -fsSL \ + -o {output.nextclade_dataset}/tree.json \ + -H "Accept: application/vnd.nextstrain.dataset.main+json;q=1, application/json;q=0.9, text/plain;q=0.8, */*;q=0.1" \ + "https://nextstrain.org/staging/nextclade/sars-cov-2" """ rule build_align: @@ -473,7 +479,7 @@ rule build_align: """ input: sequences = rules.combine_samples.output.sequences, - nextclade_dataset = "data/sars-cov-2-nextclade-defaults.zip", + nextclade_dataset = "data/sars-cov-2-nextclade-defaults", output: alignment = "results/{build_name}/aligned.fasta", nextclade_qc = 'results/{build_name}/nextclade_qc.tsv', From 00ce5813a86c7d1548173818d6fd341710063666 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Tue, 28 Jan 2025 10:43:16 +0100 Subject: [PATCH 2/2] fix: flag output directory with directory() Resolves error: ``` ImproperOutputException in rule prepare_nextclade in file /home/runner/work/ncov/ncov/workflow/snakemake_rules/main_workflow.smk, line 452: Outputs of incorrect type (directories when expecting files or vice versa). Output directories must be flagged with directory(). for rule prepare_nextclade: output: data/sars-cov-2-nextclade-defaults affected files: data/sars-cov-2-nextclade-defaults ``` https://github.com/nextstrain/ncov/actions/runs/13007476969/job/36277462988?pr=1170#step:8:136 --- workflow/snakemake_rules/main_workflow.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index 9d4727d38..25db4f843 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -455,7 +455,7 @@ rule prepare_nextclade: Downloading reference files for nextclade (used for alignment and qc). """ output: - nextclade_dataset = "data/sars-cov-2-nextclade-defaults", + nextclade_dataset = directory("data/sars-cov-2-nextclade-defaults"), params: name = config["nextclade_dataset"], conda: config["conda_environment"]