From c3cd0a3b829c14bbf53db2edabd733de980c8ce2 Mon Sep 17 00:00:00 2001
From: ivan-aksamentov <ivan.aksamentov@gmail.com>
Date: Tue, 28 Jan 2025 10:31:15 +0100
Subject: [PATCH 1/2] feat: use nightly nextclade tree

- [x] switch Nextclade dataset in directory format (which allows to replace dataset files)
- [x] replace reference tree in the dataset with the nightly tree from https://nextstrain.org/staging/nextclade/sars-cov-2

This allows to bypass laggy Nextclade dataset updates and use the latest data always. Which may or may not be what we want.

This aims to be a workaround until the dataset updates are sorted out.

Potential problems:
- nightly trees are not systematically reviewed and can contain bugs
- does any other parts of the dataset need to be updated along with the tree? (such as pathogen.json)
- does any other repos need to be updated to use nightly tree? (e.g. ncov-ingest) i.e. is there an assumption that the exact same dataset is used in 2 or more places?
---
 workflow/snakemake_rules/main_workflow.smk | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
index 8834897bf..9d4727d38 100644
--- a/workflow/snakemake_rules/main_workflow.smk
+++ b/workflow/snakemake_rules/main_workflow.smk
@@ -455,14 +455,20 @@ rule prepare_nextclade:
         Downloading reference files for nextclade (used for alignment and qc).
         """
     output:
-        nextclade_dataset = "data/sars-cov-2-nextclade-defaults.zip",
+        nextclade_dataset = "data/sars-cov-2-nextclade-defaults",
     params:
         name = config["nextclade_dataset"],
     conda: config["conda_environment"]
     shell:
         r"""
         nextclade --version
-        nextclade dataset get --name {params.name} --output-zip {output.nextclade_dataset}
+        nextclade dataset get --name {params.name} --output-dir {output.nextclade_dataset}
+
+        # override tree.json with nightly tree
+        curl -fsSL \
+            -o {output.nextclade_dataset}/tree.json \
+            -H "Accept: application/vnd.nextstrain.dataset.main+json;q=1, application/json;q=0.9, text/plain;q=0.8, */*;q=0.1" \
+            "https://nextstrain.org/staging/nextclade/sars-cov-2"
         """
 
 rule build_align:
@@ -473,7 +479,7 @@ rule build_align:
         """
     input:
         sequences = rules.combine_samples.output.sequences,
-        nextclade_dataset = "data/sars-cov-2-nextclade-defaults.zip",
+        nextclade_dataset = "data/sars-cov-2-nextclade-defaults",
     output:
         alignment = "results/{build_name}/aligned.fasta",
         nextclade_qc = 'results/{build_name}/nextclade_qc.tsv',

From 00ce5813a86c7d1548173818d6fd341710063666 Mon Sep 17 00:00:00 2001
From: ivan-aksamentov <ivan.aksamentov@gmail.com>
Date: Tue, 28 Jan 2025 10:43:16 +0100
Subject: [PATCH 2/2] fix: flag output directory with directory()

Resolves error:

```
ImproperOutputException in rule prepare_nextclade in file /home/runner/work/ncov/ncov/workflow/snakemake_rules/main_workflow.smk, line 452:
Outputs of incorrect type (directories when expecting files or vice versa). Output directories must be flagged with directory(). for rule prepare_nextclade:
    output: data/sars-cov-2-nextclade-defaults
    affected files:
        data/sars-cov-2-nextclade-defaults
```

https://github.com/nextstrain/ncov/actions/runs/13007476969/job/36277462988?pr=1170#step:8:136
---
 workflow/snakemake_rules/main_workflow.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
index 9d4727d38..25db4f843 100644
--- a/workflow/snakemake_rules/main_workflow.smk
+++ b/workflow/snakemake_rules/main_workflow.smk
@@ -455,7 +455,7 @@ rule prepare_nextclade:
         Downloading reference files for nextclade (used for alignment and qc).
         """
     output:
-        nextclade_dataset = "data/sars-cov-2-nextclade-defaults",
+        nextclade_dataset = directory("data/sars-cov-2-nextclade-defaults"),
     params:
         name = config["nextclade_dataset"],
     conda: config["conda_environment"]