From 30dc8a87abc60e4a8fe48859b3d8b793d15559fd Mon Sep 17 00:00:00 2001
From: Trevor Bedford <trevor@bedford.io>
Date: Thu, 23 Jan 2025 10:27:33 -0800
Subject: [PATCH] Improve documentation of the MLR lineage fitness analysis

This is mostly updating the description.md files to include sentence describing the MLR fitness coloring.
---
 defaults/auspice_config.json                  |  5 ++++
 .../nextstrain_description.md                 |  2 +-
 .../nextstrain_description.md                 |  2 +-
 .../nextstrain-open/nextstrain_description.md |  2 +-
 scripts/fetch_mlr_lineage_fitness.py          | 24 ++++++++++++-------
 5 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/defaults/auspice_config.json b/defaults/auspice_config.json
index 3c646bcdc..b94c2bd2a 100644
--- a/defaults/auspice_config.json
+++ b/defaults/auspice_config.json
@@ -50,6 +50,11 @@
       "title": "Mutational fitness",
       "type": "continuous"
     },
+    {
+      "key": "mlr_lineage_fitness",
+      "title": "MLR lineage fitness",
+      "type": "continuous"
+    },    
     {
       "key": "location",
       "title": "Location",
diff --git a/nextstrain_profiles/nextstrain-gisaid-21L/nextstrain_description.md b/nextstrain_profiles/nextstrain-gisaid-21L/nextstrain_description.md
index 69ba11ee5..1b1ee1dab 100644
--- a/nextstrain_profiles/nextstrain-gisaid-21L/nextstrain_description.md
+++ b/nextstrain_profiles/nextstrain-gisaid-21L/nextstrain_description.md
@@ -28,7 +28,7 @@ There are millions of complete SARS-CoV-2 genomes available and this number incr
 **Oceania**       | [21L/oceania/1m](/ncov/gisaid/21L/oceania/1m?f_region=Oceania)                     | [21L/oceania/2m](/ncov/gisaid/21L/oceania/2m?f_region=Oceania)                     | [21L/oceania/6m](/ncov/gisaid/21L/oceania/6m?f_region=Oceania)                     | [21L/oceania/all-time](/ncov/gisaid/21L/oceania/all-time?f_region=Oceania)                     |
 **South America** | [21L/south-america/1m](/ncov/gisaid/21L/south-america/1m?f_region=South%20America) | [21L/south-america/2m](/ncov/gisaid/21L/south-america/2m?f_region=South%20America) | [21L/south-america/6m](/ncov/gisaid/21L/south-america/6m?f_region=South%20America) | [21L/south-america/all-time](/ncov/gisaid/21L/south-america/all-time?f_region=South%20America) |
 
-Site numbering and genome structure uses [Wuhan-Hu-1/2019](https://www.ncbi.nlm.nih.gov/nuccore/MN908947) as reference. The phylogeny is rooted relative to clade 21L (Pango lineage BA.2) reference virus. Temporal resolution assumes a nucleotide substitution rate of 8 &times; 10^-4 subs per site per year. Mutational fitness is calculated using results from [Obermeyer et al](https://doi.org/10.1126/science.abm1208). Immune escape vs BA.2 is estimated using the [RBD antibody escape calculator](https://jbloomlab.github.io/SARS2-RBD-escape-calc/) maintained by Jesse Bloom. Full details on bioinformatic processing can be found [here](https://github.com/nextstrain/ncov).
+Site numbering and genome structure uses [Wuhan-Hu-1/2019](https://www.ncbi.nlm.nih.gov/nuccore/MN908947) as reference. The phylogeny is rooted relative to clade 21L (Pango lineage BA.2) reference virus. Temporal resolution assumes a nucleotide substitution rate of 8 &times; 10^-4 subs per site per year. Mutational fitness is calculated using results from [Obermeyer et al](https://doi.org/10.1126/science.abm1208). MLR lineage fitness is fetched from the frequency analysis at [nextstrain.org/sars-cov-2/forecasts](https://nextstrain.org/sars-cov-2/forecasts) and indicates the relative growth advantage of circulating Pango lineages. Immune escape vs BA.2 is estimated using the [RBD antibody escape calculator](https://jbloomlab.github.io/SARS2-RBD-escape-calc/) maintained by Jesse Bloom. Full details on bioinformatic processing can be found [here](https://github.com/nextstrain/ncov).
 
 We gratefully acknowledge the authors, originating and submitting laboratories of the genetic sequences and metadata made available through [GISAID](https://gisaid.org) on which this research is based. An attribution table is available by clicking on "Download Data" at the bottom of the page and then clicking on "Acknowledgments" in the resulting dialog box.
 
diff --git a/nextstrain_profiles/nextstrain-gisaid/nextstrain_description.md b/nextstrain_profiles/nextstrain-gisaid/nextstrain_description.md
index e9a9e340f..d6fd9b3f2 100644
--- a/nextstrain_profiles/nextstrain-gisaid/nextstrain_description.md
+++ b/nextstrain_profiles/nextstrain-gisaid/nextstrain_description.md
@@ -28,7 +28,7 @@ There are millions of complete SARS-CoV-2 genomes available and this number incr
 **Oceania**       | [21L/oceania/1m](/ncov/gisaid/21L/oceania/1m?f_region=Oceania)                     | [21L/oceania/2m](/ncov/gisaid/21L/oceania/2m?f_region=Oceania)                     | [21L/oceania/6m](/ncov/gisaid/21L/oceania/6m?f_region=Oceania)                     | [21L/oceania/all-time](/ncov/gisaid/21L/oceania/all-time?f_region=Oceania)                     |
 **South America** | [21L/south-america/1m](/ncov/gisaid/21L/south-america/1m?f_region=South%20America) | [21L/south-america/2m](/ncov/gisaid/21L/south-america/2m?f_region=South%20America) | [21L/south-america/6m](/ncov/gisaid/21L/south-america/6m?f_region=South%20America) | [21L/south-america/all-time](/ncov/gisaid/21L/south-america/all-time?f_region=South%20America) |
 
-Site numbering and genome structure uses [Wuhan-Hu-1/2019](https://www.ncbi.nlm.nih.gov/nuccore/MN908947) as reference. The phylogeny is rooted relative to early samples from Wuhan. Temporal resolution assumes a nucleotide substitution rate of 8 &times; 10^-4 subs per site per year. Mutational fitness is calculated using results from [Obermeyer et al](https://doi.org/10.1126/science.abm1208). Immune escape vs BA.2 is estimated using the [RBD antibody escape calculator](https://jbloomlab.github.io/SARS2-RBD-escape-calc/) maintained by Jesse Bloom. Full details on bioinformatic processing can be found [here](https://github.com/nextstrain/ncov).
+Site numbering and genome structure uses [Wuhan-Hu-1/2019](https://www.ncbi.nlm.nih.gov/nuccore/MN908947) as reference. The phylogeny is rooted relative to early samples from Wuhan. Temporal resolution assumes a nucleotide substitution rate of 8 &times; 10^-4 subs per site per year. Mutational fitness is calculated using results from [Obermeyer et al](https://doi.org/10.1126/science.abm1208). MLR lineage fitness is fetched from the frequency analysis at [nextstrain.org/sars-cov-2/forecasts](https://nextstrain.org/sars-cov-2/forecasts) and indicates the relative growth advantage of circulating Pango lineages. Immune escape vs BA.2 is estimated using the [RBD antibody escape calculator](https://jbloomlab.github.io/SARS2-RBD-escape-calc/) maintained by Jesse Bloom. Full details on bioinformatic processing can be found [here](https://github.com/nextstrain/ncov).
 
 We gratefully acknowledge the authors, originating and submitting laboratories of the genetic sequences and metadata made available through [GISAID](https://gisaid.org) on which this research is based. An attribution table is available by clicking on "Download Data" at the bottom of the page and then clicking on "Acknowledgments" in the resulting dialog box.
 
diff --git a/nextstrain_profiles/nextstrain-open/nextstrain_description.md b/nextstrain_profiles/nextstrain-open/nextstrain_description.md
index 512158e4b..604d90b6a 100644
--- a/nextstrain_profiles/nextstrain-open/nextstrain_description.md
+++ b/nextstrain_profiles/nextstrain-open/nextstrain_description.md
@@ -14,7 +14,7 @@ There are millions of complete SARS-CoV-2 genomes available on open databases an
 | **Oceania**       | [oceania/1m](/ncov/open/oceania/1m?f_region=Oceania)                     | [oceania/2m](/ncov/open/oceania/2m?f_region=Oceania)                     | [oceania/6m](/ncov/open/oceania/6m?f_region=Oceania)                     | [oceania/all-time](/ncov/open/oceania/all-time?f_region=Oceania)                     |
 | **South America** | [south-america/1m](/ncov/open/south-america/1m?f_region=South%20America) | [south-america/2m](/ncov/open/south-america/2m?f_region=South%20America) | [south-america/6m](/ncov/open/south-america/6m?f_region=South%20America) | [south-america/all-time](/ncov/open/south-america/all-time?f_region=South%20America) |
 
-Site numbering and genome structure uses [Wuhan-Hu-1/2019](https://www.ncbi.nlm.nih.gov/nuccore/MN908947) as reference. The phylogeny is rooted relative to early samples from Wuhan. Temporal resolution assumes a nucleotide substitution rate of 8 &times; 10^-4 subs per site per year. Mutational fitness is calculated using results from [Obermeyer et al](https://doi.org/10.1126/science.abm1208). Immune escape vs BA.2 is estimated using the [RBD antibody escape calculator](https://jbloomlab.github.io/SARS2-RBD-escape-calc/) maintained by Jesse Bloom. Full details on bioinformatic processing can be found [here](https://github.com/nextstrain/ncov).
+Site numbering and genome structure uses [Wuhan-Hu-1/2019](https://www.ncbi.nlm.nih.gov/nuccore/MN908947) as reference. The phylogeny is rooted relative to early samples from Wuhan. Temporal resolution assumes a nucleotide substitution rate of 8 &times; 10^-4 subs per site per year. Mutational fitness is calculated using results from [Obermeyer et al](https://doi.org/10.1126/science.abm1208). MLR lineage fitness is fetched from the frequency analysis at [nextstrain.org/sars-cov-2/forecasts](https://nextstrain.org/sars-cov-2/forecasts) and indicates the relative growth advantage of circulating Pango lineages. Immune escape vs BA.2 is estimated using the [RBD antibody escape calculator](https://jbloomlab.github.io/SARS2-RBD-escape-calc/) maintained by Jesse Bloom. Full details on bioinformatic processing can be found [here](https://github.com/nextstrain/ncov).
 
 The analysis on this page uses sequence data from NCBI GenBank and the Robert Koch Institute (RKI). Data from NCBI Genbank follows [Open Data principles](https://opendatahandbook.org/guide/en/what-is-open-data/), such that we can make input data and intermediate files available for further analysis. Open Data is data that can be freely used, re-used and redistributed by anyone - subject only, at most, to the requirement to attribute and sharealike. [Data from RKI](https://github.com/robert-koch-institut/SARS-CoV-2-Sequenzdaten_aus_Deutschland) is provided via a [CC-BY 4.0 license](https://creativecommons.org/licenses/by/4.0/), which is useable under similar terms. Please be aware that not all regions are well represented in open databases and some of the above trees might lack recent data from particular geographic regions. Some UK metadata is augmented with data available from [COG-UK](https://www.cogconsortium.uk/priority-areas/data-linkage-analysis/public-data-analysis/) (via CLIMB).
 
diff --git a/scripts/fetch_mlr_lineage_fitness.py b/scripts/fetch_mlr_lineage_fitness.py
index 6c0bb39fd..6610663b4 100644
--- a/scripts/fetch_mlr_lineage_fitness.py
+++ b/scripts/fetch_mlr_lineage_fitness.py
@@ -1,16 +1,25 @@
-from augur.io import read_metadata
-from augur.utils import write_json
 import requests
 import json
 import pandas as pd
 import argparse
 import math
+from augur.io import read_metadata
+from augur.utils import write_json
+
+# This script is currently assuming a match on lineage fitness this uses
+# https://data.nextstrain.org/files/workflows/forecasts-ncov/gisaid/pango_lineages/global/mlr/latest_results.json
+# that backs the live estimates on https://nextstrain.org/sars-cov-2/forecasts
+# This uses "Nextclade_pango" metadata label to derive sequence counts from
+# GISAID data and estimate relative growth advantages across collapsed Pango
+# lineages. It will be most relevant for 1m, 2m and 6m builds, but is not at all
+# broken for the all-time builds. It would be possible to swap this to key on
+# clade instead, but I think the greater detail of lineages is better in this case
 
 # Set up argument parser
-parser = argparse.ArgumentParser(description="Process metadata and growth advantage data.")
-parser.add_argument("--metadata", required=True, help="Path to the metadata file (TSV or compressed .tsv.xz format).")
-parser.add_argument("--metadata-id-columns", default=["strain", "name", "Virus name"], nargs="+", help="List of columns to use as identifiers in the metadata file.")
-parser.add_argument("--metadata-clade-attribute", default="Nextclade_pango", help="Matched attribute to MLR variants.")
+parser = argparse.ArgumentParser(description="Fetch MLR lineage fitness and match to strain-level metadata")
+parser.add_argument("--metadata", required=True, help="Path to the metadata TSV")
+parser.add_argument("--metadata-id-columns", default=["strain", "name", "Virus name"], nargs="+", help="List of columns to use as identifiers in the metadata file")
+parser.add_argument("--metadata-clade-attribute", default="Nextclade_pango", help="Matched attribute to MLR variants")
 parser.add_argument("--mlr-url", default="https://data.nextstrain.org/files/workflows/forecasts-ncov/gisaid/pango_lineages/global/mlr/latest_results.json", help="URL to fetch the forecasts JSON data.")
 parser.add_argument("--output-node-data", required=True, help="Path to save the output JSON node data.")
 
@@ -50,9 +59,6 @@ def fetch_growth_advantages(mlr_url):
     else:
         metadata[args.metadata_clade_attribute] = math.nan
 
-    # Output rows with matched data
-    print(metadata.head())  # Display the first few rows as an example
-
     # Create a node data object with growth advantages
     node_data = {}
     for index, record in metadata.iterrows():