From 732e38e51211dbe1df3d1f139266871d7224169f Mon Sep 17 00:00:00 2001
From: james hadfield <hadfield.james@gmail.com>
Date: Fri, 14 Feb 2025 12:08:15 +1300
Subject: [PATCH] H5N1 D1.1 genome build

This builds off the GenoFLU metadata added in previous commits to build
a D1.1-specific build. We reuse ~all of the cattle-outbreak machinery
but at this time only build the genome trees.

Due to the SRA data having limited geographic and temporal metadata
we default to the divergence tree and hide the map panel. As of
2025-02-20 the build has 615 genomes of which 412 (2/3rds) have only
the collection year and country.
---
 Snakefile                                     |  19 ++-
 config/h5n1-d1.1.yaml                         | 131 +++++++++++++++
 .../h5n1-d1.1/auspice_config_h5n1-d1.1.json   | 151 ++++++++++++++++++
 config/h5n1-d1.1/description.md               |  14 ++
 .../h5n1-d1.1/dropped_strains_h5n1-d1.1.txt   |   0
 .../h5n1-d1.1/include_strains_h5n1-d1.1.txt   |   0
 ingest/rules/genoflu.smk                      |  26 +--
 rules/cattle-flu.smk                          |  14 +-
 8 files changed, 322 insertions(+), 33 deletions(-)
 create mode 100644 config/h5n1-d1.1.yaml
 create mode 100755 config/h5n1-d1.1/auspice_config_h5n1-d1.1.json
 create mode 100644 config/h5n1-d1.1/description.md
 create mode 100644 config/h5n1-d1.1/dropped_strains_h5n1-d1.1.txt
 create mode 100644 config/h5n1-d1.1/include_strains_h5n1-d1.1.txt

diff --git a/Snakefile b/Snakefile
index 95ef489..ca18c4f 100755
--- a/Snakefile
+++ b/Snakefile
@@ -63,6 +63,12 @@ files = rules.files.params
 
 
 def subtypes_by_subtype_wildcard(wildcards):
+
+    # TODO XXX - move to configs (started in https://github.com/nextstrain/avian-flu/pull/104 but
+    # We should make the entire query config-definable)
+    if wildcards.subtype == 'h5n1-d1.1':
+        return "genoflu in 'D1.1'"
+
     db = {
         'h5nx': ['h5n1', 'h5n2', 'h5n3', 'h5n4', 'h5n5', 'h5n6', 'h5n7', 'h5n8', 'h5n9'],
         'h5n1': ['h5n1'],
@@ -72,7 +78,7 @@ def subtypes_by_subtype_wildcard(wildcards):
     db['h5n1-cattle-outbreak'] = [*db['h5nx']]
     assert wildcards.subtype in db, (f"Subtype {wildcards.subtype!r} is not defined in the snakemake function "
         "`subtypes_by_subtype_wildcard` -- is there a typo in the subtype you are targetting?")
-    return(db[wildcards.subtype])
+    return(f"subtype in [{', '.join([repr(s) for s in db[wildcards.subtype]])}]")
 
 class InvalidConfigError(Exception):
     pass
@@ -233,7 +239,7 @@ rule filter_sequences_by_subtype:
         augur filter \
             --sequences {input.sequences} \
             --metadata {input.metadata} \
-            --query "subtype in {params.subtypes!r}" \
+            --query {params.subtypes!r} \
             --output-sequences {output.sequences}
         """
 
@@ -248,7 +254,7 @@ rule filter_metadata_by_subtype:
         """
         augur filter \
             --metadata {input.metadata} \
-            --query "subtype in {params.subtypes!r}" \
+            --query {params.subtypes!r} \
             --output-metadata {output.metadata}
         """
 
@@ -633,9 +639,9 @@ rule auspice_config:
         import json
         with open(input.auspice_config) as fh:
             auspice_config = json.load(fh)
-        if wildcards.subtype == "h5n1-cattle-outbreak":
+        if wildcards.subtype in ["h5n1-cattle-outbreak", "h5n1-d1.1"]:
             if wildcards.segment == "genome":
-                auspice_config['display_defaults']['distance_measure'] = "num_date"
+                auspice_config['display_defaults']['distance_measure'] = "num_date" if wildcards.subtype == "h5n1-cattle-outbreak" else "div"
                 division_idx = next((i for i,c in enumerate(auspice_config['colorings']) if c['key']=='division'), None)
                 assert division_idx!=None, "Auspice config did not have a division coloring!"
                 auspice_config['colorings'].insert(division_idx+1, {
@@ -709,7 +715,8 @@ def auspice_name_to_wildcard_name(wildcards):
         return f"results/{subtype}/{segment}/{time}/auspice-dataset.json"
     if len(parts)==2:
         [subtype, segment] = parts
-        assert subtype=='h5n1-cattle-outbreak', "Only h5n1 builds produce an Auspice dataset without a time component in the filename"
+        assert subtype=='h5n1-cattle-outbreak' or subtype=='h5n1-d1.1', \
+            "Only h5n1 builds produce an Auspice dataset without a time component in the filename"
         return f"results/{subtype}/{segment}/default/auspice-dataset.json"
     raise Exception("Auspice JSON filename requested with an unexpected number of (underscore-separated) parts")
 
diff --git a/config/h5n1-d1.1.yaml b/config/h5n1-d1.1.yaml
new file mode 100644
index 0000000..fd4fcd4
--- /dev/null
+++ b/config/h5n1-d1.1.yaml
@@ -0,0 +1,131 @@
+#
+# TKTK
+#
+custom_rules:
+  - "rules/cattle-flu.smk"
+
+
+#### Parameters which define which builds to produce via this config ###
+builds:
+  h5n1-d1.1: ''
+
+segments:
+  - genome
+
+
+
+# Input source(s) - See README.md for how to use local files instead and/or add additional inputs
+inputs:
+  - name: ncbi
+    metadata: s3://nextstrain-data/files/workflows/avian-flu/h5n1/metadata.tsv.zst
+    sequences: s3://nextstrain-data/files/workflows/avian-flu/h5n1/{segment}/sequences.fasta.zst
+
+#### Parameters which control large overarching aspects of the build
+# Set a high target_sequences_per_tree to capture all circulating strains, as they will be pruned down
+# as part of the workflow
+target_sequences_per_tree: 10_000
+
+
+#### Config files ####
+reference: config/h5n1/reference_h5n1_{segment}.gb  # use H5N1 references
+genome_reference: config/h5n1-cattle-outbreak/h5_cattle_genome_root.gb # use cattle-flu genome reference TODO XXX
+auspice_config: config/{subtype}/auspice_config_{subtype}.json
+colors: config/h5n1/colors_h5n1.tsv # use H5N1 colors
+lat_longs: config/h5n1/lat_longs_h5n1.tsv # use H5N1 lat-longs
+include_strains: config/{subtype}/include_strains_{subtype}.txt
+# use cattle-outbreak specific dropped strains for segment + genome trees
+dropped_strains: config/{subtype}/dropped_strains_{subtype}.txt
+clades_file: clade-labeling/h5n1-clades.tsv # use H5N1 clades
+description: config/{subtype}/description.md
+
+
+#### Rule-specific parameters ####
+filter:
+  min_length:
+    FALLBACK:
+      pb2: 2100
+      pb1: 2100
+      pa: 2000
+      ha: 1600
+      np: 1400
+      na: 1270
+      mp: 900
+      ns: 800
+
+  min_date:
+    FALLBACK: 2024
+
+  group_by:
+    FALLBACK: false # no grouping during filter
+
+  exclude_where:
+    FALLBACK: host=laboratoryderived host=ferret host=unknown host=other host=host gisaid_clade=3C.2
+
+
+refine:
+  coalescent: const
+  date_inference: marginal
+
+  genome_clock_filter_iqd:
+    FALLBACK: 6
+  clock_filter_iqd:
+    FALLBACK: false
+
+  root:
+    FALLBACK: false
+
+  # For the genome only we use the closest outgroup as the root
+  # P.S. Make sure this strain is force included via augur filter --include
+  # (This isn't needed for the segment builds as we include a large enough time span to root via the clock)
+  genome_root:
+      FALLBACK: best
+
+  segment_lengths:
+    FALLBACK:
+      {'pb2': 2341, 'pb1': 2341, 'pa': 2233, 'ha': 1760, 'np': 1565, 'na': 1458, 'mp': 1027, 'ns': 865}
+
+  __clock_std_dev:  &clock_std_dev  0.00211 # YAML anchor so we can reference this value below
+
+  clock_rates:
+    FALLBACK:
+      # The rates for the 8 segments are taken from the GISAID H5N1/2y config
+      pb2: [0.00287, *clock_std_dev]
+      pb1: [0.00264, *clock_std_dev]
+      pa: [0.00248, *clock_std_dev]
+      ha: [0.00455, *clock_std_dev]
+      np: [0.00252, *clock_std_dev]
+      na: [0.00349, *clock_std_dev]
+      mp: [0.00191, *clock_std_dev]
+      ns: [0.00249, *clock_std_dev]
+      # the genome clock rate is calculated by a function in the snakemake pipeline
+      # using the segment rates weighted by their lengths
+
+ancestral:
+  inference: joint
+  root_seq:
+    FALLBACK: false
+  genome_root_seq:
+    FALLBACK: config/h5n1-cattle-outbreak/h5_cattle_genome_root.gb
+
+traits:
+  # genome build has different parameters...
+  genome_columns:
+    FALLBACK: division
+  genome_sampling_bias_correction:
+    FALLBACK: 5
+
+  # segment builds:
+  columns:
+    FALLBACK: region country # same as GISAID H5N1 builds
+  sampling_bias_correction:
+    FALLBACK: false
+
+  # all builds
+  confidence:
+    FALLBACK: true
+
+export:
+  genome_title:
+    FALLBACK: false
+  title:
+    FALLBACK: false
diff --git a/config/h5n1-d1.1/auspice_config_h5n1-d1.1.json b/config/h5n1-d1.1/auspice_config_h5n1-d1.1.json
new file mode 100755
index 0000000..9f3f0e1
--- /dev/null
+++ b/config/h5n1-d1.1/auspice_config_h5n1-d1.1.json
@@ -0,0 +1,151 @@
+{
+  "title": "Full genome analysis of the ongoing influenza A/H5N1 D1.1 outbreak in North America",
+  "maintainers": [
+    {"name": "Moncla lab", "url": "https://lmoncla.github.io/monclalab/"},
+    {"name": "the Nextstrain team", "url": "https://nextstrain.org/team"}
+  ],
+  "build_url": "https://github.com/nextstrain/avian-flu",
+  "data_provenance": [
+    {
+      "name": "USDA",
+      "url": "https://www.ncbi.nlm.nih.gov/bioproject/PRJNA1102327"
+    },
+    {
+      "name": "Andersen Lab",
+      "url": "https://github.com/andersen-lab/avian-influenza/"
+    },
+    {
+      "name": "GenBank",
+      "url": "https://www.ncbi.nlm.nih.gov/genbank/"
+    }
+  ],
+  "extensions": {
+    "nextclade": {
+      "pathogen": {
+        "schemaVersion":"3.0.0",
+        "defaultCds": "HA",
+        "cdsOrderPreference":[
+          "PB2",
+          "PB1",
+          "PA",
+          "HA",
+          "NP",
+          "NA",
+          "M1",
+          "M2",
+          "NS1",
+          "NS2"
+      ],
+      "attributes": {
+        "name": "H5N1 D1.1 Genome analysis",
+        "reference name": "concatenated ancestral sequences",
+        "reference accession": "none"
+        }
+      }
+    }
+  },
+  "colorings": [
+    {
+      "key": "gt",
+      "title": "Genotype",
+      "type": "categorical"
+    },
+    {
+      "key": "num_date",
+      "title": "Date",
+      "type": "continuous"
+    },
+    {
+      "key": "region",
+      "title": "Region",
+      "type": "categorical"
+    },
+    {
+      "key": "country",
+      "title": "Country",
+      "type": "categorical"
+    },
+    {
+      "key": "division",
+      "title": "Admin Division",
+      "type": "categorical"
+    },
+    {
+      "key": "host",
+      "title": "Host",
+      "type": "categorical"
+    },
+    {
+      "key": "subtype",
+      "title": "Subtype",
+      "type": "categorical"
+    },
+    {
+      "key": "genoflu",
+      "title": "GenoFLU",
+      "type": "categorical"
+    },
+    {
+      "key": "h5_label_clade",
+      "title": "Provisional LABEL Clade",
+      "type": "categorical"
+    },
+    {
+      "key": "furin_cleavage_motif",
+      "title": "Furin Cleavage Motif",
+      "type": "categorical"
+    },
+    {
+      "key": "cleavage_site_sequence",
+      "title": "Cleavage Site Sequence",
+      "type": "categorical"
+    },
+    {
+      "key": "author",
+      "title": "Authors",
+      "type": "categorical"
+    },
+    {
+      "key": "originating_lab",
+      "title": "Originating Lab",
+      "type": "categorical"
+    },
+    {
+      "key": "submitting_lab",
+      "title": "Submitting Lab",
+      "type": "categorical"
+    },
+    {
+      "key": "data_source",
+      "title": "Data Source",
+      "type": "categorical"
+    }
+  ],
+  "geo_resolutions": [
+    "region",
+    "country",
+    "division"
+  ],
+  "display_defaults": {
+    "map_triplicate": false,
+    "color_by": "host",
+    "geo_resolution": "division",
+    "distance_measure": "div",
+    "panels": ["tree", "entropy"]
+  },
+  "filters": [
+    "host",
+    "region",
+    "country",
+    "division",
+    "subtype",
+    "author",
+    "originating_lab",
+    "submitting_lab",
+    "data_source"
+  ],
+  "metadata_columns": [
+    "genbank_accession",
+    "sra_accessions"
+  ]
+}
diff --git a/config/h5n1-d1.1/description.md b/config/h5n1-d1.1/description.md
new file mode 100644
index 0000000..8fb6cd3
--- /dev/null
+++ b/config/h5n1-d1.1/description.md
@@ -0,0 +1,14 @@
+We gratefully acknowledge the authors, originating and submitting laboratories of the genetic sequences and metadata for sharing their work. Please note that although data generators have generously shared data in an open fashion, that does not mean there should be free license to publish on this data. Data generators should be cited where possible and collaborations should be sought in some circumstances. Please try to avoid scooping someone else's work. Reach out if uncertain.
+
+Genomic data from the ongoing H5N1 outbreaks in the US was shared by the [National Veterinary Services Laboratories (NVSL)](https://www.aphis.usda.gov/labs/about-nvsl) of the [Animal and Plant Health Inspection Service (APHIS)](https://www.aphis.usda.gov/) of the U.S. Department of Agriculture (USDA) in an open fashion to NCBI GenBank (consensus genomes and complete metadata) and to the SRA (raw reads with redacted metadata) in [BioProject PRJNA1102327](https://www.ncbi.nlm.nih.gov/bioproject/PRJNA1102327). Other groups have contributed sequence data here, but the majority of viral genomes have been shared by the USDA. The Andersen Lab has assembled raw reads from this SRA BioProject and publicly shared consensus genomes to [GitHub](https://github.com/andersen-lab/avian-influenza). We thank the USDA for genomic data sharing and the Andersen Lab for sharing assembled consensus genomes.
+
+In this analysis, we've curated data from NCBI GenBank and merged this data with SRA data via the Andersen Lab GitHub repository. 
+We will make curated sequence & metadata files available shortly. Data source as GenBank vs SRA-via-Andersen-Lab is included in this metadata and is available as a [coloring to this page](?c=data_source). 
+
+### Limitations
+Importantly, SRA-derived genomes only have the year of collection (e.g. 2024-XX-XX or 2025-XX-XX) and "USA" as collection location. In this analysis, we've inferred collection date and collection location for these samples along with confidence in date and location, however these must be treated with caution. We've added two colorings for geographic division: [one using inferred values](?c=division) and one only reporting [known values](?c=division_metadata). For these reasons we have toggled the map panel off by default.
+
+In addition to this cattle outbreak specific view, we have broader views of H5N1 evolution available as:
+ - [nextstrain.org/avian-flu/h5n1/ha/2y](https://nextstrain.org/avian-flu/h5n1/ha/2y)
+ - [nextstrain.org/avian-flu/h5n1/na/2y](https://nextstrain.org/avian-flu/h5n1/na/2y)
+ - etc...
diff --git a/config/h5n1-d1.1/dropped_strains_h5n1-d1.1.txt b/config/h5n1-d1.1/dropped_strains_h5n1-d1.1.txt
new file mode 100644
index 0000000..e69de29
diff --git a/config/h5n1-d1.1/include_strains_h5n1-d1.1.txt b/config/h5n1-d1.1/include_strains_h5n1-d1.1.txt
new file mode 100644
index 0000000..e69de29
diff --git a/ingest/rules/genoflu.smk b/ingest/rules/genoflu.smk
index d23d574..fcdc0c1 100644
--- a/ingest/rules/genoflu.smk
+++ b/ingest/rules/genoflu.smk
@@ -5,37 +5,23 @@ We are using a vendored version of <https://github.com/moncla-lab/GenoFLU-multi>
 which is built on top of USDA's GenoFLU <https://github.com/USDA-VS/GenoFLU>.
 """
 
-def genoflu_filter_args(wildcards):
-    # NOTE: it's crucial to get the quoting right here, of the following three strings on the command line
-    # "gisaid_clade=='2.3.4.4b'" gisaid_clade=='2.3.4.4b' and 'gisaid_clade==2.3.4.4b'
-    # only the first works.
-    # NOTE 2: This filtering may not be correct - see <https://github.com/nextstrain/avian-flu/pull/127#issuecomment-2669102995>
-    if wildcards.data_source=='fauna':
-        return "--query \"gisaid_clade=='2.3.4.4b'\""
-    return ""
-
-
 rule provision_genoflu_sequences:
     """
     GenoFLU will consume all the FASTA files in the provided directory, so we set up a
-    new directory with (filtered) FASTA files we want to call. Note that we use the
-    final/results sequences as _inputs_ here, because GenoFLU isn't going to modify
-    those in any way, and as such they are marked as temporary.
+    new directory with FASTA files we want to call. (We use the final ("results") sequences
+    here because the sequences themselves aren't modified by GenoFLU.)
+
+    The current implementation is a simple file copy however we may wish to use `augur filter`
+    in the future to restrict the samples we process.
     """
     input:
         sequences = "{data_source}/results/sequences_{segment}.fasta",
-        metadata = "{data_source}/data/metadata_combined.tsv",
     output:
         sequences = temp("{data_source}/data/genoflu/sequences_{segment}.fasta"),
     threads: 1
-    params:
-        query = genoflu_filter_args
     shell:
         """
-        augur filter \
-            {params.query} \
-            --metadata {input.metadata} --sequences {input.sequences} \
-            --output-sequences {output.sequences}
+        cp {input.sequences} {output.sequences}
         """
 
 rule run_genoflu:
diff --git a/rules/cattle-flu.smk b/rules/cattle-flu.smk
index 2c0c762..5553277 100644
--- a/rules/cattle-flu.smk
+++ b/rules/cattle-flu.smk
@@ -18,7 +18,7 @@ rule filter_segments_for_genome:
         min_date = "2024-01-01",
         query = 'region == "North America"'
     wildcard_constraints:
-        subtype = 'h5n1-cattle-outbreak',
+        subtype = 'h5n1-cattle-outbreak|h5n1-d1.1',
         segment = 'genome',
         time = 'default',
     log: "logs/{subtype}/{segment}/{time}/filtered_{genome_seg}.txt",
@@ -43,7 +43,7 @@ rule align_segments_for_genome:
     output:
         alignment = "results/{subtype}/{segment}/{time}/aligned_{genome_seg}.fasta"
     wildcard_constraints:
-        subtype = 'h5n1-cattle-outbreak',
+        subtype = 'h5n1-cattle-outbreak|h5n1-d1.1',
         segment = 'genome',
         time = 'default',
     threads:
@@ -68,7 +68,7 @@ rule join_segments:
     output:
         alignment = "results/{subtype}/{segment}/{time}/aligned.fasta"
     wildcard_constraints:
-        subtype = 'h5n1-cattle-outbreak',
+        subtype = 'h5n1-cattle-outbreak|h5n1-d1.1',
         segment = 'genome',
         time = 'default',
     shell:
@@ -85,7 +85,7 @@ rule genome_metadata:
     output:
         metadata = temp("results/{subtype}/{segment}/{time}/metadata_intermediate.tsv")
     wildcard_constraints:
-        subtype = 'h5n1-cattle-outbreak',
+        subtype = 'h5n1-cattle-outbreak|h5n1-d1.1',
         segment = 'genome',
         time = 'default',
     shell:
@@ -116,7 +116,7 @@ rule add_metadata_columns_to_show_non_inferred_values:
     output:
         metadata = "results/{subtype}/{segment}/{time}/metadata.tsv"
     wildcard_constraints:
-        subtype="h5n1-cattle-outbreak",
+        subtype='h5n1-cattle-outbreak|h5n1-d1.1',
         segment="genome",
         time="default",
     params:
@@ -138,7 +138,7 @@ rule prune_tree:
         tree = "results/{subtype}/{segment}/{time}/tree_outbreak-clade.nwk",
         node_data = "results/{subtype}/{segment}/{time}/outbreak-clade-strains-in-genome-tree.json",
     wildcard_constraints:
-        subtype="h5n1-cattle-outbreak",
+        subtype='h5n1-cattle-outbreak|h5n1-d1.1',
         time="default",
     shell:
         """
@@ -163,7 +163,7 @@ rule colors_genome:
     params:
         duplications = "division=division_metadata",
     wildcard_constraints:
-        subtype="h5n1-cattle-outbreak",
+        subtype='h5n1-cattle-outbreak|h5n1-d1.1',
         time="default",
     shell:
         """