WIP H5N1 D1.1 genome build

nextstrain · Feb 13, 2025 · 3e26692 · 3e26692
1 parent d597b2d
commit 3e26692
Show file tree

Hide file tree

Showing 9 changed files with 29,279 additions and 11 deletions.
diff --git a/Snakefile b/Snakefile
@@ -76,6 +76,10 @@ files = rules.files.params
 
 
 def subtypes_by_subtype_wildcard(wildcards):
+
+    if wildcards.subtype == 'h5n1-d1.1':
+        return "genoflu in 'D1.1'"
+
     db = {
         'h5nx': ['h5n1', 'h5n2', 'h5n3', 'h5n4', 'h5n5', 'h5n6', 'h5n7', 'h5n8', 'h5n9'],
         'h5n1': ['h5n1'],
@@ -85,7 +89,7 @@ def subtypes_by_subtype_wildcard(wildcards):
     db['h5n1-cattle-outbreak'] = [*db['h5nx']]
     assert wildcards.subtype in db, (f"Subtype {wildcards.subtype!r} is not defined in the snakemake function "
         "`subtypes_by_subtype_wildcard` -- is there a typo in the subtype you are targetting?")
-    return(db[wildcards.subtype])
+    return(f"subtype in {' '.join([repr(s) for s in db[wildcards.subtype]])}")
 
 rule download_sequences:
     output:
@@ -110,7 +114,25 @@ rule download_metadata:
         """
 
 
+rule input_metadata_with_genoflu_temporary:
+    input:
+        genoflu = f"config/h5n1-d1.1/genoflu_temporary.tsv",
+        metadata = f"data/{S3_SRC['name']}/metadata.tsv",
+    output:
+        metadata = f"data/{S3_SRC['name']}/metadata_genoflu.tsv",
+    shell:
+        r"""
+        python3 scripts/genoflu-constellation.py \
+            --metadata {input.metadata} --genoflu {input.genoflu} \
+            1> {output.metadata}
+        """
+
 def input_metadata(wildcards):
+
+    # special case D.1.1 builds until the time that we have this data in the ingest outputs
+    if wildcards.subtype=='h5n1-d1.1':
+        return rules.input_metadata_with_genoflu_temporary.output.metadata
+
     if S3_SRC:
         return f"data/{S3_SRC['name']}/metadata.tsv",
     elif LOCAL_INGEST:
@@ -138,7 +160,7 @@ rule filter_sequences_by_subtype:
         augur filter \
             --sequences {input.sequences} \
             --metadata {input.metadata} \
-            --query "subtype in {params.subtypes!r}" \
+            --query {params.subtypes!r} \
             --output-sequences {output.sequences}
         """
 
@@ -153,7 +175,7 @@ rule filter_metadata_by_subtype:
         """
         augur filter \
             --metadata {input.metadata} \
-            --query "subtype in {params.subtypes!r}" \
+            --query {params.subtypes!r} \
             --output-metadata {output.metadata}
         """
 
@@ -614,7 +636,8 @@ def auspice_name_to_wildcard_name(wildcards):
         return f"results/{subtype}/{segment}/{time}/auspice-dataset.json"
     if len(parts)==2:
         [subtype, segment] = parts
-        assert subtype=='h5n1-cattle-outbreak', "Only h5n1 builds produce an Auspice dataset without a time component in the filename"
+        assert subtype=='h5n1-cattle-outbreak' or subtype=='h5n1-d1.1', \
+            "Only h5n1 builds produce an Auspice dataset without a time component in the filename"
         return f"results/{subtype}/{segment}/default/auspice-dataset.json"
     raise Exception("Auspice JSON filename requested with an unexpected number of (underscore-separated) parts")
 

diff --git a/config/h5n1-d1.1.yaml b/config/h5n1-d1.1.yaml
@@ -0,0 +1,133 @@
+#
+# TKTK
+#
+custom_rules:
+  - "rules/cattle-flu.smk"
+
+
+#### Parameters which define which builds to produce via this config ###
+builds:
+  h5n1-d1.1: ''
+
+segments:
+  - genome
+
+
+#### Parameters which define the input source ####
+s3_src:
+  name: gisaid
+  metadata: s3://nextstrain-data-private/files/workflows/avian-flu/metadata.tsv.zst
+  sequences: s3://nextstrain-data-private/files/workflows/avian-flu/{segment}/sequences.fasta.zst
+local_ingest: false
+# P.S. To use local ingest files, comment out s3_src and change to local_ingest: joined-ncbi (e.g.)
+
+
+#### Parameters which control large overarching aspects of the build
+# Set a high target_sequences_per_tree to capture all circulating strains, as they will be pruned down
+# as part of the workflow
+target_sequences_per_tree: 10_000
+
+
+#### Config files ####
+reference: config/h5n1/reference_h5n1_{segment}.gb  # use H5N1 references
+genome_reference: config/h5n1-cattle-outbreak/h5_cattle_genome_root.gb # use cattle-flu genome reference TODO XXX
+auspice_config: config/{subtype}/auspice_config_{subtype}.json
+colors: config/h5n1/colors_h5n1.tsv # use H5N1 colors
+lat_longs: config/h5n1/lat_longs_h5n1.tsv # use H5N1 lat-longs
+include_strains: config/{subtype}/include_strains_{subtype}.txt
+# use cattle-outbreak specific dropped strains for segment + genome trees
+dropped_strains: config/{subtype}/dropped_strains_{subtype}.txt
+clades_file: clade-labeling/h5n1-clades.tsv # use H5N1 clades
+description: config/{subtype}/description.md
+
+
+#### Rule-specific parameters ####
+filter:
+  min_length:
+    FALLBACK:
+      pb2: 2100
+      pb1: 2100
+      pa: 2000
+      ha: 1600
+      np: 1400
+      na: 1270
+      mp: 900
+      ns: 800
+
+  min_date:
+    FALLBACK: 2024
+
+  group_by:
+    FALLBACK: false # no grouping during filter
+
+  exclude_where:
+    FALLBACK: host=laboratoryderived host=ferret host=unknown host=other host=host gisaid_clade=3C.2
+
+
+refine:
+  coalescent: const
+  date_inference: marginal
+
+  genome_clock_filter_iqd:
+    FALLBACK: 6
+  clock_filter_iqd:
+    FALLBACK: false
+
+  root:
+    FALLBACK: false
+
+  # For the genome only we use the closest outgroup as the root
+  # P.S. Make sure this strain is force included via augur filter --include
+  # (This isn't needed for the segment builds as we include a large enough time span to root via the clock)
+  genome_root:
+      FALLBACK: best
+
+  segment_lengths:
+    FALLBACK:
+      {'pb2': 2341, 'pb1': 2341, 'pa': 2233, 'ha': 1760, 'np': 1565, 'na': 1458, 'mp': 1027, 'ns': 865}
+
+  __clock_std_dev:  &clock_std_dev  0.00211 # YAML anchor so we can reference this value below
+
+  clock_rates:
+    FALLBACK:
+      # The rates for the 8 segments are taken from the GISAID H5N1/2y config
+      pb2: [0.00287, *clock_std_dev]
+      pb1: [0.00264, *clock_std_dev]
+      pa: [0.00248, *clock_std_dev]
+      ha: [0.00455, *clock_std_dev]
+      np: [0.00252, *clock_std_dev]
+      na: [0.00349, *clock_std_dev]
+      mp: [0.00191, *clock_std_dev]
+      ns: [0.00249, *clock_std_dev]
+      # the genome clock rate is calculated by a function in the snakemake pipeline
+      # using the segment rates weighted by their lengths
+
+ancestral:
+  inference: joint
+  root_seq:
+    FALLBACK: false
+  genome_root_seq:
+    FALLBACK: config/h5n1-cattle-outbreak/h5_cattle_genome_root.gb
+
+traits:
+  # genome build has different parameters...
+  genome_columns:
+    FALLBACK: division
+  genome_sampling_bias_correction:
+    FALLBACK: 5
+
+  # segment builds:
+  columns:
+    FALLBACK: region country # same as GISAID H5N1 builds
+  sampling_bias_correction:
+    FALLBACK: false
+
+  # all builds
+  confidence:
+    FALLBACK: true
+
+export:
+  genome_title:
+    FALLBACK: false
+  title:
+    FALLBACK: false
diff --git a/config/h5n1-d1.1/auspice_config_h5n1-d1.1.json b/config/h5n1-d1.1/auspice_config_h5n1-d1.1.json
@@ -0,0 +1,140 @@
+{
+  "title": "Full genome analysis of the ongoing influenza A/H5N1 D1.1 outbreak in North America",
+  "maintainers": [
+    {"name": "Moncla lab", "url": "https://lmoncla.github.io/monclalab/"},
+    {"name": "the Nextstrain team", "url": "https://nextstrain.org/team"}
+  ],
+  "build_url": "https://github.com/nextstrain/avian-flu",
+  "data_provenance": [
+    {
+      "name": "USDA",
+      "url": "https://www.ncbi.nlm.nih.gov/bioproject/PRJNA1102327"
+    },
+    {
+      "name": "GISAID"
+    }
+  ],
+  "extensions": {
+    "nextclade": {
+      "pathogen": {
+        "schemaVersion":"3.0.0",
+        "defaultCds": "HA",
+        "cdsOrderPreference":[
+          "PB2",
+          "PB1",
+          "PA",
+          "HA",
+          "NP",
+          "NA",
+          "M1",
+          "M2",
+          "NS1",
+          "NS2"
+      ],
+      "attributes": {
+        "name": "H5N1 dairy cattle outbreak",
+        "reference name": "concatenated ancestral sequences",
+        "reference accession": "none"
+        }
+      }
+    }
+  },
+  "colorings": [
+    {
+      "key": "gt",
+      "title": "Genotype",
+      "type": "categorical"
+    },
+    {
+      "key": "num_date",
+      "title": "Date",
+      "type": "continuous"
+    },
+    {
+      "key": "region",
+      "title": "Region",
+      "type": "categorical"
+    },
+    {
+      "key": "country",
+      "title": "Country",
+      "type": "categorical"
+    },
+    {
+      "key": "division",
+      "title": "Admin Division",
+      "type": "categorical"
+    },
+    {
+      "key": "host",
+      "title": "Host",
+      "type": "categorical"
+    },
+    {
+      "key": "subtype",
+      "title": "Subtype",
+      "type": "categorical"
+    },
+    {
+      "key": "h5_label_clade",
+      "title": "Provisional LABEL Clade",
+      "type": "categorical"
+    },
+    {
+      "key": "furin_cleavage_motif",
+      "title": "Furin Cleavage Motif",
+      "type": "categorical"
+    },
+    {
+      "key": "cleavage_site_sequence",
+      "title": "Cleavage Site Sequence",
+      "type": "categorical"
+    },
+    {
+      "key": "author",
+      "title": "Authors",
+      "type": "categorical"
+    },
+    {
+      "key": "originating_lab",
+      "title": "Originating Lab",
+      "type": "categorical"
+    },
+    {
+      "key": "submitting_lab",
+      "title": "Submitting Lab",
+      "type": "categorical"
+    },
+    {
+      "key": "data_source",
+      "title": "Data Source",
+      "type": "categorical"
+    }
+  ],
+  "geo_resolutions": [
+    "region",
+    "country",
+    "division"
+  ],
+  "display_defaults": {
+    "map_triplicate": false,
+    "color_by": "host",
+    "geo_resolution": "division",
+    "distance_measure": "num_date"
+  },
+  "filters": [
+    "host",
+    "region",
+    "country",
+    "division",
+    "subtype",
+    "author",
+    "originating_lab",
+    "submitting_lab",
+    "data_source"
+  ],
+  "metadata_columns": [
+    "genbank_accession",
+    "sra_accessions"
+  ]
+}
diff --git a/config/h5n1-d1.1/description.md b/config/h5n1-d1.1/description.md
diff --git a/config/h5n1-d1.1/dropped_strains_h5n1-d1.1.txt b/config/h5n1-d1.1/dropped_strains_h5n1-d1.1.txt