nextstrain · trvrb · Apr 23, 2024 · Apr 3, 2024 · Apr 3, 2024 · Apr 8, 2024
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -8,4 +8,4 @@ jobs:
   ci:
     uses: nextstrain/.github/.github/workflows/pathogen-repo-ci.yaml@master
     with:
-      build-args: auspice/flu_avian_h5n1_ha.json
+      build-args: auspice/avian-flu_h5n1_ha_all-time.json
diff --git a/Snakefile b/Snakefile
@@ -1,15 +1,24 @@
-SUBTYPES = ["h5nx","h5n1","h9n2","h7n9"]
+SUBTYPES = ["h5nx","h5n1","h7n9","h9n2"]#["h5nx","h5n1"]
 SEGMENTS = ["pb2", "pb1", "pa", "ha","np", "na", "mp", "ns"]
+TIME =     ["all-time","2y"]
 
 path_to_fauna = '../fauna'
 
+
+def all_targets():
+    return [
+        *expand("auspice/avian-flu_{subtype}_{segment}_{time}.json", subtype=["h5nx","h5n1"], segment=SEGMENTS,time=TIME),
+        *expand("auspice/avian-flu_{subtype}_{segment}_{time}.json", subtype=['h7n9', 'h9n2'], segment=SEGMENTS,time=['all-time'])
+    ]
+
 rule all:
     input:
-        auspice_json = expand("auspice/flu_avian_{subtype}_{segment}.json", subtype=SUBTYPES, segment=SEGMENTS)
+        auspice_json = all_targets()
 
 rule files:
     params:
         dropped_strains = "config/dropped_strains_{subtype}.txt",
+        include_strains = "config/include_strains_{subtype}.txt",
         reference = "config/reference_{subtype}_{segment}.gb",
         colors = "config/colors_{subtype}.tsv",
         lat_longs = "config/lat_longs_{subtype}.tsv",
@@ -19,6 +28,7 @@ rule files:
 
 files = rules.files.params
 
+
 def download_by(w):
     db = {'h5nx': 'subtype:h5n1,h5n2,h5n3,h5n4,h5n5,h5n6,h5n7,h5n8,h5n9', 'h5n1': 'subtype:h5n1', 'h7n9': 'subtype:h7n9', 'h9n2': 'subtype:h9n2'}
     return(db[w.subtype])
@@ -28,32 +38,82 @@ def metadata_by_wildcards(w):
     return(md[w.subtype])
 
 def group_by(w):
-    gb = {'h5nx': 'subtype country year','h5n1': 'region country year', 'h7n9': 'division year', 'h9n2': 'country year'}
-    return gb[w.subtype]
-
-def sequences_per_group(w):
-    spg = {'h5nx': '5','h5n1': '10', 'h7n9': '70', 'h9n2': '10'}
-    return spg[w.subtype]
+    gb = {
+        'h5nx': {'all-time': 'subtype country year', '2y': 'subtype region month host'},
+        'h5n1': {'all-time': 'region country year', '2y': 'subtype region month host'},
+        'h7n9': {'all-time': 'division year'},
+        'h9n2': {'all-time': 'country year'}
+        }
+    return gb[w.subtype][w.time]
 
 def min_length(w):
     len_dict = {"pb2": 2100, "pb1": 2100, "pa": 2000, "ha":1600, "np":1400, "na":1270, "mp":900, "ns":800}
     length = len_dict[w.segment]
     return(length)
 
 def min_date(w):
-    date = {'h5nx':'1996','h5n1': '1996', 'h7n9': '2013', 'h9n2': '1966'}
-    return date[w.subtype]
+    date = {
+        'h5nx': {'all-time': '1996', '2y': '2Y'},
+        'h5n1': {'all-time': '1996', '2y': '2Y'},
+        'h7n9': {'all-time': '2013'},
+        'h9n2': {'all-time': '1966'}
+        }
+    return date[w.subtype][w.time]
 
 def traits_columns(w):
     traits = {'h5nx':'region','h5n1': 'region country', 'h7n9': 'country division', 'h9n2': 'region country'}
     return traits[w.subtype]
 
+def clock_rate(w):
+    clock_rates_h5nx = {
+        'pb2': '--clock-rate 0.00287',
+        'pb1': '--clock-rate 0.00267',
+        'pa': '--clock-rate 0.00238',
+        'ha': '--clock-rate 0.0048',
+        'np': '--clock-rate 0.0022',
+        'na': '--clock-rate 0.0028',
+        'mp': '--clock-rate 0.0017',
+        'ns': '--clock-rate 0.0017'
+        }
+
+    clock_rates_h5n1 = {
+        'pb2': '--clock-rate 0.00287',
+        'pb1': '--clock-rate 0.00264',
+        'pa': '--clock-rate 0.00248',
+        'ha': '--clock-rate 0.00455',
+        'np': '--clock-rate 0.00252',
+        'na': '--clock-rate 0.00349',
+        'mp': '--clock-rate 0.00191',
+        'ns': '--clock-rate 0.00249'
+        }
+
+    clock_rate = {
+        'h5nx': {'all-time':'', '2y': clock_rates_h5nx[w.segment]},
+        'h5n1': {'all-time':'', '2y': clock_rates_h5n1[w.segment]},
+        'h7n9': {'all-time':''},
+        'h9n2': {'all-time':''}
+        }
+
+    return clock_rate[w.subtype][w.time]
+
+
+def clock_rate_std_dev(w):
+    clock_rate_std_dev = {
+        'h5nx': {'all-time': '', '2y': '--clock-std-dev 0.00211'},
+        'h5n1': {'all-time': '', '2y': '--clock-std-dev 0.00211'},
+        'h7n9': {'all-time': ''},
+        'h9n2': {'all-time': ''}
+        }
+
+    return clock_rate_std_dev[w.subtype][w.time]
+
+
 rule download:
     message: "Downloading sequences from fauna"
     output:
         sequences = "data/{subtype}_{segment}.fasta"
     params:
-        fasta_fields = "strain virus accession collection_date region country division location host subtype originating_lab submitting_lab h5_clade",
+        fasta_fields = "strain virus accession collection_date region country division location host domestic_status subtype originating_lab submitting_lab authors PMID gisaid_clade h5_clade",
         download_by = download_by
     shell:
         """
@@ -65,7 +125,7 @@ rule download:
             --path data \
             --fstem {wildcards.subtype}_{wildcards.segment}
         """
-
+### comment
 rule parse:
     message: "Parsing fasta into sequences and metadata"
     input:
@@ -74,8 +134,8 @@ rule parse:
         sequences = "results/sequences_{subtype}_{segment}.fasta",
         metadata = "results/metadata_{subtype}_{segment}.tsv"
     params:
-        fasta_fields =  "strain virus isolate_id date region country division location host subtype originating_lab submitting_lab h5_clade",
-        prettify_fields = "region country division location host originating_lab submitting_lab"
+        fasta_fields =  "strain virus isolate_id date region country division location host domestic_status subtype originating_lab submitting_lab authors PMID gisaid_clade h5_clade",
+        prettify_fields = "region country division location host originating_lab submitting_lab authors PMID"
     shell:
         """
         augur parse \
@@ -105,33 +165,36 @@ rule filter:
     message:
         """
         Filtering to
-          - {params.sequences_per_group} sequence(s) per {params.group_by!s}
+          - subsampling to {params.subsample_max_sequences} sequences
+          - grouping by {params.group_by}
           - excluding strains in {input.exclude}
           - samples with missing region and country metadata
           - excluding strains prior to {params.min_date}
         """
     input:
         sequences = rules.parse.output.sequences,
         metadata = metadata_by_wildcards,
-        exclude = files.dropped_strains
+        exclude = files.dropped_strains,
+        include = files.include_strains
     output:
-        sequences = "results/filtered_{subtype}_{segment}.fasta"
+        sequences = "results/filtered_{subtype}_{segment}_{time}.fasta"
     params:
         group_by = group_by,
-        sequences_per_group = sequences_per_group,
+        subsample_max_sequences = 3000,
         min_date = min_date,
         min_length = min_length,
-        exclude_where = "host=laboratoryderived host=ferret host=unknown host=other country=? region=?"
+        exclude_where = "host=laboratoryderived host=ferret host=unknown host=other host=host country=? region=? gisaid_clade=3C.2"
 
     shell:
         """
         augur filter \
             --sequences {input.sequences} \
             --metadata {input.metadata} \
             --exclude {input.exclude} \
+            --include {input.include} \
             --output {output.sequences} \
             --group-by {params.group_by} \
-            --sequences-per-group {params.sequences_per_group} \
+            --subsample-max-sequences {params.subsample_max_sequences} \
             --min-date {params.min_date} \
             --exclude-where {params.exclude_where} \
             --min-length {params.min_length} \
@@ -148,7 +211,7 @@ rule align:
         sequences = rules.filter.output.sequences,
         reference = files.reference
     output:
-        alignment = "results/aligned_{subtype}_{segment}.fasta"
+        alignment = "results/aligned_{subtype}_{segment}_{time}.fasta"
     shell:
         """
         augur align \
@@ -165,7 +228,7 @@ rule tree:
     input:
         alignment = rules.align.output.alignment
     output:
-        tree = "results/tree-raw_{subtype}_{segment}.nwk"
+        tree = "results/tree-raw_{subtype}_{segment}_{time}.nwk"
     params:
         method = "iqtree"
     shell:
@@ -190,12 +253,14 @@ rule refine:
         alignment = rules.align.output,
         metadata = rules.parse.output.metadata
     output:
-        tree = "results/tree_{subtype}_{segment}.nwk",
-        node_data = "results/branch-lengths_{subtype}_{segment}.json"
+        tree = "results/tree_{subtype}_{segment}_{time}.nwk",
+        node_data = "results/branch-lengths_{subtype}_{segment}_{time}.json"
     params:
         coalescent = "const",
         date_inference = "marginal",
-        clock_filter_iqd = 4
+        clock_filter_iqd = 4,
+        clock = clock_rate,
+        clock_std_dev = clock_rate_std_dev
     shell:
         """
         augur refine \
@@ -208,6 +273,8 @@ rule refine:
             --coalescent {params.coalescent} \
             --date-confidence \
             --date-inference {params.date_inference} \
+            {params.clock} \
+            {params.clock_std_dev} \
             --clock-filter-iqd {params.clock_filter_iqd}
         """
 
@@ -217,7 +284,7 @@ rule ancestral:
         tree = rules.refine.output.tree,
         alignment = rules.align.output
     output:
-        node_data = "results/nt-muts_{subtype}_{segment}.json"
+        node_data = "results/nt-muts_{subtype}_{segment}_{time}.json"
     params:
         inference = "joint"
     shell:
@@ -237,7 +304,7 @@ rule translate:
         node_data = rules.ancestral.output.node_data,
         reference = files.reference
     output:
-        node_data = "results/aa-muts_{subtype}_{segment}.json"
+        node_data = "results/aa-muts_{subtype}_{segment}_{time}.json"
     shell:
         """
         augur translate \
@@ -253,7 +320,7 @@ rule traits:
         tree = rules.refine.output.tree,
         metadata = rules.parse.output.metadata
     output:
-        node_data = "results/traits_{subtype}_{segment}.json",
+        node_data = "results/traits_{subtype}_{segment}_{time}.json",
     params:
         columns = traits_columns,
     shell:
@@ -269,10 +336,10 @@ rule traits:
 rule cleavage_site:
     message: "determining sequences that harbor furin cleavage sites"
     input:
-        alignment = "results/aligned_{subtype}_ha.fasta"
+        alignment = "results/aligned_{subtype}_ha_{time}.fasta"
     output:
-        cleavage_site_annotations = "results/cleavage-site_{subtype}_ha.json",
-        cleavage_site_sequences = "results/cleavage-site-sequences_{subtype}_ha.json"
+        cleavage_site_annotations = "results/cleavage-site_{subtype}_ha_{time}.json",
+        cleavage_site_sequences = "results/cleavage-site-sequences_{subtype}_ha_{time}.json"
     shell:
         """
         python scripts/annotate-ha-cleavage-site.py \
@@ -297,7 +364,7 @@ rule export:
         auspice_config = files.auspice_config,
         description = files.description
     output:
-        auspice_json = "auspice/flu_avian_{subtype}_{segment}.json"
+        auspice_json = "auspice/avian-flu_{subtype}_{segment}_{time}.json"
     shell:
         """
         augur export v2 \

diff --git a/batch.py b/batch.py
@@ -30,8 +30,8 @@
         for segment in params.segments:
             call = ['nextstrain', 'build', '--aws-batch', '.', '-j 1']
             targets = []
-            targets.append('auspice/flu_avian_%s_%s_tree.json'%(subtype, segment))
-            targets.append('auspice/flu_avian_%s_%s_meta.json'%(subtype, segment))
+            targets.append('auspice/avian-flu_%s_%s_tree.json'%(subtype, segment))
+            targets.append('auspice/avian-flu_%s_%s_meta.json'%(subtype, segment))
             call.extend(targets)
             print(' '.join(call))
             log = open('logs/%s_%s.txt'%(subtype, segment), 'w')

diff --git a/clade-labeling/add-clades.py b/clade-labeling/add-clades.py
@@ -57,7 +57,7 @@ def annotate_metadata_file(metadata_infile, metadata_outfile, clade_assignments)
 				else:
 					clade = "?"
 					unknown_clades += 1
-					print("unknown clade for ", strain_name)
+					#print("unknown clade for ", strain_name)
 				new_line = line.strip() + "\t" + clade + "\n"
 
 			with open(metadata_outfile, "a") as outfile:

diff --git a/clade-labeling/h5n1-clades.tsv b/clade-labeling/h5n1-clades.tsv
@@ -17811,3 +17811,85 @@ A/caspiantern/Washington/23024996001original/2023	2.3.4.4b
 A/Chicken/Netherlands/24001946006010/2024	EA-nonGsGD
 A/Tuftedduck/Netherlands/1/2023	EA-nonGsGD
 A/chicken/Iraq/KVCL016/2015	?
+          A/Ph/ST/44/2004	2.3.2
+A/chicken/Vietnam/Raho77232263/2023	2.3.2.1e
+A/duck/Vietnam/Raho723S2874/2023	2.3.2.1e
+A/duck/Vietnam/Raho723S2490/2023	2.3.2.1e
+A/Vietnam/KhanhhoaRV1005/2024	2.3.2.1e
+A/duck/Vietnam/Raho723S2875/2023	2.3.2.1e
+A/muteswan/Poland/MB055L2/2024	2.3.4.4b
+A/largebilledcrow/Osaka/2702A045/2024	2.3.4.4b
+A/Fox/Bayern/i193/2023	2.3.4.4b
+A/feline/USA/24008764001original/2024	2.3.4.4b
+A/henharrier/Parnu/TA2126003/2021	2.3.4.4b
+A/chicken/CzechRepublic/3529/2024	2.3.4.4b
+A/domesticduck/Poland/H69T2/2024	2.3.4.4b
+A/goose/Bayern/wv196/2023	2.3.4.4b
+A/feline/USA/24009116005original/2024	2.3.4.4b
+A/muteswan/Poland/MB085N/2024	2.3.4.4b
+A/feline/USA/24009311006original/2024	2.3.4.4b
+A/chicken/CzechRepublic/47202/2024	2.3.4.4b
+A/chicken/Poland/H79T2/2024	2.3.4.4b
+A/muteswan/Poland/MB055L1/2024	2.3.4.4b
+A/dairycattle/Texas/24008749002v/2024	2.3.4.4b
+A/feline/USA/24009116004original/2024	2.3.4.4b
+A/domesticduck/Poland/H52T1K2/2024	2.3.4.4b
+A/chicken/CzechRepublic/35494/2024	2.3.4.4b
+A/chicken/CzechRepublic/31374/2024	2.3.4.4b
+A/feline/USA/24009116002original/2024	2.3.4.4b
+A/chicken/CzechRepublic/3744orig/2024	2.3.4.4b
+A/largebilledcrow/Osaka/2702A031/2024	2.3.4.4b
+A/muteswan/Poland/MB055L4/2024	2.3.4.4b
+A/domesticduck/Poland/H57T1/2024	2.3.4.4b
+A/turkey/Poland/H40T2/2024	2.3.4.4b
+A/goose/Bayern/wv351/2023	2.3.4.4b
+A/domesticgoose/Poland/H49W/2024	2.3.4.4b
+A/turkey/Poland/H47T4/2024	2.3.4.4b
+A/turkey/Poland/H80T3/2024	2.3.4.4b
+A/Texas/37/2024	2.3.4.4b
+A/chicken/Poland/H45NM/2024	2.3.4.4b
+A/turkey/Poland/H75T1/2024	2.3.4.4b
+A/CommonBuzzard/GermanyBB/2024AI01490/2024	2.3.4.4b
+A/feline/USA/23037332001original/2023	2.3.4.4b
+A/feline/USA/24008850001original/2024	2.3.4.4b
+A/turkey/Poland/H68T2/2024	2.3.4.4b
+A/domesticduck/Poland/H66T1/2024	2.3.4.4b
+A/feline/USA/24009311004original/2024	2.3.4.4b
+A/turkey/Poland/H43T2/2024	2.3.4.4b
+A/chicken/CzechRepublic/354910/2024	2.3.4.4b
+A/chicken/CzechRepublic/34582/2024	2.3.4.4b
+A/NorthernFulmar/Netherlands/4/2024	2.3.4.4b
+A/largebilledcrow/Ishikawa/1702A010/2024	2.3.4.4b
+A/largebilledcrow/Osaka/2702A044/2024	2.3.4.4b
+A/largebilledcrow/Osaka/2702A033/2024	2.3.4.4b
+A/largebilledcrow/Osaka/2702A032/2024	2.3.4.4b
+A/chicken/CzechRepublic/37442/2024	2.3.4.4b
+A/domesticduck/Poland/H60T4/2024	2.3.4.4b
+A/feline/USA/24008850002original/2024	2.3.4.4b
+A/Seagull/Parnu/TA21132845/2021	2.3.4.4b
+A/chicken/CzechRepublic/37443/2024	2.3.4.4b
+A/muteswan/Poland/MB070L1/2024	2.3.4.4b
+A/buzzard/Poland/MB098N/2024	2.3.4.4b
+A/chicken/CzechRepublic/31371/2024	2.3.4.4b
+A/turkey/Poland/H81T1/2024	2.3.4.4b
+A/domesticgoose/Poland/H48D1/2024	2.3.4.4b
+A/chicken/CzechRepublic/37441/2024	2.3.4.4b
+A/muteswan/CzechRepublic/4316/2024	2.3.4.4b
+A/buzzard/Poland/MB103N/2024	2.3.4.4b
+A/turkey/Poland/H63N/2024	2.3.4.4b
+A/chicken/CzechRepublic/35499/2024	2.3.4.4b
+A/largebilledcrow/Hokkaido/B114/2024	2.3.4.4b
+A/chicken/CzechRepublic/34583/2024	2.3.4.4b
+A/feline/USA/24008764002original/2024	2.3.4.4b
+A/eagle/Parnu/TA21118641/2021	2.3.4.4b
+A/turkey/Poland/H38T4/2024	2.3.4.4b
+A/largebilledcrow/Osaka/2702A043/2024	2.3.4.4b
+A/chicken/CzechRepublic/34584/2024	2.3.4.4b
+A/chicken/CzechRepublic/31375/2024	2.3.4.4b
+A/Northernshoveler/Jeju/D60/2023	2.3.4.4b
+A/turkey/Poland/H543NM/2023	2.3.4.4b
+A/largebilledcrow/Osaka/2702A030/2024	2.3.4.4b
+A/CommonBuzzard/GermanyHH/2024AI01435/2024	EA-nonGsGD
+A/BarnacleGoose/GermanySH/2024AI01487/2024	EA-nonGsGD
+A/Dunlin/GermanySH/2024AI01484/2024	EA-nonGsGD
+A/Fox/Bayern/WS113/2022	?