From 6c6356d4e775db4f5d3c0b808214162e5814b6e7 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Tue, 25 Feb 2025 10:59:08 +1300 Subject: [PATCH] Filter cattle-outbreak using GenoFLU B3.13 The previous approach relied on broad filtering -- minimum date of 2024, region of North America -- a hardcoded exclude list and a clock filter. As the diversity of sequences increased the clock-filter became less effective and ultimately dropped all the desired strains. See for more. We now use GenoFLU constellations and can relax the date and region filters accordingly. This relaxation didn't result in any non-North-American samples included but did add one B3.13 genome from 2023: 'A/Goose/USA/23-038138-001-original/2023'. These changes to filtering will also apply to the D1.1 builds, but testing indicates no changes. The segment-level approach is not addressed here, but could be similarly adjusted to use the GenoFLU matching on the segment level. Specifically, adds the segment-level annotations and the expanded constellation is: B3.13 = PA:ea1, HA:ea1, PB1:am4, MP:ea1, NA:ea1, PB2:am2.2, NP:am8, NS:am1.1 Closes https://github.com/nextstrain/avian-flu/issues/133 --- Snakefile | 5 ++- .../auspice_config_h5n1-cattle-outbreak.json | 5 +++ .../dropped_strains_h5n1-cattle-outbreak.txt | 31 +++---------------- rules/cattle-flu.smk | 5 --- 4 files changed, 13 insertions(+), 33 deletions(-) diff --git a/Snakefile b/Snakefile index ca18c4f..563f01e 100755 --- a/Snakefile +++ b/Snakefile @@ -64,10 +64,14 @@ files = rules.files.params def subtypes_by_subtype_wildcard(wildcards): + # TODO - this function does more than strictly subtype filtering as certain builds filter to + # GenoFLU constellation, and in the future this may be expanded. We should rename the function! # TODO XXX - move to configs (started in https://github.com/nextstrain/avian-flu/pull/104 but # We should make the entire query config-definable) if wildcards.subtype == 'h5n1-d1.1': return "genoflu in 'D1.1'" + elif wildcards.subtype == 'h5n1-cattle-outbreak': + return "genoflu in 'B3.13'" db = { 'h5nx': ['h5n1', 'h5n2', 'h5n3', 'h5n4', 'h5n5', 'h5n6', 'h5n7', 'h5n8', 'h5n9'], @@ -75,7 +79,6 @@ def subtypes_by_subtype_wildcard(wildcards): 'h7n9': ['h7n9'], 'h9n2': ['h9n2'], } - db['h5n1-cattle-outbreak'] = [*db['h5nx']] assert wildcards.subtype in db, (f"Subtype {wildcards.subtype!r} is not defined in the snakemake function " "`subtypes_by_subtype_wildcard` -- is there a typo in the subtype you are targetting?") return(f"subtype in [{', '.join([repr(s) for s in db[wildcards.subtype]])}]") diff --git a/config/h5n1-cattle-outbreak/auspice_config_h5n1-cattle-outbreak.json b/config/h5n1-cattle-outbreak/auspice_config_h5n1-cattle-outbreak.json index 09ac49e..e5b0e44 100755 --- a/config/h5n1-cattle-outbreak/auspice_config_h5n1-cattle-outbreak.json +++ b/config/h5n1-cattle-outbreak/auspice_config_h5n1-cattle-outbreak.json @@ -79,6 +79,11 @@ "title": "Subtype", "type": "categorical" }, + { + "key": "genoflu", + "title": "GenoFLU constellation", + "type": "categorical" + }, { "key": "h5_label_clade", "title": "Provisional LABEL Clade", diff --git a/config/h5n1-cattle-outbreak/dropped_strains_h5n1-cattle-outbreak.txt b/config/h5n1-cattle-outbreak/dropped_strains_h5n1-cattle-outbreak.txt index 647eb03..5da89e0 100755 --- a/config/h5n1-cattle-outbreak/dropped_strains_h5n1-cattle-outbreak.txt +++ b/config/h5n1-cattle-outbreak/dropped_strains_h5n1-cattle-outbreak.txt @@ -9,6 +9,10 @@ A/Cattle/USA/24-009027-002-v/2024 # Duplicate of A/cattle/Michigan/24-009027-002 A/PEFA/USA/24-005915-001-original/2024 # Duplicate of A/Peregrinefalcon/California/24-005915-001/2024 A/Skunk/USA/24-006483-001-original/2024 # Duplicate of A/skunk/NewMexico/24-006483-001/2024 +# Many of the following exclude strains were added prior to filtering on GenoFLU +# constellation B3.13 and thus may not be applicable any more. We can clean these up +# in the future if desired. + # Dropping these strains from include due to excess private mutations A/cattle/NorthCarolina/24-010327-002/2024 A/cattle/Texas/24-009495-007/2024 @@ -246,30 +250,3 @@ A/westerngull/California/24-004708-001/2024 A/WesternGull/USA/24-004708-001-original/2024 A/WesternSandpiper/USA/24-004707-001-original/2024 A/woodduck/NorthCarolina/W24-026/2024 - -# D1.1 spillover -A/StripedSkunk/WA/W240530074-2-1/2024 -A/CATTLE/USA/25-002645-006/2025 -A/CATTLE/USA/25-002645-005/2025 -A/CATTLE/USA/25-002645-004/2025 -A/CATTLE/USA/25-002645-003/2025 -A/chicken/AR/24-037983-003-original/2024 -A/chicken/AR/24-037983-001-original/2024 -A/chicken/AR/24-037983-002-original/2024 -A/chicken/MN/24-038159-002-original/2024 -A/Duck/MN/24-038159-001-original/2024 -A/goose/MN/24-038159-004-original/2024 -A/Turkey/MN/24-035355-002-original/2024 -A/Turkey/MN/24-035355-001-original/2024 -A/Turkey/MN/24-035521-001-original/2024 -A/Turkey/MN/24-035524-003-original/2024 -A/Turkey/MN/24-035524-004-original/2024 -A/Turkey/MN/24-035524-001-original/2024 -A/Turkey/MN/24-035524-002-original/2024 -A/Turkey/MN/24-034932-001-original/2024 -A/Turkey/MN/24-034932-003-original/2024 -A/Turkey/MN/24-034932-002-original/2024 -A/Turkey/MN/24-035355-003-original/2024 -A/Turkey/MN/24-036792-002-original/2024 -A/Turkey/MN/24-036792-003-original/2024 -A/Turkey/MN/24-036792-001-original/2024 \ No newline at end of file diff --git a/rules/cattle-flu.smk b/rules/cattle-flu.smk index f737b90..5197582 100644 --- a/rules/cattle-flu.smk +++ b/rules/cattle-flu.smk @@ -14,9 +14,6 @@ rule filter_segments_for_genome: exclude = config['dropped_strains'], output: sequences = "results/{subtype}/{segment}/{time}/filtered_{genome_seg}.fasta" - params: - min_date = "2024-01-01", - query = 'region == "North America"' wildcard_constraints: subtype = 'h5n1-cattle-outbreak|h5n1-d1.1', segment = 'genome', @@ -29,8 +26,6 @@ rule filter_segments_for_genome: --metadata {input.metadata} \ --include {input.include} \ --exclude {input.exclude} \ - --min-date {params.min_date} \ - --query {params.query:q} \ --output-log {log} \ --output-sequences {output.sequences} """