Skip to content

Commit

Permalink
WIP H5N1 D1.1 genome build
Browse files Browse the repository at this point in the history
  • Loading branch information
jameshadfield committed Feb 13, 2025
1 parent d597b2d commit 3e26692
Show file tree
Hide file tree
Showing 9 changed files with 29,279 additions and 11 deletions.
31 changes: 27 additions & 4 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ files = rules.files.params


def subtypes_by_subtype_wildcard(wildcards):

if wildcards.subtype == 'h5n1-d1.1':
return "genoflu in 'D1.1'"

db = {
'h5nx': ['h5n1', 'h5n2', 'h5n3', 'h5n4', 'h5n5', 'h5n6', 'h5n7', 'h5n8', 'h5n9'],
'h5n1': ['h5n1'],
Expand All @@ -85,7 +89,7 @@ def subtypes_by_subtype_wildcard(wildcards):
db['h5n1-cattle-outbreak'] = [*db['h5nx']]
assert wildcards.subtype in db, (f"Subtype {wildcards.subtype!r} is not defined in the snakemake function "
"`subtypes_by_subtype_wildcard` -- is there a typo in the subtype you are targetting?")
return(db[wildcards.subtype])
return(f"subtype in {' '.join([repr(s) for s in db[wildcards.subtype]])}")

rule download_sequences:
output:
Expand All @@ -110,7 +114,25 @@ rule download_metadata:
"""


rule input_metadata_with_genoflu_temporary:
input:
genoflu = f"config/h5n1-d1.1/genoflu_temporary.tsv",
metadata = f"data/{S3_SRC['name']}/metadata.tsv",
output:
metadata = f"data/{S3_SRC['name']}/metadata_genoflu.tsv",
shell:
r"""
python3 scripts/genoflu-constellation.py \
--metadata {input.metadata} --genoflu {input.genoflu} \
1> {output.metadata}
"""

def input_metadata(wildcards):

# special case D.1.1 builds until the time that we have this data in the ingest outputs
if wildcards.subtype=='h5n1-d1.1':
return rules.input_metadata_with_genoflu_temporary.output.metadata

if S3_SRC:
return f"data/{S3_SRC['name']}/metadata.tsv",
elif LOCAL_INGEST:
Expand Down Expand Up @@ -138,7 +160,7 @@ rule filter_sequences_by_subtype:
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--query "subtype in {params.subtypes!r}" \
--query {params.subtypes!r} \
--output-sequences {output.sequences}
"""

Expand All @@ -153,7 +175,7 @@ rule filter_metadata_by_subtype:
"""
augur filter \
--metadata {input.metadata} \
--query "subtype in {params.subtypes!r}" \
--query {params.subtypes!r} \
--output-metadata {output.metadata}
"""

Expand Down Expand Up @@ -614,7 +636,8 @@ def auspice_name_to_wildcard_name(wildcards):
return f"results/{subtype}/{segment}/{time}/auspice-dataset.json"
if len(parts)==2:
[subtype, segment] = parts
assert subtype=='h5n1-cattle-outbreak', "Only h5n1 builds produce an Auspice dataset without a time component in the filename"
assert subtype=='h5n1-cattle-outbreak' or subtype=='h5n1-d1.1', \
"Only h5n1 builds produce an Auspice dataset without a time component in the filename"
return f"results/{subtype}/{segment}/default/auspice-dataset.json"
raise Exception("Auspice JSON filename requested with an unexpected number of (underscore-separated) parts")

Expand Down
133 changes: 133 additions & 0 deletions config/h5n1-d1.1.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
#
# TKTK
#
custom_rules:
- "rules/cattle-flu.smk"


#### Parameters which define which builds to produce via this config ###
builds:
h5n1-d1.1: ''

segments:
- genome


#### Parameters which define the input source ####
s3_src:
name: gisaid
metadata: s3://nextstrain-data-private/files/workflows/avian-flu/metadata.tsv.zst
sequences: s3://nextstrain-data-private/files/workflows/avian-flu/{segment}/sequences.fasta.zst
local_ingest: false
# P.S. To use local ingest files, comment out s3_src and change to local_ingest: joined-ncbi (e.g.)


#### Parameters which control large overarching aspects of the build
# Set a high target_sequences_per_tree to capture all circulating strains, as they will be pruned down
# as part of the workflow
target_sequences_per_tree: 10_000


#### Config files ####
reference: config/h5n1/reference_h5n1_{segment}.gb # use H5N1 references
genome_reference: config/h5n1-cattle-outbreak/h5_cattle_genome_root.gb # use cattle-flu genome reference TODO XXX
auspice_config: config/{subtype}/auspice_config_{subtype}.json
colors: config/h5n1/colors_h5n1.tsv # use H5N1 colors
lat_longs: config/h5n1/lat_longs_h5n1.tsv # use H5N1 lat-longs
include_strains: config/{subtype}/include_strains_{subtype}.txt
# use cattle-outbreak specific dropped strains for segment + genome trees
dropped_strains: config/{subtype}/dropped_strains_{subtype}.txt
clades_file: clade-labeling/h5n1-clades.tsv # use H5N1 clades
description: config/{subtype}/description.md


#### Rule-specific parameters ####
filter:
min_length:
FALLBACK:
pb2: 2100
pb1: 2100
pa: 2000
ha: 1600
np: 1400
na: 1270
mp: 900
ns: 800

min_date:
FALLBACK: 2024

group_by:
FALLBACK: false # no grouping during filter

exclude_where:
FALLBACK: host=laboratoryderived host=ferret host=unknown host=other host=host gisaid_clade=3C.2


refine:
coalescent: const
date_inference: marginal

genome_clock_filter_iqd:
FALLBACK: 6
clock_filter_iqd:
FALLBACK: false

root:
FALLBACK: false

# For the genome only we use the closest outgroup as the root
# P.S. Make sure this strain is force included via augur filter --include
# (This isn't needed for the segment builds as we include a large enough time span to root via the clock)
genome_root:
FALLBACK: best

segment_lengths:
FALLBACK:
{'pb2': 2341, 'pb1': 2341, 'pa': 2233, 'ha': 1760, 'np': 1565, 'na': 1458, 'mp': 1027, 'ns': 865}

__clock_std_dev: &clock_std_dev 0.00211 # YAML anchor so we can reference this value below

clock_rates:
FALLBACK:
# The rates for the 8 segments are taken from the GISAID H5N1/2y config
pb2: [0.00287, *clock_std_dev]
pb1: [0.00264, *clock_std_dev]
pa: [0.00248, *clock_std_dev]
ha: [0.00455, *clock_std_dev]
np: [0.00252, *clock_std_dev]
na: [0.00349, *clock_std_dev]
mp: [0.00191, *clock_std_dev]
ns: [0.00249, *clock_std_dev]
# the genome clock rate is calculated by a function in the snakemake pipeline
# using the segment rates weighted by their lengths

ancestral:
inference: joint
root_seq:
FALLBACK: false
genome_root_seq:
FALLBACK: config/h5n1-cattle-outbreak/h5_cattle_genome_root.gb

traits:
# genome build has different parameters...
genome_columns:
FALLBACK: division
genome_sampling_bias_correction:
FALLBACK: 5

# segment builds:
columns:
FALLBACK: region country # same as GISAID H5N1 builds
sampling_bias_correction:
FALLBACK: false

# all builds
confidence:
FALLBACK: true

export:
genome_title:
FALLBACK: false
title:
FALLBACK: false
140 changes: 140 additions & 0 deletions config/h5n1-d1.1/auspice_config_h5n1-d1.1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
{
"title": "Full genome analysis of the ongoing influenza A/H5N1 D1.1 outbreak in North America",
"maintainers": [
{"name": "Moncla lab", "url": "https://lmoncla.github.io/monclalab/"},
{"name": "the Nextstrain team", "url": "https://nextstrain.org/team"}
],
"build_url": "https://github.com/nextstrain/avian-flu",
"data_provenance": [
{
"name": "USDA",
"url": "https://www.ncbi.nlm.nih.gov/bioproject/PRJNA1102327"
},
{
"name": "GISAID"
}
],
"extensions": {
"nextclade": {
"pathogen": {
"schemaVersion":"3.0.0",
"defaultCds": "HA",
"cdsOrderPreference":[
"PB2",
"PB1",
"PA",
"HA",
"NP",
"NA",
"M1",
"M2",
"NS1",
"NS2"
],
"attributes": {
"name": "H5N1 dairy cattle outbreak",
"reference name": "concatenated ancestral sequences",
"reference accession": "none"
}
}
}
},
"colorings": [
{
"key": "gt",
"title": "Genotype",
"type": "categorical"
},
{
"key": "num_date",
"title": "Date",
"type": "continuous"
},
{
"key": "region",
"title": "Region",
"type": "categorical"
},
{
"key": "country",
"title": "Country",
"type": "categorical"
},
{
"key": "division",
"title": "Admin Division",
"type": "categorical"
},
{
"key": "host",
"title": "Host",
"type": "categorical"
},
{
"key": "subtype",
"title": "Subtype",
"type": "categorical"
},
{
"key": "h5_label_clade",
"title": "Provisional LABEL Clade",
"type": "categorical"
},
{
"key": "furin_cleavage_motif",
"title": "Furin Cleavage Motif",
"type": "categorical"
},
{
"key": "cleavage_site_sequence",
"title": "Cleavage Site Sequence",
"type": "categorical"
},
{
"key": "author",
"title": "Authors",
"type": "categorical"
},
{
"key": "originating_lab",
"title": "Originating Lab",
"type": "categorical"
},
{
"key": "submitting_lab",
"title": "Submitting Lab",
"type": "categorical"
},
{
"key": "data_source",
"title": "Data Source",
"type": "categorical"
}
],
"geo_resolutions": [
"region",
"country",
"division"
],
"display_defaults": {
"map_triplicate": false,
"color_by": "host",
"geo_resolution": "division",
"distance_measure": "num_date"
},
"filters": [
"host",
"region",
"country",
"division",
"subtype",
"author",
"originating_lab",
"submitting_lab",
"data_source"
],
"metadata_columns": [
"genbank_accession",
"sra_accessions"
]
}
Empty file added config/h5n1-d1.1/description.md
Empty file.
Empty file.
Loading

0 comments on commit 3e26692

Please sign in to comment.