Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

2y build targets #14

Merged
merged 12 commits into from
Apr 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ jobs:
ci:
uses: nextstrain/.github/.github/workflows/pathogen-repo-ci.yaml@master
with:
build-args: auspice/flu_avian_h5n1_ha.json
build-args: auspice/avian-flu_h5n1_ha_all-time.json
131 changes: 99 additions & 32 deletions Snakefile
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,15 +1,24 @@
SUBTYPES = ["h5nx","h5n1","h9n2","h7n9"]
SUBTYPES = ["h5nx","h5n1","h7n9","h9n2"]#["h5nx","h5n1"]
SEGMENTS = ["pb2", "pb1", "pa", "ha","np", "na", "mp", "ns"]
TIME = ["all-time","2y"]

path_to_fauna = '../fauna'


def all_targets():
return [
*expand("auspice/avian-flu_{subtype}_{segment}_{time}.json", subtype=["h5nx","h5n1"], segment=SEGMENTS,time=TIME),
*expand("auspice/avian-flu_{subtype}_{segment}_{time}.json", subtype=['h7n9', 'h9n2'], segment=SEGMENTS,time=['all-time'])
]

rule all:
input:
auspice_json = expand("auspice/flu_avian_{subtype}_{segment}.json", subtype=SUBTYPES, segment=SEGMENTS)
auspice_json = all_targets()

rule files:
params:
dropped_strains = "config/dropped_strains_{subtype}.txt",
include_strains = "config/include_strains_{subtype}.txt",
reference = "config/reference_{subtype}_{segment}.gb",
colors = "config/colors_{subtype}.tsv",
lat_longs = "config/lat_longs_{subtype}.tsv",
Expand All @@ -19,6 +28,7 @@ rule files:

files = rules.files.params


def download_by(w):
db = {'h5nx': 'subtype:h5n1,h5n2,h5n3,h5n4,h5n5,h5n6,h5n7,h5n8,h5n9', 'h5n1': 'subtype:h5n1', 'h7n9': 'subtype:h7n9', 'h9n2': 'subtype:h9n2'}
return(db[w.subtype])
Expand All @@ -28,32 +38,82 @@ def metadata_by_wildcards(w):
return(md[w.subtype])

def group_by(w):
gb = {'h5nx': 'subtype country year','h5n1': 'region country year', 'h7n9': 'division year', 'h9n2': 'country year'}
return gb[w.subtype]

def sequences_per_group(w):
spg = {'h5nx': '5','h5n1': '10', 'h7n9': '70', 'h9n2': '10'}
return spg[w.subtype]
gb = {
'h5nx': {'all-time': 'subtype country year', '2y': 'subtype region month host'},
'h5n1': {'all-time': 'region country year', '2y': 'subtype region month host'},
'h7n9': {'all-time': 'division year'},
'h9n2': {'all-time': 'country year'}
}
return gb[w.subtype][w.time]

def min_length(w):
len_dict = {"pb2": 2100, "pb1": 2100, "pa": 2000, "ha":1600, "np":1400, "na":1270, "mp":900, "ns":800}
length = len_dict[w.segment]
return(length)

def min_date(w):
date = {'h5nx':'1996','h5n1': '1996', 'h7n9': '2013', 'h9n2': '1966'}
return date[w.subtype]
date = {
'h5nx': {'all-time': '1996', '2y': '2Y'},
'h5n1': {'all-time': '1996', '2y': '2Y'},
'h7n9': {'all-time': '2013'},
'h9n2': {'all-time': '1966'}
}
return date[w.subtype][w.time]

def traits_columns(w):
traits = {'h5nx':'region','h5n1': 'region country', 'h7n9': 'country division', 'h9n2': 'region country'}
return traits[w.subtype]

def clock_rate(w):
clock_rates_h5nx = {
'pb2': '--clock-rate 0.00287',
'pb1': '--clock-rate 0.00267',
'pa': '--clock-rate 0.00238',
'ha': '--clock-rate 0.0048',
'np': '--clock-rate 0.0022',
'na': '--clock-rate 0.0028',
'mp': '--clock-rate 0.0017',
'ns': '--clock-rate 0.0017'
}

clock_rates_h5n1 = {
'pb2': '--clock-rate 0.00287',
'pb1': '--clock-rate 0.00264',
'pa': '--clock-rate 0.00248',
'ha': '--clock-rate 0.00455',
'np': '--clock-rate 0.00252',
'na': '--clock-rate 0.00349',
'mp': '--clock-rate 0.00191',
'ns': '--clock-rate 0.00249'
}

clock_rate = {
'h5nx': {'all-time':'', '2y': clock_rates_h5nx[w.segment]},
'h5n1': {'all-time':'', '2y': clock_rates_h5n1[w.segment]},
'h7n9': {'all-time':''},
'h9n2': {'all-time':''}
}

return clock_rate[w.subtype][w.time]


def clock_rate_std_dev(w):
clock_rate_std_dev = {
'h5nx': {'all-time': '', '2y': '--clock-std-dev 0.00211'},
'h5n1': {'all-time': '', '2y': '--clock-std-dev 0.00211'},
'h7n9': {'all-time': ''},
'h9n2': {'all-time': ''}
}

return clock_rate_std_dev[w.subtype][w.time]


rule download:
message: "Downloading sequences from fauna"
output:
sequences = "data/{subtype}_{segment}.fasta"
params:
fasta_fields = "strain virus accession collection_date region country division location host subtype originating_lab submitting_lab h5_clade",
fasta_fields = "strain virus accession collection_date region country division location host domestic_status subtype originating_lab submitting_lab authors PMID gisaid_clade h5_clade",
download_by = download_by
shell:
"""
Expand All @@ -65,7 +125,7 @@ rule download:
--path data \
--fstem {wildcards.subtype}_{wildcards.segment}
"""

### comment
rule parse:
message: "Parsing fasta into sequences and metadata"
input:
Expand All @@ -74,8 +134,8 @@ rule parse:
sequences = "results/sequences_{subtype}_{segment}.fasta",
metadata = "results/metadata_{subtype}_{segment}.tsv"
params:
fasta_fields = "strain virus isolate_id date region country division location host subtype originating_lab submitting_lab h5_clade",
prettify_fields = "region country division location host originating_lab submitting_lab"
fasta_fields = "strain virus isolate_id date region country division location host domestic_status subtype originating_lab submitting_lab authors PMID gisaid_clade h5_clade",
prettify_fields = "region country division location host originating_lab submitting_lab authors PMID"
shell:
"""
augur parse \
Expand Down Expand Up @@ -105,33 +165,36 @@ rule filter:
message:
"""
Filtering to
- {params.sequences_per_group} sequence(s) per {params.group_by!s}
- subsampling to {params.subsample_max_sequences} sequences
- grouping by {params.group_by}
- excluding strains in {input.exclude}
- samples with missing region and country metadata
- excluding strains prior to {params.min_date}
"""
input:
sequences = rules.parse.output.sequences,
metadata = metadata_by_wildcards,
exclude = files.dropped_strains
exclude = files.dropped_strains,
include = files.include_strains
output:
sequences = "results/filtered_{subtype}_{segment}.fasta"
sequences = "results/filtered_{subtype}_{segment}_{time}.fasta"
params:
group_by = group_by,
sequences_per_group = sequences_per_group,
subsample_max_sequences = 3000,
min_date = min_date,
min_length = min_length,
exclude_where = "host=laboratoryderived host=ferret host=unknown host=other country=? region=?"
exclude_where = "host=laboratoryderived host=ferret host=unknown host=other host=host country=? region=? gisaid_clade=3C.2"

shell:
"""
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--exclude {input.exclude} \
--include {input.include} \
--output {output.sequences} \
--group-by {params.group_by} \
--sequences-per-group {params.sequences_per_group} \
--subsample-max-sequences {params.subsample_max_sequences} \
--min-date {params.min_date} \
--exclude-where {params.exclude_where} \
--min-length {params.min_length} \
Expand All @@ -148,7 +211,7 @@ rule align:
sequences = rules.filter.output.sequences,
reference = files.reference
output:
alignment = "results/aligned_{subtype}_{segment}.fasta"
alignment = "results/aligned_{subtype}_{segment}_{time}.fasta"
shell:
"""
augur align \
Expand All @@ -165,7 +228,7 @@ rule tree:
input:
alignment = rules.align.output.alignment
output:
tree = "results/tree-raw_{subtype}_{segment}.nwk"
tree = "results/tree-raw_{subtype}_{segment}_{time}.nwk"
params:
method = "iqtree"
shell:
Expand All @@ -190,12 +253,14 @@ rule refine:
alignment = rules.align.output,
metadata = rules.parse.output.metadata
output:
tree = "results/tree_{subtype}_{segment}.nwk",
node_data = "results/branch-lengths_{subtype}_{segment}.json"
tree = "results/tree_{subtype}_{segment}_{time}.nwk",
node_data = "results/branch-lengths_{subtype}_{segment}_{time}.json"
params:
coalescent = "const",
date_inference = "marginal",
clock_filter_iqd = 4
clock_filter_iqd = 4,
clock = clock_rate,
clock_std_dev = clock_rate_std_dev
shell:
"""
augur refine \
Expand All @@ -208,6 +273,8 @@ rule refine:
--coalescent {params.coalescent} \
--date-confidence \
--date-inference {params.date_inference} \
{params.clock} \
{params.clock_std_dev} \
--clock-filter-iqd {params.clock_filter_iqd}
"""

Expand All @@ -217,7 +284,7 @@ rule ancestral:
tree = rules.refine.output.tree,
alignment = rules.align.output
output:
node_data = "results/nt-muts_{subtype}_{segment}.json"
node_data = "results/nt-muts_{subtype}_{segment}_{time}.json"
params:
inference = "joint"
shell:
Expand All @@ -237,7 +304,7 @@ rule translate:
node_data = rules.ancestral.output.node_data,
reference = files.reference
output:
node_data = "results/aa-muts_{subtype}_{segment}.json"
node_data = "results/aa-muts_{subtype}_{segment}_{time}.json"
shell:
"""
augur translate \
Expand All @@ -253,7 +320,7 @@ rule traits:
tree = rules.refine.output.tree,
metadata = rules.parse.output.metadata
output:
node_data = "results/traits_{subtype}_{segment}.json",
node_data = "results/traits_{subtype}_{segment}_{time}.json",
params:
columns = traits_columns,
shell:
Expand All @@ -269,10 +336,10 @@ rule traits:
rule cleavage_site:
message: "determining sequences that harbor furin cleavage sites"
input:
alignment = "results/aligned_{subtype}_ha.fasta"
alignment = "results/aligned_{subtype}_ha_{time}.fasta"
output:
cleavage_site_annotations = "results/cleavage-site_{subtype}_ha.json",
cleavage_site_sequences = "results/cleavage-site-sequences_{subtype}_ha.json"
cleavage_site_annotations = "results/cleavage-site_{subtype}_ha_{time}.json",
cleavage_site_sequences = "results/cleavage-site-sequences_{subtype}_ha_{time}.json"
shell:
"""
python scripts/annotate-ha-cleavage-site.py \
Expand All @@ -297,7 +364,7 @@ rule export:
auspice_config = files.auspice_config,
description = files.description
output:
auspice_json = "auspice/flu_avian_{subtype}_{segment}.json"
auspice_json = "auspice/avian-flu_{subtype}_{segment}_{time}.json"
shell:
"""
augur export v2 \
Expand Down
4 changes: 2 additions & 2 deletions batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@
for segment in params.segments:
call = ['nextstrain', 'build', '--aws-batch', '.', '-j 1']
targets = []
targets.append('auspice/flu_avian_%s_%s_tree.json'%(subtype, segment))
targets.append('auspice/flu_avian_%s_%s_meta.json'%(subtype, segment))
targets.append('auspice/avian-flu_%s_%s_tree.json'%(subtype, segment))
targets.append('auspice/avian-flu_%s_%s_meta.json'%(subtype, segment))
call.extend(targets)
print(' '.join(call))
log = open('logs/%s_%s.txt'%(subtype, segment), 'w')
Expand Down
2 changes: 1 addition & 1 deletion clade-labeling/add-clades.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def annotate_metadata_file(metadata_infile, metadata_outfile, clade_assignments)
else:
clade = "?"
unknown_clades += 1
print("unknown clade for ", strain_name)
#print("unknown clade for ", strain_name)
new_line = line.strip() + "\t" + clade + "\n"

with open(metadata_outfile, "a") as outfile:
Expand Down
82 changes: 82 additions & 0 deletions clade-labeling/h5n1-clades.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -17811,3 +17811,85 @@ A/caspiantern/Washington/23024996001original/2023 2.3.4.4b
A/Chicken/Netherlands/24001946006010/2024 EA-nonGsGD
A/Tuftedduck/Netherlands/1/2023 EA-nonGsGD
A/chicken/Iraq/KVCL016/2015 ?
A/Ph/ST/44/2004 2.3.2
A/chicken/Vietnam/Raho77232263/2023 2.3.2.1e
A/duck/Vietnam/Raho723S2874/2023 2.3.2.1e
A/duck/Vietnam/Raho723S2490/2023 2.3.2.1e
A/Vietnam/KhanhhoaRV1005/2024 2.3.2.1e
A/duck/Vietnam/Raho723S2875/2023 2.3.2.1e
A/muteswan/Poland/MB055L2/2024 2.3.4.4b
A/largebilledcrow/Osaka/2702A045/2024 2.3.4.4b
A/Fox/Bayern/i193/2023 2.3.4.4b
A/feline/USA/24008764001original/2024 2.3.4.4b
A/henharrier/Parnu/TA2126003/2021 2.3.4.4b
A/chicken/CzechRepublic/3529/2024 2.3.4.4b
A/domesticduck/Poland/H69T2/2024 2.3.4.4b
A/goose/Bayern/wv196/2023 2.3.4.4b
A/feline/USA/24009116005original/2024 2.3.4.4b
A/muteswan/Poland/MB085N/2024 2.3.4.4b
A/feline/USA/24009311006original/2024 2.3.4.4b
A/chicken/CzechRepublic/47202/2024 2.3.4.4b
A/chicken/Poland/H79T2/2024 2.3.4.4b
A/muteswan/Poland/MB055L1/2024 2.3.4.4b
A/dairycattle/Texas/24008749002v/2024 2.3.4.4b
A/feline/USA/24009116004original/2024 2.3.4.4b
A/domesticduck/Poland/H52T1K2/2024 2.3.4.4b
A/chicken/CzechRepublic/35494/2024 2.3.4.4b
A/chicken/CzechRepublic/31374/2024 2.3.4.4b
A/feline/USA/24009116002original/2024 2.3.4.4b
A/chicken/CzechRepublic/3744orig/2024 2.3.4.4b
A/largebilledcrow/Osaka/2702A031/2024 2.3.4.4b
A/muteswan/Poland/MB055L4/2024 2.3.4.4b
A/domesticduck/Poland/H57T1/2024 2.3.4.4b
A/turkey/Poland/H40T2/2024 2.3.4.4b
A/goose/Bayern/wv351/2023 2.3.4.4b
A/domesticgoose/Poland/H49W/2024 2.3.4.4b
A/turkey/Poland/H47T4/2024 2.3.4.4b
A/turkey/Poland/H80T3/2024 2.3.4.4b
A/Texas/37/2024 2.3.4.4b
A/chicken/Poland/H45NM/2024 2.3.4.4b
A/turkey/Poland/H75T1/2024 2.3.4.4b
A/CommonBuzzard/GermanyBB/2024AI01490/2024 2.3.4.4b
A/feline/USA/23037332001original/2023 2.3.4.4b
A/feline/USA/24008850001original/2024 2.3.4.4b
A/turkey/Poland/H68T2/2024 2.3.4.4b
A/domesticduck/Poland/H66T1/2024 2.3.4.4b
A/feline/USA/24009311004original/2024 2.3.4.4b
A/turkey/Poland/H43T2/2024 2.3.4.4b
A/chicken/CzechRepublic/354910/2024 2.3.4.4b
A/chicken/CzechRepublic/34582/2024 2.3.4.4b
A/NorthernFulmar/Netherlands/4/2024 2.3.4.4b
A/largebilledcrow/Ishikawa/1702A010/2024 2.3.4.4b
A/largebilledcrow/Osaka/2702A044/2024 2.3.4.4b
A/largebilledcrow/Osaka/2702A033/2024 2.3.4.4b
A/largebilledcrow/Osaka/2702A032/2024 2.3.4.4b
A/chicken/CzechRepublic/37442/2024 2.3.4.4b
A/domesticduck/Poland/H60T4/2024 2.3.4.4b
A/feline/USA/24008850002original/2024 2.3.4.4b
A/Seagull/Parnu/TA21132845/2021 2.3.4.4b
A/chicken/CzechRepublic/37443/2024 2.3.4.4b
A/muteswan/Poland/MB070L1/2024 2.3.4.4b
A/buzzard/Poland/MB098N/2024 2.3.4.4b
A/chicken/CzechRepublic/31371/2024 2.3.4.4b
A/turkey/Poland/H81T1/2024 2.3.4.4b
A/domesticgoose/Poland/H48D1/2024 2.3.4.4b
A/chicken/CzechRepublic/37441/2024 2.3.4.4b
A/muteswan/CzechRepublic/4316/2024 2.3.4.4b
A/buzzard/Poland/MB103N/2024 2.3.4.4b
A/turkey/Poland/H63N/2024 2.3.4.4b
A/chicken/CzechRepublic/35499/2024 2.3.4.4b
A/largebilledcrow/Hokkaido/B114/2024 2.3.4.4b
A/chicken/CzechRepublic/34583/2024 2.3.4.4b
A/feline/USA/24008764002original/2024 2.3.4.4b
A/eagle/Parnu/TA21118641/2021 2.3.4.4b
A/turkey/Poland/H38T4/2024 2.3.4.4b
A/largebilledcrow/Osaka/2702A043/2024 2.3.4.4b
A/chicken/CzechRepublic/34584/2024 2.3.4.4b
A/chicken/CzechRepublic/31375/2024 2.3.4.4b
A/Northernshoveler/Jeju/D60/2023 2.3.4.4b
A/turkey/Poland/H543NM/2023 2.3.4.4b
A/largebilledcrow/Osaka/2702A030/2024 2.3.4.4b
A/CommonBuzzard/GermanyHH/2024AI01435/2024 EA-nonGsGD
A/BarnacleGoose/GermanySH/2024AI01487/2024 EA-nonGsGD
A/Dunlin/GermanySH/2024AI01484/2024 EA-nonGsGD
A/Fox/Bayern/WS113/2022 ?
Loading