From 4537422808f17fcfb86bf00ff9f56b3f7b65f21d Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 9 Aug 2024 12:21:00 -0700 Subject: [PATCH] Update Snakefile to LF line endings --- Snakefile | 1192 ++++++++++++++++++++++++++--------------------------- 1 file changed, 596 insertions(+), 596 deletions(-) diff --git a/Snakefile b/Snakefile index 977a2b7..f931734 100755 --- a/Snakefile +++ b/Snakefile @@ -1,596 +1,596 @@ -# constrain the wildcards to not include `_` which we use to separate "parts" of filenames (where a part may be a wildcard itself) -wildcard_constraints: - subtype = "[^_]+", - segment = "[^_]+", - time = "[^_]+", - -# defined before extra rules `include`d as they reference this constant -SEGMENTS = ["pb2", "pb1", "pa", "ha","np", "na", "mp", "ns"] - - -for rule_file in config.get('custom_rules', []): - include: rule_file - -# The config option `same_strains_per_segment=True'` (e.g. supplied to snakemake via --config command line argument) -# will change the behaviour of the workflow to use the same strains for each segment. This is achieved via these steps: -# (1) Filter the HA segment as normal plus filter to those strains with 8 segments -# (2) Filter the other segments by simply force-including the same strains as (1) -SAME_STRAINS = bool(config.get('same_strains_per_segment', False)) - -S3_SRC = config.get('s3_src', {}) -LOCAL_INGEST = config.get('local_ingest', None) - -def sanity_check_config(): - assert LOCAL_INGEST or S3_SRC, "The config must define either 's3_src' or 'local_ingest'" - # NOTE: we could relax the following exclusivity of S3_SRC and LOCAL_INGEST - # if we want to use `--config local_ingest=gisaid` overrides. - assert not (S3_SRC and LOCAL_INGEST), "The config defined both 'local_ingest' and 's3_src', which are mutually exclusive" - if S3_SRC: - assert isinstance(S3_SRC, dict) and all([k in S3_SRC for k in ("name", "sequences", "metadata")]), \ - "Config 's3_src' must be a dict with 'name', 'sequences' and 'metadata' keys" - -sanity_check_config() - -def collect_builds(): - """ - iteratively create workflow targets from config.builds for the `all` rule - you can over-ride this by specifying targets (filenames) on the command line - """ - targets = [] - for subtype,times in config.get('builds', {}).items(): - for segment in config.get('segments', []): - if len(times): - for time in times: - targets.append(f"auspice/avian-flu_{subtype}_{segment}_{time}.json") - else: - targets.append(f"auspice/avian-flu_{subtype}_{segment}.json") - return targets - -rule all: - input: - auspice_json = collect_builds() - -# This must be after the `all` rule above since it depends on its inputs -include: "rules/deploy.smk" - -rule test_target: - """ - For testing purposes such as CI workflows. - """ - input: "auspice/avian-flu_h5n1_ha_all-time.json" - -rule files: - params: - dropped_strains = config['dropped_strains'], - include_strains = config['include_strains'], - reference = config['reference'], - colors = config['colors'], - # TODO - Augur 24.4.0 includes extensive lat-longs by default - can we drop the following avian-flu specific ones? - lat_longs = config['lat_longs'], - auspice_config = config['auspice_config'], - clades_file = config['clades_file'], - description = config['description'], - -files = rules.files.params - - -def subtypes_by_subtype_wildcard(wildcards): - db = { - 'h5nx': ['h5n1', 'h5n2', 'h5n3', 'h5n4', 'h5n5', 'h5n6', 'h5n7', 'h5n8', 'h5n9'], - 'h5n1': ['h5n1'], - 'h7n9': ['h7n9'], - 'h9n2': ['h9n2'], - } - db['h5n1-cattle-outbreak'] = [*db['h5nx']] - assert wildcards.subtype in db, (f"Subtype {wildcards.subtype!r} is not defined in the snakemake function " - "`subtypes_by_subtype_wildcard` -- is there a typo in the subtype you are targetting?") - return(db[wildcards.subtype]) - -rule download_sequences: - output: - sequences = f"data/{S3_SRC.get('name', None)}/sequences_{{segment}}.fasta", - params: - address=lambda w: S3_SRC.get('sequences', None).format(segment=w.segment) - shell: - """ - aws s3 cp {params.address:q} - | zstd -d > {output.sequences} - """ - -rule download_metadata: - output: - metadata = f"data/{S3_SRC.get('name', None)}/metadata.tsv", - params: - address=S3_SRC.get('metadata', None), - shell: - """ - aws s3 cp {params.address:q} - | zstd -d > {output.metadata} - """ - - -def input_metadata(wildcards): - if S3_SRC: - return f"data/{S3_SRC['name']}/metadata.tsv", - elif LOCAL_INGEST: - return f"ingest/{LOCAL_INGEST}/results/metadata.tsv", - raise Exception() # already caught by `sanity_check_config` above, this is just being cautious - -def input_sequences(wildcards): - if S3_SRC: - return f"data/{S3_SRC['name']}/sequences_{wildcards.segment}.fasta", - elif LOCAL_INGEST: - return f"ingest/{LOCAL_INGEST}/results/sequences_{wildcards.segment}.fasta" - raise Exception() # already caught by `sanity_check_config` above, this is just being cautious - - -rule filter_sequences_by_subtype: - input: - sequences = input_sequences, - metadata = input_metadata, - output: - sequences = "results/{subtype}/{segment}/sequences.fasta", - params: - subtypes=subtypes_by_subtype_wildcard, - shell: - """ - augur filter \ - --sequences {input.sequences} \ - --metadata {input.metadata} \ - --query "subtype in {params.subtypes!r}" \ - --output-sequences {output.sequences} - """ - -rule filter_metadata_by_subtype: - input: - metadata = input_metadata, - output: - metadata = "results/{subtype}/metadata.tsv", - params: - subtypes=subtypes_by_subtype_wildcard, - shell: - """ - augur filter \ - --metadata {input.metadata} \ - --query "subtype in {params.subtypes!r}" \ - --output-metadata {output.metadata} - """ - -def metadata_by_wildcards(wildcards): - # H5 builds have extra clade-level metadata added to the metadata TSV. - # We may move this to a node-data JSON which would simplify the snakemake logic - # a bit -- see - if wildcards.subtype in ("h5n1", "h5nx", "h5n1-cattle-outbreak"): - return "results/{subtype}/metadata-with-clade.tsv" - else: - return "results/{subtype}/metadata.tsv", - - -def get_config(rule_name, rule_key, wildcards, segment=None, fallback="FALLBACK"): - assert rule_name in config, f"Config missing top-level {rule_name} key" - assert rule_key in config[rule_name], f"Config missing entry for {rule_name}.{rule_key}" - try: - return config[rule_name][rule_key][wildcards.subtype][wildcards.time] - except KeyError: - assert fallback in config[rule_name][rule_key], f"config.{rule_name!r}.{rule_key!r} either needs " \ - f"an entry for {wildcards.subtype!r}.{wildcards.time!r} added or a (default) {fallback!r} key." - return config[rule_name][rule_key][fallback] - -def refine_clock_rates(w): - info = get_config('refine', 'clock_rates', w) - - if w.segment == 'genome': - # calculate the genome rate via a weighted average of the segment rates - assert 'genome' not in info, ("This snakemake pipeline is currently set up to calculate the genome clock rate " - "based on the segment rates, however you have provided a genome rate in the config.") - try: - segment_lengths= get_config('refine', 'segment_lengths', w) - except AssertionError as e: - # py11 brings e.add_note() which is nicer - e.args = (*e.args, "NOTE: For segment=genome we require the segment_lengths to be defined as we use them to calculate the clock rate") - raise - mean = sum([info[seg][0]*length for seg,length in segment_lengths.items()])/sum(segment_lengths.values()) - stdev = mean/2 - return f"--clock-rate {mean} --clock-std-dev {stdev}" - - assert w.segment in info, 'config.refine.clock_rates: data must be provided for each segment. Use "" for inference.' - if info[w.segment] == "": - return "" - - assert isinstance(info[w.segment], list), "The clock rates for {w.subtype!r} {w.time!r} {w.segment!r} must be a list of (rate, std-dev)" - assert len(info[w.segment])==2, "The clock rates for {w.subtype!r} {w.time!r} {w.segment!r} must be a list of (rate, std-dev)" - return f"--clock-rate {info[w.segment][0]} --clock-std-dev {info[w.segment][1]}" - -def refine_clock_filter(w): - filter = get_config('refine', 'clock_filter_iqd', w) - return f"--clock-filter-iqd {filter}" if filter else "" - - -rule add_h5_clade: - message: "Adding in a column for h5 clade numbering" - input: - metadata = "results/{subtype}/metadata.tsv", - clades_file = files.clades_file - output: - metadata= "results/{subtype}/metadata-with-clade.tsv" - shell: - """ - python clade-labeling/add-clades.py \ - --metadata {input.metadata} \ - --output {output.metadata} \ - --clades {input.clades_file} - """ - -def _filter_params(wildcards, input, output, threads, resources): - """ - Generate the arguments to `augur filter`. When we are running independent analyses - (i.e. not using the same strains for each segment), then we generate a full set of - filter parameters here. - When we are using the same sequences for each segment, then for HA we use a full - filter call and for the rest of the segments we filter to the strains chosen for HA - """ - - assert wildcards.segment!='genome', "Don't use this function for genome builds!" - - # For non-HA segments when we are using the SAME_STRAINS for all segments - # we have a simple filtering approach: match what we're using for HA! - # We also include the "force-include" list; 99% of the time these strains will already be in - # the HA strain list (as they were force-included there too) but there may be rare occasions - # where we force-include a strain which does not have a HA sequence. - if input.strains: - # some basic error checking to guard against inadvertent changes in the future - if not (SAME_STRAINS and wildcards.segment!='ha'): - raise Exception("A strains input should only be present for SAME_STRAINS + HA!") - return f"--exclude-all --include {input.strains} {input.include}" - - exclude_where = get_config('filter', 'exclude_where', wildcards) - # If SAME_STRAINS (and due to the above conditional we have the HA segment at this point) - # then we want to restrict to strains present in all 8 segments. Note that force-included - # strains may not have all segments, but that's preferable to filtering them out. - if SAME_STRAINS: - exclude_where += f" n_segments!={len(SEGMENTS)}" - - - cmd = "" - - group_by_value = get_config('filter', 'group_by', wildcards) - cmd += f" --group-by {group_by_value}" if group_by_value else "" - - cmd += f" --subsample-max-sequences {config['target_sequences_per_tree']}" - cmd += f" --min-date {get_config('filter', 'min_date', wildcards)}" - cmd += f" --include {input.include}" - cmd += f" --exclude-where {exclude_where}" - cmd += f" --min-length {get_config('filter', 'min_length', wildcards)[wildcards.segment]}" - cmd += f" --non-nucleotide" - return cmd - -rule filter: - input: - sequences = "results/{subtype}/{segment}/sequences.fasta", - metadata = metadata_by_wildcards, - exclude = files.dropped_strains, - include = files.include_strains, - strains = lambda w: f"results/{w.subtype}/ha/{w.time}/filtered.txt" if (SAME_STRAINS and w.segment!='ha') else [], - output: - sequences = "results/{subtype}/{segment}/{time}/filtered.fasta", - strains = "results/{subtype}/{segment}/{time}/filtered.txt", - params: - args = _filter_params, - shell: - """ - augur filter \ - --sequences {input.sequences} \ - --metadata {input.metadata} \ - --exclude {input.exclude} \ - --output-sequences {output.sequences} \ - --output-strains {output.strains} \ - {params.args} - """ - -rule align: - message: - """ - Aligning sequences to {input.reference} - - filling gaps with N - """ - input: - sequences = rules.filter.output.sequences, - reference = files.reference - output: - alignment = "results/{subtype}/{segment}/{time}/aligned.fasta" - wildcard_constraints: - # for genome builds we don't use this rule; see `rule join_segments` - segment = "|".join(seg for seg in SEGMENTS) - threads: - 4 - shell: - """ - augur align \ - --sequences {input.sequences} \ - --reference-sequence {input.reference} \ - --output {output.alignment} \ - --remove-reference \ - --nthreads {threads} - """ - - -rule tree: - message: "Building tree" - input: - # Note that the alignment input may come from `rule align` or `rule join_segments` - alignment = rules.align.output.alignment - output: - tree = "results/{subtype}/{segment}/{time}/tree-raw.nwk" - params: - method = "iqtree" - threads: - 4 - shell: - """ - augur tree \ - --alignment {input.alignment} \ - --output {output.tree} \ - --method {params.method} \ - --nthreads {threads} - """ - - -def refine_root(wildcards): - root = get_config('refine', 'genome_root', wildcards) \ - if wildcards.segment=='genome' \ - else get_config('refine', 'root', wildcards) - return f"--root {root}" if root else "" - -rule refine: - message: - """ - Refining tree - - estimate timetree - - use {params.coalescent} coalescent timescale - - estimate {params.date_inference} node dates - """ - input: - tree = rules.tree.output.tree, - alignment = rules.align.output, - metadata = metadata_by_wildcards, - output: - tree = "results/{subtype}/{segment}/{time}/tree.nwk", - node_data = "results/{subtype}/{segment}/{time}/branch-lengths.json" - params: - coalescent = config['refine']['coalescent'], - date_inference = config['refine']['date_inference'], - clock_rates = refine_clock_rates, - clock_filter = refine_clock_filter, - root = refine_root, - shell: - """ - augur refine \ - --tree {input.tree} \ - --alignment {input.alignment} \ - --metadata {input.metadata} \ - --output-tree {output.tree} \ - --output-node-data {output.node_data} \ - --timetree \ - {params.root} \ - --coalescent {params.coalescent} \ - --date-confidence \ - --date-inference {params.date_inference} \ - {params.clock_rates} \ - {params.clock_filter} - """ - - -def refined_tree(w): - """ - Return the refined tree to be used for export, traits, ancestry reconstruction etc - The cattle-flu build introduces an additional step beyond `augur refine`, which is - why this function exists. - """ - if w.subtype=='h5n1-cattle-outbreak' and w.segment!='genome': - return "results/{subtype}/{segment}/{time}/tree_outbreak-clade.nwk" - return "results/{subtype}/{segment}/{time}/tree.nwk", - -def ancestral_root_seq(wildcards): - root_seq = get_config('ancestral', 'genome_root_seq', wildcards) \ - if wildcards.segment=='genome' \ - else get_config('ancestral', 'root_seq', wildcards) - return f"--root-sequence {root_seq}" if root_seq else "" - -rule ancestral: - message: "Reconstructing ancestral sequences and mutations" - input: - tree = refined_tree, - alignment = rules.align.output - output: - node_data = "results/{subtype}/{segment}/{time}/nt-muts.json" - params: - inference = config['ancestral']['inference'], - root_seq = ancestral_root_seq, - shell: - """ - augur ancestral \ - --tree {input.tree} \ - --alignment {input.alignment} \ - --output-node-data {output.node_data} \ - --inference {params.inference} \ - {params.root_seq} \ - --keep-ambiguous - """ - -rule translate: - message: "Translating amino acid sequences" - input: - tree = refined_tree, - node_data = rules.ancestral.output.node_data, - reference = lambda w: config['genome_reference'] if w.segment=='genome' else files.reference - output: - node_data = "results/{subtype}/{segment}/{time}/aa-muts.json" - shell: - """ - augur translate \ - --tree {input.tree} \ - --ancestral-sequences {input.node_data} \ - --reference-sequence {input.reference} \ - --output {output.node_data} - """ - -def traits_params(wildcards): - columns = get_config('traits', 'genome_columns', wildcards) \ - if wildcards.segment=='genome' \ - else get_config('traits', 'columns', wildcards) - - bias = get_config('traits', 'genome_sampling_bias_correction', wildcards) \ - if wildcards.segment=='genome' \ - else get_config('traits', 'sampling_bias_correction', wildcards) - - confidence = get_config('traits', 'confidence', wildcards) - - args = f"--columns {columns}" - if bias: - args += f" --sampling-bias-correction {bias}" - if confidence: - args += f" --confidence" - return args - - -rule traits: - input: - tree = refined_tree, - metadata = metadata_by_wildcards, - output: - node_data = "results/{subtype}/{segment}/{time}/traits.json", - params: - info = traits_params, - shell: - """ - augur traits \ - --tree {input.tree} \ - --metadata {input.metadata} \ - --output {output.node_data} \ - {params.info} - """ - - -rule cleavage_site: - """ - Cleavage sites are only computed for HA and are unchanged depending on the build. As such, they are not - parameterised in the config file - """ - input: - alignment = "results/{subtype}/ha/{time}/aligned.fasta" - output: - cleavage_site_annotations = "results/{subtype}/ha/{time}/cleavage-site.json", - cleavage_site_sequences = "results/{subtype}/ha/{time}/cleavage-site-sequences.json" - shell: - """ - python scripts/annotate-ha-cleavage-site.py \ - --alignment {input.alignment} \ - --furin_site_motif {output.cleavage_site_annotations} \ - --cleavage_site_sequence {output.cleavage_site_sequences} - """ - -def export_node_data_files(wildcards): - nd = [ - rules.refine.output.node_data, - rules.traits.output.node_data, - rules.ancestral.output.node_data, - rules.translate.output.node_data, - rules.cleavage_site.output.cleavage_site_annotations, - rules.cleavage_site.output.cleavage_site_sequences, - ] - - if wildcards.subtype=="h5n1-cattle-outbreak" and wildcards.segment!='genome': - nd.append(rules.prune_tree.output.node_data) - return nd - - -def additional_export_config(wildcards): - args = "" - - title_overrides = get_config('export', 'genome_title', wildcards) \ - if wildcards.segment=='genome' \ - else get_config('export', 'title', wildcards) - if title_overrides: - args += ("--title '" + - title_overrides.format(segment=wildcards.segment.upper(), subtype=wildcards.subtype.upper(), time=wildcards.time) + - "'") - - return args - -rule export: - """ - Export the files into results/ and then use a subsequent rule to move these to the - auspice/ directory - """ - input: - tree = refined_tree, - metadata = metadata_by_wildcards, - node_data = export_node_data_files, - colors = files.colors, - lat_longs = files.lat_longs, - auspice_config = files.auspice_config, - description = files.description - output: - auspice_json = "results/{subtype}/{segment}/{time}/auspice-dataset.json" - params: - additional_config = additional_export_config - shell: - """ - augur export v2 \ - --tree {input.tree} \ - --metadata {input.metadata} \ - --node-data {input.node_data} \ - --colors {input.colors} \ - --lat-longs {input.lat_longs} \ - --auspice-config {input.auspice_config} \ - --description {input.description} \ - --include-root-sequence-inline \ - {params.additional_config} \ - --output {output.auspice_json} - """ - -def auspice_name_to_wildcard_name(wildcards): - """ - Used to link Auspice JSONs filenames to their intermediate filename which includes all wildcards. - Examples: - 1. subtype + segment + time in their filename / URL, - e.g. "avian-flu_h5n1_ha_2y.json" (nextstrain.org/avian-flu/h5n1/ha/2y) - maps to subtype=h5n1, segment=ha, time=2y - 2. subtype + segment in their filename / URL, - e.g. "avian-flu_h5n1-cattle-outbreak_ha.json" (nextstrain.org/avian-flu/h5n1-cattle-outbreak/ha) - maps to subtype=h5n1-cattle-outbreak, segment=ha, time=default - """ - parts = wildcards.parts.split("_") - if len(parts)==3: - [subtype, segment, time] = parts - assert segment!='genome', "Genome builds are not available for this build" - return f"results/{subtype}/{segment}/{time}/auspice-dataset.json" - if len(parts)==2: - [subtype, segment] = parts - assert subtype=='h5n1-cattle-outbreak', "Only h5n1 builds produce an Auspice dataset without a time component in the filename" - return f"results/{subtype}/{segment}/default/auspice-dataset.json" - raise Exception("Auspice JSON filename requested with an unexpected number of (underscore-separated) parts") - - -rule rename_auspice_datasets: - """ - This allows us to create files in auspice/ which mirror the intended URL structure rather than - the wildcard structure we use in the workflow. - """ - input: - json = auspice_name_to_wildcard_name - output: - json = "auspice/avian-flu_{parts}.json" - wildcard_constraints: - timepart = ".*" - shell: - """ - cp {input.json} {output.json} - """ - - -rule clean: - message: "Removing directories: {params}" - params: - "results ", - "auspice" - shell: - "rm -rfv {params}" +# constrain the wildcards to not include `_` which we use to separate "parts" of filenames (where a part may be a wildcard itself) +wildcard_constraints: + subtype = "[^_]+", + segment = "[^_]+", + time = "[^_]+", + +# defined before extra rules `include`d as they reference this constant +SEGMENTS = ["pb2", "pb1", "pa", "ha","np", "na", "mp", "ns"] + + +for rule_file in config.get('custom_rules', []): + include: rule_file + +# The config option `same_strains_per_segment=True'` (e.g. supplied to snakemake via --config command line argument) +# will change the behaviour of the workflow to use the same strains for each segment. This is achieved via these steps: +# (1) Filter the HA segment as normal plus filter to those strains with 8 segments +# (2) Filter the other segments by simply force-including the same strains as (1) +SAME_STRAINS = bool(config.get('same_strains_per_segment', False)) + +S3_SRC = config.get('s3_src', {}) +LOCAL_INGEST = config.get('local_ingest', None) + +def sanity_check_config(): + assert LOCAL_INGEST or S3_SRC, "The config must define either 's3_src' or 'local_ingest'" + # NOTE: we could relax the following exclusivity of S3_SRC and LOCAL_INGEST + # if we want to use `--config local_ingest=gisaid` overrides. + assert not (S3_SRC and LOCAL_INGEST), "The config defined both 'local_ingest' and 's3_src', which are mutually exclusive" + if S3_SRC: + assert isinstance(S3_SRC, dict) and all([k in S3_SRC for k in ("name", "sequences", "metadata")]), \ + "Config 's3_src' must be a dict with 'name', 'sequences' and 'metadata' keys" + +sanity_check_config() + +def collect_builds(): + """ + iteratively create workflow targets from config.builds for the `all` rule + you can over-ride this by specifying targets (filenames) on the command line + """ + targets = [] + for subtype,times in config.get('builds', {}).items(): + for segment in config.get('segments', []): + if len(times): + for time in times: + targets.append(f"auspice/avian-flu_{subtype}_{segment}_{time}.json") + else: + targets.append(f"auspice/avian-flu_{subtype}_{segment}.json") + return targets + +rule all: + input: + auspice_json = collect_builds() + +# This must be after the `all` rule above since it depends on its inputs +include: "rules/deploy.smk" + +rule test_target: + """ + For testing purposes such as CI workflows. + """ + input: "auspice/avian-flu_h5n1_ha_all-time.json" + +rule files: + params: + dropped_strains = config['dropped_strains'], + include_strains = config['include_strains'], + reference = config['reference'], + colors = config['colors'], + # TODO - Augur 24.4.0 includes extensive lat-longs by default - can we drop the following avian-flu specific ones? + lat_longs = config['lat_longs'], + auspice_config = config['auspice_config'], + clades_file = config['clades_file'], + description = config['description'], + +files = rules.files.params + + +def subtypes_by_subtype_wildcard(wildcards): + db = { + 'h5nx': ['h5n1', 'h5n2', 'h5n3', 'h5n4', 'h5n5', 'h5n6', 'h5n7', 'h5n8', 'h5n9'], + 'h5n1': ['h5n1'], + 'h7n9': ['h7n9'], + 'h9n2': ['h9n2'], + } + db['h5n1-cattle-outbreak'] = [*db['h5nx']] + assert wildcards.subtype in db, (f"Subtype {wildcards.subtype!r} is not defined in the snakemake function " + "`subtypes_by_subtype_wildcard` -- is there a typo in the subtype you are targetting?") + return(db[wildcards.subtype]) + +rule download_sequences: + output: + sequences = f"data/{S3_SRC.get('name', None)}/sequences_{{segment}}.fasta", + params: + address=lambda w: S3_SRC.get('sequences', None).format(segment=w.segment) + shell: + """ + aws s3 cp {params.address:q} - | zstd -d > {output.sequences} + """ + +rule download_metadata: + output: + metadata = f"data/{S3_SRC.get('name', None)}/metadata.tsv", + params: + address=S3_SRC.get('metadata', None), + shell: + """ + aws s3 cp {params.address:q} - | zstd -d > {output.metadata} + """ + + +def input_metadata(wildcards): + if S3_SRC: + return f"data/{S3_SRC['name']}/metadata.tsv", + elif LOCAL_INGEST: + return f"ingest/{LOCAL_INGEST}/results/metadata.tsv", + raise Exception() # already caught by `sanity_check_config` above, this is just being cautious + +def input_sequences(wildcards): + if S3_SRC: + return f"data/{S3_SRC['name']}/sequences_{wildcards.segment}.fasta", + elif LOCAL_INGEST: + return f"ingest/{LOCAL_INGEST}/results/sequences_{wildcards.segment}.fasta" + raise Exception() # already caught by `sanity_check_config` above, this is just being cautious + + +rule filter_sequences_by_subtype: + input: + sequences = input_sequences, + metadata = input_metadata, + output: + sequences = "results/{subtype}/{segment}/sequences.fasta", + params: + subtypes=subtypes_by_subtype_wildcard, + shell: + """ + augur filter \ + --sequences {input.sequences} \ + --metadata {input.metadata} \ + --query "subtype in {params.subtypes!r}" \ + --output-sequences {output.sequences} + """ + +rule filter_metadata_by_subtype: + input: + metadata = input_metadata, + output: + metadata = "results/{subtype}/metadata.tsv", + params: + subtypes=subtypes_by_subtype_wildcard, + shell: + """ + augur filter \ + --metadata {input.metadata} \ + --query "subtype in {params.subtypes!r}" \ + --output-metadata {output.metadata} + """ + +def metadata_by_wildcards(wildcards): + # H5 builds have extra clade-level metadata added to the metadata TSV. + # We may move this to a node-data JSON which would simplify the snakemake logic + # a bit -- see + if wildcards.subtype in ("h5n1", "h5nx", "h5n1-cattle-outbreak"): + return "results/{subtype}/metadata-with-clade.tsv" + else: + return "results/{subtype}/metadata.tsv", + + +def get_config(rule_name, rule_key, wildcards, segment=None, fallback="FALLBACK"): + assert rule_name in config, f"Config missing top-level {rule_name} key" + assert rule_key in config[rule_name], f"Config missing entry for {rule_name}.{rule_key}" + try: + return config[rule_name][rule_key][wildcards.subtype][wildcards.time] + except KeyError: + assert fallback in config[rule_name][rule_key], f"config.{rule_name!r}.{rule_key!r} either needs " \ + f"an entry for {wildcards.subtype!r}.{wildcards.time!r} added or a (default) {fallback!r} key." + return config[rule_name][rule_key][fallback] + +def refine_clock_rates(w): + info = get_config('refine', 'clock_rates', w) + + if w.segment == 'genome': + # calculate the genome rate via a weighted average of the segment rates + assert 'genome' not in info, ("This snakemake pipeline is currently set up to calculate the genome clock rate " + "based on the segment rates, however you have provided a genome rate in the config.") + try: + segment_lengths= get_config('refine', 'segment_lengths', w) + except AssertionError as e: + # py11 brings e.add_note() which is nicer + e.args = (*e.args, "NOTE: For segment=genome we require the segment_lengths to be defined as we use them to calculate the clock rate") + raise + mean = sum([info[seg][0]*length for seg,length in segment_lengths.items()])/sum(segment_lengths.values()) + stdev = mean/2 + return f"--clock-rate {mean} --clock-std-dev {stdev}" + + assert w.segment in info, 'config.refine.clock_rates: data must be provided for each segment. Use "" for inference.' + if info[w.segment] == "": + return "" + + assert isinstance(info[w.segment], list), "The clock rates for {w.subtype!r} {w.time!r} {w.segment!r} must be a list of (rate, std-dev)" + assert len(info[w.segment])==2, "The clock rates for {w.subtype!r} {w.time!r} {w.segment!r} must be a list of (rate, std-dev)" + return f"--clock-rate {info[w.segment][0]} --clock-std-dev {info[w.segment][1]}" + +def refine_clock_filter(w): + filter = get_config('refine', 'clock_filter_iqd', w) + return f"--clock-filter-iqd {filter}" if filter else "" + + +rule add_h5_clade: + message: "Adding in a column for h5 clade numbering" + input: + metadata = "results/{subtype}/metadata.tsv", + clades_file = files.clades_file + output: + metadata= "results/{subtype}/metadata-with-clade.tsv" + shell: + """ + python clade-labeling/add-clades.py \ + --metadata {input.metadata} \ + --output {output.metadata} \ + --clades {input.clades_file} + """ + +def _filter_params(wildcards, input, output, threads, resources): + """ + Generate the arguments to `augur filter`. When we are running independent analyses + (i.e. not using the same strains for each segment), then we generate a full set of + filter parameters here. + When we are using the same sequences for each segment, then for HA we use a full + filter call and for the rest of the segments we filter to the strains chosen for HA + """ + + assert wildcards.segment!='genome', "Don't use this function for genome builds!" + + # For non-HA segments when we are using the SAME_STRAINS for all segments + # we have a simple filtering approach: match what we're using for HA! + # We also include the "force-include" list; 99% of the time these strains will already be in + # the HA strain list (as they were force-included there too) but there may be rare occasions + # where we force-include a strain which does not have a HA sequence. + if input.strains: + # some basic error checking to guard against inadvertent changes in the future + if not (SAME_STRAINS and wildcards.segment!='ha'): + raise Exception("A strains input should only be present for SAME_STRAINS + HA!") + return f"--exclude-all --include {input.strains} {input.include}" + + exclude_where = get_config('filter', 'exclude_where', wildcards) + # If SAME_STRAINS (and due to the above conditional we have the HA segment at this point) + # then we want to restrict to strains present in all 8 segments. Note that force-included + # strains may not have all segments, but that's preferable to filtering them out. + if SAME_STRAINS: + exclude_where += f" n_segments!={len(SEGMENTS)}" + + + cmd = "" + + group_by_value = get_config('filter', 'group_by', wildcards) + cmd += f" --group-by {group_by_value}" if group_by_value else "" + + cmd += f" --subsample-max-sequences {config['target_sequences_per_tree']}" + cmd += f" --min-date {get_config('filter', 'min_date', wildcards)}" + cmd += f" --include {input.include}" + cmd += f" --exclude-where {exclude_where}" + cmd += f" --min-length {get_config('filter', 'min_length', wildcards)[wildcards.segment]}" + cmd += f" --non-nucleotide" + return cmd + +rule filter: + input: + sequences = "results/{subtype}/{segment}/sequences.fasta", + metadata = metadata_by_wildcards, + exclude = files.dropped_strains, + include = files.include_strains, + strains = lambda w: f"results/{w.subtype}/ha/{w.time}/filtered.txt" if (SAME_STRAINS and w.segment!='ha') else [], + output: + sequences = "results/{subtype}/{segment}/{time}/filtered.fasta", + strains = "results/{subtype}/{segment}/{time}/filtered.txt", + params: + args = _filter_params, + shell: + """ + augur filter \ + --sequences {input.sequences} \ + --metadata {input.metadata} \ + --exclude {input.exclude} \ + --output-sequences {output.sequences} \ + --output-strains {output.strains} \ + {params.args} + """ + +rule align: + message: + """ + Aligning sequences to {input.reference} + - filling gaps with N + """ + input: + sequences = rules.filter.output.sequences, + reference = files.reference + output: + alignment = "results/{subtype}/{segment}/{time}/aligned.fasta" + wildcard_constraints: + # for genome builds we don't use this rule; see `rule join_segments` + segment = "|".join(seg for seg in SEGMENTS) + threads: + 4 + shell: + """ + augur align \ + --sequences {input.sequences} \ + --reference-sequence {input.reference} \ + --output {output.alignment} \ + --remove-reference \ + --nthreads {threads} + """ + + +rule tree: + message: "Building tree" + input: + # Note that the alignment input may come from `rule align` or `rule join_segments` + alignment = rules.align.output.alignment + output: + tree = "results/{subtype}/{segment}/{time}/tree-raw.nwk" + params: + method = "iqtree" + threads: + 4 + shell: + """ + augur tree \ + --alignment {input.alignment} \ + --output {output.tree} \ + --method {params.method} \ + --nthreads {threads} + """ + + +def refine_root(wildcards): + root = get_config('refine', 'genome_root', wildcards) \ + if wildcards.segment=='genome' \ + else get_config('refine', 'root', wildcards) + return f"--root {root}" if root else "" + +rule refine: + message: + """ + Refining tree + - estimate timetree + - use {params.coalescent} coalescent timescale + - estimate {params.date_inference} node dates + """ + input: + tree = rules.tree.output.tree, + alignment = rules.align.output, + metadata = metadata_by_wildcards, + output: + tree = "results/{subtype}/{segment}/{time}/tree.nwk", + node_data = "results/{subtype}/{segment}/{time}/branch-lengths.json" + params: + coalescent = config['refine']['coalescent'], + date_inference = config['refine']['date_inference'], + clock_rates = refine_clock_rates, + clock_filter = refine_clock_filter, + root = refine_root, + shell: + """ + augur refine \ + --tree {input.tree} \ + --alignment {input.alignment} \ + --metadata {input.metadata} \ + --output-tree {output.tree} \ + --output-node-data {output.node_data} \ + --timetree \ + {params.root} \ + --coalescent {params.coalescent} \ + --date-confidence \ + --date-inference {params.date_inference} \ + {params.clock_rates} \ + {params.clock_filter} + """ + + +def refined_tree(w): + """ + Return the refined tree to be used for export, traits, ancestry reconstruction etc + The cattle-flu build introduces an additional step beyond `augur refine`, which is + why this function exists. + """ + if w.subtype=='h5n1-cattle-outbreak' and w.segment!='genome': + return "results/{subtype}/{segment}/{time}/tree_outbreak-clade.nwk" + return "results/{subtype}/{segment}/{time}/tree.nwk", + +def ancestral_root_seq(wildcards): + root_seq = get_config('ancestral', 'genome_root_seq', wildcards) \ + if wildcards.segment=='genome' \ + else get_config('ancestral', 'root_seq', wildcards) + return f"--root-sequence {root_seq}" if root_seq else "" + +rule ancestral: + message: "Reconstructing ancestral sequences and mutations" + input: + tree = refined_tree, + alignment = rules.align.output + output: + node_data = "results/{subtype}/{segment}/{time}/nt-muts.json" + params: + inference = config['ancestral']['inference'], + root_seq = ancestral_root_seq, + shell: + """ + augur ancestral \ + --tree {input.tree} \ + --alignment {input.alignment} \ + --output-node-data {output.node_data} \ + --inference {params.inference} \ + {params.root_seq} \ + --keep-ambiguous + """ + +rule translate: + message: "Translating amino acid sequences" + input: + tree = refined_tree, + node_data = rules.ancestral.output.node_data, + reference = lambda w: config['genome_reference'] if w.segment=='genome' else files.reference + output: + node_data = "results/{subtype}/{segment}/{time}/aa-muts.json" + shell: + """ + augur translate \ + --tree {input.tree} \ + --ancestral-sequences {input.node_data} \ + --reference-sequence {input.reference} \ + --output {output.node_data} + """ + +def traits_params(wildcards): + columns = get_config('traits', 'genome_columns', wildcards) \ + if wildcards.segment=='genome' \ + else get_config('traits', 'columns', wildcards) + + bias = get_config('traits', 'genome_sampling_bias_correction', wildcards) \ + if wildcards.segment=='genome' \ + else get_config('traits', 'sampling_bias_correction', wildcards) + + confidence = get_config('traits', 'confidence', wildcards) + + args = f"--columns {columns}" + if bias: + args += f" --sampling-bias-correction {bias}" + if confidence: + args += f" --confidence" + return args + + +rule traits: + input: + tree = refined_tree, + metadata = metadata_by_wildcards, + output: + node_data = "results/{subtype}/{segment}/{time}/traits.json", + params: + info = traits_params, + shell: + """ + augur traits \ + --tree {input.tree} \ + --metadata {input.metadata} \ + --output {output.node_data} \ + {params.info} + """ + + +rule cleavage_site: + """ + Cleavage sites are only computed for HA and are unchanged depending on the build. As such, they are not + parameterised in the config file + """ + input: + alignment = "results/{subtype}/ha/{time}/aligned.fasta" + output: + cleavage_site_annotations = "results/{subtype}/ha/{time}/cleavage-site.json", + cleavage_site_sequences = "results/{subtype}/ha/{time}/cleavage-site-sequences.json" + shell: + """ + python scripts/annotate-ha-cleavage-site.py \ + --alignment {input.alignment} \ + --furin_site_motif {output.cleavage_site_annotations} \ + --cleavage_site_sequence {output.cleavage_site_sequences} + """ + +def export_node_data_files(wildcards): + nd = [ + rules.refine.output.node_data, + rules.traits.output.node_data, + rules.ancestral.output.node_data, + rules.translate.output.node_data, + rules.cleavage_site.output.cleavage_site_annotations, + rules.cleavage_site.output.cleavage_site_sequences, + ] + + if wildcards.subtype=="h5n1-cattle-outbreak" and wildcards.segment!='genome': + nd.append(rules.prune_tree.output.node_data) + return nd + + +def additional_export_config(wildcards): + args = "" + + title_overrides = get_config('export', 'genome_title', wildcards) \ + if wildcards.segment=='genome' \ + else get_config('export', 'title', wildcards) + if title_overrides: + args += ("--title '" + + title_overrides.format(segment=wildcards.segment.upper(), subtype=wildcards.subtype.upper(), time=wildcards.time) + + "'") + + return args + +rule export: + """ + Export the files into results/ and then use a subsequent rule to move these to the + auspice/ directory + """ + input: + tree = refined_tree, + metadata = metadata_by_wildcards, + node_data = export_node_data_files, + colors = files.colors, + lat_longs = files.lat_longs, + auspice_config = files.auspice_config, + description = files.description + output: + auspice_json = "results/{subtype}/{segment}/{time}/auspice-dataset.json" + params: + additional_config = additional_export_config + shell: + """ + augur export v2 \ + --tree {input.tree} \ + --metadata {input.metadata} \ + --node-data {input.node_data} \ + --colors {input.colors} \ + --lat-longs {input.lat_longs} \ + --auspice-config {input.auspice_config} \ + --description {input.description} \ + --include-root-sequence-inline \ + {params.additional_config} \ + --output {output.auspice_json} + """ + +def auspice_name_to_wildcard_name(wildcards): + """ + Used to link Auspice JSONs filenames to their intermediate filename which includes all wildcards. + Examples: + 1. subtype + segment + time in their filename / URL, + e.g. "avian-flu_h5n1_ha_2y.json" (nextstrain.org/avian-flu/h5n1/ha/2y) + maps to subtype=h5n1, segment=ha, time=2y + 2. subtype + segment in their filename / URL, + e.g. "avian-flu_h5n1-cattle-outbreak_ha.json" (nextstrain.org/avian-flu/h5n1-cattle-outbreak/ha) + maps to subtype=h5n1-cattle-outbreak, segment=ha, time=default + """ + parts = wildcards.parts.split("_") + if len(parts)==3: + [subtype, segment, time] = parts + assert segment!='genome', "Genome builds are not available for this build" + return f"results/{subtype}/{segment}/{time}/auspice-dataset.json" + if len(parts)==2: + [subtype, segment] = parts + assert subtype=='h5n1-cattle-outbreak', "Only h5n1 builds produce an Auspice dataset without a time component in the filename" + return f"results/{subtype}/{segment}/default/auspice-dataset.json" + raise Exception("Auspice JSON filename requested with an unexpected number of (underscore-separated) parts") + + +rule rename_auspice_datasets: + """ + This allows us to create files in auspice/ which mirror the intended URL structure rather than + the wildcard structure we use in the workflow. + """ + input: + json = auspice_name_to_wildcard_name + output: + json = "auspice/avian-flu_{parts}.json" + wildcard_constraints: + timepart = ".*" + shell: + """ + cp {input.json} {output.json} + """ + + +rule clean: + message: "Removing directories: {params}" + params: + "results ", + "auspice" + shell: + "rm -rfv {params}"