From c2f651cda6f9d08591c80758adb80b1248144e31 Mon Sep 17 00:00:00 2001 From: Kim Andrews <17375001+kimandrews@users.noreply.github.com> Date: Wed, 21 Feb 2024 16:52:23 -0800 Subject: [PATCH 01/10] Move phylogenetic workflow to a phylogenetic directory Move phylogenetic workflow from top-level to phylogenetic directory to follow the [Pathogen Repo Guide](https://github.com/nextstrain/pathogen-repo-guide/tree/main) --- Snakefile => phylogenetic/Snakefile | 0 {config => phylogenetic/defaults}/auspice_config.json | 0 {config => phylogenetic/defaults}/colors.tsv | 0 {config => phylogenetic/defaults}/config.yaml | 8 ++++---- {config => phylogenetic/defaults}/dropped_strains.txt | 0 {config => phylogenetic/defaults}/measles_reference.gb | 0 {example_data => phylogenetic/example_data}/metadata.tsv | 0 .../example_data}/sequences.fasta | 0 8 files changed, 4 insertions(+), 4 deletions(-) rename Snakefile => phylogenetic/Snakefile (100%) rename {config => phylogenetic/defaults}/auspice_config.json (100%) rename {config => phylogenetic/defaults}/colors.tsv (100%) rename {config => phylogenetic/defaults}/config.yaml (57%) rename {config => phylogenetic/defaults}/dropped_strains.txt (100%) rename {config => phylogenetic/defaults}/measles_reference.gb (100%) rename {example_data => phylogenetic/example_data}/metadata.tsv (100%) rename {example_data => phylogenetic/example_data}/sequences.fasta (100%) diff --git a/Snakefile b/phylogenetic/Snakefile similarity index 100% rename from Snakefile rename to phylogenetic/Snakefile diff --git a/config/auspice_config.json b/phylogenetic/defaults/auspice_config.json similarity index 100% rename from config/auspice_config.json rename to phylogenetic/defaults/auspice_config.json diff --git a/config/colors.tsv b/phylogenetic/defaults/colors.tsv similarity index 100% rename from config/colors.tsv rename to phylogenetic/defaults/colors.tsv diff --git a/config/config.yaml b/phylogenetic/defaults/config.yaml similarity index 57% rename from config/config.yaml rename to phylogenetic/defaults/config.yaml index 1fe71e7..f8ca2c1 100644 --- a/config/config.yaml +++ b/phylogenetic/defaults/config.yaml @@ -1,8 +1,8 @@ files: - exclude: "config/dropped_strains.txt" - reference: "config/measles_reference.gb" - colors: "config/colors.tsv" - auspice_config: "config/auspice_config.json" + exclude: "defaults/dropped_strains.txt" + reference: "defaults/measles_reference.gb" + colors: "defaults/colors.tsv" + auspice_config: "defaults/auspice_config.json" filter: group_by: "country year month" sequences_per_group: 20 diff --git a/config/dropped_strains.txt b/phylogenetic/defaults/dropped_strains.txt similarity index 100% rename from config/dropped_strains.txt rename to phylogenetic/defaults/dropped_strains.txt diff --git a/config/measles_reference.gb b/phylogenetic/defaults/measles_reference.gb similarity index 100% rename from config/measles_reference.gb rename to phylogenetic/defaults/measles_reference.gb diff --git a/example_data/metadata.tsv b/phylogenetic/example_data/metadata.tsv similarity index 100% rename from example_data/metadata.tsv rename to phylogenetic/example_data/metadata.tsv diff --git a/example_data/sequences.fasta b/phylogenetic/example_data/sequences.fasta similarity index 100% rename from example_data/sequences.fasta rename to phylogenetic/example_data/sequences.fasta From c0157a0c35cb468ffe086a0b163fdd961937d4f4 Mon Sep 17 00:00:00 2001 From: Kim Andrews <17375001+kimandrews@users.noreply.github.com> Date: Wed, 21 Feb 2024 17:45:19 -0800 Subject: [PATCH 02/10] Move rules for preparing sequences to its own smk file Part of work to update this repo to match the pathogen-repo-guide. --- phylogenetic/Snakefile | 82 +----------------- phylogenetic/rules/prepare_sequences.smk | 102 +++++++++++++++++++++++ 2 files changed, 104 insertions(+), 80 deletions(-) create mode 100644 phylogenetic/rules/prepare_sequences.smk diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index e44939f..e3532bc 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -1,88 +1,10 @@ -configfile: "config/config.yaml" +configfile: "defaults/config.yaml" rule all: input: auspice_json = "auspice/measles.json", -rule download: - """Downloading sequences and metadata from data.nextstrain.org""" - output: - sequences = "data/sequences.fasta.zst", - metadata = "data/metadata.tsv.zst" - params: - sequences_url = "https://data.nextstrain.org/files/measles/sequences.fasta.zst", - metadata_url = "https://data.nextstrain.org/files/measles/metadata.tsv.zst" - shell: - """ - curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences} - curl -fsSL --compressed {params.metadata_url:q} --output {output.metadata} - """ - -rule decompress: - """Decompressing sequences and metadata""" - input: - sequences = "data/sequences.fasta.zst", - metadata = "data/metadata.tsv.zst" - output: - sequences = "data/sequences.fasta", - metadata = "data/metadata.tsv" - shell: - """ - zstd -d -c {input.sequences} > {output.sequences} - zstd -d -c {input.metadata} > {output.metadata} - """ - -rule filter: - """ - Filtering to - - {params.sequences_per_group} sequence(s) per {params.group_by!s} - - from {params.min_date} onwards - - excluding strains in {input.exclude} - - minimum genome length of {params.min_length} - """ - input: - sequences = "data/sequences.fasta", - metadata = "data/metadata.tsv", - exclude = config["files"]["exclude"] - output: - sequences = "results/filtered.fasta" - params: - group_by = config["filter"]["group_by"], - sequences_per_group = config["filter"]["sequences_per_group"], - min_date = config["filter"]["min_date"], - min_length = config["filter"]["min_length"] - shell: - """ - augur filter \ - --sequences {input.sequences} \ - --metadata {input.metadata} \ - --exclude {input.exclude} \ - --output {output.sequences} \ - --group-by {params.group_by} \ - --sequences-per-group {params.sequences_per_group} \ - --min-date {params.min_date} \ - --min-length {params.min_length} - """ - -rule align: - """ - Aligning sequences to {input.reference} - - filling gaps with N - """ - input: - sequences = "results/filtered.fasta", - reference = config["files"]["reference"] - output: - alignment = "results/aligned.fasta" - shell: - """ - augur align \ - --sequences {input.sequences} \ - --reference-sequence {input.reference} \ - --output {output.alignment} \ - --fill-gaps \ - --remove-reference - """ +include: "rules/prepare_sequences.smk" rule tree: """Building tree""" diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk new file mode 100644 index 0000000..0344e24 --- /dev/null +++ b/phylogenetic/rules/prepare_sequences.smk @@ -0,0 +1,102 @@ +""" +This part of the workflow prepares sequences for constructing the phylogenetic tree. + +REQUIRED INPUTS: + + metadata = data/metadata.tsv + sequences = data/sequences.fasta + reference = ../shared/reference.fasta + +OUTPUTS: + + prepared_sequences = results/prepared_sequences.fasta + +This part of the workflow usually includes the following steps: + + - augur index + - augur filter + - augur align + - augur mask + +See Augur's usage docs for these commands for more details. +""" +rule download: + """Downloading sequences and metadata from data.nextstrain.org""" + output: + sequences = "data/sequences.fasta.zst", + metadata = "data/metadata.tsv.zst" + params: + sequences_url = "https://data.nextstrain.org/files/measles/sequences.fasta.zst", + metadata_url = "https://data.nextstrain.org/files/measles/metadata.tsv.zst" + shell: + """ + curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences} + curl -fsSL --compressed {params.metadata_url:q} --output {output.metadata} + """ + +rule decompress: + """Decompressing sequences and metadata""" + input: + sequences = "data/sequences.fasta.zst", + metadata = "data/metadata.tsv.zst" + output: + sequences = "data/sequences.fasta", + metadata = "data/metadata.tsv" + shell: + """ + zstd -d -c {input.sequences} > {output.sequences} + zstd -d -c {input.metadata} > {output.metadata} + """ + +rule filter: + """ + Filtering to + - {params.sequences_per_group} sequence(s) per {params.group_by!s} + - from {params.min_date} onwards + - excluding strains in {input.exclude} + - minimum genome length of {params.min_length} + """ + input: + sequences = "data/sequences.fasta", + metadata = "data/metadata.tsv", + exclude = config["files"]["exclude"] + output: + sequences = "results/filtered.fasta" + params: + group_by = config["filter"]["group_by"], + sequences_per_group = config["filter"]["sequences_per_group"], + min_date = config["filter"]["min_date"], + min_length = config["filter"]["min_length"] + shell: + """ + augur filter \ + --sequences {input.sequences} \ + --metadata {input.metadata} \ + --exclude {input.exclude} \ + --output {output.sequences} \ + --group-by {params.group_by} \ + --sequences-per-group {params.sequences_per_group} \ + --min-date {params.min_date} \ + --min-length {params.min_length} + """ + +rule align: + """ + Aligning sequences to {input.reference} + - filling gaps with N + """ + input: + sequences = "results/filtered.fasta", + reference = config["files"]["reference"] + output: + alignment = "results/aligned.fasta" + shell: + """ + augur align \ + --sequences {input.sequences} \ + --reference-sequence {input.reference} \ + --output {output.alignment} \ + --fill-gaps \ + --remove-reference + """ + \ No newline at end of file From 1b202bc25a9e4f303eba4ed425bfcc5099caf6c9 Mon Sep 17 00:00:00 2001 From: Kim Andrews <17375001+kimandrews@users.noreply.github.com> Date: Thu, 22 Feb 2024 08:53:31 -0800 Subject: [PATCH 03/10] Move rules for constructing phylogeny to its own smk file Part of work to update this repo to match the pathogen-repo-guide. --- phylogenetic/Snakefile | 47 +-------------- phylogenetic/rules/construct_phylogeny.smk | 68 ++++++++++++++++++++++ 2 files changed, 69 insertions(+), 46 deletions(-) create mode 100644 phylogenetic/rules/construct_phylogeny.smk diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index e3532bc..a312463 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -5,53 +5,8 @@ rule all: auspice_json = "auspice/measles.json", include: "rules/prepare_sequences.smk" +include: "rules/construct_phylogeny.smk" -rule tree: - """Building tree""" - input: - alignment = "results/aligned.fasta" - output: - tree = "results/tree_raw.nwk" - shell: - """ - augur tree \ - --alignment {input.alignment} \ - --output {output.tree} - """ - -rule refine: - """ - Refining tree - - estimate timetree - - use {params.coalescent} coalescent timescale - - estimate {params.date_inference} node dates - - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation - """ - input: - tree = "results/tree_raw.nwk", - alignment = "results/aligned.fasta", - metadata = "data/metadata.tsv" - output: - tree = "results/tree.nwk", - node_data = "results/branch_lengths.json" - params: - coalescent = config["refine"]["coalescent"], - date_inference = config["refine"]["date_inference"], - clock_filter_iqd = config["refine"]["clock_filter_iqd"] - shell: - """ - augur refine \ - --tree {input.tree} \ - --alignment {input.alignment} \ - --metadata {input.metadata} \ - --output-tree {output.tree} \ - --output-node-data {output.node_data} \ - --timetree \ - --coalescent {params.coalescent} \ - --date-confidence \ - --date-inference {params.date_inference} \ - --clock-filter-iqd {params.clock_filter_iqd} - """ rule ancestral: """Reconstructing ancestral sequences and mutations""" diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk new file mode 100644 index 0000000..43ab05f --- /dev/null +++ b/phylogenetic/rules/construct_phylogeny.smk @@ -0,0 +1,68 @@ +""" +This part of the workflow constructs the phylogenetic tree. + +REQUIRED INPUTS: + + metadata = data/metadata.tsv + prepared_sequences = results/prepared_sequences.fasta + +OUTPUTS: + + tree = results/tree.nwk + branch_lengths = results/branch_lengths.json + +This part of the workflow usually includes the following steps: + + - augur tree + - augur refine + +See Augur's usage docs for these commands for more details. +""" + +rule tree: + """Building tree""" + input: + alignment = "results/aligned.fasta" + output: + tree = "results/tree_raw.nwk" + shell: + """ + augur tree \ + --alignment {input.alignment} \ + --output {output.tree} + """ + +rule refine: + """ + Refining tree + - estimate timetree + - use {params.coalescent} coalescent timescale + - estimate {params.date_inference} node dates + - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation + """ + input: + tree = "results/tree_raw.nwk", + alignment = "results/aligned.fasta", + metadata = "data/metadata.tsv" + output: + tree = "results/tree.nwk", + node_data = "results/branch_lengths.json" + params: + coalescent = config["refine"]["coalescent"], + date_inference = config["refine"]["date_inference"], + clock_filter_iqd = config["refine"]["clock_filter_iqd"] + shell: + """ + augur refine \ + --tree {input.tree} \ + --alignment {input.alignment} \ + --metadata {input.metadata} \ + --output-tree {output.tree} \ + --output-node-data {output.node_data} \ + --timetree \ + --coalescent {params.coalescent} \ + --date-confidence \ + --date-inference {params.date_inference} \ + --clock-filter-iqd {params.clock_filter_iqd} + """ + \ No newline at end of file From 29211114e6d17d358cb2e22be25c8e695130e227 Mon Sep 17 00:00:00 2001 From: Kim Andrews <17375001+kimandrews@users.noreply.github.com> Date: Thu, 22 Feb 2024 09:18:28 -0800 Subject: [PATCH 04/10] Move rules for annotating phylogeny to its own smk file Part of work to update this repo to match the pathogen-repo-guide. --- phylogenetic/Snakefile | 36 +----------- phylogenetic/rules/annotate_phylogeny.smk | 67 +++++++++++++++++++++++ 2 files changed, 68 insertions(+), 35 deletions(-) create mode 100644 phylogenetic/rules/annotate_phylogeny.smk diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index a312463..138c79c 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -6,43 +6,9 @@ rule all: include: "rules/prepare_sequences.smk" include: "rules/construct_phylogeny.smk" +include: "rules/annotate_phylogeny.smk" -rule ancestral: - """Reconstructing ancestral sequences and mutations""" - input: - tree = "results/tree.nwk", - alignment = "results/aligned.fasta" - output: - node_data = "results/nt_muts.json" - params: - inference = config["ancestral"]["inference"] - shell: - """ - augur ancestral \ - --tree {input.tree} \ - --alignment {input.alignment} \ - --output-node-data {output.node_data} \ - --inference {params.inference} - """ - -rule translate: - """Translating amino acid sequences""" - input: - tree = "results/tree.nwk", - node_data = "results/nt_muts.json", - reference = config["files"]["reference"] - output: - node_data = "results/aa_muts.json" - shell: - """ - augur translate \ - --tree {input.tree} \ - --ancestral-sequences {input.node_data} \ - --reference-sequence {input.reference} \ - --output {output.node_data} \ - """ - rule export: """Exporting data files for for auspice""" input: diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk new file mode 100644 index 0000000..61a0bd6 --- /dev/null +++ b/phylogenetic/rules/annotate_phylogeny.smk @@ -0,0 +1,67 @@ +""" +This part of the workflow creates additonal annotations for the phylogenetic tree. + +REQUIRED INPUTS: + + metadata = data/metadata.tsv + prepared_sequences = results/prepared_sequences.fasta + tree = results/tree.nwk + +OUTPUTS: + + node_data = results/*.json + + There are no required outputs for this part of the workflow as it depends + on which annotations are created. All outputs are expected to be node data + JSON files that can be fed into `augur export`. + + See Nextstrain's data format docs for more details on node data JSONs: + https://docs.nextstrain.org/page/reference/data-formats.html + +This part of the workflow usually includes the following steps: + + - augur traits + - augur ancestral + - augur translate + - augur clades + +See Augur's usage docs for these commands for more details. + +Custom node data files can also be produced by build-specific scripts in addition +to the ones produced by Augur commands. +""" + +rule ancestral: + """Reconstructing ancestral sequences and mutations""" + input: + tree = "results/tree.nwk", + alignment = "results/aligned.fasta" + output: + node_data = "results/nt_muts.json" + params: + inference = config["ancestral"]["inference"] + shell: + """ + augur ancestral \ + --tree {input.tree} \ + --alignment {input.alignment} \ + --output-node-data {output.node_data} \ + --inference {params.inference} + """ + +rule translate: + """Translating amino acid sequences""" + input: + tree = "results/tree.nwk", + node_data = "results/nt_muts.json", + reference = config["files"]["reference"] + output: + node_data = "results/aa_muts.json" + shell: + """ + augur translate \ + --tree {input.tree} \ + --ancestral-sequences {input.node_data} \ + --reference-sequence {input.reference} \ + --output {output.node_data} \ + """ From 88f2e683b276804460ecca10157729b6b983bd1b Mon Sep 17 00:00:00 2001 From: Kim Andrews <17375001+kimandrews@users.noreply.github.com> Date: Thu, 22 Feb 2024 09:36:14 -0800 Subject: [PATCH 05/10] Move rule for exporting auspice json to its own smk file Part of work to update this repo to match the pathogen-repo-guide. --- phylogenetic/Snakefile | 26 +----------------- phylogenetic/rules/export.smk | 51 +++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 25 deletions(-) create mode 100644 phylogenetic/rules/export.smk diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index 138c79c..e2779b6 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -7,31 +7,7 @@ rule all: include: "rules/prepare_sequences.smk" include: "rules/construct_phylogeny.smk" include: "rules/annotate_phylogeny.smk" - - -rule export: - """Exporting data files for for auspice""" - input: - tree = "results/tree.nwk", - metadata = "data/metadata.tsv", - branch_lengths = "results/branch_lengths.json", - nt_muts = "results/nt_muts.json", - aa_muts = "results/aa_muts.json", - colors = config["files"]["colors"], - auspice_config = config["files"]["auspice_config"] - output: - auspice_json = rules.all.input.auspice_json - shell: - """ - augur export v2 \ - --tree {input.tree} \ - --metadata {input.metadata} \ - --node-data {input.branch_lengths} {input.nt_muts} {input.aa_muts} \ - --colors {input.colors} \ - --auspice-config {input.auspice_config} \ - --include-root-sequence \ - --output {output.auspice_json} - """ +include: "rules/export.smk" rule clean: """Removing directories: {params}""" diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk new file mode 100644 index 0000000..76e00df --- /dev/null +++ b/phylogenetic/rules/export.smk @@ -0,0 +1,51 @@ +""" +This part of the workflow collects the phylogenetic tree and annotations to +export a Nextstrain dataset. + +REQUIRED INPUTS: + + metadata = data/metadata.tsv + tree = results/tree.nwk + branch_lengths = results/branch_lengths.json + node_data = results/*.json + +OUTPUTS: + + auspice_json = auspice/${build_name}.json + + There are optional sidecar JSON files that can be exported as part of the dataset. + See Nextstrain's data format docs for more details on sidecar files: + https://docs.nextstrain.org/page/reference/data-formats.html + +This part of the workflow usually includes the following steps: + + - augur export v2 + - augur frequencies + +See Augur's usage docs for these commands for more details. +""" + +rule export: + """Exporting data files for for auspice""" + input: + tree = "results/tree.nwk", + metadata = "data/metadata.tsv", + branch_lengths = "results/branch_lengths.json", + nt_muts = "results/nt_muts.json", + aa_muts = "results/aa_muts.json", + colors = config["files"]["colors"], + auspice_config = config["files"]["auspice_config"] + output: + auspice_json = rules.all.input.auspice_json + shell: + """ + augur export v2 \ + --tree {input.tree} \ + --metadata {input.metadata} \ + --node-data {input.branch_lengths} {input.nt_muts} {input.aa_muts} \ + --colors {input.colors} \ + --auspice-config {input.auspice_config} \ + --include-root-sequence \ + --output {output.auspice_json} + """ + \ No newline at end of file From c1da895b4b3a947119a8ddd4fbcca2af3cf5c3de Mon Sep 17 00:00:00 2001 From: Kim Andrews <17375001+kimandrews@users.noreply.github.com> Date: Fri, 23 Feb 2024 11:05:31 -0800 Subject: [PATCH 06/10] Update the CI workflow Following the pathogen-repo-guide and https://github.com/nextstrain/zika/commit/efe11e381ca777ac7e784d7d04cce0d04fd6fbfe --- .github/workflows/ci.yaml | 23 +++++++++++++++++-- phylogenetic/Snakefile | 6 +++++ phylogenetic/build-configs/ci/config.yaml | 7 ++++++ .../build-configs/ci/copy_example_data.smk | 17 ++++++++++++++ 4 files changed, 51 insertions(+), 2 deletions(-) create mode 100644 phylogenetic/build-configs/ci/config.yaml create mode 100644 phylogenetic/build-configs/ci/copy_example_data.smk diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index b1f5bca..7cc7827 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -5,5 +5,24 @@ on: - pull_request jobs: - ci: - uses: nextstrain/.github/.github/workflows/pathogen-repo-ci.yaml@master + pathogen-ci: + strategy: + matrix: + runtime: [docker, conda] + permissions: + id-token: write + uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master + secrets: inherit + with: + runtime: ${{ matrix.runtime }} + run: | + nextstrain build \ + phylogenetic \ + --configfile build-configs/ci/config.yaml + artifact-name: output-${{ matrix.runtime }} + artifact-paths: | + phylogenetic/auspice/ + phylogenetic/results/ + phylogenetic/benchmarks/ + phylogenetic/logs/ + phylogenetic/.snakemake/log/ \ No newline at end of file diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index e2779b6..c1bbbd6 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -9,6 +9,12 @@ include: "rules/construct_phylogeny.smk" include: "rules/annotate_phylogeny.smk" include: "rules/export.smk" +# Include custom rules defined in the config. +if "custom_rules" in config: + for rule_file in config["custom_rules"]: + + include: rule_file + rule clean: """Removing directories: {params}""" params: diff --git a/phylogenetic/build-configs/ci/config.yaml b/phylogenetic/build-configs/ci/config.yaml new file mode 100644 index 0000000..de89c67 --- /dev/null +++ b/phylogenetic/build-configs/ci/config.yaml @@ -0,0 +1,7 @@ +# This configuration file contains the custom configurations parameters +# for the CI workflow to run with the example data. + +# Custom rules to run as part of the CI automated workflow +# The paths should be relative to the phylogenetic directory. +custom_rules: + - build-configs/ci/copy_example_data.smk diff --git a/phylogenetic/build-configs/ci/copy_example_data.smk b/phylogenetic/build-configs/ci/copy_example_data.smk new file mode 100644 index 0000000..c36eb54 --- /dev/null +++ b/phylogenetic/build-configs/ci/copy_example_data.smk @@ -0,0 +1,17 @@ +rule copy_example_data: + input: + sequences="example_data/sequences.fasta", + metadata="example_data/metadata.tsv", + output: + sequences="data/sequences.fasta", + metadata="data/metadata.tsv", + shell: + """ + cp -f {input.sequences} {output.sequences} + cp -f {input.metadata} {output.metadata} + """ + +# Add a Snakemake ruleorder directive here if you need to resolve ambiguous rules +# that have the same output as the copy_example_data rule. + +ruleorder: copy_example_data > decompress From e86ab750daaabee69d83044eb8be92e6b9c30eb3 Mon Sep 17 00:00:00 2001 From: Kim Andrews <17375001+kimandrews@users.noreply.github.com> Date: Mon, 26 Feb 2024 10:41:29 -0800 Subject: [PATCH 07/10] Update `README.md` files to match new workflow structure Update top-level and phylogenetic `README.md` files to match new workflow structure that includes ingest and phylogenetic directories, following the pathogen-repo-guide --- README.md | 74 +++++++++--------------------------------- phylogenetic/README.md | 50 ++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 58 deletions(-) create mode 100644 phylogenetic/README.md diff --git a/README.md b/README.md index 4625956..9eb08e4 100644 --- a/README.md +++ b/README.md @@ -1,67 +1,25 @@ -# nextstrain.org/measles +# Nextstrain repository for measles virus -This is the [Nextstrain](https://nextstrain.org) build for measles virus, visible at -[nextstrain.org/measles](https://nextstrain.org/measles). +This repository contains two workflows for the analysis of measles virus data: -The build encompasses fetching data, preparing it for analysis, doing quality -control, performing analyses, and saving the results in a format suitable for -visualization (with [auspice][]). This involves running components of -Nextstrain such as [augur][]. +- [`ingest/`](./ingest) - Download data from GenBank, clean and curate it +- [`phylogenetic/`](./phylogenetic) - Filter sequences, align, construct phylogeny and export for visualization -All measles-specific steps and functionality for the Nextstrain pipeline should be -housed in this repository. +Each folder contains a README.md with more information. The results of running both workflows are publicly visible at [nextstrain.org/measles](https://nextstrain.org/measles). -[![Build Status](https://github.com/nextstrain/measles/actions/workflows/ci.yaml/badge.svg?branch=main)](https://github.com/nextstrain/measles/actions/workflows/ci.yaml) +## Installation -## Usage +Follow the [standard installation instructions](https://docs.nextstrain.org/en/latest/install.html) for Nextstrain's suite of software tools. -If you're unfamiliar with Nextstrain builds, you may want to follow our -[quickstart guide][] first and then come back here. +## Quickstart -The easiest way to run this pathogen build is using the [Nextstrain -command-line tool][nextstrain-cli]: +Run the default phylogenetic workflow via: +``` +cd phylogenetic/ +nextstrain build . +nextstrain view . +``` - nextstrain build . +## Documentation -See the [nextstrain-cli README][] for how to install the `nextstrain` command. - -Alternatively, you should be able to run the build using `snakemake` within a -suitably-configured local environment. Details of setting that up are not yet -well-documented, but will be in the future. - -Build output goes into the directories `data/`, `results/` and `auspice/`. - -Once you've run the build, you can view the results in auspice: - - nextstrain view auspice/ - - -## Configuration - -Configuration takes place entirely with the `Snakefile`. This can be read top-to-bottom, each rule -specifies its file inputs and output and also its parameters. There is little redirection and each -rule should be able to be reasoned with on its own. - - - -If you don't have access to our https endpoints, you can run the build using the -example data provided in this repository. Before running the build, copy the -example sequences into the `data/` directory like so: - - mkdir -p data/ - cp example_data/* data/. - - -[Nextstrain]: https://nextstrain.org - -[augur]: https://github.com/nextstrain/augur -[auspice]: https://github.com/nextstrain/auspice -[snakemake cli]: https://snakemake.readthedocs.io/en/stable/executable.html#all-options -[nextstrain-cli]: https://github.com/nextstrain/cli -[nextstrain-cli README]: https://github.com/nextstrain/cli/blob/master/README.md -[quickstart guide]: https://nextstrain.org/docs/getting-started/quickstart +- [Running a pathogen workflow](https://docs.nextstrain.org/en/latest/tutorials/running-a-workflow.html) diff --git a/phylogenetic/README.md b/phylogenetic/README.md new file mode 100644 index 0000000..8ead834 --- /dev/null +++ b/phylogenetic/README.md @@ -0,0 +1,50 @@ +# nextstrain.org/measles + +This is the [Nextstrain](https://nextstrain.org) build for measles, visible at +[nextstrain.org/measles](https://nextstrain.org/measles). + +## Software requirements + +Follow the [standard installation instructions](https://docs.nextstrain.org/en/latest/install.html) +for Nextstrain's suite of software tools. + +## Usage + +If you're unfamiliar with Nextstrain builds, you may want to follow our +[Running a Pathogen Workflow guide](https://docs.nextstrain.org/en/latest/tutorials/running-a-workflow.html) first and then come back here. + +The easiest way to run this pathogen build is using the Nextstrain +command-line tool from within the `phylogenetic/` directory: + + cd phylogenetic/ + nextstrain build . + +Build output goes into the directories `data/`, `results/` and `auspice/`. + +Once you've run the build, you can view the results with: + + nextstrain view . + +## Configuration + +Configuration takes place entirely with the `Snakefile`. This can be read +top-to-bottom, each rule specifies its file inputs and output and also its +parameters. There is little redirection and each rule should be able to be +reasoned with on its own. + +### Using GenBank data + +This build starts by pulling preprocessed sequence and metadata files from: + +* https://data.nextstrain.org/files/measles/sequences.fasta.zst +* https://data.nextstrain.org/files/measles/metadata.tsv.zst + +The above datasets have been preprocessed and cleaned from GenBank. + +### Using example data + +Alternatively, you can run the build using the +example data provided in this repository. To run the build by copying the +example sequences into the `data/` directory, use the following: + + nextstrain build . --configfile profiles/ci/profiles_config.yaml From 0a9055ec08ae87054a5db92bc24b67836db94d7f Mon Sep 17 00:00:00 2001 From: Kim Andrews <17375001+kimandrews@users.noreply.github.com> Date: Mon, 26 Feb 2024 10:48:27 -0800 Subject: [PATCH 08/10] Add top-level `nextstrain-pathogen.yaml` Add empty top-level `nextstrain-pathogen.yaml` to allow `nextstrain build` to work from any directory regardless of runtime, as described [here](https://github.com/nextstrain/pathogen-repo-guide/commit/e318589ccb1ad49b5ab21f9856ba612bcff796db) --- nextstrain-pathogen.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 nextstrain-pathogen.yaml diff --git a/nextstrain-pathogen.yaml b/nextstrain-pathogen.yaml new file mode 100644 index 0000000..b74c50d --- /dev/null +++ b/nextstrain-pathogen.yaml @@ -0,0 +1,5 @@ +# This is currently an empty file to indicate the top level pathogen repo. +# The inclusion of this file allows the Nextstrain CLI to run the +# `nextstrain build` from any directory regardless of runtime. +# +# See https://github.com/nextstrain/cli/releases/tag/8.2.0 for more details. From 1b5c544ed42338e22ae0a8e15b44320ebb41274c Mon Sep 17 00:00:00 2001 From: Kim Andrews <17375001+kimandrews@users.noreply.github.com> Date: Fri, 1 Mar 2024 10:38:16 -0800 Subject: [PATCH 09/10] Reduce header content for snakemake rules --- phylogenetic/rules/annotate_phylogeny.smk | 26 ---------------------- phylogenetic/rules/construct_phylogeny.smk | 15 ------------- phylogenetic/rules/export.smk | 20 ----------------- phylogenetic/rules/prepare_sequences.smk | 17 -------------- 4 files changed, 78 deletions(-) diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk index 61a0bd6..2f8eec4 100644 --- a/phylogenetic/rules/annotate_phylogeny.smk +++ b/phylogenetic/rules/annotate_phylogeny.smk @@ -1,34 +1,8 @@ """ This part of the workflow creates additonal annotations for the phylogenetic tree. -REQUIRED INPUTS: - - metadata = data/metadata.tsv - prepared_sequences = results/prepared_sequences.fasta - tree = results/tree.nwk - -OUTPUTS: - - node_data = results/*.json - - There are no required outputs for this part of the workflow as it depends - on which annotations are created. All outputs are expected to be node data - JSON files that can be fed into `augur export`. - - See Nextstrain's data format docs for more details on node data JSONs: - https://docs.nextstrain.org/page/reference/data-formats.html - -This part of the workflow usually includes the following steps: - - - augur traits - - augur ancestral - - augur translate - - augur clades - See Augur's usage docs for these commands for more details. -Custom node data files can also be produced by build-specific scripts in addition -to the ones produced by Augur commands. """ rule ancestral: diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk index 43ab05f..8134742 100644 --- a/phylogenetic/rules/construct_phylogeny.smk +++ b/phylogenetic/rules/construct_phylogeny.smk @@ -1,21 +1,6 @@ """ This part of the workflow constructs the phylogenetic tree. -REQUIRED INPUTS: - - metadata = data/metadata.tsv - prepared_sequences = results/prepared_sequences.fasta - -OUTPUTS: - - tree = results/tree.nwk - branch_lengths = results/branch_lengths.json - -This part of the workflow usually includes the following steps: - - - augur tree - - augur refine - See Augur's usage docs for these commands for more details. """ diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk index 76e00df..3ed8d89 100644 --- a/phylogenetic/rules/export.smk +++ b/phylogenetic/rules/export.smk @@ -2,26 +2,6 @@ This part of the workflow collects the phylogenetic tree and annotations to export a Nextstrain dataset. -REQUIRED INPUTS: - - metadata = data/metadata.tsv - tree = results/tree.nwk - branch_lengths = results/branch_lengths.json - node_data = results/*.json - -OUTPUTS: - - auspice_json = auspice/${build_name}.json - - There are optional sidecar JSON files that can be exported as part of the dataset. - See Nextstrain's data format docs for more details on sidecar files: - https://docs.nextstrain.org/page/reference/data-formats.html - -This part of the workflow usually includes the following steps: - - - augur export v2 - - augur frequencies - See Augur's usage docs for these commands for more details. """ diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index 0344e24..b09d185 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -1,23 +1,6 @@ """ This part of the workflow prepares sequences for constructing the phylogenetic tree. -REQUIRED INPUTS: - - metadata = data/metadata.tsv - sequences = data/sequences.fasta - reference = ../shared/reference.fasta - -OUTPUTS: - - prepared_sequences = results/prepared_sequences.fasta - -This part of the workflow usually includes the following steps: - - - augur index - - augur filter - - augur align - - augur mask - See Augur's usage docs for these commands for more details. """ rule download: From 17beea00c73010e330e0a3994311f2a0321cd102 Mon Sep 17 00:00:00 2001 From: Kim Andrews <17375001+kimandrews@users.noreply.github.com> Date: Fri, 1 Mar 2024 14:11:00 -0800 Subject: [PATCH 10/10] Update ChangeLog --- CHANGES.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 946f64c..36df3f1 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,4 @@ # CHANGELOG -* 11 January 2024: Use a config file to define hardcoded parameters and file paths, add a change log. [PR #9](https://github.com/nextstrain/measles/pull/9) +* 1 March 2024: Add phylogenetic directory to follow the pathogen-repo-guide, and update the CI workflow to match the new file structure. [PR #18](https://github.com/nextstrain/measles/pull/18) * 14 February 2024: Add ingest directory from pathogen-repo-guide and make measles-specific modifications. [PR #10](https://github.com/nextstrain/measles/pull/10) +* 11 January 2024: Use a config file to define hardcoded parameters and file paths, and add a change log. [PR #9](https://github.com/nextstrain/measles/pull/9)