From c2f651cda6f9d08591c80758adb80b1248144e31 Mon Sep 17 00:00:00 2001
From: Kim Andrews <17375001+kimandrews@users.noreply.github.com>
Date: Wed, 21 Feb 2024 16:52:23 -0800
Subject: [PATCH 01/10] Move phylogenetic workflow to a phylogenetic directory
 Move phylogenetic workflow from top-level to phylogenetic directory to follow
 the [Pathogen Repo
 Guide](https://github.com/nextstrain/pathogen-repo-guide/tree/main)

---
 Snakefile => phylogenetic/Snakefile                       | 0
 {config => phylogenetic/defaults}/auspice_config.json     | 0
 {config => phylogenetic/defaults}/colors.tsv              | 0
 {config => phylogenetic/defaults}/config.yaml             | 8 ++++----
 {config => phylogenetic/defaults}/dropped_strains.txt     | 0
 {config => phylogenetic/defaults}/measles_reference.gb    | 0
 {example_data => phylogenetic/example_data}/metadata.tsv  | 0
 .../example_data}/sequences.fasta                         | 0
 8 files changed, 4 insertions(+), 4 deletions(-)
 rename Snakefile => phylogenetic/Snakefile (100%)
 rename {config => phylogenetic/defaults}/auspice_config.json (100%)
 rename {config => phylogenetic/defaults}/colors.tsv (100%)
 rename {config => phylogenetic/defaults}/config.yaml (57%)
 rename {config => phylogenetic/defaults}/dropped_strains.txt (100%)
 rename {config => phylogenetic/defaults}/measles_reference.gb (100%)
 rename {example_data => phylogenetic/example_data}/metadata.tsv (100%)
 rename {example_data => phylogenetic/example_data}/sequences.fasta (100%)

diff --git a/Snakefile b/phylogenetic/Snakefile
similarity index 100%
rename from Snakefile
rename to phylogenetic/Snakefile
diff --git a/config/auspice_config.json b/phylogenetic/defaults/auspice_config.json
similarity index 100%
rename from config/auspice_config.json
rename to phylogenetic/defaults/auspice_config.json
diff --git a/config/colors.tsv b/phylogenetic/defaults/colors.tsv
similarity index 100%
rename from config/colors.tsv
rename to phylogenetic/defaults/colors.tsv
diff --git a/config/config.yaml b/phylogenetic/defaults/config.yaml
similarity index 57%
rename from config/config.yaml
rename to phylogenetic/defaults/config.yaml
index 1fe71e7..f8ca2c1 100644
--- a/config/config.yaml
+++ b/phylogenetic/defaults/config.yaml
@@ -1,8 +1,8 @@
 files:
-    exclude: "config/dropped_strains.txt"
-    reference: "config/measles_reference.gb"
-    colors: "config/colors.tsv"
-    auspice_config: "config/auspice_config.json"
+    exclude: "defaults/dropped_strains.txt"
+    reference: "defaults/measles_reference.gb"
+    colors: "defaults/colors.tsv"
+    auspice_config: "defaults/auspice_config.json"
 filter: 
     group_by: "country year month"
     sequences_per_group: 20
diff --git a/config/dropped_strains.txt b/phylogenetic/defaults/dropped_strains.txt
similarity index 100%
rename from config/dropped_strains.txt
rename to phylogenetic/defaults/dropped_strains.txt
diff --git a/config/measles_reference.gb b/phylogenetic/defaults/measles_reference.gb
similarity index 100%
rename from config/measles_reference.gb
rename to phylogenetic/defaults/measles_reference.gb
diff --git a/example_data/metadata.tsv b/phylogenetic/example_data/metadata.tsv
similarity index 100%
rename from example_data/metadata.tsv
rename to phylogenetic/example_data/metadata.tsv
diff --git a/example_data/sequences.fasta b/phylogenetic/example_data/sequences.fasta
similarity index 100%
rename from example_data/sequences.fasta
rename to phylogenetic/example_data/sequences.fasta

From c0157a0c35cb468ffe086a0b163fdd961937d4f4 Mon Sep 17 00:00:00 2001
From: Kim Andrews <17375001+kimandrews@users.noreply.github.com>
Date: Wed, 21 Feb 2024 17:45:19 -0800
Subject: [PATCH 02/10] Move rules for preparing sequences to its own smk file
 Part of work to update this repo to match the pathogen-repo-guide.

---
 phylogenetic/Snakefile                   |  82 +-----------------
 phylogenetic/rules/prepare_sequences.smk | 102 +++++++++++++++++++++++
 2 files changed, 104 insertions(+), 80 deletions(-)
 create mode 100644 phylogenetic/rules/prepare_sequences.smk

diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
index e44939f..e3532bc 100644
--- a/phylogenetic/Snakefile
+++ b/phylogenetic/Snakefile
@@ -1,88 +1,10 @@
-configfile: "config/config.yaml" 
+configfile: "defaults/config.yaml" 
 
 rule all:
     input:
         auspice_json = "auspice/measles.json",
 
-rule download:
-    """Downloading sequences and metadata from data.nextstrain.org"""
-    output:
-        sequences = "data/sequences.fasta.zst",
-        metadata = "data/metadata.tsv.zst"
-    params:
-        sequences_url = "https://data.nextstrain.org/files/measles/sequences.fasta.zst",
-        metadata_url = "https://data.nextstrain.org/files/measles/metadata.tsv.zst"
-    shell:
-        """
-        curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences}
-        curl -fsSL --compressed {params.metadata_url:q} --output {output.metadata}
-        """
-
-rule decompress:
-    """Decompressing sequences and metadata"""
-    input:
-        sequences = "data/sequences.fasta.zst",
-        metadata = "data/metadata.tsv.zst"
-    output:
-        sequences = "data/sequences.fasta",
-        metadata = "data/metadata.tsv"
-    shell:
-        """
-        zstd -d -c {input.sequences} > {output.sequences}
-        zstd -d -c {input.metadata} > {output.metadata}
-        """
-
-rule filter:
-    """
-    Filtering to
-      - {params.sequences_per_group} sequence(s) per {params.group_by!s}
-      - from {params.min_date} onwards
-      - excluding strains in {input.exclude}
-      - minimum genome length of {params.min_length}
-    """
-    input:
-        sequences = "data/sequences.fasta",
-        metadata = "data/metadata.tsv",
-        exclude = config["files"]["exclude"]
-    output:
-        sequences = "results/filtered.fasta"
-    params:
-        group_by = config["filter"]["group_by"],
-        sequences_per_group = config["filter"]["sequences_per_group"],
-        min_date = config["filter"]["min_date"],
-        min_length = config["filter"]["min_length"]
-    shell:
-        """
-        augur filter \
-            --sequences {input.sequences} \
-            --metadata {input.metadata} \
-            --exclude {input.exclude} \
-            --output {output.sequences} \
-            --group-by {params.group_by} \
-            --sequences-per-group {params.sequences_per_group} \
-            --min-date {params.min_date} \
-            --min-length {params.min_length}
-        """
-
-rule align:
-    """
-    Aligning sequences to {input.reference}
-      - filling gaps with N
-    """
-    input:
-        sequences = "results/filtered.fasta",
-        reference = config["files"]["reference"]
-    output:
-        alignment = "results/aligned.fasta"
-    shell:
-        """
-        augur align \
-            --sequences {input.sequences} \
-            --reference-sequence {input.reference} \
-            --output {output.alignment} \
-            --fill-gaps \
-            --remove-reference
-        """
+include: "rules/prepare_sequences.smk"
 
 rule tree:
     """Building tree"""
diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk
new file mode 100644
index 0000000..0344e24
--- /dev/null
+++ b/phylogenetic/rules/prepare_sequences.smk
@@ -0,0 +1,102 @@
+"""
+This part of the workflow prepares sequences for constructing the phylogenetic tree.
+
+REQUIRED INPUTS:
+
+    metadata    = data/metadata.tsv
+    sequences   = data/sequences.fasta
+    reference   = ../shared/reference.fasta
+
+OUTPUTS:
+
+    prepared_sequences = results/prepared_sequences.fasta
+
+This part of the workflow usually includes the following steps:
+
+    - augur index
+    - augur filter
+    - augur align
+    - augur mask
+
+See Augur's usage docs for these commands for more details.
+"""
+rule download:
+    """Downloading sequences and metadata from data.nextstrain.org"""
+    output:
+        sequences = "data/sequences.fasta.zst",
+        metadata = "data/metadata.tsv.zst"
+    params:
+        sequences_url = "https://data.nextstrain.org/files/measles/sequences.fasta.zst",
+        metadata_url = "https://data.nextstrain.org/files/measles/metadata.tsv.zst"
+    shell:
+        """
+        curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences}
+        curl -fsSL --compressed {params.metadata_url:q} --output {output.metadata}
+        """
+
+rule decompress:
+    """Decompressing sequences and metadata"""
+    input:
+        sequences = "data/sequences.fasta.zst",
+        metadata = "data/metadata.tsv.zst"
+    output:
+        sequences = "data/sequences.fasta",
+        metadata = "data/metadata.tsv"
+    shell:
+        """
+        zstd -d -c {input.sequences} > {output.sequences}
+        zstd -d -c {input.metadata} > {output.metadata}
+        """
+
+rule filter:
+    """
+    Filtering to
+      - {params.sequences_per_group} sequence(s) per {params.group_by!s}
+      - from {params.min_date} onwards
+      - excluding strains in {input.exclude}
+      - minimum genome length of {params.min_length}
+    """
+    input:
+        sequences = "data/sequences.fasta",
+        metadata = "data/metadata.tsv",
+        exclude = config["files"]["exclude"]
+    output:
+        sequences = "results/filtered.fasta"
+    params:
+        group_by = config["filter"]["group_by"],
+        sequences_per_group = config["filter"]["sequences_per_group"],
+        min_date = config["filter"]["min_date"],
+        min_length = config["filter"]["min_length"]
+    shell:
+        """
+        augur filter \
+            --sequences {input.sequences} \
+            --metadata {input.metadata} \
+            --exclude {input.exclude} \
+            --output {output.sequences} \
+            --group-by {params.group_by} \
+            --sequences-per-group {params.sequences_per_group} \
+            --min-date {params.min_date} \
+            --min-length {params.min_length}
+        """
+
+rule align:
+    """
+    Aligning sequences to {input.reference}
+      - filling gaps with N
+    """
+    input:
+        sequences = "results/filtered.fasta",
+        reference = config["files"]["reference"]
+    output:
+        alignment = "results/aligned.fasta"
+    shell:
+        """
+        augur align \
+            --sequences {input.sequences} \
+            --reference-sequence {input.reference} \
+            --output {output.alignment} \
+            --fill-gaps \
+            --remove-reference
+        """
+        
\ No newline at end of file

From 1b202bc25a9e4f303eba4ed425bfcc5099caf6c9 Mon Sep 17 00:00:00 2001
From: Kim Andrews <17375001+kimandrews@users.noreply.github.com>
Date: Thu, 22 Feb 2024 08:53:31 -0800
Subject: [PATCH 03/10] Move rules for constructing phylogeny to its own smk
 file Part of work to update this repo to match the pathogen-repo-guide.

---
 phylogenetic/Snakefile                     | 47 +--------------
 phylogenetic/rules/construct_phylogeny.smk | 68 ++++++++++++++++++++++
 2 files changed, 69 insertions(+), 46 deletions(-)
 create mode 100644 phylogenetic/rules/construct_phylogeny.smk

diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
index e3532bc..a312463 100644
--- a/phylogenetic/Snakefile
+++ b/phylogenetic/Snakefile
@@ -5,53 +5,8 @@ rule all:
         auspice_json = "auspice/measles.json",
 
 include: "rules/prepare_sequences.smk"
+include: "rules/construct_phylogeny.smk"
 
-rule tree:
-    """Building tree"""
-    input:
-        alignment = "results/aligned.fasta"
-    output:
-        tree = "results/tree_raw.nwk"
-    shell:
-        """
-        augur tree \
-            --alignment {input.alignment} \
-            --output {output.tree}
-        """
-
-rule refine:
-    """
-    Refining tree
-      - estimate timetree
-      - use {params.coalescent} coalescent timescale
-      - estimate {params.date_inference} node dates
-      - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation
-    """
-    input:
-        tree = "results/tree_raw.nwk",
-        alignment = "results/aligned.fasta",
-        metadata = "data/metadata.tsv"
-    output:
-        tree = "results/tree.nwk",
-        node_data = "results/branch_lengths.json"
-    params:
-        coalescent = config["refine"]["coalescent"],
-        date_inference = config["refine"]["date_inference"],
-        clock_filter_iqd = config["refine"]["clock_filter_iqd"]
-    shell:
-        """
-        augur refine \
-            --tree {input.tree} \
-            --alignment {input.alignment} \
-            --metadata {input.metadata} \
-            --output-tree {output.tree} \
-            --output-node-data {output.node_data} \
-            --timetree \
-            --coalescent {params.coalescent} \
-            --date-confidence \
-            --date-inference {params.date_inference} \
-            --clock-filter-iqd {params.clock_filter_iqd}
-        """
 
 rule ancestral:
     """Reconstructing ancestral sequences and mutations"""
diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk
new file mode 100644
index 0000000..43ab05f
--- /dev/null
+++ b/phylogenetic/rules/construct_phylogeny.smk
@@ -0,0 +1,68 @@
+"""
+This part of the workflow constructs the phylogenetic tree.
+
+REQUIRED INPUTS:
+
+    metadata            = data/metadata.tsv
+    prepared_sequences  = results/prepared_sequences.fasta
+
+OUTPUTS:
+
+    tree            = results/tree.nwk
+    branch_lengths  = results/branch_lengths.json
+
+This part of the workflow usually includes the following steps:
+
+    - augur tree
+    - augur refine
+
+See Augur's usage docs for these commands for more details.
+"""
+
+rule tree:
+    """Building tree"""
+    input:
+        alignment = "results/aligned.fasta"
+    output:
+        tree = "results/tree_raw.nwk"
+    shell:
+        """
+        augur tree \
+            --alignment {input.alignment} \
+            --output {output.tree}
+        """
+
+rule refine:
+    """
+    Refining tree
+      - estimate timetree
+      - use {params.coalescent} coalescent timescale
+      - estimate {params.date_inference} node dates
+      - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation
+    """
+    input:
+        tree = "results/tree_raw.nwk",
+        alignment = "results/aligned.fasta",
+        metadata = "data/metadata.tsv"
+    output:
+        tree = "results/tree.nwk",
+        node_data = "results/branch_lengths.json"
+    params:
+        coalescent = config["refine"]["coalescent"],
+        date_inference = config["refine"]["date_inference"],
+        clock_filter_iqd = config["refine"]["clock_filter_iqd"]
+    shell:
+        """
+        augur refine \
+            --tree {input.tree} \
+            --alignment {input.alignment} \
+            --metadata {input.metadata} \
+            --output-tree {output.tree} \
+            --output-node-data {output.node_data} \
+            --timetree \
+            --coalescent {params.coalescent} \
+            --date-confidence \
+            --date-inference {params.date_inference} \
+            --clock-filter-iqd {params.clock_filter_iqd}
+        """
+        
\ No newline at end of file

From 29211114e6d17d358cb2e22be25c8e695130e227 Mon Sep 17 00:00:00 2001
From: Kim Andrews <17375001+kimandrews@users.noreply.github.com>
Date: Thu, 22 Feb 2024 09:18:28 -0800
Subject: [PATCH 04/10] Move rules for annotating phylogeny to its own smk file
 Part of work to update this repo to match the pathogen-repo-guide.

---
 phylogenetic/Snakefile                    | 36 +-----------
 phylogenetic/rules/annotate_phylogeny.smk | 67 +++++++++++++++++++++++
 2 files changed, 68 insertions(+), 35 deletions(-)
 create mode 100644 phylogenetic/rules/annotate_phylogeny.smk

diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
index a312463..138c79c 100644
--- a/phylogenetic/Snakefile
+++ b/phylogenetic/Snakefile
@@ -6,43 +6,9 @@ rule all:
 
 include: "rules/prepare_sequences.smk"
 include: "rules/construct_phylogeny.smk"
+include: "rules/annotate_phylogeny.smk"
 
 
-rule ancestral:
-    """Reconstructing ancestral sequences and mutations"""
-    input:
-        tree = "results/tree.nwk",
-        alignment = "results/aligned.fasta"
-    output:
-        node_data = "results/nt_muts.json"
-    params:
-        inference = config["ancestral"]["inference"]
-    shell:
-        """
-        augur ancestral \
-            --tree {input.tree} \
-            --alignment {input.alignment} \
-            --output-node-data {output.node_data} \
-            --inference {params.inference}
-        """
-
-rule translate:
-    """Translating amino acid sequences"""
-    input:
-        tree = "results/tree.nwk",
-        node_data = "results/nt_muts.json",
-        reference = config["files"]["reference"]
-    output:
-        node_data = "results/aa_muts.json"
-    shell:
-        """
-        augur translate \
-            --tree {input.tree} \
-            --ancestral-sequences {input.node_data} \
-            --reference-sequence {input.reference} \
-            --output {output.node_data} \
-        """
-
 rule export:
     """Exporting data files for for auspice"""
     input:
diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk
new file mode 100644
index 0000000..61a0bd6
--- /dev/null
+++ b/phylogenetic/rules/annotate_phylogeny.smk
@@ -0,0 +1,67 @@
+"""
+This part of the workflow creates additonal annotations for the phylogenetic tree.
+
+REQUIRED INPUTS:
+
+    metadata            = data/metadata.tsv
+    prepared_sequences  = results/prepared_sequences.fasta
+    tree                = results/tree.nwk
+
+OUTPUTS:
+
+    node_data = results/*.json
+
+    There are no required outputs for this part of the workflow as it depends
+    on which annotations are created. All outputs are expected to be node data
+    JSON files that can be fed into `augur export`.
+
+    See Nextstrain's data format docs for more details on node data JSONs:
+    https://docs.nextstrain.org/page/reference/data-formats.html
+
+This part of the workflow usually includes the following steps:
+
+    - augur traits
+    - augur ancestral
+    - augur translate
+    - augur clades
+
+See Augur's usage docs for these commands for more details.
+
+Custom node data files can also be produced by build-specific scripts in addition
+to the ones produced by Augur commands.
+"""
+
+rule ancestral:
+    """Reconstructing ancestral sequences and mutations"""
+    input:
+        tree = "results/tree.nwk",
+        alignment = "results/aligned.fasta"
+    output:
+        node_data = "results/nt_muts.json"
+    params:
+        inference = config["ancestral"]["inference"]
+    shell:
+        """
+        augur ancestral \
+            --tree {input.tree} \
+            --alignment {input.alignment} \
+            --output-node-data {output.node_data} \
+            --inference {params.inference}
+        """
+
+rule translate:
+    """Translating amino acid sequences"""
+    input:
+        tree = "results/tree.nwk",
+        node_data = "results/nt_muts.json",
+        reference = config["files"]["reference"]
+    output:
+        node_data = "results/aa_muts.json"
+    shell:
+        """
+        augur translate \
+            --tree {input.tree} \
+            --ancestral-sequences {input.node_data} \
+            --reference-sequence {input.reference} \
+            --output {output.node_data} \
+        """

From 88f2e683b276804460ecca10157729b6b983bd1b Mon Sep 17 00:00:00 2001
From: Kim Andrews <17375001+kimandrews@users.noreply.github.com>
Date: Thu, 22 Feb 2024 09:36:14 -0800
Subject: [PATCH 05/10] Move rule for exporting auspice json to its own smk
 file Part of work to update this repo to match the pathogen-repo-guide.

---
 phylogenetic/Snakefile        | 26 +-----------------
 phylogenetic/rules/export.smk | 51 +++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 25 deletions(-)
 create mode 100644 phylogenetic/rules/export.smk

diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
index 138c79c..e2779b6 100644
--- a/phylogenetic/Snakefile
+++ b/phylogenetic/Snakefile
@@ -7,31 +7,7 @@ rule all:
 include: "rules/prepare_sequences.smk"
 include: "rules/construct_phylogeny.smk"
 include: "rules/annotate_phylogeny.smk"
-
-
-rule export:
-    """Exporting data files for for auspice"""
-    input:
-        tree = "results/tree.nwk",
-        metadata = "data/metadata.tsv",
-        branch_lengths = "results/branch_lengths.json",
-        nt_muts = "results/nt_muts.json",
-        aa_muts = "results/aa_muts.json",
-        colors = config["files"]["colors"],
-        auspice_config = config["files"]["auspice_config"]
-    output:
-        auspice_json = rules.all.input.auspice_json
-    shell:
-        """
-        augur export v2 \
-            --tree {input.tree} \
-            --metadata {input.metadata} \
-            --node-data {input.branch_lengths} {input.nt_muts} {input.aa_muts} \
-            --colors {input.colors} \
-            --auspice-config {input.auspice_config} \
-            --include-root-sequence \
-            --output {output.auspice_json}
-        """
+include: "rules/export.smk"
 
 rule clean:
     """Removing directories: {params}"""
diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk
new file mode 100644
index 0000000..76e00df
--- /dev/null
+++ b/phylogenetic/rules/export.smk
@@ -0,0 +1,51 @@
+"""
+This part of the workflow collects the phylogenetic tree and annotations to
+export a Nextstrain dataset.
+
+REQUIRED INPUTS:
+
+    metadata        = data/metadata.tsv
+    tree            = results/tree.nwk
+    branch_lengths  = results/branch_lengths.json
+    node_data       = results/*.json
+
+OUTPUTS:
+
+    auspice_json = auspice/${build_name}.json
+
+    There are optional sidecar JSON files that can be exported as part of the dataset.
+    See Nextstrain's data format docs for more details on sidecar files:
+    https://docs.nextstrain.org/page/reference/data-formats.html
+
+This part of the workflow usually includes the following steps:
+
+    - augur export v2
+    - augur frequencies
+
+See Augur's usage docs for these commands for more details.
+"""
+
+rule export:
+    """Exporting data files for for auspice"""
+    input:
+        tree = "results/tree.nwk",
+        metadata = "data/metadata.tsv",
+        branch_lengths = "results/branch_lengths.json",
+        nt_muts = "results/nt_muts.json",
+        aa_muts = "results/aa_muts.json",
+        colors = config["files"]["colors"],
+        auspice_config = config["files"]["auspice_config"]
+    output:
+        auspice_json = rules.all.input.auspice_json
+    shell:
+        """
+        augur export v2 \
+            --tree {input.tree} \
+            --metadata {input.metadata} \
+            --node-data {input.branch_lengths} {input.nt_muts} {input.aa_muts} \
+            --colors {input.colors} \
+            --auspice-config {input.auspice_config} \
+            --include-root-sequence \
+            --output {output.auspice_json}
+        """
+        
\ No newline at end of file

From c1da895b4b3a947119a8ddd4fbcca2af3cf5c3de Mon Sep 17 00:00:00 2001
From: Kim Andrews <17375001+kimandrews@users.noreply.github.com>
Date: Fri, 23 Feb 2024 11:05:31 -0800
Subject: [PATCH 06/10] Update the CI workflow Following the
 pathogen-repo-guide and
 https://github.com/nextstrain/zika/commit/efe11e381ca777ac7e784d7d04cce0d04fd6fbfe

---
 .github/workflows/ci.yaml                     | 23 +++++++++++++++++--
 phylogenetic/Snakefile                        |  6 +++++
 phylogenetic/build-configs/ci/config.yaml     |  7 ++++++
 .../build-configs/ci/copy_example_data.smk    | 17 ++++++++++++++
 4 files changed, 51 insertions(+), 2 deletions(-)
 create mode 100644 phylogenetic/build-configs/ci/config.yaml
 create mode 100644 phylogenetic/build-configs/ci/copy_example_data.smk

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index b1f5bca..7cc7827 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -5,5 +5,24 @@ on:
   - pull_request
 
 jobs:
-  ci:
-    uses: nextstrain/.github/.github/workflows/pathogen-repo-ci.yaml@master
+  pathogen-ci:
+    strategy:
+      matrix:
+        runtime: [docker, conda]
+    permissions:
+      id-token: write
+    uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master
+    secrets: inherit
+    with:
+      runtime: ${{ matrix.runtime }}
+      run: |
+        nextstrain build \
+          phylogenetic \
+          --configfile build-configs/ci/config.yaml
+      artifact-name: output-${{ matrix.runtime }}
+      artifact-paths: |
+        phylogenetic/auspice/
+        phylogenetic/results/
+        phylogenetic/benchmarks/
+        phylogenetic/logs/
+        phylogenetic/.snakemake/log/
\ No newline at end of file
diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
index e2779b6..c1bbbd6 100644
--- a/phylogenetic/Snakefile
+++ b/phylogenetic/Snakefile
@@ -9,6 +9,12 @@ include: "rules/construct_phylogeny.smk"
 include: "rules/annotate_phylogeny.smk"
 include: "rules/export.smk"
 
+# Include custom rules defined in the config.
+if "custom_rules" in config:
+    for rule_file in config["custom_rules"]:
+
+        include: rule_file
+
 rule clean:
     """Removing directories: {params}"""
     params:
diff --git a/phylogenetic/build-configs/ci/config.yaml b/phylogenetic/build-configs/ci/config.yaml
new file mode 100644
index 0000000..de89c67
--- /dev/null
+++ b/phylogenetic/build-configs/ci/config.yaml
@@ -0,0 +1,7 @@
+# This configuration file contains the custom configurations parameters
+# for the CI workflow to run with the example data.
+
+# Custom rules to run as part of the CI automated workflow
+# The paths should be relative to the phylogenetic directory.
+custom_rules:
+  - build-configs/ci/copy_example_data.smk
diff --git a/phylogenetic/build-configs/ci/copy_example_data.smk b/phylogenetic/build-configs/ci/copy_example_data.smk
new file mode 100644
index 0000000..c36eb54
--- /dev/null
+++ b/phylogenetic/build-configs/ci/copy_example_data.smk
@@ -0,0 +1,17 @@
+rule copy_example_data:
+    input:
+        sequences="example_data/sequences.fasta",
+        metadata="example_data/metadata.tsv",
+    output:
+        sequences="data/sequences.fasta",
+        metadata="data/metadata.tsv",
+    shell:
+        """
+        cp -f {input.sequences} {output.sequences}
+        cp -f {input.metadata} {output.metadata}
+        """
+
+# Add a Snakemake ruleorder directive here if you need to resolve ambiguous rules
+# that have the same output as the copy_example_data rule.
+
+ruleorder: copy_example_data > decompress

From e86ab750daaabee69d83044eb8be92e6b9c30eb3 Mon Sep 17 00:00:00 2001
From: Kim Andrews <17375001+kimandrews@users.noreply.github.com>
Date: Mon, 26 Feb 2024 10:41:29 -0800
Subject: [PATCH 07/10] Update `README.md` files to match new workflow
 structure Update top-level and phylogenetic `README.md` files to match new
 workflow structure that includes ingest and phylogenetic directories,
 following the pathogen-repo-guide

---
 README.md              | 74 +++++++++---------------------------------
 phylogenetic/README.md | 50 ++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 58 deletions(-)
 create mode 100644 phylogenetic/README.md

diff --git a/README.md b/README.md
index 4625956..9eb08e4 100644
--- a/README.md
+++ b/README.md
@@ -1,67 +1,25 @@
-# nextstrain.org/measles
+# Nextstrain repository for measles virus
 
-This is the [Nextstrain](https://nextstrain.org) build for measles virus, visible at
-[nextstrain.org/measles](https://nextstrain.org/measles).
+This repository contains two workflows for the analysis of measles virus data:
 
-The build encompasses fetching data, preparing it for analysis, doing quality
-control, performing analyses, and saving the results in a format suitable for
-visualization (with [auspice][]).  This involves running components of
-Nextstrain such as [augur][].
+- [`ingest/`](./ingest) - Download data from GenBank, clean and curate it
+- [`phylogenetic/`](./phylogenetic) - Filter sequences, align, construct phylogeny and export for visualization
 
-All measles-specific steps and functionality for the Nextstrain pipeline should be
-housed in this repository.
+Each folder contains a README.md with more information. The results of running both workflows are publicly visible at [nextstrain.org/measles](https://nextstrain.org/measles).
 
-[![Build Status](https://github.com/nextstrain/measles/actions/workflows/ci.yaml/badge.svg?branch=main)](https://github.com/nextstrain/measles/actions/workflows/ci.yaml)
+## Installation
 
-## Usage
+Follow the [standard installation instructions](https://docs.nextstrain.org/en/latest/install.html) for Nextstrain's suite of software tools.
 
-If you're unfamiliar with Nextstrain builds, you may want to follow our
-[quickstart guide][] first and then come back here.
+## Quickstart
 
-The easiest way to run this pathogen build is using the [Nextstrain
-command-line tool][nextstrain-cli]:
+Run the default phylogenetic workflow via:
+```
+cd phylogenetic/
+nextstrain build .
+nextstrain view .
+```
 
-    nextstrain build .
+## Documentation
 
-See the [nextstrain-cli README][] for how to install the `nextstrain` command.
-
-Alternatively, you should be able to run the build using `snakemake` within a
-suitably-configured local environment.  Details of setting that up are not yet
-well-documented, but will be in the future.
-
-Build output goes into the directories `data/`, `results/` and `auspice/`.
-
-Once you've run the build, you can view the results in auspice:
-
-    nextstrain view auspice/
-
-
-## Configuration
-
-Configuration takes place entirely with the `Snakefile`. This can be read top-to-bottom, each rule
-specifies its file inputs and output and also its parameters. There is little redirection and each
-rule should be able to be reasoned with on its own.
-
-<!--
-### fauna / RethinkDB credentials
-
-This build starts by pulling sequences from our live [fauna][] database (a RethinkDB instance). This
-requires environment variables `RETHINK_HOST` and `RETHINK_AUTH_KEY` to be set.
--->
-
-If you don't have access to our https endpoints, you can run the build using the
-example data provided in this repository.  Before running the build, copy the
-example sequences into the `data/` directory like so:
-
-    mkdir -p data/
-    cp example_data/* data/.
-
-
-[Nextstrain]: https://nextstrain.org
-<!-- [fauna]: https://github.com/nextstrain/fauna -->
-[augur]: https://github.com/nextstrain/augur
-[auspice]: https://github.com/nextstrain/auspice
-[snakemake cli]: https://snakemake.readthedocs.io/en/stable/executable.html#all-options
-[nextstrain-cli]: https://github.com/nextstrain/cli
-[nextstrain-cli README]: https://github.com/nextstrain/cli/blob/master/README.md
-[quickstart guide]: https://nextstrain.org/docs/getting-started/quickstart
+- [Running a pathogen workflow](https://docs.nextstrain.org/en/latest/tutorials/running-a-workflow.html)
diff --git a/phylogenetic/README.md b/phylogenetic/README.md
new file mode 100644
index 0000000..8ead834
--- /dev/null
+++ b/phylogenetic/README.md
@@ -0,0 +1,50 @@
+# nextstrain.org/measles
+
+This is the [Nextstrain](https://nextstrain.org) build for measles, visible at
+[nextstrain.org/measles](https://nextstrain.org/measles).
+
+## Software requirements
+
+Follow the [standard installation instructions](https://docs.nextstrain.org/en/latest/install.html)
+for Nextstrain's suite of software tools.
+
+## Usage
+
+If you're unfamiliar with Nextstrain builds, you may want to follow our
+[Running a Pathogen Workflow guide](https://docs.nextstrain.org/en/latest/tutorials/running-a-workflow.html) first and then come back here.
+
+The easiest way to run this pathogen build is using the Nextstrain
+command-line tool from within the `phylogenetic/` directory:
+
+    cd phylogenetic/
+    nextstrain build .
+
+Build output goes into the directories `data/`, `results/` and `auspice/`.
+
+Once you've run the build, you can view the results with:
+
+    nextstrain view .
+
+## Configuration
+
+Configuration takes place entirely with the `Snakefile`. This can be read
+top-to-bottom, each rule specifies its file inputs and output and also its
+parameters. There is little redirection and each rule should be able to be
+reasoned with on its own.
+
+### Using GenBank data
+
+This build starts by pulling preprocessed sequence and metadata files from:
+
+* https://data.nextstrain.org/files/measles/sequences.fasta.zst
+* https://data.nextstrain.org/files/measles/metadata.tsv.zst
+
+The above datasets have been preprocessed and cleaned from GenBank.
+
+### Using example data
+
+Alternatively, you can run the build using the
+example data provided in this repository.  To run the build by copying the
+example sequences into the `data/` directory, use the following:
+
+    nextstrain build .  --configfile profiles/ci/profiles_config.yaml

From 0a9055ec08ae87054a5db92bc24b67836db94d7f Mon Sep 17 00:00:00 2001
From: Kim Andrews <17375001+kimandrews@users.noreply.github.com>
Date: Mon, 26 Feb 2024 10:48:27 -0800
Subject: [PATCH 08/10] Add top-level `nextstrain-pathogen.yaml` Add empty
 top-level `nextstrain-pathogen.yaml` to allow `nextstrain build` to work from
 any directory regardless of runtime, as described
 [here](https://github.com/nextstrain/pathogen-repo-guide/commit/e318589ccb1ad49b5ab21f9856ba612bcff796db)

---
 nextstrain-pathogen.yaml | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 nextstrain-pathogen.yaml

diff --git a/nextstrain-pathogen.yaml b/nextstrain-pathogen.yaml
new file mode 100644
index 0000000..b74c50d
--- /dev/null
+++ b/nextstrain-pathogen.yaml
@@ -0,0 +1,5 @@
+# This is currently an empty file to indicate the top level pathogen repo.
+# The inclusion of this file allows the Nextstrain CLI to run the
+# `nextstrain build` from any directory regardless of runtime.
+#
+# See https://github.com/nextstrain/cli/releases/tag/8.2.0 for more details.

From 1b5c544ed42338e22ae0a8e15b44320ebb41274c Mon Sep 17 00:00:00 2001
From: Kim Andrews <17375001+kimandrews@users.noreply.github.com>
Date: Fri, 1 Mar 2024 10:38:16 -0800
Subject: [PATCH 09/10] Reduce header content for snakemake rules

---
 phylogenetic/rules/annotate_phylogeny.smk  | 26 ----------------------
 phylogenetic/rules/construct_phylogeny.smk | 15 -------------
 phylogenetic/rules/export.smk              | 20 -----------------
 phylogenetic/rules/prepare_sequences.smk   | 17 --------------
 4 files changed, 78 deletions(-)

diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk
index 61a0bd6..2f8eec4 100644
--- a/phylogenetic/rules/annotate_phylogeny.smk
+++ b/phylogenetic/rules/annotate_phylogeny.smk
@@ -1,34 +1,8 @@
 """
 This part of the workflow creates additonal annotations for the phylogenetic tree.
 
-REQUIRED INPUTS:
-
-    metadata            = data/metadata.tsv
-    prepared_sequences  = results/prepared_sequences.fasta
-    tree                = results/tree.nwk
-
-OUTPUTS:
-
-    node_data = results/*.json
-
-    There are no required outputs for this part of the workflow as it depends
-    on which annotations are created. All outputs are expected to be node data
-    JSON files that can be fed into `augur export`.
-
-    See Nextstrain's data format docs for more details on node data JSONs:
-    https://docs.nextstrain.org/page/reference/data-formats.html
-
-This part of the workflow usually includes the following steps:
-
-    - augur traits
-    - augur ancestral
-    - augur translate
-    - augur clades
-
 See Augur's usage docs for these commands for more details.
 
-Custom node data files can also be produced by build-specific scripts in addition
-to the ones produced by Augur commands.
 """
 
 rule ancestral:
diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk
index 43ab05f..8134742 100644
--- a/phylogenetic/rules/construct_phylogeny.smk
+++ b/phylogenetic/rules/construct_phylogeny.smk
@@ -1,21 +1,6 @@
 """
 This part of the workflow constructs the phylogenetic tree.
 
-REQUIRED INPUTS:
-
-    metadata            = data/metadata.tsv
-    prepared_sequences  = results/prepared_sequences.fasta
-
-OUTPUTS:
-
-    tree            = results/tree.nwk
-    branch_lengths  = results/branch_lengths.json
-
-This part of the workflow usually includes the following steps:
-
-    - augur tree
-    - augur refine
-
 See Augur's usage docs for these commands for more details.
 """
 
diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk
index 76e00df..3ed8d89 100644
--- a/phylogenetic/rules/export.smk
+++ b/phylogenetic/rules/export.smk
@@ -2,26 +2,6 @@
 This part of the workflow collects the phylogenetic tree and annotations to
 export a Nextstrain dataset.
 
-REQUIRED INPUTS:
-
-    metadata        = data/metadata.tsv
-    tree            = results/tree.nwk
-    branch_lengths  = results/branch_lengths.json
-    node_data       = results/*.json
-
-OUTPUTS:
-
-    auspice_json = auspice/${build_name}.json
-
-    There are optional sidecar JSON files that can be exported as part of the dataset.
-    See Nextstrain's data format docs for more details on sidecar files:
-    https://docs.nextstrain.org/page/reference/data-formats.html
-
-This part of the workflow usually includes the following steps:
-
-    - augur export v2
-    - augur frequencies
-
 See Augur's usage docs for these commands for more details.
 """
 
diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk
index 0344e24..b09d185 100644
--- a/phylogenetic/rules/prepare_sequences.smk
+++ b/phylogenetic/rules/prepare_sequences.smk
@@ -1,23 +1,6 @@
 """
 This part of the workflow prepares sequences for constructing the phylogenetic tree.
 
-REQUIRED INPUTS:
-
-    metadata    = data/metadata.tsv
-    sequences   = data/sequences.fasta
-    reference   = ../shared/reference.fasta
-
-OUTPUTS:
-
-    prepared_sequences = results/prepared_sequences.fasta
-
-This part of the workflow usually includes the following steps:
-
-    - augur index
-    - augur filter
-    - augur align
-    - augur mask
-
 See Augur's usage docs for these commands for more details.
 """
 rule download:

From 17beea00c73010e330e0a3994311f2a0321cd102 Mon Sep 17 00:00:00 2001
From: Kim Andrews <17375001+kimandrews@users.noreply.github.com>
Date: Fri, 1 Mar 2024 14:11:00 -0800
Subject: [PATCH 10/10] Update ChangeLog

---
 CHANGES.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CHANGES.md b/CHANGES.md
index 946f64c..36df3f1 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,4 @@
 # CHANGELOG
-* 11 January 2024: Use a config file to define hardcoded parameters and file paths, add a change log. [PR #9](https://github.com/nextstrain/measles/pull/9)
+* 1 March 2024: Add phylogenetic directory to follow the pathogen-repo-guide, and update the CI workflow to match the new file structure. [PR #18](https://github.com/nextstrain/measles/pull/18)
 * 14 February 2024: Add ingest directory from pathogen-repo-guide and make measles-specific modifications. [PR #10](https://github.com/nextstrain/measles/pull/10)
+* 11 January 2024: Use a config file to define hardcoded parameters and file paths, and add a change log. [PR #9](https://github.com/nextstrain/measles/pull/9)