Skip to content

Commit

Permalink
🚧 Use augur subsample
Browse files Browse the repository at this point in the history
  • Loading branch information
victorlin committed Apr 17, 2024
1 parent 0afe650 commit 6369f22
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 13 deletions.
5 changes: 2 additions & 3 deletions phylogenetic/defaults/config_zika.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,10 @@ strain_id_field: "accession"
display_strain_field: "strain"

filter:
group_by: "country year month"
sequences_per_group: 40
min_date: 2012
min_length: 5385

subsampling: "defaults/subsampling.yaml"

refine:
coalescent: "opt"
date_inference: "marginal"
Expand Down
8 changes: 8 additions & 0 deletions phylogenetic/defaults/subsampling.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
samples:
all:
group_by:
- country
- year
- month
sequences_per_group: 40
min_date: 2012
38 changes: 28 additions & 10 deletions phylogenetic/rules/prepare_sequences.smk
Original file line number Diff line number Diff line change
Expand Up @@ -52,21 +52,17 @@ rule decompress:
rule filter:
"""
Filtering to
- {params.sequences_per_group} sequence(s) per {params.group_by!s}
- from {params.min_date} onwards
- excluding strains in {input.exclude}
- exclude strains in {input.exclude}
- minimum genome length of {params.min_length} (50% of Zika virus genome)
"""
input:
sequences = "data/sequences_all.fasta",
metadata = "data/metadata_all.tsv",
exclude = "defaults/dropped_strains.txt",
output:
metadata = "results/filtered.tsv",
sequences = "results/filtered.fasta"
params:
group_by = config["filter"]["group_by"],
sequences_per_group = config["filter"]["sequences_per_group"],
min_date = config["filter"]["min_date"],
min_length = config["filter"]["min_length"],
strain_id = config.get("strain_id_field", "strain"),
shell:
Expand All @@ -77,19 +73,41 @@ rule filter:
--metadata-id-columns {params.strain_id} \
--exclude {input.exclude} \
--output {output.sequences} \
--group-by {params.group_by} \
--sequences-per-group {params.sequences_per_group} \
--min-date {params.min_date} \
--output-metadata {output.metadata} \
--min-length {params.min_length}
"""

rule subsample:
"""
Subsampling with config defined in {params.config}.
"""
input:
metadata = "results/filtered.tsv",
sequences = "results/filtered.fasta",
output:
metadata = "results/subsampled.tsv",
sequences = "results/subsampled.fasta",
params:
config = config["subsampling"],
strain_id = config.get("strain_id_field", "strain"),
shell:
"""
augur subsample \
--sequences {input.sequences} \
--metadata {input.metadata} \
--config {params.config} \
--metadata-id-columns {params.strain_id} \
--output-metadata {output.metadata} \
--output-sequences {output.sequences}
"""

rule align:
"""
Aligning sequences to {input.reference}
- filling gaps with N
"""
input:
sequences = "results/filtered.fasta",
sequences = "results/subsampled.fasta",
reference = "defaults/zika_reference.gb"
output:
alignment = "results/aligned.fasta"
Expand Down

0 comments on commit 6369f22

Please sign in to comment.