Skip to content

Commit

Permalink
Merge pull request #32 from nextstrain/update-nextclade-rules-in-inge…
Browse files Browse the repository at this point in the history
…st-21

Update nextclade rules in ingest [#21]
  • Loading branch information
genehack authored Jan 9, 2025
2 parents 0fb9db7 + 6a4b959 commit b451c59
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 53 deletions.
25 changes: 24 additions & 1 deletion ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -131,5 +131,28 @@ curate:
- url
nextclade:
dataset_name: "nextstrain/yellow-fever/prM-E"
field_map: "defaults/nextclade_field_map.tsv"
field_map:
seqName: "seqName"
clade: "clade"
coverage: "coverage"
totalMissing: "missing_data"
totalSubstitutions: "divergence"
totalNonACGTNs: "nonACGTN"
qc.overallStatus: "QC_overall"
qc.missingData.status: "QC_missing_data"
qc.mixedSites.status: "QC_mixed_sites"
qc.privateMutations.status: "QC_rare_mutations"
qc.snpClusters.status: "QC_snp_clusters"
qc.frameShifts.status: "QC_frame_shifts"
qc.stopCodons.status: "QC_stop_codons"
frameShifts: "frame_shifts"
privateNucMutations.reversionSubstitutions: "private_reversion_substitutions"
privateNucMutations.labeledSubstitutions: "private_labeled_substitutions"
privateNucMutations.unlabeledSubstitutions: "private_unlabeled_substitutions"
privateNucMutations.totalReversionSubstitutions: "private_total_reversion_substitutions"
privateNucMutations.totalLabeledSubstitutions: "private_total_labeled_substitutions"
privateNucMutations.totalUnlabeledSubstitutions: "private_total_unlabeled_substitutions"
privateNucMutations.totalPrivateSubstitutions: "private_total_private_substitutions"
qc.snpClusters.clusteredSNPs: "private_snp_clusters"
qc.snpClusters.totalSNPs: "private_total_snp_clusters"
id_field: "seqName"
28 changes: 0 additions & 28 deletions ingest/defaults/nextclade_field_map.tsv

This file was deleted.

2 changes: 1 addition & 1 deletion ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ rule curate:

rule add_genbank_url:
input:
metadata=temp("data/all_metadata_intermediate.tsv"),
metadata="data/all_metadata_intermediate.tsv",
output:
metadata="data/all_metadata.tsv",
log:
Expand Down
59 changes: 36 additions & 23 deletions ingest/rules/nextclade.smk
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,35 @@ rule run_nextclade:
"""


rule join_metadata_and_nextclade:
rule nextclade_metadata:
input:
nextclade="results/nextclade.tsv",
output:
nextclade_metadata=temp("results/nextclade_metadata.tsv"),
params:
nextclade_id_field=config["nextclade"]["id_field"],
nextclade_field_map=[f"{old}={new}" for old, new in config["nextclade"]["field_map"].items()],
nextclade_fields=",".join(config["nextclade"]["field_map"].values()),
log:
"logs/nextclade_metadata.txt",
benchmark:
"benchmarks/nextclade_metadata.tsv",
shell:
r"""
augur curate rename \
--metadata {input.nextclade:q} \
--id-column {params.nextclade_id_field:q} \
--field-map {params.nextclade_field_map:q} \
--output-metadata - \
| csvtk cut --tabs --fields {params.nextclade_fields:q} \
> {output.nextclade_metadata:q} 2> {log:q}
"""


rule join_metadata_and_nextclade:
input:
metadata="data/subset_metadata.tsv",
nextclade_field_map=config["nextclade"]["field_map"],
nextclade_metadata="results/nextclade_metadata.tsv",
output:
metadata="results/metadata.tsv",
params:
Expand All @@ -61,25 +85,14 @@ rule join_metadata_and_nextclade:
"benchmarks/join_metadata_and_nextclade.txt",
shell:
r"""
(
export SUBSET_FIELDS=`grep -v '^#' {input.nextclade_field_map} | awk '{{print $1}}' | tr '\n' ',' | sed 's/,$//g'`
csvtk -t cut -f $SUBSET_FIELDS \
{input.nextclade} \
| csvtk -t rename2 \
-F \
-f '*' \
-p '(.+)' \
-r '{{kv}}' \
-k {input.nextclade_field_map} \
| tsv-join -H \
--filter-file - \
--key-fields {params.nextclade_id_field} \
--data-fields {params.metadata_id_field} \
--append-fields '*' \
--write-all ? \
{input.metadata} \
| tsv-select -H --exclude {params.nextclade_id_field} \
> {output.metadata}
) 2>{log:q}
augur merge \
--metadata \
metadata={input.metadata:q} \
nextclade={input.nextclade_metadata:q} \
--metadata-id-columns \
metadata={params.metadata_id_field:q} \
nextclade={params.nextclade_id_field:q} \
--output-metadata {output.metadata:q} \
--no-source-columns \
&> {log:q}
"""

0 comments on commit b451c59

Please sign in to comment.