Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ingest: Derive url and use accession fields during ingest #78

Merged
merged 3 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ curate:
# The original field names should match the ncbi_datasets_fields provided above.
# This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
field_map:
accession: genbank_accession
accession-rev: genbank_accession_rev
accession: accession
accession_version: accession_version
isolate-lineage: strain
sourcedb: database
geo-region: region
Expand All @@ -62,7 +62,7 @@ curate:
# Currently accepts any characters because we do not have a clear standard for strain names across pathogens
strain_regex: '^.+$'
# Back up strain name field to use if 'strain' doesn't match regex above
strain_backup_fields: ['genbank_accession']
strain_backup_fields: ['accession']
# List of date fields to standardize to ISO format YYYY-MM-DD
date_fields: ['date', 'release_date', 'update_date']
# List of expected date formats that are present in the date fields provided above
Expand All @@ -89,15 +89,17 @@ curate:
# The path should be relative to the ingest directory
annotations: "defaults/annotations.tsv"
# The ID field in the metadata to use to merge the manual annotations
annotations_id: 'genbank_accession'
annotations_id: 'accession'
# The ID field in the metadata to use as the sequence id in the output FASTA file
output_id_field: 'genbank_accession'
output_id_field: 'accession'
# The field in the NDJSON record that contains the actual genomic sequence
output_sequence_field: 'sequence'
# The field in the NDJSON record that contains the actual GenBank accession
genbank_accession: 'accession'
# The list of metadata columns to keep in the final output of the curation pipeline.
metadata_columns: [
'genbank_accession',
'genbank_accession_rev',
'accession',
'accession_version',
'strain',
'date',
'region',
Expand All @@ -111,4 +113,5 @@ curate:
'sra_accessions',
'authors',
'institution',
'url',
]
23 changes: 22 additions & 1 deletion ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,30 @@ rule curate:
"""


rule add_metadata_columns:
"""Add columns to metadata
Notable columns:
- [NEW] url: URL linking to the NCBI GenBank record ('https://www.ncbi.nlm.nih.gov/nuccore/*').
"""
input:
metadata = "data/all_metadata.tsv"
output:
metadata = temp("data/all_metadata_added.tsv")
params:
accession=config['curate']['genbank_accession']
shell:
"""
csvtk mutate2 -t \
-n url \
-e '"https://www.ncbi.nlm.nih.gov/nuccore/" + ${params.accession}' \
{input.metadata} \
> {output.metadata}
"""


rule subset_metadata:
input:
metadata="data/all_metadata.tsv",
metadata="data/all_metadata_added.tsv",
output:
subset_metadata="results/metadata.tsv",
params:
Expand Down
6 changes: 3 additions & 3 deletions ingest/rules/fetch_from_ncbi.smk
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ rule format_ncbi_dataset_report:
--elide-header \
| csvtk fix-quotes -Ht \
| csvtk add-header -t -n {params.ncbi_datasets_fields:q} \
| csvtk rename -t -f accession -n accession-rev \
| csvtk -t mutate -f accession-rev -n accession -p "^(.+?)\." \
| csvtk rename -t -f accession -n accession_version \
| csvtk -t mutate -f accession_version -n accession -p "^(.+?)\." \
| csvtk del-quotes -t \
| tsv-select -H -f accession --rest last \
> {output.ncbi_dataset_tsv}
Expand All @@ -89,7 +89,7 @@ rule format_ncbi_datasets_ndjson:
augur curate passthru \
--metadata {input.ncbi_dataset_tsv} \
--fasta {input.ncbi_dataset_sequences} \
--seq-id-column accession-rev \
--seq-id-column accession_version \
--seq-field sequence \
--unmatched-reporting warn \
--duplicate-reporting warn \
Expand Down
70 changes: 35 additions & 35 deletions phylogenetic/example_data/metadata.tsv
Original file line number Diff line number Diff line change
@@ -1,35 +1,35 @@
strain virus genbank_accession date region country division city db segment authors
PAN/CDC_259359_V1_V3/2015 zika KX156774 2015-12-18 North America Panama Panama Panama genbank genome Shabman et al
COL/FLR_00024/2015 zika MF574569 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al
PRVABC59 zika KU501215 2015-12-XX North America Puerto Rico Puerto Rico Puerto Rico genbank genome Lanciotti et al
COL/FLR_00008/2015 zika MF574562 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al
Colombia/2016/ZC204Se zika KY317939 2016-01-06 South America Colombia Colombia Colombia genbank genome Quick et al
ZKC2/2016 zika KX253996 2016-02-16 Oceania American Samoa American Samoa American Samoa genbank genome Wu et al
VEN/UF_1/2016 zika KX702400 2016-03-25 South America Venezuela Venezuela Venezuela genbank genome Blohm et al
DOM/2016/BB_0059 zika KY785425 2016-04-04 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al
BRA/2016/FC_6706 zika KY785433 2016-04-08 South America Brazil Brazil Brazil genbank genome Metsky et al
DOM/2016/BB_0183 zika KY785420 2016-04-18 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al
EcEs062_16 zika KX879603 2016-04-XX South America Ecuador Ecuador Ecuador genbank genome Marquez et al
HND/2016/HU_ME59 zika KY785418 2016-05-13 North America Honduras Honduras Honduras genbank genome Metsky et al
DOM/2016/MA_WGS16_011 zika KY785484 2016-06-06 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al
DOM/2016/BB_0433 zika KY785441 2016-06-13 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al
USA/2016/FL022 zika KY075935 2016-07-22 North America Usa Usa Usa genbank genome Grubaugh et al
SG_027 zika KY241697 2016-08-27 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al
SG_074 zika KY241744 2016-08-28 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al
SG_056 zika KY241726 2016-08-28 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al
USA/2016/FLUR022 zika KY325473 2016-08-31 North America Usa Usa Usa genbank genome Grubaugh et al
Aedes_aegypti/USA/2016/FL05 zika KY075937 2016-09-09 North America Usa Usa Usa genbank genome Grubaugh et al
SG_018 zika KY241688 2016-09-13 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al
USA/2016/FLWB042 zika KY325478 2016-09-26 North America Usa Usa Usa genbank genome Grubaugh et al
COL/PRV_00028/2015 zika MF574578 2016-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al
Thailand/1610acTw zika MF692778 2016-10-XX Southeast Asia Thailand Thailand Thailand genbank genome Lin et al
1_0087_PF zika KX447509 2013-12-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al
1_0199_PF zika KX447519 2013-11-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al
1_0181_PF zika KX447512 2013-12-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al
Brazil/2015/ZBRC301 zika KY558995 2015-05-13 South America Brazil Brazil Brazil genbank genome Faria et al
Brazil/2015/ZBRA105 zika KY558989 2015-02-23 South America Brazil Brazil Brazil genbank genome Faria et al
Brazil/2016/ZBRC16 zika KY558991 2016-01-19 South America Brazil Brazil Brazil genbank genome Faria et al
V8375 zika KU501217 2015-11-01 North America Guatemala Guatemala Guatemala genbank genome Lanciotti et al
Nica1_16 zika KX421195 2016-01-19 North America Nicaragua Nicaragua Nicaragua genbank genome Tabata et al
Brazil/2015/ZBRC303 zika KY558997 2015-05-14 South America Brazil Brazil Brazil genbank genome Faria et al
SMGC_1 zika KX266255 2016-02-14 Oceania American Samoa American Samoa American Samoa genbank genome Bi et al
strain virus accession date region country division city db segment authors url
PAN/CDC_259359_V1_V3/2015 zika KX156774 2015-12-18 North America Panama Panama Panama genbank genome Shabman et al https://www.ncbi.nlm.nih.gov/nuccore/KX156774
COL/FLR_00024/2015 zika MF574569 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574569
PRVABC59 zika KU501215 2015-12-XX North America Puerto Rico Puerto Rico Puerto Rico genbank genome Lanciotti et al https://www.ncbi.nlm.nih.gov/nuccore/KU501215
COL/FLR_00008/2015 zika MF574562 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574562
Colombia/2016/ZC204Se zika KY317939 2016-01-06 South America Colombia Colombia Colombia genbank genome Quick et al https://www.ncbi.nlm.nih.gov/nuccore/KY317939
ZKC2/2016 zika KX253996 2016-02-16 Oceania American Samoa American Samoa American Samoa genbank genome Wu et al https://www.ncbi.nlm.nih.gov/nuccore/KX253996
VEN/UF_1/2016 zika KX702400 2016-03-25 South America Venezuela Venezuela Venezuela genbank genome Blohm et al https://www.ncbi.nlm.nih.gov/nuccore/KX702400
DOM/2016/BB_0059 zika KY785425 2016-04-04 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785425
BRA/2016/FC_6706 zika KY785433 2016-04-08 South America Brazil Brazil Brazil genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785433
DOM/2016/BB_0183 zika KY785420 2016-04-18 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785420
EcEs062_16 zika KX879603 2016-04-XX South America Ecuador Ecuador Ecuador genbank genome Marquez et al https://www.ncbi.nlm.nih.gov/nuccore/KX879603
HND/2016/HU_ME59 zika KY785418 2016-05-13 North America Honduras Honduras Honduras genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785418
DOM/2016/MA_WGS16_011 zika KY785484 2016-06-06 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785484
DOM/2016/BB_0433 zika KY785441 2016-06-13 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785441
USA/2016/FL022 zika KY075935 2016-07-22 North America Usa Usa Usa genbank genome Grubaugh et al https://www.ncbi.nlm.nih.gov/nuccore/KY075935
SG_027 zika KY241697 2016-08-27 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al https://www.ncbi.nlm.nih.gov/nuccore/KY241697
SG_074 zika KY241744 2016-08-28 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al https://www.ncbi.nlm.nih.gov/nuccore/KY241744
SG_056 zika KY241726 2016-08-28 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al https://www.ncbi.nlm.nih.gov/nuccore/KY241726
USA/2016/FLUR022 zika KY325473 2016-08-31 North America Usa Usa Usa genbank genome Grubaugh et al https://www.ncbi.nlm.nih.gov/nuccore/KY325473
Aedes_aegypti/USA/2016/FL05 zika KY075937 2016-09-09 North America Usa Usa Usa genbank genome Grubaugh et al https://www.ncbi.nlm.nih.gov/nuccore/KY075937
SG_018 zika KY241688 2016-09-13 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al https://www.ncbi.nlm.nih.gov/nuccore/KY241688
USA/2016/FLWB042 zika KY325478 2016-09-26 North America Usa Usa Usa genbank genome Grubaugh et al https://www.ncbi.nlm.nih.gov/nuccore/KY325478
COL/PRV_00028/2015 zika MF574578 2016-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574578
Thailand/1610acTw zika MF692778 2016-10-XX Southeast Asia Thailand Thailand Thailand genbank genome Lin et al https://www.ncbi.nlm.nih.gov/nuccore/MF692778
1_0087_PF zika KX447509 2013-12-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al https://www.ncbi.nlm.nih.gov/nuccore/KX447509
1_0199_PF zika KX447519 2013-11-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al https://www.ncbi.nlm.nih.gov/nuccore/KX447519
1_0181_PF zika KX447512 2013-12-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al https://www.ncbi.nlm.nih.gov/nuccore/KX447512
Brazil/2015/ZBRC301 zika KY558995 2015-05-13 South America Brazil Brazil Brazil genbank genome Faria et al https://www.ncbi.nlm.nih.gov/nuccore/KY558995
Brazil/2015/ZBRA105 zika KY558989 2015-02-23 South America Brazil Brazil Brazil genbank genome Faria et al https://www.ncbi.nlm.nih.gov/nuccore/KY558989
Brazil/2016/ZBRC16 zika KY558991 2016-01-19 South America Brazil Brazil Brazil genbank genome Faria et al https://www.ncbi.nlm.nih.gov/nuccore/KY558991
V8375 zika KU501217 2015-11-01 North America Guatemala Guatemala Guatemala genbank genome Lanciotti et al https://www.ncbi.nlm.nih.gov/nuccore/KU501217
Nica1_16 zika KX421195 2016-01-19 North America Nicaragua Nicaragua Nicaragua genbank genome Tabata et al https://www.ncbi.nlm.nih.gov/nuccore/KX421195
Brazil/2015/ZBRC303 zika KY558997 2015-05-14 South America Brazil Brazil Brazil genbank genome Faria et al https://www.ncbi.nlm.nih.gov/nuccore/KY558997
SMGC_1 zika KX266255 2016-02-14 Oceania American Samoa American Samoa American Samoa genbank genome Bi et al https://www.ncbi.nlm.nih.gov/nuccore/KX266255
4 changes: 2 additions & 2 deletions phylogenetic/example_data/metadata_usvi.tsv
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
genbank_accession genbank_accession_rev accession strain date region country division location length host release_date update_date sra_accessions authors institution url
USVI/37/2016 VI37 USVI/37/2016 2016-10-06 North America Usvi Saint Croix Saint Croix 10807 Homo sapiens Black et al FH https://github.com/blab/zika-usvi/
accession accession_version strain date region country division location length host release_date update_date sra_accessions authors institution url
VI37 VI37 USVI/37/2016 2016-10-06 North America Usvi Saint Croix Saint Croix 10807 Homo sapiens Black et al FH https://github.com/blab/zika-usvi/
26 changes: 1 addition & 25 deletions phylogenetic/rules/merge_sequences_usvi.smk
Original file line number Diff line number Diff line change
Expand Up @@ -21,35 +21,11 @@ This part of the workflow usually includes the following steps:

"""

rule add_metadata_columns:
"""Add columns to metadata

Notable columns:
- genbank_accession: GenBank accession for Auspice to generate a URL to the NCBI GenBank record.
- [NEW] accession: The GenBank accession. Added to go alongside USVI accession.
- [NEW] url: URL linking to the NCBI GenBank record ('https://www.ncbi.nlm.nih.gov/nuccore/*'). Added to go alongside USVI url.
"""
input:
metadata = "data/metadata.tsv"
output:
metadata = "data/metadata_modified.tsv"
shell:
"""
csvtk mutate2 -tl \
-n url \
-e '"https://www.ncbi.nlm.nih.gov/nuccore/" + $genbank_accession' \
{input.metadata} \
| csvtk mutate2 -tl \
-n accession \
-e '$genbank_accession' \
> {output.metadata}
"""

rule append_usvi:
"""Appending USVI sequences"""
input:
sequences = "data/sequences.fasta",
metadata = "data/metadata_modified.tsv",
metadata = "data/metadata.tsv",
usvi_sequences = "data/sequences_usvi.fasta",
usvi_metadata = "data/metadata_usvi.tsv"
output:
Expand Down
Loading