Skip to content

Commit

Permalink
ingest/curate: Make the field map config more user friendly
Browse files Browse the repository at this point in the history
Incorporating changes from the pathogen repo template:

* nextstrain/pathogen-repo-guide@5e1b1ef
  • Loading branch information
j23414 committed Jan 20, 2024
1 parent 5fa511a commit 5089330
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 16 deletions.
31 changes: 16 additions & 15 deletions ingest/config/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,23 +27,24 @@ ncbi_datasets_fields:
# Params for the curate rule
curate:
# NCBI fields to rename to Nextstrain field names.
# List of field names to change where the key is the original field name and
# the value is the new field name
# This is the first step in the pipeline, so any references to field names
# in the configs below should use the new field names
field_map: [
'accession=genbank_accession',
'accession-rev=genbank_accession_rev',
'isolate-lineage=strain',
'sourcedb=database',
'geo-region=region',
'geo-location=location',
'host-name=host',
'isolate-collection-date=date',
'release-date=release_date',
'update-date=update_date',
'sra-accs=sra_accessions',
'submitter-names=authors',
'submitter-affiliations=institution',
]
field_map:
accession: genbank_accession
accession-rev: genbank_accession_rev
isolate-lineage: strain
sourcedb: database
geo-region: region
geo-location: location
host-name: host
isolate-collection-date: date
release-date: release_date
update-date: update_date
sra-accs: sra_accessions
submitter-names: authors
submitter-affiliations: institution
# Standardized strain name regex
# Currently accepts any characters because we do not have a clear standard for strain names
strain_regex: '^.+$'
Expand Down
9 changes: 8 additions & 1 deletion ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,13 @@ rule concat_geolocation_rules:
"""


def format_field_map(field_map: dict[str, str]) -> str:
"""
Format dict to `"key1"="value1" "key2"="value2"...` for use in shell commands.
"""
return " ".join([f'"{key}"="{value}"' for key, value in field_map.items()])


rule curate:
input:
sequences_ndjson="data/sequences.ndjson",
Expand All @@ -47,7 +54,7 @@ rule curate:
log:
"logs/curate.txt",
params:
field_map=config["curate"]["field_map"],
field_map=format_field_map(config["curate"]["field_map"]),
strain_regex=config["curate"]["strain_regex"],
strain_backup_fields=config["curate"]["strain_backup_fields"],
date_fields=config["curate"]["date_fields"],
Expand Down

0 comments on commit 5089330

Please sign in to comment.