diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml index 41c3432..10ada38 100644 --- a/ingest/config/defaults.yaml +++ b/ingest/config/defaults.yaml @@ -1,7 +1,15 @@ +# This configuration file should contain all required configuration parameters +# for the ingest workflow to run to completion. +# +# Define optional config parameters with their default values here so that users +# do not have to dig through the workflows to figure out the default values + # Sources of sequences to include in the ingest run sources: ['genbank'] -# Pathogen NCBI Taxonomy ID -ncbi_taxon_id: '64320' + +# Required to fetch from NCBI Datasets +ncbi_taxon_id: "64320" + # The list of NCBI Datasets fields to include from NCBI Datasets output # These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields # https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields @@ -24,13 +32,18 @@ ncbi_datasets_fields: - submitter-affiliation - submitter-country -# Params for the curate rule +# Config parameters related to the curate pipeline curate: - # NCBI fields to rename to Nextstrain field names. - # List of field names to change where the key is the original field name and - # the value is the new field name - # This is the first step in the pipeline, so any references to field names - # in the configs below should use the new field names + # URL pointed to public generalized geolocation rules + # For the Nextstrain team, this is currently + # 'https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv' + geolocation_rules_url: 'https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv' + # The path to the local geolocation rules within the pathogen repo + # The path should be relative to the ingest directory. + local_geolocation_rules: 'config/geolocation-rules.tsv' + # List of field names to change where the key is the original field name and the value is the new field name + # The original field names should match the ncbi_datasets_fields provided above. + # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names field_map: accession: genbank_accession accession-rev: genbank_accession_rev @@ -46,47 +59,40 @@ curate: submitter-names: authors submitter-affiliations: institution # Standardized strain name regex - # Currently accepts any characters because we do not have a clear standard for strain names + # Currently accepts any characters because we do not have a clear standard for strain names across pathogens strain_regex: '^.+$' - # Back up strain name field if 'strain' doesn't match regex above + # Back up strain name field to use if 'strain' doesn't match regex above strain_backup_fields: ['genbank_accession'] - # List of date fields to standardize + # List of date fields to standardize to ISO format YYYY-MM-DD date_fields: ['date', 'release_date', 'update_date'] - # Expected date formats present in date fields + # List of expected date formats that are present in the date fields provided above # These date formats should use directives expected by datetime # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes expected_date_formats: ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ'] - # Titlecase rules titlecase: - # Abbreviations not cast to titlecase, keeps uppercase + # List of string fields to titlecase + fields: ['region', 'country', 'division', 'location'] + # List of abbreviations not cast to titlecase, keeps uppercase abbreviations: ['USA'] # Articles that should not be cast to titlecase articles: [ 'and', 'd', 'de', 'del', 'des', 'di', 'do', 'en', 'l', 'la', 'las', 'le', 'los', 'nad', 'of', 'op', 'sur', 'the', 'y' ] - # List of string fields to titlecase - fields: ['region', 'country', 'division', 'location'] - # Authors field name + # Metadata field that contains the list of authors associated with the sequence authors_field: 'authors' - # Authors default value if authors value is empty + # Default value to use if the authors field is empty authors_default_value: '?' - # Field name for the generated abbreviated authors - abbr_authors_field: 'abbr_authors' - # General geolocation rules to apply to geolocation fields - geolocation_rules_url: 'https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv' - # Local geolocation rules that are only applicable to zika data - # Local rules can overwrite the general geolocation rules provided above - local_geolocation_rules: 'config/geolocation-rules.tsv' - # User annotations file - annotations: 'config/annotations.tsv' - # ID field used to merge annotations + # Path to the manual annotations file + # The path should be relative to the ingest directory + annotations: "config/annotations.tsv" + # The ID field in the metadata to use to merge the manual annotations annotations_id: 'genbank_accession' - # Field to use as the sequence ID in the FASTA file - id_field: 'genbank_accession' - # Field to use as the sequence in the FASTA file - sequence_field: 'sequence' - # Final output columns for the metadata TSV + # The ID field in the metadata to use as the sequence id in the output FASTA file + output_id_field: 'genbank_accession' + # The field in the NDJSON record that contains the actual genomic sequence + output_sequence_field: 'sequence' + # The list of metadata columns to keep in the final output of the curation pipeline. metadata_columns: [ 'genbank_accession', 'genbank_accession_rev', @@ -104,4 +110,3 @@ curate: 'authors', 'institution', ] - diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index dee2813..f278fec 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -64,11 +64,10 @@ rule curate: titlecase_fields=config["curate"]["titlecase"]["fields"], authors_field=config["curate"]["authors_field"], authors_default_value=config["curate"]["authors_default_value"], - abbr_authors_field=config["curate"]["abbr_authors_field"], annotations_id=config["curate"]["annotations_id"], metadata_columns=config["curate"]["metadata_columns"], - id_field=config["curate"]["id_field"], - sequence_field=config["curate"]["sequence_field"], + id_field=config["curate"]["output_id_field"], + sequence_field=config["curate"]["output_sequence_field"], shell: """ (cat {input.sequences_ndjson} \