diff --git a/config/main_config.yaml b/config/main_config.yaml
index 97393ec..b200d5f 100644
--- a/config/main_config.yaml
+++ b/config/main_config.yaml
@@ -1,7 +1,7 @@
OVERVIEW:
- - "All fields listed in the config file are mandatory."
+ - "Fields marked with *? are mandatory."
- "Fields marked with * contains unique sample identifiers"
- - "Fields marked with ? are required in full. Missing values can be fill with Unkown"
+ - "Fields marked with ? are required in full. Missing values can be fill with Unknown"
- "Fields marked with # contains unique sample identifiers that must match the header of the respective sequences in the fasta file."
- "Fields marked with & contains date information."
@@ -9,11 +9,11 @@ SUBMISSION_PORTAL:
COMMON_FIELDS:
sequence_name*:
- text: 'Sequence identifier used in fasta file. This is used to create the fasta file for Genbank or GISAID.'
- organism:
+ organism?:
- text: 'The most descriptive organism name for the samples. If relevant, you can search the organism name in the NCBI Taxonomy database. For FLU, organism must be "Influenza A Virus". For COV, organism must be "Severe acute respiratory syndrome coronavirus 2".'
collection_date&:
- - text: 'The date on which the sample was collected; must be in the ISO format: YYYY-MM-DD.
For example: 2020-03-25'
- authors:
+ - text: 'Date on which the sample was collected. Populate using ISO 8601 standard: “YYYY-mm-dd”, “YYYY-mm” or “YYYY” (e.g., 1990–10–30, 1990–10, or 1990). Including the month or month/day of collection is extremely valuable for accessing seasonality in the database.'
+ authors:
- text: 'Citing authors. List of Last, First Middle, suffix separated by a semicolon ";" E.g.: "Baker, Howard Henry, Jr.; Powell, Earl Alexander, III.;"'
PORTAL_NAMES:
NCBI:
@@ -25,115 +25,173 @@ SUBMISSION_PORTAL:
- text: 'Submitter Provided Unique Identifiers. This is used to report back assigned accessions as well as for cross-linking objects within submission.'
ncbi-spuid_namespace:
- text: 'If SPUID is used, spuid_namespace has to be provided. The values of spuid_namespace are from controlled vocabulary and need to be coordinated with NCBI prior to submission.'
- ncbi-bioproject:
+ ncbi-bioproject*:
- text: 'Associated BioProject accession number. For example: PRJNA217342'
DATABASE:
BIOSAMPLE:
- bs-description:
+ bs-description:
- text: 'A brief description about the sample, e.g. SARS-CoV-2 Sequencing Baseline Constellation.'
- bs-collected_by:
+ bs-collected_by?:
- text: 'Name of persons or institute who collected the sample.'
- bs-geo_loc_name:
+ bs-geo_loc_name?:
- text: 'Geographical origin of the sample; use the appropriate name from this list. Use a colon to separate the country or ocean from more detailed information about the location, eg "Canada: Vancouver" or "Germany: halfway down Zugspitze, Alps". Entering multiple localities in one attribute is not allowed.'
bs-host:
- text: 'The natural (as opposed to laboratory) host to the organism from which the sample was obtained. Use the full taxonomic name, eg, Homo sapiens.'
bs-host_disease:
- text: 'Name of relevant disease, e.g. Salmonella gastroenteritis. Controlled vocabulary, please see Human Disease Ontology or MeSH'
- bs-isolate:
+ bs-isolate:
- text: 'Identification or description of the specific individual from which this sample was obtained.'
- bs-isolation_source:
+ bs-strain*:
+ - text: 'Identification or description of the specific individual from which this sample was obtained.'
+ bs-isolation_source?:
- text: 'Describes the physical, environmental and/or local geographical source of the biological sample from which the sample was derived.'
bs-lat_lon:
- text: 'The geographical coordinates of the location where the sample was collected. Specify as degrees latitude and longitude in format "d[d.dddd] N|S d[dd.dddd] W|E", eg, 38.98 N 77.11 W'
+ PACKAGES:
+ ENTERIC:
+ bs-food_origin?:
+ - text: 'The geographic origin of the food product. Include the country name if imported, or the "Country: state/territory/province" if domestic. Include multiple locations if necessary, delimited by semi colon. Examples: USA:MD or India.'
+ bs-intended_consumer?:
+ - text: 'Food consumer type, human or animal, for which the food product is produced and marketed. Example: human as food consumer.'
+ bs-isolate_name_alias:
+ - text: 'Other IDs associated with this isolate or strain. Separate with ; if more than one. Example: ABC123; StateLab567.'
+ bs-culture_collection:
+ - text: 'Name of source institute and unique culture identifier. See the description for the proper format and list of allowed institutes, http://www.insdc.org/controlled-vocabulary-culturecollection-qualifier. Example: ATCC:BAA-664.'
+ bs-reference_material:
+ - text: 'Describes a laboratory reference or control strain. Leave blank if not applicable. Example: proficiency testing Isolate.'
+ bs-cult_isol_date:
+ - text: 'A culture isolation date is a date-time entity marking the end of a process in which a sample yields a positive result for the target microbial analyte(s) in the form of an isolated colony or colonies. Example: 2020-05-02.'
+ bs-samp_collect_device:
+ - text: 'Device used to collect the sample. Choose a term provided in the pick-list, or provide your own.'
+ bs-IFSAC_category:
+ - text: 'The IFSAC food categorization scheme has five distinct levels to which foods can be assigned, depending upon the type of food. First, foods are assigned to one of four food groups (aquatic animals, land animals, plants, and other). Food groups include increasingly specific food categories; dairy, eggs, meat and poultry, and game are in the land animal food group, and the category meat and poultry is further subdivided into more specific categories of meat (beef, pork, other meat) and poultry (chicken, turkey, other poultry). Finally, foods are differentiated by differences in food processing (such as pasteurized fluid dairy products, unpasteurized fluid dairy products, pasteurized solid and semi-solid dairy products, and unpasteurized solid and semi-solid dairy products. An IFSAC food category chart is available from [CDC Food Safety](https://www.cdc.gov/foodsafety/ifsac/projects/food-categorization-scheme.html) PMID: 28926300.'
+ bs-serotype:
+ - text: 'Follow the OHE custom guidance for "organism" and include any serotype/serovar information in the organism name. We do not recommend populating this attribute.'
+ bs-serovar:
+ - text: 'Follow the OHE custom guidance for "organism" and include any serotype/serovar information in the organism name. We do not recommend populating this attribute.'
+ bs-spec_intended_cons:
+ - text: 'Specific food consumer type for which the food product is produced and marketed. This field accepts terms listed under food consumer group (http://purl.obolibrary.org/obo/FOODON_03510136).'
+ bs-coll_site_geo_feat:
+ - text: 'Text or terms that describe the geographic feature where the food sample was obtained by the researcher. This field accepts selected terms listed under the following ontologies: anthropogenic geographic feature (http://purl.obolibrary.org/obo/ENVO_00000002), for example agricultural fairground [ENVO:01000986]; garden [ENVO:00000011} or any of its subclasses; market [ENVO:01000987]; water well [ENVO:01000002]; or human construction (http://purl.obolibrary.org/obo/ENVO_00000070).'
+ bs-food_prod:
+ - text: 'Descriptors of the food production system or of the agricultural environment and growing conditions related to the farm production system. This field accepts terms listed under food production (http://purl.obolibrary.org/obo/FOODON_03530206). Multiple terms may apply and can be separated by semicolons.'
+ bs-label_claims:
+ - text: 'Labeling claims containing descriptors such as wild caught, free-range, organic, free-range, industrial, hormone-free, antibiotic free, cage free. Can include more than one term, separated by ";".'
+ bs-food_product_type:
+ - text: 'A food product type is a class of food products that is differentiated by its food composition (e.g., single- or multi-ingredient), processing and/or consumption characteristics. This does not include brand name products but it may include generic food dish categories. This field accepts terms under food product type (http://purl.obolibrary.org/obo/FOODON:03400361). For terms related to food product for an animal, consult food product for animal (http://purl.obolibrary.org/obo/FOODON_03309997). If the proper descriptor is not listed please use text to describe the food type. Multiple terms can be separated by one or more semicolons.'
+ bs-food_industry_code:
+ - text: 'The US-FDA Industry Code is the first of five elements that comprise an FDA product code. An Industry code determines the broadest area into which a product falls.'
+ bs-food_industry_class:
+ - text: 'The US FDA-Class is the second of five elements that comprise a FDA product code. This element is directly related to an Industry and designates the food group, source, product, use, pharmacological action, category or animal species of the product. A class code is more specific than an Industry; for example, the Fishery/Seafood products Industry may contain Classes such as Smoked, Breaded and such.'
+ bs-food_source:
+ - text: 'Individual organism or category of organisms from which the food product or its major ingredient is derived. Choose from a broad taxonomic category in the provided picklist, or provide a more specific taxonomic term from the Food Product by Organism branch [FOODON:00002381]."'
+ bs-food_processing_method:
+ - text: 'Methods for processing food for culinary purposes, or prepared prior to packaging. Choose one or multiple terms, separated by "; ".'
+ bs-food_preserv_proc:
+ - text: 'The methods contributing to the prevention or retardation of microbial, enzymatic or oxidative spoilage and thus to the extension of shelf life. This field accepts terms listed under food preservation process (http://purl.obolibrary.org/obo/FOODON_03470107).'
+ bs-food_additive:
+ - text: 'A substance or substances added to food to maintain or improve safety and freshness, to improve or maintain nutritional value, or improve taste, texture and appearance. This field accepts terms listed under food additive (http://purl.obolibrary.org/obo/FOODON_03412972). Multiple terms can be separated by one or more semicolons, but please consider limiting this list to the top 5 ingredients listed in order as on the food label. See also, https://www.fda.gov/food/food-ingredients-packaging/overview-food-ingredients-additives-colors.'
+ bs-food_contact_surf:
+ - text: 'The specific container or coating materials in direct contact with the food. Multiple values can be assigned. This field accepts terms listed under food contact surface (http://purl.obolibrary.org/obo/FOODON_03500010).'
+ bs-food_contain_wrap:
+ - text: 'Type of container or wrapping defined by the main container material, the container form, and the material of the liner lids or ends. Also type of container or wrapping by form; prefer description by material first, then by form. This field accepts terms listed under food container or wrapping (http://purl.obolibrary.org/obo/FOODON_03490100).'
+ bs-food_pack_medium:
+ - text: 'The medium in which the food is packed for preservation and handling or the medium surrounding homemade foods, e.g., peaches cooked in sugar syrup. The packing medium may provide a controlled environment for the food. It may also serve to improve palatability and consumer appeal. This includes edible packing media (e.g. fruit juice), gas other than air (e.g. carbon dioxide), vacuum packed, or packed with aerosol propellant. This field accepts terms under food packing medium (http://purl.obolibrary.org/obo/FOODON_03480020). Multiple terms may apply and can be separated by semicolons.'
+ bs-food_pack_integrity:
+ - text: 'A term label and term id to describe the state of the packing material and text to explain the exact condition. This field accepts terms listed under food packing medium integrity (http://purl.obolibrary.org/obo/FOODON_03530218).'
+ bs-food_quality_date:
+ - text: 'The date recommended for the use of the product while at peak quality, this date is not a reflection of safety unless used on infant formula this date is not a reflection of safety and is typically labeled on a food product as "best if used by," best by," "use by," or "freeze by." Must use ISO date format, YYYY-MM-DD. The date recommended for the use of the product while at peak quality, this date is not a reflection of safety unless used on infant formula this date is not a reflection of safety and is typically labeled on a food product as "best if used by," best by," "use by," or "freeze by." Must use ISO date format, YYYY-MM-DD.'
+ bs-food_prod_synonym:
+ - text: 'Other names by which the food product is known by (e.g., regional or non-English names).'
SRA:
- sra-file_location:
+ sra-file_location?:
- text: 'Location of raw reads files. Options: "local" or "cloud"'
- sra-file_name:
+ sra-file_name*:
- text: 'Name of the raw read files. All file names must be unique and not contain any sensitive information. Files can be compressed using gzip or bzip2, and may be submitted in a tar archive but archiving and/or compressing your files is not required. Do not use zip! If there are multiple files, concatenate them with a commas (","), e.g. "sample1_R1.fastq.gz, sample1_R2.fastq.gz". Store files in /seqsender/data/raw_reads/ or provide full path to the raw read files.'
- sra-library_name:
+ sra-library_name*:
- text: 'Short unique identifier for sequencing library. Each name must be unique!'
- sra-instrument_model:
+ sra-instrument_model:
- text: 'Type of instrument model used for sequencing. See a list of options here.'
- sra-library_strategy:
+ sra-library_strategy:
- text: 'The sequencing technique intended for the library. See a list of options here.'
- sra-library_source:
+ sra-library_source:
- text: 'The type of source material that is being sequenced. See a list of options here.'
- sra-library_selection:
+ sra-library_selection:
- text: 'The method used to select and/or enrich the material being sequenced. See a list of options here.'
- sra-library_layout:
+ sra-library_layout:
- text: 'Whether to expect SINGLE or PAIRED end reads. Options: "single" or "paired"'
GENBANK:
- gb-seq_id*#:
+ gb-seq_id*#:
- text: 'Identification to be used for the sequence in the FASTA.'
- gb-subm_lab:
+ gb-subm_lab?:
- text: 'Full name of organization, institute, or laboratory, etc., who is submitting this record'
- gb-subm_lab_division:
+ gb-subm_lab_division:
- text: 'The division of organization, institute, or laboratory, etc., who is submitting this record'
- gb-subm_lab_addr:
+ gb-subm_lab_addr?:
- text: 'The address of organization, institute, or laboratory, etc., who is submitting this record'
- gb-publication_title:
+ gb-publication_title:
- text: 'The title and relevant publication details (volume, issue, etc.) of a paper that discusses the submission. If left empty, the program will used the name of the submission as title.'
- gb-publication_status:
+ gb-publication_status:
- text: 'Options: "unpublished" or "in-press" or "published"'
- src-isolate:
+ src-isolate?:
- text: 'Identification or description of the specific individual from which this sample was obtained'
- src-country:
+ src-country?:
- text: 'Geographical origin of the sample; use the appropriate name from this list. Use a colon to separate the country or ocean from more detailed information about the location, eg "Canada: Vancouver" or "Germany: halfway down Zugspitze, Alps". Entering multiple localities in one attribute is not allowed.'
- src-host:
+ src-host?:
- text: 'The natural (as opposed to laboratory) host to the organism from which the sample was obtained. Use the full taxonomic name, eg, Homo sapiens'
- src-serotype:
+ src-serotype:
- text: 'For Influenza A only; must be in format HxNx, Hx, Nx or mixed; where x is a numeral'
- src-isolation_source:
+ src-isolation_source?:
- text: 'Describes the physical, environmental and/or local geographical source of the biological sample from which the sample was derived.'
- cmt-StructuredCommentPrefix:
+ cmt-StructuredCommentPrefix:
- text: 'Structured comment keyword. For FLU use "FluData", HIV use "HIV-DataBaseData", and for COV and other organisms use "Assembly-Data".'
- cmt-StructuredCommentSuffix:
+ cmt-StructuredCommentSuffix:
- text: 'Structured comment keyword. For FLU use "FluData", HIV use "HIV-DataBaseData", and for COV and other organisms use "Assembly-Data".'
GISAID:
DATABASE:
FLU:
- gs-seq_id*#:
+ gs-seq_id*#:
- text: 'Identification to be used for the sequence in the FASTA.'
- gs-Isolate_Name:
+ gs-Isolate_Name*:
- text: 'E.g. "A/Brisbane/1444A/2010"'
- gs-segment:
+ gs-segment:
- text: 'Segment name for GISAID. Options are: "HA", "HE", "MP", "NA", "NP", "NS", "P3", "PA", "PB1", "PB2"'
- gs-Subtype:
+ gs-Subtype:
- text: 'E.g. "H5N1"'
- gs-Location:
+ gs-Location?:
- text: 'E.g., "United Kingdom", "Japan", "China", "United States", etc.'
- gs-Host:
+ gs-Host?:
- text: 'Host or source name., E.g. "human", "avian", "chicken", "Anas Acuta", "environment", etc.'
gs-Collection_Month:
- text: 'For incomplete collection dates, use this field instead of "Collection_Date". Month of year: "1" = Jan, "2" = Feb, so forth, "12" = Dec'
gs-Collection_Year:
- text: 'For incomplete collection dates, use this field instead of "Collection_Date". Four digit year as string: e.g. "2023"'
- gs-Originating_Lab_Id:
+ gs-Originating_Lab_Id:
- text: 'The numeric ID of the sample"s originating laboratory, e.g. "2698"'
COV:
- gs-virus_name*#:
+ gs-virus_name*#:
- text: 'For example: hCoV-19/Country/SampleID/YYYY
There are four parts delineated by the forward slash "/" character: