From da43bbd3a26a008e38051a4f0aa157b94f7a569d Mon Sep 17 00:00:00 2001 From: Erik Wolfsohn Date: Tue, 5 Mar 2024 20:01:16 -0800 Subject: [PATCH 1/6] Changed variables from str to list in get_config to correctly loop through database names when querying for submission updates Changed navigation in get_ncbi_process_report to step through parent directories individually before entering test or production - ncbi ftp is configured to hide child directories until you access the parents Added some very preliminary handling for the OneHealth Enteric metadata package in SRA and BioSample --- config/main_config.yaml | 4 ++-- process.py | 6 ++++-- report.py | 4 +++- seqsender.py | 2 +- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/config/main_config.yaml b/config/main_config.yaml index 97393ec..0c826b5 100644 --- a/config/main_config.yaml +++ b/config/main_config.yaml @@ -39,7 +39,7 @@ SUBMISSION_PORTAL: - text: 'The natural (as opposed to laboratory) host to the organism from which the sample was obtained. Use the full taxonomic name, eg, Homo sapiens.' bs-host_disease: - text: 'Name of relevant disease, e.g. Salmonella gastroenteritis. Controlled vocabulary, please see Human Disease Ontology or MeSH' - bs-isolate: + bs-strain: - text: 'Identification or description of the specific individual from which this sample was obtained.' bs-isolation_source: - text: 'Describes the physical, environmental and/or local geographical source of the biological sample from which the sample was derived.' @@ -50,7 +50,7 @@ SUBMISSION_PORTAL: - text: 'Location of raw reads files. Options: "local" or "cloud"' sra-file_name: - text: 'Name of the raw read files. All file names must be unique and not contain any sensitive information. Files can be compressed using gzip or bzip2, and may be submitted in a tar archive but archiving and/or compressing your files is not required. Do not use zip! If there are multiple files, concatenate them with a commas (","), e.g. "sample1_R1.fastq.gz, sample1_R2.fastq.gz". Store files in /seqsender/data/raw_reads/ or provide full path to the raw read files.' - sra-library_name: + sra-library_ID: - text: 'Short unique identifier for sequencing library. Each name must be unique!' sra-instrument_model: - text: 'Type of instrument model used for sequencing. See a list of options here.' diff --git a/process.py b/process.py index 1f3917e..8bf55c7 100644 --- a/process.py +++ b/process.py @@ -79,6 +79,8 @@ def get_required_colnames(database, organism): # Check the config file def get_config(config_file, database): # Determine which portal is the database belongs to + print(database) + database = database.split(',') submission_portals = ["NCBI" if x in ["BIOSAMPLE", "SRA", "GENBANK"] else "GISAID" for x in database] # Read in config file with open(config_file, "r") as f: @@ -96,7 +98,7 @@ def get_config(config_file, database): print("\n"+"Error: " + database[d] + " is listed as one of the submitting databases.", file=sys.stderr) print("Error: However, there is no " + submission_portals[d] + " submission information provided in the config file.", file=sys.stderr) print("Error: Either remove " + database[d] + " from the submitting databases or update your config file."+"\n", file=sys.stderr) - sys.exit(1) + #sys.exit(1) return config_dict else: print("Error: Config file is incorrect. File must has a valid yaml format.", file=sys.stderr) @@ -133,7 +135,7 @@ def get_metadata(database, organism, metadata_file): if pd.isna(metadata[name]).any(): print("Error: The required 'collection_date' field in metadata file contains incorrect date format. Date must be in the ISO format: YYYYMMDD/YYYYDDMM/DDMMYYYY/MMDDYYYY. For example: 2020-03-25.", file=sys.stderr) sys.exit(1) - metadata[name] = metadata[name].dt.strftime("%Y-%m-%d") + metadata[name] = metadata[name].dt.strftime("%Y-%m") # Make sure specific column fields with empty values are filled with "Unknown" if (name in [re.sub("[*?#&]", "", x) for x in required_unknown_colnames]) and any(metadata[name] == ""): metadata[name] = metadata[name].replace(r'^\s*$', "Unknown", regex=True) diff --git a/report.py b/report.py index 348244b..b255ffa 100644 --- a/report.py +++ b/report.py @@ -40,7 +40,9 @@ def get_ncbi_process_report(database, submission_name, submission_files_dir, con FTP_HOST = process.get_main_config()["PORTAL_NAMES"]["NCBI"]["FTP_HOST"] ftp = ftplib.FTP(FTP_HOST) ftp.login(user=config_dict["Username"], passwd=config_dict["Password"]) - # CD to to test or production folder + # CD to test or production folder + ftp.cwd('/') + ftp.cwd('submit') ftp.cwd(submission_type) # Check if submission name exists if ncbi_submission_name not in ftp.nlst(): diff --git a/seqsender.py b/seqsender.py index a5a643d..d37f221 100755 --- a/seqsender.py +++ b/seqsender.py @@ -27,7 +27,7 @@ STARTTIME = datetime.now() # Define organsim choices -ORGANISM_CHOICES = ["FLU", "COV"] +ORGANISM_CHOICES = ["FLU", "COV", "ENTERIC"] # Define database choices DATABASE_CHOICES = ["BIOSAMPLE", "SRA", "GENBANK", "GISAID"] From d1f50309ea6f1249188c180f5b3b4756c157d1d1 Mon Sep 17 00:00:00 2001 From: Erik Wolfsohn Date: Wed, 6 Mar 2024 00:16:15 -0800 Subject: [PATCH 2/6] changed database_args to prevent being prompted for fasta input unless submitting to Genbank or GISAID Added all mandatory and optional fields for onehealth enteric biosample package to main_config.yaml & added metadata/config templates Added handling to remove empty optional metadata columns if not filled out at submission time --- config/main_config.yaml | 94 +++++++++++++++---- create.py | 5 +- process.py | 22 ++++- seqsender.py | 5 +- template/ENTERIC/onehealth_config.yaml | 29 ++++++ ...ehealth_enteric_biosample_sra_metadata.csv | 3 + 6 files changed, 133 insertions(+), 25 deletions(-) create mode 100644 template/ENTERIC/onehealth_config.yaml create mode 100644 template/ENTERIC/onehealth_enteric_biosample_sra_metadata.csv diff --git a/config/main_config.yaml b/config/main_config.yaml index 0c826b5..5d64ce4 100644 --- a/config/main_config.yaml +++ b/config/main_config.yaml @@ -1,7 +1,7 @@ OVERVIEW: - "All fields listed in the config file are mandatory." - "Fields marked with * contains unique sample identifiers" - - "Fields marked with ? are required in full. Missing values can be fill with Unkown" + - "Fields marked with ? are required in full. Missing values can be fill with Unknown" - "Fields marked with # contains unique sample identifiers that must match the header of the respective sequences in the fasta file." - "Fields marked with & contains date information." @@ -9,11 +9,11 @@ SUBMISSION_PORTAL: COMMON_FIELDS: sequence_name*: - text: 'Sequence identifier used in fasta file. This is used to create the fasta file for Genbank or GISAID.' - organism: + organism?: - text: 'The most descriptive organism name for the samples. If relevant, you can search the organism name in the NCBI Taxonomy database. For FLU, organism must be "Influenza A Virus". For COV, organism must be "Severe acute respiratory syndrome coronavirus 2".' collection_date&: - - text: 'The date on which the sample was collected; must be in the ISO format: YYYY-MM-DD.
For example: 2020-03-25' - authors: + - text: 'Date on which the sample was collected. Populate using ISO 8601 standard: “YYYY-mm-dd”, “YYYY-mm” or “YYYY” (e.g., 1990–10–30, 1990–10, or 1990). Including the month or month/day of collection is extremely valuable for accessing seasonality in the database.' + authors: - text: 'Citing authors. List of Last, First Middle, suffix separated by a semicolon ";" E.g.: "Baker, Howard Henry, Jr.; Powell, Earl Alexander, III.;"' PORTAL_NAMES: NCBI: @@ -25,42 +25,100 @@ SUBMISSION_PORTAL: - text: 'Submitter Provided Unique Identifiers. This is used to report back assigned accessions as well as for cross-linking objects within submission.' ncbi-spuid_namespace: - text: 'If SPUID is used, spuid_namespace has to be provided. The values of spuid_namespace are from controlled vocabulary and need to be coordinated with NCBI prior to submission.' - ncbi-bioproject: + ncbi-bioproject*: - text: 'Associated BioProject accession number. For example: PRJNA217342' DATABASE: BIOSAMPLE: - bs-description: + bs-description: - text: 'A brief description about the sample, e.g. SARS-CoV-2 Sequencing Baseline Constellation.' - bs-collected_by: + bs-collected_by?: - text: 'Name of persons or institute who collected the sample.' - bs-geo_loc_name: + bs-geo_loc_name?: - text: 'Geographical origin of the sample; use the appropriate name from this list. Use a colon to separate the country or ocean from more detailed information about the location, eg "Canada: Vancouver" or "Germany: halfway down Zugspitze, Alps". Entering multiple localities in one attribute is not allowed.' bs-host: - text: 'The natural (as opposed to laboratory) host to the organism from which the sample was obtained. Use the full taxonomic name, eg, Homo sapiens.' bs-host_disease: - text: 'Name of relevant disease, e.g. Salmonella gastroenteritis. Controlled vocabulary, please see Human Disease Ontology or MeSH' - bs-strain: + bs-isolate: - text: 'Identification or description of the specific individual from which this sample was obtained.' - bs-isolation_source: + bs-strain*: + - text: 'Identification or description of the specific individual from which this sample was obtained.' + bs-isolation_source*: - text: 'Describes the physical, environmental and/or local geographical source of the biological sample from which the sample was derived.' bs-lat_lon: - text: 'The geographical coordinates of the location where the sample was collected. Specify as degrees latitude and longitude in format "d[d.dddd] N|S d[dd.dddd] W|E", eg, 38.98 N 77.11 W' + PACKAGES: + ENTERIC: + bs-food_origin?: + - text: 'The geographic origin of the food product. Include the country name if imported, or the "Country: state/territory/province" if domestic. Include multiple locations if necessary, delimited by semi colon. Examples: USA:MD or India.' + bs-intended_consumer?: + - text: 'Food consumer type, human or animal, for which the food product is produced and marketed. Example: human as food consumer.' + bs-isolate_name_alias: + - text: 'Other IDs associated with this isolate or strain. Separate with ; if more than one. Example: ABC123; StateLab567.' + bs-culture_collection: + - text: 'Name of source institute and unique culture identifier. See the description for the proper format and list of allowed institutes, http://www.insdc.org/controlled-vocabulary-culturecollection-qualifier. Example: ATCC:BAA-664.' + bs-reference_material: + - text: 'Describes a laboratory reference or control strain. Leave blank if not applicable. Example: proficiency testing Isolate.' + bs-cult_isol_date: + - text: 'A culture isolation date is a date-time entity marking the end of a process in which a sample yields a positive result for the target microbial analyte(s) in the form of an isolated colony or colonies. Example: 2020-05-02.' + bs-samp_collect_device: + - text: 'Device used to collect the sample. Choose a term provided in the pick-list, or provide your own.' + bs-IFSAC_category: + - text: 'The IFSAC food categorization scheme has five distinct levels to which foods can be assigned, depending upon the type of food. First, foods are assigned to one of four food groups (aquatic animals, land animals, plants, and other). Food groups include increasingly specific food categories; dairy, eggs, meat and poultry, and game are in the land animal food group, and the category meat and poultry is further subdivided into more specific categories of meat (beef, pork, other meat) and poultry (chicken, turkey, other poultry). Finally, foods are differentiated by differences in food processing (such as pasteurized fluid dairy products, unpasteurized fluid dairy products, pasteurized solid and semi-solid dairy products, and unpasteurized solid and semi-solid dairy products. An IFSAC food category chart is available from [CDC Food Safety](https://www.cdc.gov/foodsafety/ifsac/projects/food-categorization-scheme.html) PMID: 28926300.' + bs-serotype: + - text: 'Follow the OHE custom guidance for "organism" and include any serotype/serovar information in the organism name. We do not recommend populating this attribute.' + bs-serovar: + - text: 'Follow the OHE custom guidance for "organism" and include any serotype/serovar information in the organism name. We do not recommend populating this attribute.' + bs-spec_intended_cons: + - text: 'Specific food consumer type for which the food product is produced and marketed. This field accepts terms listed under food consumer group (http://purl.obolibrary.org/obo/FOODON_03510136).' + bs-coll_site_geo_feat: + - text: 'Text or terms that describe the geographic feature where the food sample was obtained by the researcher. This field accepts selected terms listed under the following ontologies: anthropogenic geographic feature (http://purl.obolibrary.org/obo/ENVO_00000002), for example agricultural fairground [ENVO:01000986]; garden [ENVO:00000011} or any of its subclasses; market [ENVO:01000987]; water well [ENVO:01000002]; or human construction (http://purl.obolibrary.org/obo/ENVO_00000070).' + bs-food_prod: + - text: 'Descriptors of the food production system or of the agricultural environment and growing conditions related to the farm production system. This field accepts terms listed under food production (http://purl.obolibrary.org/obo/FOODON_03530206). Multiple terms may apply and can be separated by semicolons.' + bs-label_claims: + - text: 'Labeling claims containing descriptors such as wild caught, free-range, organic, free-range, industrial, hormone-free, antibiotic free, cage free. Can include more than one term, separated by ";".' + bs-food_product_type: + - text: 'A food product type is a class of food products that is differentiated by its food composition (e.g., single- or multi-ingredient), processing and/or consumption characteristics. This does not include brand name products but it may include generic food dish categories. This field accepts terms under food product type (http://purl.obolibrary.org/obo/FOODON:03400361). For terms related to food product for an animal, consult food product for animal (http://purl.obolibrary.org/obo/FOODON_03309997). If the proper descriptor is not listed please use text to describe the food type. Multiple terms can be separated by one or more semicolons.' + bs-food_industry_code: + - text: 'The US-FDA Industry Code is the first of five elements that comprise an FDA product code. An Industry code determines the broadest area into which a product falls.' + bs-food_industry_class: + - text: 'The US FDA-Class is the second of five elements that comprise a FDA product code. This element is directly related to an Industry and designates the food group, source, product, use, pharmacological action, category or animal species of the product. A class code is more specific than an Industry; for example, the Fishery/Seafood products Industry may contain Classes such as Smoked, Breaded and such.' + bs-food_source: + - text: 'Individual organism or category of organisms from which the food product or its major ingredient is derived. Choose from a broad taxonomic category in the provided picklist, or provide a more specific taxonomic term from the Food Product by Organism branch [FOODON:00002381]."' + bs-food_processing_method: + - text: 'Methods for processing food for culinary purposes, or prepared prior to packaging. Choose one or multiple terms, separated by "; ".' + bs-food_preserv_proc: + - text: 'The methods contributing to the prevention or retardation of microbial, enzymatic or oxidative spoilage and thus to the extension of shelf life. This field accepts terms listed under food preservation process (http://purl.obolibrary.org/obo/FOODON_03470107).' + bs-food_additive: + - text: 'A substance or substances added to food to maintain or improve safety and freshness, to improve or maintain nutritional value, or improve taste, texture and appearance. This field accepts terms listed under food additive (http://purl.obolibrary.org/obo/FOODON_03412972). Multiple terms can be separated by one or more semicolons, but please consider limiting this list to the top 5 ingredients listed in order as on the food label. See also, https://www.fda.gov/food/food-ingredients-packaging/overview-food-ingredients-additives-colors.' + bs-food_contact_surf: + - text: 'The specific container or coating materials in direct contact with the food. Multiple values can be assigned. This field accepts terms listed under food contact surface (http://purl.obolibrary.org/obo/FOODON_03500010).' + bs-food_contain_wrap: + - text: 'Type of container or wrapping defined by the main container material, the container form, and the material of the liner lids or ends. Also type of container or wrapping by form; prefer description by material first, then by form. This field accepts terms listed under food container or wrapping (http://purl.obolibrary.org/obo/FOODON_03490100).' + bs-food_pack_medium: + - text: 'The medium in which the food is packed for preservation and handling or the medium surrounding homemade foods, e.g., peaches cooked in sugar syrup. The packing medium may provide a controlled environment for the food. It may also serve to improve palatability and consumer appeal. This includes edible packing media (e.g. fruit juice), gas other than air (e.g. carbon dioxide), vacuum packed, or packed with aerosol propellant. This field accepts terms under food packing medium (http://purl.obolibrary.org/obo/FOODON_03480020). Multiple terms may apply and can be separated by semicolons.' + bs-food_pack_integrity: + - text: 'A term label and term id to describe the state of the packing material and text to explain the exact condition. This field accepts terms listed under food packing medium integrity (http://purl.obolibrary.org/obo/FOODON_03530218).' + bs-food_quality_date: + - text: 'The date recommended for the use of the product while at peak quality, this date is not a reflection of safety unless used on infant formula this date is not a reflection of safety and is typically labeled on a food product as "best if used by," best by," "use by," or "freeze by." Must use ISO date format, YYYY-MM-DD. The date recommended for the use of the product while at peak quality, this date is not a reflection of safety unless used on infant formula this date is not a reflection of safety and is typically labeled on a food product as "best if used by," best by," "use by," or "freeze by." Must use ISO date format, YYYY-MM-DD.' + bs-food_prod_synonym: + - text: 'Other names by which the food product is known by (e.g., regional or non-English names).' SRA: - sra-file_location: + sra-file_location?: - text: 'Location of raw reads files. Options: "local" or "cloud"' - sra-file_name: + sra-file_name*: - text: 'Name of the raw read files. All file names must be unique and not contain any sensitive information. Files can be compressed using gzip or bzip2, and may be submitted in a tar archive but archiving and/or compressing your files is not required. Do not use zip! If there are multiple files, concatenate them with a commas (","), e.g. "sample1_R1.fastq.gz, sample1_R2.fastq.gz". Store files in /seqsender/data/raw_reads/ or provide full path to the raw read files.' - sra-library_ID: + sra-library_ID*: - text: 'Short unique identifier for sequencing library. Each name must be unique!' - sra-instrument_model: + sra-instrument_model: - text: 'Type of instrument model used for sequencing. See a list of options here.' - sra-library_strategy: + sra-library_strategy: - text: 'The sequencing technique intended for the library. See a list of options here.' - sra-library_source: + sra-library_source: - text: 'The type of source material that is being sequenced. See a list of options here.' - sra-library_selection: + sra-library_selection: - text: 'The method used to select and/or enrich the material being sequenced. See a list of options here.' - sra-library_layout: + sra-library_layout: - text: 'Whether to expect SINGLE or PAIRED end reads. Options: "single" or "paired"' GENBANK: gb-seq_id*#: diff --git a/create.py b/create.py index f5f2a62..e101c03 100644 --- a/create.py +++ b/create.py @@ -158,7 +158,10 @@ def create_submission_xml(organism, database, submission_name, config_dict, meta spuid.text = row["ncbi-spuid"] descriptor = etree.SubElement(biosample, "Descriptor") title = etree.SubElement(descriptor, "Title") - title.text = row["bs-description"] + if "bs-description" in row: + title.text = row["bs-description"] + else: + title.text = "" organism = etree.SubElement(biosample, "Organism") organismname = etree.SubElement(organism, "OrganismName") organismname.text = row["organism"] diff --git a/process.py b/process.py index 8bf55c7..c232c70 100644 --- a/process.py +++ b/process.py @@ -69,18 +69,24 @@ def get_required_colnames(database, organism): # Get required fields for given organism if organism in list(main_config["PORTAL_NAMES"][portal]["DATABASE"].keys()): all_required_colnames += list(main_config["PORTAL_NAMES"][portal]["DATABASE"][organism].keys()) + # Get required fields for a biosample package + if organism in list(main_config["PORTAL_NAMES"][portal]["DATABASE"]["BIOSAMPLE"]["PACKAGES"].keys()): + all_required_colnames += list(main_config["PORTAL_NAMES"][portal]["DATABASE"]["BIOSAMPLE"]["PACKAGES"][organism].keys()) # Get required fields for each given database for database_name in database_list: if database_name in list(main_config["PORTAL_NAMES"][portal]["DATABASE"].keys()): all_required_colnames += list(main_config["PORTAL_NAMES"][portal]["DATABASE"][database_name].keys()) # Extract the unique metadata fields + # remove the PACKAGES key that snuck in there, probably a better way to do this + all_required_colnames = [header for header in all_required_colnames if header != 'PACKAGES'] return set(all_required_colnames) # Check the config file def get_config(config_file, database): # Determine which portal is the database belongs to - print(database) - database = database.split(',') + if isinstance(database, str): + database = database.split(',') + submission_portals = ["NCBI" if x in ["BIOSAMPLE", "SRA", "GENBANK"] else "GISAID" for x in database] # Read in config file with open(config_file, "r") as f: @@ -98,7 +104,8 @@ def get_config(config_file, database): print("\n"+"Error: " + database[d] + " is listed as one of the submitting databases.", file=sys.stderr) print("Error: However, there is no " + submission_portals[d] + " submission information provided in the config file.", file=sys.stderr) print("Error: Either remove " + database[d] + " from the submitting databases or update your config file."+"\n", file=sys.stderr) - #sys.exit(1) + sys.exit(1) + return config_dict else: print("Error: Config file is incorrect. File must has a valid yaml format.", file=sys.stderr) @@ -110,6 +117,9 @@ def get_metadata(database, organism, metadata_file): metadata = pd.read_csv(metadata_file, header = 0, dtype = str, engine = "python", encoding="utf-8", index_col=False, na_filter=False) # Remove rows if entirely empty metadata = metadata.dropna(how="all") + # Drop empty optional columns that were not filled out before submission + metadata = metadata.applymap(lambda x: pd.NA if x == '' else x) + metadata = metadata.dropna(axis=1, how="all") # Remove extra spaces from column names metadata.columns = metadata.columns.str.strip() # Extract the required fields for specified database @@ -120,8 +130,10 @@ def get_metadata(database, organism, metadata_file): required_unknown_colnames = list(filter(lambda x: ("?" in x)==True, db_required_colnames)) # Obtain the column fields with & sign. Those fields contain date values. required_date_colnames = list(filter(lambda x: ("&" in x)==True, db_required_colnames)) + # merge all three required lists into one + required_colnames = required_sample_colnames + required_unknown_colnames + required_date_colnames # Obtain the real required column names without the asterisks and & signs - required_colnames = [re.sub("[*?#&]", "", x) for x in db_required_colnames] + required_colnames = [re.sub("[*?#&]", "", x) for x in required_colnames] # Check if required column names are existed in metadata file if not set(required_colnames).issubset(set(metadata.columns)): failed_required_colnames = list(filter(lambda x: (x in metadata.columns)==False, required_colnames)) @@ -133,7 +145,7 @@ def get_metadata(database, organism, metadata_file): if name in [re.sub("[*?#&]", "", x) for x in required_date_colnames]: metadata[name] = pd.to_datetime(metadata[name], errors="coerce") if pd.isna(metadata[name]).any(): - print("Error: The required 'collection_date' field in metadata file contains incorrect date format. Date must be in the ISO format: YYYYMMDD/YYYYDDMM/DDMMYYYY/MMDDYYYY. For example: 2020-03-25.", file=sys.stderr) + print("Error: The required 'collection_date' field in metadata file contains incorrect date format. Populate using ISO 8601 standard: “YYYY-mm-dd”, “YYYY-mm” or “YYYY” (e.g., 1990–10–30, 1990–10, or 1990).", file=sys.stderr) sys.exit(1) metadata[name] = metadata[name].dt.strftime("%Y-%m") # Make sure specific column fields with empty values are filled with "Unknown" diff --git a/seqsender.py b/seqsender.py index d37f221..19bac10 100755 --- a/seqsender.py +++ b/seqsender.py @@ -26,7 +26,7 @@ # Define current time STARTTIME = datetime.now() -# Define organsim choices +# Define organism choices ORGANISM_CHOICES = ["FLU", "COV", "ENTERIC"] # Define database choices @@ -212,9 +212,12 @@ def args_parser(): # Parse the database argument database_args = database_parser.parse_known_args()[0] + # Change namespace to list - value names in the namespace were being evaluated as true and requiring fasta input for BIOSAMPLE/SRA only submissions + database_args = [x for x in vars(database_args).values() if x] # If genbank and/or gisaid in the database list, must provide fasta file if any(x in database_args for x in ["genbank", "gisaid"]): + print(database_args) file_parser.add_argument("--fasta_file", help="Fasta file stored in submission directory", required=True) diff --git a/template/ENTERIC/onehealth_config.yaml b/template/ENTERIC/onehealth_config.yaml new file mode 100644 index 0000000..a3b51c4 --- /dev/null +++ b/template/ENTERIC/onehealth_config.yaml @@ -0,0 +1,29 @@ +Submission: + NCBI: + Username: username + Password: password + Submission_Position: 1 + Description: + Title: onehealth_test_submission + Comment: This is a test submission + Organization: + '@role': owner + '@type': institute + '@org_id': 12345 + Name: CCPHL + Address: + Affil: Contra Costa Health Services + Div: Public Health Laboratory + Street: 2500 Alhambra Ave + City: Martinez + Sub: CA + Postal_code: 94553 + Country: USA + Email: email@myemail.com + Phone: "" + Submitter: + '@email': email@myemail.com + '@alt_email': "" + Name: + First: Jane + Last: Doe \ No newline at end of file diff --git a/template/ENTERIC/onehealth_enteric_biosample_sra_metadata.csv b/template/ENTERIC/onehealth_enteric_biosample_sra_metadata.csv new file mode 100644 index 0000000..9567a48 --- /dev/null +++ b/template/ENTERIC/onehealth_enteric_biosample_sra_metadata.csv @@ -0,0 +1,3 @@ +sequence_name,organism,collection_date,authors,ncbi-spuid,ncbi-spuid_namespace,ncbi-bioproject,bs-package,bs-sample_name,bs-sample_title,bs-collected_by,bs-geo_loc_name,bs-isolation_source,bs-purpose_of_sampling,bs-source_type,bs-strain,bs-animal_env,bs-animal_intrusion,bs-biocide_used,bs-building_setting,bs-coll_site_geo_feat,bs-cult_isol_date,bs-culture_collection,bs-env_broad_scale,bs-env_local_scale,bs-env_medium,bs-env_monitoring_zone,bs-extr_weather_event,bs-facility_type,bs-farm_equip,bs-farm_water_source,bs-fertilizer_admin,bs-food_additive,bs-food_clean_proc,bs-food_contact_surf,bs-food_contain_wrap,bs-food_industry_class,bs-food_industry_code,bs-food_origin,bs-food_pack_integrity,bs-food_pack_medium,bs-food_preserv_proc,bs-food_processing_method,bs-food_prod,bs-food_prod_synonym,bs-food_product_type,bs-food_quality_date,bs-food_source,bs-food_type_processed,bs-host,bs-host_age,bs-host_am,bs-host_animal_breed,bs-host_body_product,bs-host_disease,bs-host_group_size,bs-host_housing,bs-host_sex,bs-host_subject_id,bs-host_tissue_sampled,bs-host_variety,bs-ifsac_category,bs-indoor_surf,bs-indoor_surf_subpart,bs-intended_consumer,bs-isolate_name_alias,bs-label_claims,bs-lat_lon,bs-location_in_facility,bs-material_condition,bs-mechanical_damage,bs-plant_growth_med,bs-plant_water_method,bs-project_name,bs-reference_material,bs-rel_location,bs-samp_collect_device,bs-sanitizer_used_postharvest,bs-sequenced_by,bs-serotype,bs-serovar,bs-soil_type,bs-spec_intended_cons,bs-surf_material,bs-surf_temp,bs-surface_orientation,bs-upstream_intervention,bs-description,sra-library_ID,sra-library_strategy,sra-library_source,sra-library_selection,sra-library_layout,sra-file_name,sra-instrument_model,sra-file_location +23CA08CL01CC-S1,Salmonella enterica,2023-08,Jane Doe,23CA08CL01CC-S1,ca-contracosta-phl,PRJNA292661,OneHealthEnteric.1.0,23CA08CL01CC-S1,WGS of Salmonella enterica: 2023 NARMS CHICKEN LIVER 23CA08CL01CC-S1,Contra Costa Public Health Laboratory,USA:CA,CHICKEN LIVER,baseline surveillance [GENEPIO:0100005],food,23CA08CL01CC-S1,,,,,,,,,,,,,,,,,,,,,,,USA:CA,,,,,,,,,,,,,,,,,,,,,,,,,,human as food consumer,,,,,,,,,NARMS Retail Meat,,,,,,,,,,,,,,,23CA08CL01CC-S1,WGS,GENOMIC,RANDOM,paired,"23CA08CL01CC-S1_1.clean.fastq.gz,23CA08CL01CC-S1_2.clean.fastq.gz",Illumina MiSeq,local +23CA09CG01CC-S1,Salmonella enterica,2023-09,Jane Doe,23CA09CG01CC-S1,ca-contracosta-phl,PRJNA292664,OneHealthEnteric.1.0,23CA09CG01CC-S1,WGS of Salmonella enterica: 2023 NARMS CHICKEN GIZZARD 23CA09CG01CC-S1,Contra Costa Public Health Laboratory,USA:CA,CHICKEN GIZZARD,baseline surveillance [GENEPIO:0100005],food,23CA09CG01CC-S1,,,,,,,,,,,,,,,,,,,,,,,USA:CA,,,,,,,,,,,,,,,,,,,,,,,,,,human as food consumer,,,,,,,,,NARMS Retail Meat,,,,,,,,,,,,,,,23CA09CG01CC-S1,WGS,GENOMIC,RANDOM,paired,"23CA09CG01CC-S1_1.clean.fastq.gz,23CA09CG01CC-S1_2.clean.fastq.gz",Illumina MiSeq,local \ No newline at end of file From d811cf16e733919d19dc2337c87e6d31b446f0ce Mon Sep 17 00:00:00 2001 From: Erik Wolfsohn Date: Wed, 6 Mar 2024 12:39:26 -0800 Subject: [PATCH 3/6] Updated main_config.yaml - only fields tagged with *?&# are mandatory - fields with * also must be unique. Probably have to revisit this for GISAID/GENBANK --- config/main_config.yaml | 70 ++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/config/main_config.yaml b/config/main_config.yaml index 5d64ce4..0c8b22c 100644 --- a/config/main_config.yaml +++ b/config/main_config.yaml @@ -1,5 +1,5 @@ OVERVIEW: - - "All fields listed in the config file are mandatory." + - "Fields marked with *?&# are mandatory." - "Fields marked with * contains unique sample identifiers" - "Fields marked with ? are required in full. Missing values can be fill with Unknown" - "Fields marked with # contains unique sample identifiers that must match the header of the respective sequences in the fasta file." @@ -43,7 +43,7 @@ SUBMISSION_PORTAL: - text: 'Identification or description of the specific individual from which this sample was obtained.' bs-strain*: - text: 'Identification or description of the specific individual from which this sample was obtained.' - bs-isolation_source*: + bs-isolation_source?: - text: 'Describes the physical, environmental and/or local geographical source of the biological sample from which the sample was derived.' bs-lat_lon: - text: 'The geographical coordinates of the location where the sample was collected. Specify as degrees latitude and longitude in format "d[d.dddd] N|S d[dd.dddd] W|E", eg, 38.98 N 77.11 W' @@ -121,77 +121,77 @@ SUBMISSION_PORTAL: sra-library_layout: - text: 'Whether to expect SINGLE or PAIRED end reads. Options: "single" or "paired"' GENBANK: - gb-seq_id*#: + gb-seq_id*#: - text: 'Identification to be used for the sequence in the FASTA.' - gb-subm_lab: + gb-subm_lab?: - text: 'Full name of organization, institute, or laboratory, etc., who is submitting this record' - gb-subm_lab_division: + gb-subm_lab_division: - text: 'The division of organization, institute, or laboratory, etc., who is submitting this record' - gb-subm_lab_addr: + gb-subm_lab_addr?: - text: 'The address of organization, institute, or laboratory, etc., who is submitting this record' - gb-publication_title: + gb-publication_title: - text: 'The title and relevant publication details (volume, issue, etc.) of a paper that discusses the submission. If left empty, the program will used the name of the submission as title.' - gb-publication_status: + gb-publication_status: - text: 'Options: "unpublished" or "in-press" or "published"' - src-isolate: + src-isolate?: - text: 'Identification or description of the specific individual from which this sample was obtained' - src-country: + src-country?: - text: 'Geographical origin of the sample; use the appropriate name from this list. Use a colon to separate the country or ocean from more detailed information about the location, eg "Canada: Vancouver" or "Germany: halfway down Zugspitze, Alps". Entering multiple localities in one attribute is not allowed.' - src-host: + src-host?: - text: 'The natural (as opposed to laboratory) host to the organism from which the sample was obtained. Use the full taxonomic name, eg, Homo sapiens' - src-serotype: + src-serotype: - text: 'For Influenza A only; must be in format HxNx, Hx, Nx or mixed; where x is a numeral' - src-isolation_source: + src-isolation_source?: - text: 'Describes the physical, environmental and/or local geographical source of the biological sample from which the sample was derived.' - cmt-StructuredCommentPrefix: + cmt-StructuredCommentPrefix: - text: 'Structured comment keyword. For FLU use "FluData", HIV use "HIV-DataBaseData", and for COV and other organisms use "Assembly-Data".' - cmt-StructuredCommentSuffix: + cmt-StructuredCommentSuffix: - text: 'Structured comment keyword. For FLU use "FluData", HIV use "HIV-DataBaseData", and for COV and other organisms use "Assembly-Data".' GISAID: DATABASE: FLU: - gs-seq_id*#: + gs-seq_id*#: - text: 'Identification to be used for the sequence in the FASTA.' - gs-Isolate_Name: + gs-Isolate_Name*: - text: 'E.g. "A/Brisbane/1444A/2010"' - gs-segment: + gs-segment: - text: 'Segment name for GISAID. Options are: "HA", "HE", "MP", "NA", "NP", "NS", "P3", "PA", "PB1", "PB2"' - gs-Subtype: + gs-Subtype: - text: 'E.g. "H5N1"' - gs-Location: + gs-Location?: - text: 'E.g., "United Kingdom", "Japan", "China", "United States", etc.' - gs-Host: + gs-Host?: - text: 'Host or source name., E.g. "human", "avian", "chicken", "Anas Acuta", "environment", etc.' gs-Collection_Month: - text: 'For incomplete collection dates, use this field instead of "Collection_Date". Month of year: "1" = Jan, "2" = Feb, so forth, "12" = Dec' gs-Collection_Year: - text: 'For incomplete collection dates, use this field instead of "Collection_Date". Four digit year as string: e.g. "2023"' - gs-Originating_Lab_Id: + gs-Originating_Lab_Id: - text: 'The numeric ID of the sample"s originating laboratory, e.g. "2698"' COV: - gs-virus_name*#: + gs-virus_name*#: - text: 'For example: hCoV-19/Country/SampleID/YYYY
There are four parts delineated by the forward slash "/" character:
  • "hCoV-19": despite common usage of virus synonyms such as SARS-CoV-2 or nCoV-19, this first part must remain "hCoV-19" verbatim (to ensure backwards compatibility with EpiCoV db).
  • "Country" is full name of country of sample collection (e.g., Australia), including spaces. For backwards compatibility, the exception being to use "USA" for United States of America.
  • "SampleID" is recommended to be of the format, Loc-Lab-Number, where:
    • Loc is location abbreviation (use abbreviated state or province for location, such as "VIC" for Victoria, Australia, or "CA" for California, USA)
    • Lab is lab name abbreviation (e.g., "CDC" for Centres for Disease Control)
    • Number is sample number or lab code (e.g., 02978, or S47y)
  • "YYYY" is four digit year of sample collection. Note, this must be the same as the YYYY provided in the collection_date value, else a "date inplausible" error will occur
In this example, the virus_name could be:
  • hCoV-19/Australia/VIC-CDC-02978/2022, or
  • hCoV-19/USA/CA-CDC-S47y/2022, respectively.
NOTE: virus_name field must match exactly the header of the respective sequence in the fasta file.
' - gs-type: + gs-type?: - text: 'For hCoV-19, this will always be "betacoronavirus".' - gs-passage: + gs-passage?: - text: '"Original" if the sample was sequenced directly from swabs, otherwise add the name of the cell line (e.g., "Vero") used to culture the specimen.' - gs-location: + gs-location?: - text: 'Format as "Continent / Country / Region / Sub-region"' - gs-host: + gs-host?: - text: 'For clinical samples, this is "Human". Otherwise add the species name of the organism from which the sample was originally sourced.' - gs-gender: + gs-gender?: - text: 'Synonym for "Biological sex". Should be "Female", "Male", or "Other"' - gs-patient_age: + gs-patient_age?: - text: 'Age in years of the person from whom the specimen was collected. May take format other than integer years, for example, "0.5" (i.e., 6 months), "5 days", "7 months". If units are not given, they are assumed in years.' - gs-patient_status: + gs-patient_status?: - text: 'E.g., "Hospitalized", "Released", "Live", "Deceased"' - gs-seq_technology: + gs-seq_technology: - text: 'Add the sequencer brand and model. See a list of options here.' - gs-orig_lab: + gs-orig_lab?: - text: 'Full name of laboratory from where sample originated.' - gs-orig_lab_addr: + gs-orig_lab_addr?: - text: 'Complete building address of laboratory from where sample originated.' - gs-subm_lab: + gs-subm_lab?: - text: 'Full name of laboratory submitting this record to GISAID. See a list of options here.' - gs-subm_lab_addr: + gs-subm_lab_addr?: - text: 'Complete building address of the submitting laboratory.' From 5d18b17efcfaf0c169fee0ac3f25855be1d2e598 Mon Sep 17 00:00:00 2001 From: Dakota Howard <58985143+dthoward96@users.noreply.github.com> Date: Fri, 8 Mar 2024 13:57:59 -0500 Subject: [PATCH 4/6] Update report.py Correcting this, not all FTP accounts have the "submit" folder, adjusting it to automatically detect the folder and correctly step into it if it exists. --- report.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/report.py b/report.py index b255ffa..4ac1869 100644 --- a/report.py +++ b/report.py @@ -40,9 +40,18 @@ def get_ncbi_process_report(database, submission_name, submission_files_dir, con FTP_HOST = process.get_main_config()["PORTAL_NAMES"]["NCBI"]["FTP_HOST"] ftp = ftplib.FTP(FTP_HOST) ftp.login(user=config_dict["Username"], passwd=config_dict["Password"]) - # CD to test or production folder - ftp.cwd('/') - ftp.cwd('submit') + # Check FTP folder structure either /submit/Production/ or /Production/ + if submission_type not in ftp.nlst(): + # Check if submit folder exists + if "submit" in ftp.nlst(): + ftp.cwd("submit") + # If submit folder exists check if Production/Test folder exists + if submission_type not in ftp.nlst(): + print("Error: Cannot find submission folder on NCBI FTP site.", file=sys.stderr) + sys.exit(1) + else: + print("Error: Cannot find submission folder on NCBI FTP site.", file=sys.stderr) + sys.exit(1) ftp.cwd(submission_type) # Check if submission name exists if ncbi_submission_name not in ftp.nlst(): From 054ac4daf972975ed89cc2063f790bbad65c671a Mon Sep 17 00:00:00 2001 From: Dakota Howard <58985143+dthoward96@users.noreply.github.com> Date: Fri, 8 Mar 2024 14:18:35 -0500 Subject: [PATCH 5/6] Update main_config.yaml library_name is the correct attribute value to be used not library_ID based on their examples https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/sra/samples/sra.submission.run.xml?revision=71838&view=markup --- config/main_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/main_config.yaml b/config/main_config.yaml index 0c8b22c..b200d5f 100644 --- a/config/main_config.yaml +++ b/config/main_config.yaml @@ -108,7 +108,7 @@ SUBMISSION_PORTAL: - text: 'Location of raw reads files. Options: "local" or "cloud"' sra-file_name*: - text: 'Name of the raw read files. All file names must be unique and not contain any sensitive information. Files can be compressed using gzip or bzip2, and may be submitted in a tar archive but archiving and/or compressing your files is not required. Do not use zip! If there are multiple files, concatenate them with a commas (","), e.g. "sample1_R1.fastq.gz, sample1_R2.fastq.gz". Store files in /seqsender/data/raw_reads/ or provide full path to the raw read files.' - sra-library_ID*: + sra-library_name*: - text: 'Short unique identifier for sequencing library. Each name must be unique!' sra-instrument_model: - text: 'Type of instrument model used for sequencing. See a list of options here.' From f79e4fcffd2984e9284adffa52b7b6584eb85aa1 Mon Sep 17 00:00:00 2001 From: Dakota Howard <58985143+dthoward96@users.noreply.github.com> Date: Fri, 8 Mar 2024 14:28:06 -0500 Subject: [PATCH 6/6] Update create.py If bs-description is empty don't build descriptor with empty string. Remove for NCBI to automatically generate --- create.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/create.py b/create.py index e101c03..068b08d 100644 --- a/create.py +++ b/create.py @@ -156,12 +156,10 @@ def create_submission_xml(organism, database, submission_name, config_dict, meta sampleid = etree.SubElement(biosample, "SampleId") spuid = etree.SubElement(sampleid, "SPUID", spuid_namespace=row["ncbi-spuid_namespace"]) spuid.text = row["ncbi-spuid"] - descriptor = etree.SubElement(biosample, "Descriptor") - title = etree.SubElement(descriptor, "Title") - if "bs-description" in row: + if "bs-description" in row and row["bs-description"] is not None and row["bs-description"] != "": + descriptor = etree.SubElement(biosample, "Descriptor") + title = etree.SubElement(descriptor, "Title") title.text = row["bs-description"] - else: - title.text = "" organism = etree.SubElement(biosample, "Organism") organismname = etree.SubElement(organism, "OrganismName") organismname.text = row["organism"]