Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add One Health Enteric BioSample Package + misc bugfixes #38

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 110 additions & 52 deletions config/main_config.yaml

Large diffs are not rendered by default.

7 changes: 4 additions & 3 deletions create.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,9 +156,10 @@ def create_submission_xml(organism, database, submission_name, config_dict, meta
sampleid = etree.SubElement(biosample, "SampleId")
spuid = etree.SubElement(sampleid, "SPUID", spuid_namespace=row["ncbi-spuid_namespace"])
spuid.text = row["ncbi-spuid"]
descriptor = etree.SubElement(biosample, "Descriptor")
title = etree.SubElement(descriptor, "Title")
title.text = row["bs-description"]
if "bs-description" in row and row["bs-description"] is not None and row["bs-description"] != "":
descriptor = etree.SubElement(biosample, "Descriptor")
title = etree.SubElement(descriptor, "Title")
title.text = row["bs-description"]
organism = etree.SubElement(biosample, "Organism")
organismname = etree.SubElement(organism, "OrganismName")
organismname.text = row["organism"]
Expand Down
20 changes: 17 additions & 3 deletions process.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,16 +69,24 @@ def get_required_colnames(database, organism):
# Get required fields for given organism
if organism in list(main_config["PORTAL_NAMES"][portal]["DATABASE"].keys()):
all_required_colnames += list(main_config["PORTAL_NAMES"][portal]["DATABASE"][organism].keys())
# Get required fields for a biosample package
if organism in list(main_config["PORTAL_NAMES"][portal]["DATABASE"]["BIOSAMPLE"]["PACKAGES"].keys()):
all_required_colnames += list(main_config["PORTAL_NAMES"][portal]["DATABASE"]["BIOSAMPLE"]["PACKAGES"][organism].keys())
# Get required fields for each given database
for database_name in database_list:
if database_name in list(main_config["PORTAL_NAMES"][portal]["DATABASE"].keys()):
all_required_colnames += list(main_config["PORTAL_NAMES"][portal]["DATABASE"][database_name].keys())
# Extract the unique metadata fields
# remove the PACKAGES key that snuck in there, probably a better way to do this
all_required_colnames = [header for header in all_required_colnames if header != 'PACKAGES']
return set(all_required_colnames)

# Check the config file
def get_config(config_file, database):
# Determine which portal is the database belongs to
if isinstance(database, str):
database = database.split(',')

submission_portals = ["NCBI" if x in ["BIOSAMPLE", "SRA", "GENBANK"] else "GISAID" for x in database]
# Read in config file
with open(config_file, "r") as f:
Expand All @@ -97,6 +105,7 @@ def get_config(config_file, database):
print("Error: However, there is no " + submission_portals[d] + " submission information provided in the config file.", file=sys.stderr)
print("Error: Either remove " + database[d] + " from the submitting databases or update your config file."+"\n", file=sys.stderr)
sys.exit(1)

return config_dict
else:
print("Error: Config file is incorrect. File must has a valid yaml format.", file=sys.stderr)
Expand All @@ -108,6 +117,9 @@ def get_metadata(database, organism, metadata_file):
metadata = pd.read_csv(metadata_file, header = 0, dtype = str, engine = "python", encoding="utf-8", index_col=False, na_filter=False)
# Remove rows if entirely empty
metadata = metadata.dropna(how="all")
# Drop empty optional columns that were not filled out before submission
metadata = metadata.applymap(lambda x: pd.NA if x == '' else x)
metadata = metadata.dropna(axis=1, how="all")
# Remove extra spaces from column names
metadata.columns = metadata.columns.str.strip()
# Extract the required fields for specified database
Expand All @@ -118,8 +130,10 @@ def get_metadata(database, organism, metadata_file):
required_unknown_colnames = list(filter(lambda x: ("?" in x)==True, db_required_colnames))
# Obtain the column fields with & sign. Those fields contain date values.
required_date_colnames = list(filter(lambda x: ("&" in x)==True, db_required_colnames))
# merge all three required lists into one
required_colnames = required_sample_colnames + required_unknown_colnames + required_date_colnames
# Obtain the real required column names without the asterisks and & signs
required_colnames = [re.sub("[*?#&]", "", x) for x in db_required_colnames]
required_colnames = [re.sub("[*?#&]", "", x) for x in required_colnames]
# Check if required column names are existed in metadata file
if not set(required_colnames).issubset(set(metadata.columns)):
failed_required_colnames = list(filter(lambda x: (x in metadata.columns)==False, required_colnames))
Expand All @@ -131,9 +145,9 @@ def get_metadata(database, organism, metadata_file):
if name in [re.sub("[*?#&]", "", x) for x in required_date_colnames]:
metadata[name] = pd.to_datetime(metadata[name], errors="coerce")
if pd.isna(metadata[name]).any():
print("Error: The required 'collection_date' field in metadata file contains incorrect date format. Date must be in the ISO format: YYYYMMDD/YYYYDDMM/DDMMYYYY/MMDDYYYY. For example: 2020-03-25.", file=sys.stderr)
print("Error: The required 'collection_date' field in metadata file contains incorrect date format. Populate using ISO 8601 standard: “YYYY-mm-dd”, “YYYY-mm” or “YYYY” (e.g., 1990–10–30, 1990–10, or 1990).", file=sys.stderr)
sys.exit(1)
metadata[name] = metadata[name].dt.strftime("%Y-%m-%d")
metadata[name] = metadata[name].dt.strftime("%Y-%m")
# Make sure specific column fields with empty values are filled with "Unknown"
if (name in [re.sub("[*?#&]", "", x) for x in required_unknown_colnames]) and any(metadata[name] == ""):
metadata[name] = metadata[name].replace(r'^\s*$', "Unknown", regex=True)
Expand Down
13 changes: 12 additions & 1 deletion report.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,18 @@ def get_ncbi_process_report(database, submission_name, submission_files_dir, con
FTP_HOST = process.get_main_config()["PORTAL_NAMES"]["NCBI"]["FTP_HOST"]
ftp = ftplib.FTP(FTP_HOST)
ftp.login(user=config_dict["Username"], passwd=config_dict["Password"])
# CD to to test or production folder
# Check FTP folder structure either /submit/Production/ or /Production/
if submission_type not in ftp.nlst():
# Check if submit folder exists
if "submit" in ftp.nlst():
ftp.cwd("submit")
# If submit folder exists check if Production/Test folder exists
if submission_type not in ftp.nlst():
print("Error: Cannot find submission folder on NCBI FTP site.", file=sys.stderr)
sys.exit(1)
else:
print("Error: Cannot find submission folder on NCBI FTP site.", file=sys.stderr)
sys.exit(1)
ftp.cwd(submission_type)
# Check if submission name exists
if ncbi_submission_name not in ftp.nlst():
Expand Down
7 changes: 5 additions & 2 deletions seqsender.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@
# Define current time
STARTTIME = datetime.now()

# Define organsim choices
ORGANISM_CHOICES = ["FLU", "COV"]
# Define organism choices
ORGANISM_CHOICES = ["FLU", "COV", "ENTERIC"]

# Define database choices
DATABASE_CHOICES = ["BIOSAMPLE", "SRA", "GENBANK", "GISAID"]
Expand Down Expand Up @@ -212,9 +212,12 @@ def args_parser():

# Parse the database argument
database_args = database_parser.parse_known_args()[0]
# Change namespace to list - value names in the namespace were being evaluated as true and requiring fasta input for BIOSAMPLE/SRA only submissions
database_args = [x for x in vars(database_args).values() if x]

# If genbank and/or gisaid in the database list, must provide fasta file
if any(x in database_args for x in ["genbank", "gisaid"]):
print(database_args)
file_parser.add_argument("--fasta_file",
help="Fasta file stored in submission directory",
required=True)
Expand Down
29 changes: 29 additions & 0 deletions template/ENTERIC/onehealth_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
Submission:
NCBI:
Username: username
Password: password
Submission_Position: 1
Description:
Title: onehealth_test_submission
Comment: This is a test submission
Organization:
'@role': owner
'@type': institute
'@org_id': 12345
Name: CCPHL
Address:
Affil: Contra Costa Health Services
Div: Public Health Laboratory
Street: 2500 Alhambra Ave
City: Martinez
Sub: CA
Postal_code: 94553
Country: USA
Email: [email protected]
Phone: ""
Submitter:
'@email': [email protected]
'@alt_email': ""
Name:
First: Jane
Last: Doe
3 changes: 3 additions & 0 deletions template/ENTERIC/onehealth_enteric_biosample_sra_metadata.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sequence_name,organism,collection_date,authors,ncbi-spuid,ncbi-spuid_namespace,ncbi-bioproject,bs-package,bs-sample_name,bs-sample_title,bs-collected_by,bs-geo_loc_name,bs-isolation_source,bs-purpose_of_sampling,bs-source_type,bs-strain,bs-animal_env,bs-animal_intrusion,bs-biocide_used,bs-building_setting,bs-coll_site_geo_feat,bs-cult_isol_date,bs-culture_collection,bs-env_broad_scale,bs-env_local_scale,bs-env_medium,bs-env_monitoring_zone,bs-extr_weather_event,bs-facility_type,bs-farm_equip,bs-farm_water_source,bs-fertilizer_admin,bs-food_additive,bs-food_clean_proc,bs-food_contact_surf,bs-food_contain_wrap,bs-food_industry_class,bs-food_industry_code,bs-food_origin,bs-food_pack_integrity,bs-food_pack_medium,bs-food_preserv_proc,bs-food_processing_method,bs-food_prod,bs-food_prod_synonym,bs-food_product_type,bs-food_quality_date,bs-food_source,bs-food_type_processed,bs-host,bs-host_age,bs-host_am,bs-host_animal_breed,bs-host_body_product,bs-host_disease,bs-host_group_size,bs-host_housing,bs-host_sex,bs-host_subject_id,bs-host_tissue_sampled,bs-host_variety,bs-ifsac_category,bs-indoor_surf,bs-indoor_surf_subpart,bs-intended_consumer,bs-isolate_name_alias,bs-label_claims,bs-lat_lon,bs-location_in_facility,bs-material_condition,bs-mechanical_damage,bs-plant_growth_med,bs-plant_water_method,bs-project_name,bs-reference_material,bs-rel_location,bs-samp_collect_device,bs-sanitizer_used_postharvest,bs-sequenced_by,bs-serotype,bs-serovar,bs-soil_type,bs-spec_intended_cons,bs-surf_material,bs-surf_temp,bs-surface_orientation,bs-upstream_intervention,bs-description,sra-library_ID,sra-library_strategy,sra-library_source,sra-library_selection,sra-library_layout,sra-file_name,sra-instrument_model,sra-file_location
23CA08CL01CC-S1,Salmonella enterica,2023-08,Jane Doe,23CA08CL01CC-S1,ca-contracosta-phl,PRJNA292661,OneHealthEnteric.1.0,23CA08CL01CC-S1,WGS of Salmonella enterica: 2023 NARMS CHICKEN LIVER 23CA08CL01CC-S1,Contra Costa Public Health Laboratory,USA:CA,CHICKEN LIVER,baseline surveillance [GENEPIO:0100005],food,23CA08CL01CC-S1,,,,,,,,,,,,,,,,,,,,,,,USA:CA,,,,,,,,,,,,,,,,,,,,,,,,,,human as food consumer,,,,,,,,,NARMS Retail Meat,,,,,,,,,,,,,,,23CA08CL01CC-S1,WGS,GENOMIC,RANDOM,paired,"23CA08CL01CC-S1_1.clean.fastq.gz,23CA08CL01CC-S1_2.clean.fastq.gz",Illumina MiSeq,local
23CA09CG01CC-S1,Salmonella enterica,2023-09,Jane Doe,23CA09CG01CC-S1,ca-contracosta-phl,PRJNA292664,OneHealthEnteric.1.0,23CA09CG01CC-S1,WGS of Salmonella enterica: 2023 NARMS CHICKEN GIZZARD 23CA09CG01CC-S1,Contra Costa Public Health Laboratory,USA:CA,CHICKEN GIZZARD,baseline surveillance [GENEPIO:0100005],food,23CA09CG01CC-S1,,,,,,,,,,,,,,,,,,,,,,,USA:CA,,,,,,,,,,,,,,,,,,,,,,,,,,human as food consumer,,,,,,,,,NARMS Retail Meat,,,,,,,,,,,,,,,23CA09CG01CC-S1,WGS,GENOMIC,RANDOM,paired,"23CA09CG01CC-S1_1.clean.fastq.gz,23CA09CG01CC-S1_2.clean.fastq.gz",Illumina MiSeq,local
Loading