From 07a3a78a3d7c320ab0c8bb846782471fa856a61a Mon Sep 17 00:00:00 2001 From: Dakota Howard <58985143+dthoward96@users.noreply.github.com> Date: Wed, 3 Apr 2024 10:54:42 -0400 Subject: [PATCH] Add files via upload Changes requirement from only isolate to require either strain or isolate for BioSample and GenBank --- process.py | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/process.py b/process.py index 22467c6..9ac66c9 100644 --- a/process.py +++ b/process.py @@ -65,7 +65,7 @@ def get_required_colnames(database, organism): if len(database_list) > 0: # Get all common fields across all databases in a portal if "COMMON_FIELDS" in list(main_config["PORTAL_NAMES"][portal].keys()): - all_required_colnames += list(main_config["PORTAL_NAMES"][portal]["COMMON_FIELDS"].keys()) + all_required_colnames += list(main_config["PORTAL_NAMES"][portal]["COMMON_FIELDS"].keys()) # Get required fields for given organism if organism in list(main_config["PORTAL_NAMES"][portal]["DATABASE"].keys()): all_required_colnames += list(main_config["PORTAL_NAMES"][portal]["DATABASE"][organism].keys()) @@ -74,7 +74,7 @@ def get_required_colnames(database, organism): if database_name in list(main_config["PORTAL_NAMES"][portal]["DATABASE"].keys()): all_required_colnames += list(main_config["PORTAL_NAMES"][portal]["DATABASE"][database_name].keys()) # Extract the unique metadata fields - return set(all_required_colnames) + return set(all_required_colnames) # Check the config file def get_config(config_file, database): @@ -98,7 +98,7 @@ def get_config(config_file, database): print("Error: Either remove " + database[d] + " from the submitting databases or update your config file."+"\n", file=sys.stderr) sys.exit(1) return config_dict - else: + else: print("Error: Config file is incorrect. File must has a valid yaml format.", file=sys.stderr) sys.exit(1) @@ -120,11 +120,23 @@ def get_metadata(database, organism, metadata_file): required_date_colnames = list(filter(lambda x: ("&" in x)==True, db_required_colnames)) # Obtain the real required column names without the asterisks and & signs required_colnames = [re.sub("[*?#&]", "", x) for x in db_required_colnames] + # Remove ISOLATE FROM REQUIRED COLNAMES FOR TEMP FIX + required_colnames = [x for x in required_colnames if "-isolate" not in x] # Check if required column names are existed in metadata file if not set(required_colnames).issubset(set(metadata.columns)): failed_required_colnames = list(filter(lambda x: (x in metadata.columns)==False, required_colnames)) print("Error: Metadata file must have the following required column names: " + ", ".join(failed_required_colnames), file=sys.stderr) sys.exit(1) + ################# TEMPORARY FIX ################### + # Temporary fix to require either isolate or strain field not both + if "BIOSAMPLE" in database: + if "bs-isolate" not in metadata and "bs-strain" not in metadata: + print("Error: Metadata file must have one of these required columns: \"bs-isolate\" or \"bs-strain\".", file=sys.stderr) + sys.exit(1) + if "GENBANK" in database: + if "src-isolate" not in metadata and "src-strain" not in metadata: + print("Error: Metadata file must have one of these required columns: \"src-isolate\" or \"src-strain\".", file=sys.stderr) + sys.exit(1) # Run some checks to make sure the required column fields are populated correctly for name in required_colnames: # Make sure specific fields have a correct date format @@ -171,12 +183,12 @@ def read_gisaid_log(log_file, submission_status_file): if "epi_isl".upper() in line.upper(): column_name = "gs-sample_name" sample_name = list(set(filter(lambda x: (x.upper() in line.upper())==True, submission_status[column_name]))) - accession_id = "epi_isl_id" + accession_id = "epi_isl_id" accession = re.search("EPI_ISL_[1-9]+", line) elif "epi_id".upper() in line.upper(): column_name = "gs-sequence_name" sample_name = list(set(filter(lambda x: (x.upper() in line.upper())==True, submission_status[column_name]))) - accession_id = "epi_id" + accession_id = "epi_id" accession = re.search("EPI[1-9]+", line) else: continue @@ -327,10 +339,10 @@ def update_submission_status(submission_dir, submission_name, organism, test): for database_name in database: print("\n" + "Submission database: " + database_name, file=sys.stdout) df = pd.read_csv(submission_log_file, header = 0, dtype = str, engine = "python", encoding="utf-8", index_col=False).sort_values('Submission_Position', ascending=True) - df_processing = df[(df["Organism"] == organism) & (df["Database"] == database_name) & (df["Submission_Directory"] == submission_dir) & (df["Submission_Name"] == submission_name) & (df["Submission_Type"] == submission_type)] + df_processing = df[(df["Organism"] == organism) & (df["Database"] == database_name) & (df["Submission_Directory"] == submission_dir) & (df["Submission_Name"] == submission_name) & (df["Submission_Type"] == submission_type)] df_processing = df_processing.reset_index(drop=True) - submission_dir = df_processing["Submission_Directory"][0] - submission_position = df_processing["Submission_Position"][0] + submission_dir = df_processing["Submission_Directory"][0] + submission_position = df_processing["Submission_Position"][0] submission_id, submission_status = df_processing["Submission_Status"][0].strip().split(";") config_file = df_processing["Config_File"][0] table2asn = df_processing["Table2asn"][0] @@ -358,11 +370,11 @@ def update_submission_status(submission_dir, submission_name, organism, test): print("There is no GISAID CLI package for " + organism + " located at "+ gisaid_cli, file=sys.stderr) print("Please download the CLI package from GISAID platform", file=sys.stderr) print("Then place a copy of the CLI binary at "+ gisaid_cli, file=sys.stderr) - sys.exit(1) + sys.exit(1) # Check the status of the submission if "processed-ok" in submission_status: print("Submission status: " + submission_status, file=sys.stdout) - else: + else: # Pull download submission report and update its status if database_name in ["BIOSAMPLE", "SRA", "GENBANK"]: # If report exists, processing the report and output status of the submission @@ -384,8 +396,8 @@ def update_submission_status(submission_dir, submission_name, organism, test): for db in other_submitting_db: db_df = df.loc[df["Database"] == db] db_df = db_df.reset_index(drop=True) - db_status = db_df["Submission_Status"][0] - # If the status of biosample or sra is processed-ok, then go ahead and submit to Genbank + db_status = db_df["Submission_Status"][0] + # If the status of biosample or sra is processed-ok, then go ahead and submit to Genbank if "processed-ok" in db_status: all_status += [1] report.update_genbank_files(database=database, organism=organism, submission_files_dir=submission_files_dir, submission_status_file=submission_status_file) @@ -419,9 +431,8 @@ def update_submission_status(submission_dir, submission_name, organism, test): if "processed-ok" in db_status: report.update_gisaid_files(organism=organism, submission_files_dir=submission_files_dir, submission_status_file=submission_status_file) submission_status = submit.submit_gisaid(organism=organism, database=database_name, submission_dir=submission_dir, submission_name=submission_name, config_dict=config_dict["GISAID"], gisaid_cli=gisaid_cli, submission_status_file=submission_status_file, submission_type=submission_type) - submission_id = "" + submission_id = "" # Update status in the submission log create.create_submission_log(database=database_name, submission_position=submission_position, organism=organism, submission_name=submission_name, submission_dir=submission_dir, config_file=config_file, submission_status=submission_status, submission_id=submission_id, table2asn=table2asn, gff_file=gff_file, submission_type=submission_type) # Print out the submission status print("Submission status: " + submission_status, file=sys.stdout) -