Skip to content

Commit

Permalink
Bug fix for gisaid name overwriting genbank name for fasta file (#53)
Browse files Browse the repository at this point in the history
* Bug fix for gisaid name overwriting genbank name for fasta file
  • Loading branch information
dthoward96 committed Apr 15, 2024
1 parent 67d14c9 commit ec6fc19
Show file tree
Hide file tree
Showing 11 changed files with 65 additions and 3,560 deletions.
29 changes: 13 additions & 16 deletions create.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
import submit

# Create directory and files for NCBI database submissions
def create_ncbi_submission(organism, database, submission_name, submission_dir, config_dict, metadata, fasta_file=None, table2asn=False, gff_file=None):
def create_ncbi_submission(organism, database, submission_name, submission_dir, config_dict, metadata, table2asn=False, gff_file=None):
# Create a database subfolder within the submission directory to dump all submission files
submission_files_dir = os.path.join(submission_dir, submission_name, "submission_files", database)
# Create submission files directory
Expand All @@ -47,11 +47,11 @@ def create_ncbi_submission(organism, database, submission_name, submission_dir,
elif "GENBANK" in database:
sequence_names = metadata["gb-seq_id"]
# Create genbank specific files
create_genbank_files(organism=organism, submission_name=submission_name, submission_files_dir=submission_files_dir, config_dict=config_dict, metadata=metadata, fasta_file=fasta_file)
create_genbank_files(organism=organism, submission_name=submission_name, submission_files_dir=submission_files_dir, config_dict=config_dict, metadata=metadata)
# If using Table2asn do not generate extra genbank files
if table2asn == True:
create_genbank_table2asn(submission_name=submission_name, submission_files_dir=submission_files_dir, gff_file=gff_file)
return
return
else:
# If FTP upload for Genbank, create ZIP file for upload if table2asn is set to False
create_genbank_zip(submission_name=submission_name, submission_files_dir=submission_files_dir)
Expand All @@ -62,7 +62,7 @@ def create_ncbi_submission(organism, database, submission_name, submission_dir,
print("Files are stored at: " + os.path.join(submission_files_dir), file=sys.stdout)

# Create directory and files for GISAID submission
def create_gisaid_submission(organism, database, submission_name, submission_dir, config_dict, metadata, fasta_file):
def create_gisaid_submission(organism, database, submission_name, submission_dir, config_dict, metadata):
# Create a database subfolder within the submission directory to dump all submission files
submission_files_dir = os.path.join(submission_dir, submission_name, "submission_files", database)
# Create submission files directory
Expand Down Expand Up @@ -96,11 +96,11 @@ def create_gisaid_submission(organism, database, submission_name, submission_dir
# Create submission files
gisaid_df.to_csv(os.path.join(submission_files_dir, "metadata.csv"), index=False, sep=",")
shutil.copy(os.path.join(submission_files_dir, "metadata.csv"), os.path.join(submission_files_dir, "orig_metadata.csv"))
create_fasta(organism=organism, database="GISAID", metadata=metadata, submission_files_dir=submission_files_dir, fasta_file=fasta_file)
create_fasta(organism=organism, database="GISAID", metadata=metadata, submission_files_dir=submission_files_dir)
shutil.copy(os.path.join(submission_files_dir, "sequence.fsa"), os.path.join(submission_files_dir, "orig_sequence.fsa"))
print("\n"+"Creating submission files for " + database, file=sys.stdout)
print("Files are stored at: " + os.path.join(submission_files_dir), file=sys.stdout)

def create_submission_xml(organism, database, submission_name, config_dict, metadata, failed_seqs_auto_removed=True):
# Submission XML header
root = etree.Element("Submission")
Expand Down Expand Up @@ -376,26 +376,23 @@ def create_authorset(config_dict, metadata, submission_name, submission_files_di
f.write("}\n")

# Create fasta file based on database
def create_fasta(organism, database, metadata, fasta_file, submission_files_dir):
# Make sure sequence name is found in fasta file header
fasta_df = process.process_fasta_samples(metadata=metadata, fasta_file=fasta_file)
# Now replace fasta header with appropriate sequence ids
def create_fasta(organism, database, metadata, submission_files_dir):
# Extract the required fields for specified database
db_required_colnames = process.get_required_colnames(database=database, organism=organism)
db_required_colnames = process.get_required_colnames(database=[database], organism=organism)
# Get the sample names with "#" symbol
sample_colname = list(filter(lambda x: ("#" in x)==True, db_required_colnames))[0].replace("#","").replace("*","")
# Create fasta file
records = []
for index, row in fasta_df.iterrows():
for index, row in metadata.iterrows():
records.append(SeqRecord(row["fasta_sequence_orig"], id = row[sample_colname], description = ""))
with open(os.path.join(submission_files_dir, "sequence.fsa"), "w+") as f:
SeqIO.write(records, f, "fasta")

# Create a zip file for genbank submission
def create_genbank_files(organism, config_dict, metadata, fasta_file, submission_name, submission_files_dir):
def create_genbank_files(organism, config_dict, metadata, submission_name, submission_files_dir):
# Create authorset file
create_authorset(config_dict=config_dict, metadata=metadata, submission_name=submission_name, submission_files_dir=submission_files_dir)
create_fasta(organism=organism, database="GENBANK", metadata=metadata, fasta_file=fasta_file, submission_files_dir=submission_files_dir)
create_fasta(organism=organism, database="GENBANK", metadata=metadata, submission_files_dir=submission_files_dir)
# Retrieve the source df"
source_df = metadata.filter(regex="^gb-seq_id$|^src-|^ncbi-spuid$|^ncbi-bioproject$|^organism$|^collection_date$").copy()
source_df.columns = source_df.columns.str.replace("src-","").str.strip()
Expand Down Expand Up @@ -465,9 +462,9 @@ def create_submission_log(database, submission_position, organism, submission_na
df = pd.DataFrame(columns = ["Submission_Name", "Organism", "Database", "Submission_Position", "Submission_Type", "Submission_Date", "Submission_Status", "Submission_Directory", "Config_File", "Table2asn", "GFF_File", "Update_Date"])
# Fill in the log field if it exists, otherwise create new
df_partial = df.loc[(df["Organism"] == organism) & (df["Database"] == database) & (df["Submission_Directory"] == submission_dir) & (df["Submission_Name"] == submission_name) & (df["Submission_Type"] == submission_type)]
# Update existing field
# Update existing field
if df_partial.shape[0] > 0:
df.loc[df_partial.index.values, "Submission_Position"] = submission_position
df.loc[df_partial.index.values, "Submission_Position"] = submission_position
df.loc[df_partial.index.values, "Submission_Status"] = submission_id + ";" + submission_status
df.loc[df_partial.index.values, "Table2asn"] = table2asn
df.loc[df_partial.index.values, "GFF_File"] = gff_file
Expand Down
6 changes: 6 additions & 0 deletions process.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,12 @@ def get_metadata(database, organism, metadata_file):
required_colnames = [re.sub("[*?#&]", "", x) for x in db_required_colnames]
# Remove ISOLATE FROM REQUIRED COLNAMES FOR TEMP FIX
required_colnames = [x for x in required_colnames if "-isolate" not in x]
# Bug fix to remove required fields from COV submissions (required for FLU)
if organism == "COV":
if "bs-lat_lon" in required_colnames:
required_colnames.remove("bs-lat_lon")
if "src-serotype" in required_colnames:
required_colnames.remove("src-serotype")
# Check if required column names are existed in metadata file
if not set(required_colnames).issubset(set(metadata.columns)):
failed_required_colnames = list(filter(lambda x: (x in metadata.columns)==False, required_colnames))
Expand Down
37 changes: 19 additions & 18 deletions seqsender.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def start(command, database, organism, submission_dir, submission_name, config_f
print("There is no GISAID CLI package for " + organism + " located at "+ gisaid_cli, file=sys.stderr)
print("Please download the GISAID " + organism + " CLI package from the GISAID platform", file=sys.stderr)
print("Extract the zip file and place a copy of the CLI binary at "+ gisaid_cli, file=sys.stderr)
sys.exit(1)
sys.exit(1)
# Determine whether this is a test or production submission
if test is True:
submission_type = "Test"
Expand All @@ -82,13 +82,14 @@ def start(command, database, organism, submission_dir, submission_name, config_f
config_dict = process.get_config(config_file=config_file, database=database)
# Check metadata file
metadata = process.get_metadata(database=database, organism=organism, metadata_file=metadata_file)
metadata = process.process_fasta_samples(metadata = metadata, fasta_file = fasta_file)
# Create identifier for each database to store submitting samples in submission status worksheet
identifier_columns = dict()
# Prepping submission files for each given database
for database_name in database:
if database_name in ["BIOSAMPLE", "SRA", "GENBANK"]:
identifier_columns.update({"ncbi-spuid": "ncbi-sample_name"})
create.create_ncbi_submission(organism=organism, database=database_name, submission_name=submission_name, submission_dir=submission_dir, config_dict=config_dict["NCBI"], metadata=metadata, fasta_file=fasta_file, table2asn=table2asn, gff_file=gff_file)
create.create_ncbi_submission(organism=organism, database=database_name, submission_name=submission_name, submission_dir=submission_dir, config_dict=config_dict["NCBI"], metadata=metadata, table2asn=table2asn, gff_file=gff_file)
if "GENBANK" in database_name:
identifier_columns.update({"gb-seq_id": "ncbi-sequence_name"})
elif "GISAID" in database_name:
Expand All @@ -97,7 +98,7 @@ def start(command, database, organism, submission_dir, submission_name, config_f
identifier_columns.update({"gs-seq_id": "gs-sequence_name"})
elif "COV" in organism:
identifier_columns.update({"gs-virus_name": "gs-sample_name"})
create.create_gisaid_submission(organism=organism, database=database_name, submission_name=submission_name, submission_dir=submission_dir, config_dict=config_dict["GISAID"], metadata=metadata, fasta_file=fasta_file)
create.create_gisaid_submission(organism=organism, database=database_name, submission_name=submission_name, submission_dir=submission_dir, config_dict=config_dict["GISAID"], metadata=metadata)
else:
print("Error: Database " + database_name + " is not a valid database selection.", file=sys.stderr)
sys.exit(1)
Expand All @@ -109,7 +110,7 @@ def start(command, database, organism, submission_dir, submission_name, config_f
if command == "submit":
for database_name in database:
# BioSample and SRA can be submitted together but to add accessions to GenBank they must be fully processed
if database_name in ["BIOSAMPLE", "SRA"]:
if database_name in ["BIOSAMPLE", "SRA"]:
if ("GISAID" in database) and (int(config_dict["GISAID"]["Submission_Position"]) == 1):
submission_position = 2
else:
Expand Down Expand Up @@ -193,10 +194,10 @@ def args_parser():
required=True)
submission_name_parser.add_argument("--submission_name",
help='Name of the submission',
required=True)
required=True)
submission_dir_parser.add_argument("--submission_dir",
help='Directory to where all required files (such as metadata, fasta, etc.) are stored',
required=True)
required=True)
config_file_parser.add_argument("--config_file",
help="Config file stored in submission directory",
required=True)
Expand All @@ -221,16 +222,16 @@ def args_parser():
else:
file_parser.add_argument("--fasta_file",
help="Fasta file stored in submission directory",
required=False)
# If genbank in the database list, determine whether to prepare table2asn submission
required=False)

# If genbank in the database list, determine whether to prepare table2asn submission
if any(x in database_args for x in ["genbank"]):
table2asn_parser.add_argument("--table2asn",
help="Whether to prepare a Table2asn submission.",
required=False,
action="store_const",
default=False,
const=True)
const=True)
# Optional: add annotation to table2asn submission
gff_parser.add_argument("--gff_file",
help="An annotation file to add to a Table2asn submission",
Expand Down Expand Up @@ -259,7 +260,7 @@ def args_parser():
update_module = subparser_modules.add_parser(
'check_submission_status',
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description='Check existing process of a submission',
description='Check existing process of a submission',
parents=[submission_dir_parser, submission_name_parser, organism_parser, test_parser]
)

Expand Down Expand Up @@ -291,13 +292,13 @@ def main():
# Determine whether required files that needed in the command
database = []
if "biosample" in args:
database += [args.biosample]
database += [args.biosample]
if "sra" in args:
database += [args.sra]
database += [args.sra]
if "genbank" in args:
database += [args.genbank]
database += [args.genbank]
if "gisaid" in args:
database += [args.gisaid]
database += [args.gisaid]
if "organism" in args:
organism = args.organism
if "submission_name" in args:
Expand Down Expand Up @@ -331,9 +332,9 @@ def main():
else:
test = False

# Get database list
# Get database list
database = [x for x in database if x]

# Execute the command
if command in ["prep", "submit"]:
# If database is not given, display help
Expand All @@ -359,7 +360,7 @@ def main():
parser.print_help()
sys.exit(0)

# Print out the execution time
# Print out the execution time
get_execution_time()

if __name__ == "__main__":
Expand Down
27 changes: 13 additions & 14 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,26 +28,26 @@

# Create example templates for testing
def create_zip_template(organism, database, submission_dir, submission_name):
# Create output directory
# Create output directory
submission_dir = os.path.abspath(submission_dir)
out_dir = os.path.join(submission_dir, submission_name)
os.makedirs(out_dir, exist_ok = True)
# Create sra directory
out_sra_dir = os.path.join(out_dir, "raw_reads")
# Create a list of files to output
out_metadata_file = os.path.join(out_dir, "metadata.csv")
out_config_file = os.path.join(out_dir, "config.yaml")
out_sequence_file = os.path.join(out_dir, "sequence.fasta")
out_fastq_1_r1_file = os.path.join(out_sra_dir, "fastq_1_R1.fastq.gz")
out_fastq_1_r2_file = os.path.join(out_sra_dir, "fastq_1_R2.fastq.gz")
out_fastq_2_r1_file = os.path.join(out_sra_dir, "fastq_2_R1.fastq.gz")
out_fastq_2_r2_file = os.path.join(out_sra_dir, "fastq_2_R2.fastq.gz")
# Create a list of files to output
out_metadata_file = os.path.join(out_dir, "metadata.csv")
out_config_file = os.path.join(out_dir, "config.yaml")
out_sequence_file = os.path.join(out_dir, "sequence.fasta")
out_fastq_1_r1_file = os.path.join(out_sra_dir, "fastq_1_R1.fastq.gz")
out_fastq_1_r2_file = os.path.join(out_sra_dir, "fastq_1_R2.fastq.gz")
out_fastq_2_r1_file = os.path.join(out_sra_dir, "fastq_2_R1.fastq.gz")
out_fastq_2_r2_file = os.path.join(out_sra_dir, "fastq_2_R2.fastq.gz")
# Create a list of template files to output
temp_config_file = os.path.join(PROG_DIR, "template", organism, organism.lower()+"_config.yaml")
temp_sequence_file = os.path.join(PROG_DIR, "template", organism, organism.lower()+"_sequence.fasta")
temp_fastq_1_r1_file = os.path.join(PROG_DIR, "template", organism, organism.lower()+"_fastq_1_R1.fastq.gz")
temp_config_file = os.path.join(PROG_DIR, "template", organism, organism.lower()+"_config.yaml")
temp_sequence_file = os.path.join(PROG_DIR, "template", organism, organism.lower()+"_sequence.fasta")
temp_fastq_1_r1_file = os.path.join(PROG_DIR, "template", organism, organism.lower()+"_fastq_1_R1.fastq.gz")
temp_fastq_1_r2_file = os.path.join(PROG_DIR, "template", organism, organism.lower()+"_fastq_1_R2.fastq.gz")
temp_fastq_2_r1_file = os.path.join(PROG_DIR, "template", organism, organism.lower()+"_fastq_2_R1.fastq.gz")
temp_fastq_2_r1_file = os.path.join(PROG_DIR, "template", organism, organism.lower()+"_fastq_2_R1.fastq.gz")
temp_fastq_2_r2_file = os.path.join(PROG_DIR, "template", organism, organism.lower()+"_fastq_2_R2.fastq.gz")
# Print generating message
print("\n"+"Generating submission template", file=sys.stdout)
Expand Down Expand Up @@ -100,4 +100,3 @@ def download_table2asn(table2asn_dir):
print("Downloading table2asn error", file=sys.stderr)
print(error, file=sys.stderr)
sys.exit(1)

6 changes: 3 additions & 3 deletions template/COV/cov_biosample_metadata.csv
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
spuid,spuid_namespace,description,organism,bioproject,bs-isolate,bs-collected_by, bs-collection_date, bs-host, bs-host_disease,bs-isolation_source,bs-geo_loc_name,bs-host_sex,bs-host_age
seq1,CDC-OAMD,CDC Sars CoV2 Sequencing Baseline Constellation,Severe acute respiratory syndrome coronavirus 2,PRJNA512913,SARS-CoV-2/human/USA/GA_2741/2020,Helix,2020-03-28,Homo sapiens,COVID-19,nasal swab,United States: Georgia,Male,28
seq2,CDC-OAMD,CDC Sars CoV2 Sequencing Baseline Constellation,Severe acute respiratory syndrome coronavirus 2,PRJNA512962,SARS-CoV-2/human/USA/GA_3742/2020,Helix,2020-04-29,Homo sapiens,COVID-20,nasal swab,United States: Georgia,Male,45
organism,collection_date,authors,ncbi-spuid,ncbi-spuid_namespace,ncbi-bioproject,bs-isolate,bs-package,bs-description,bs-collected_by, bs-host, bs-host_disease,bs-isolation_source,bs-geo_loc_name,bs-host_sex,bs-host_age
Severe acute respiratory syndrome coronavirus 2,3/28/2020,"Doe, John, R.; Doe, Jane;",SARS-CoV-2/human/USA/GA_2741/2020,CDC-OAMD,PRJNA512913,SARS-CoV-2/human/USA/GA_2741/2020,SARS-CoV-2.cl.1.0,Sars CoV2 Sequencing Baseline Constellation,Helix,Homo sapiens,COVID-19,nasal swab,United States: Georgia,Male,28
Severe acute respiratory syndrome coronavirus 2,4/29/2020,"Doe, John; Doe, Jane;",SARS-CoV-2/human/USA/GA_3742/2020,CDC-OAMD,PRJNA512962,SARS-CoV-2/human/USA/GA_3742/2020,SARS-CoV-2.cl.1.0,Sars CoV2 Sequencing Baseline Constellation,Helix,Homo sapiens,COVID-20,nasal swab,United States: Georgia,Male,45
Binary file added template/COV/cov_fastq_2_R1.fastq.gz
Binary file not shown.
Binary file added template/COV/cov_fastq_2_R2.fastq.gz
Binary file not shown.
6 changes: 3 additions & 3 deletions template/COV/cov_genbank_metadata.csv
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
spuid,spuid_namespace,subm_lab,subm_lab_division,subm_lab_addr,authors,publication_status,publication_title,src-organism,src-isolate,src-collection_date,src-country,src-host,src-isolation_source,src-BioProject,src-BioSample,cmt-StructuredCommentPrefix,cmt-Assembly Method,cmt-Coverage,cmt-Sequencing Technology,cmt-StructuredCommentSuffix
seq1,CDC-OAMD,NIH,NCBI,"10 Center Dr, Bethesda, MD, USA 20895","Doe, John, R.; Doe, Jane;",unpublished,,Severe acute respiratory syndrome coronavirus 2,SARS-CoV-2/human/USA/GA_2741/2020,3/28/2020,USA: GA,Homo sapiens,nasal swab,PRJNA512913,SAMN02224951,Assembly-Data,Newbler v. 2.3,100x,Illumina ,Assembly-Data
seq2,CDC-OAMD,NIH,NCBI,"10 Center Dr, Bethesda, MD, USA 20895","Doe, John; Doe, Jane;",unpublished,,Severe acute respiratory syndrome coronavirus 2,SARS-CoV-2/human/USA/GA_3742/2020,4/29/2020,USA: GA,Homo sapiens,nasal swab,PRJNA512962,SAMN02224986,Assembly-Data,Newbler v. 2.3,100x,Illumina,Assembly-Data
sequence_name,organism,collection_date,authors,ncbi-spuid,ncbi-spuid_namespace,ncbi-bioproject,gb-seq_id,gb-subm_lab,gb-subm_lab_division,gb-subm_lab_addr,gb-publication_status,gb-publication_title,src-isolate,src-country,src-host,src-isolation_source,cmt-StructuredCommentPrefix,cmt-Assembly Method,cmt-Coverage,cmt-Sequencing Technology,cmt-StructuredCommentSuffix
GA_2741,Severe acute respiratory syndrome coronavirus 2,3/28/2020,"Doe, John, R.; Doe, Jane;",SARS-CoV-2/human/USA/GA_2741/2020,CDC-OAMD,PRJNA512913,SARS-CoV-2/human/USA/GA_2741/2020,NIH,NCBI,"10 Center Dr, Bethesda, MD, USA 20895",unpublished,,SARS-CoV-2/human/USA/GA_2741/2020,USA: GA,Homo sapiens,nasal swab,Assembly-Data,Newbler v. 2.3,100x,Illumina ,Assembly-Data
GA_2742,Severe acute respiratory syndrome coronavirus 2,4/29/2020,"Doe, John; Doe, Jane;",SARS-CoV-2/human/USA/GA_3742/2020,CDC-OAMD,PRJNA512962,SARS-CoV-2/human/USA/GA_3742/2020,NIH,NCBI,"10 Center Dr, Bethesda, MD, USA 20895",unpublished,,SARS-CoV-2/human/USA/GA_3742/2020,USA: GA,Homo sapiens,nasal swab,Assembly-Data,Newbler v. 2.3,100x,Illumina,Assembly-Data
Loading

0 comments on commit ec6fc19

Please sign in to comment.