Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

V1.2.1. Release #69

Merged
merged 17 commits into from
Sep 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
3 changes: 3 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
/shiny/*
/vignettes/*
/docs/*
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ submit.ready
*report.xml
test_input/test_metadata.tsv
upload_log.csv
submission_log.csv
*.vscode
*.Rproj
*.Rhistory
Expand All @@ -19,3 +20,7 @@ docker-compose-*.yaml

# ignore folders
**/.Rproj.user
**/test_data/*
**/gisaid_cli/*
**/COV_TEST_DATA/*
**/FLU_TEST_DATA/*
2 changes: 1 addition & 1 deletion README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ github_pages_url <- description$GITHUB_PAGES

<p style="font-size: 16px;"><em>Public Database Submission Pipeline</em></p>

**Beta Version**: v1.2.0. This pipeline is currently in Beta testing, and issues could appear during submission. Please use it at your own risk. Feedback and suggestions are welcome!
**Beta Version**: v1.2.1. This pipeline is currently in Beta testing, and issues could appear during submission. Please use it at your own risk. Feedback and suggestions are welcome!

**General Disclaimer**: This repository was created for use by CDC programs to collaborate on public health related projects in support of the [CDC mission](https://www.cdc.gov/about/organization/mission.htm). GitHub is not hosted by the CDC, but is a third party website used by CDC and its partners to share information and collaborate on software. CDC use of GitHub does not imply an endorsement of any one particular service, product, or enterprise.

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

</p>

**Beta Version**: 1.2.0. This pipeline is currently in Beta testing, and
**Beta Version**: 1.2.1. This pipeline is currently in Beta testing, and
issues could appear during submission. Please use it at your own risk.
Feedback and suggestions are welcome\!

Expand Down
2 changes: 1 addition & 1 deletion argument_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def args_parser():
required=True)
file_parser.add_argument("--fasta_file",
help="Fasta file used to generate submission files; fasta header should match the column 'sequence_name' stored in your metadata. Input either full file path or if just file name it must be stored at '<submission_dir>/<submission_name>/<fasta_file>'.",
required=True)
default = None)
file_parser.add_argument("--table2asn",
help="Perform a table2asn submission instead of GenBank FTP submission for organism choices 'FLU' or 'COV'.",
required=False,
Expand Down
70 changes: 47 additions & 23 deletions biosample_sra_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ def create_manual_submission_files(database: str, submission_dir: str, metadata:
column_ordered = ["sample_name","library_ID"]
prefix = "sra-"
# Create SRA specific fields
metadata["sra-title"] = config_dict["Description"]["Title"]
filename_cols = [col for col in metadata.columns.tolist() if re.match("sra-file_[1-9]\d*", col)]
# Correct index for filename column
for col in filename_cols:
Expand All @@ -69,8 +68,8 @@ def create_manual_submission_files(database: str, submission_dir: str, metadata:
rename_columns[col] = col.replace("sra-file_", "sra-filename")
elif "BIOSAMPLE" in database:
metadata_regex = "^bs-|^organism$|^collection_date$"
rename_columns = {"bs-description":"sample_title","bioproject":"bioproject_accession"}
drop_columns = ["bs-package"]
rename_columns = {"bioproject":"bioproject_accession"}
drop_columns = ["bs-title", "bs-comment", "bs-sample_title", "bs-sample_description"]
column_ordered = ["sample_name"]
prefix = "bs-"
else:
Expand All @@ -92,14 +91,31 @@ def create_manual_submission_files(database: str, submission_dir: str, metadata:
file_handler.save_csv(df=database_df, file_path=submission_dir, file_name="metadata.tsv", sep="\t")

# Create submission XML
def create_submission_xml(organism: str, database: str, submission_name: str, config_dict: Dict[str, Any], metadata: pd.DataFrame, failed_seqs_auto_removed: bool = True) -> bytes:
def create_submission_xml(organism: str, database: str, submission_name: str, config_dict: Dict[str, Any], metadata: pd.DataFrame) -> bytes:
# Submission XML header
root = etree.Element("Submission")
description = etree.SubElement(root, "Description")
title = etree.SubElement(description, "Title")
title.text = config_dict["Description"]["Title"]
comment = etree.SubElement(description, "Comment")
comment.text = config_dict["Description"]["Comment"]
if "BIOSAMPLE" in database:
if "bs-title" in metadata and pd.notnull(metadata["bs-title"].iloc[0]) and metadata["bs-title"].iloc[0].strip() != 0:
title.text = metadata["bs-title"].iloc[0]
else:
title.text = submission_name + "-BS"
comment = etree.SubElement(description, "Comment")
if "bs-comment" in metadata and pd.notnull(metadata["bs-comment"].iloc[0]) and metadata["bs-comment"].iloc[0].strip() != 0:
comment.text = metadata["bs-comment"].iloc[0]
else:
comment.text = "BioSample Submission"
elif "SRA" in database:
if "sra-title" in metadata and pd.notnull(metadata["sra-title"].iloc[0]) and metadata["sra-title"].iloc[0].strip() != 0:
title.text = metadata["sra-title"].iloc[0]
else:
title.text = submission_name + "-SRA"
comment = etree.SubElement(description, "Comment")
if "sra-comment" in metadata and pd.notnull(metadata["sra-comment"].iloc[0]) and metadata["sra-comment"].iloc[0].strip() != 0:
comment.text = metadata["sra-comment"].iloc[0]
else:
comment.text = "SRA Submission"
# Description info including organization and contact info
organization = etree.SubElement(description, "Organization", type=config_dict["Description"]["Organization"]["Type"], role=config_dict["Description"]["Organization"]["Role"])
org_name = etree.SubElement(organization, "Name")
Expand All @@ -125,13 +141,18 @@ def create_submission_xml(organism: str, database: str, submission_name: str, co
sampleid = etree.SubElement(biosample, "SampleId")
spuid = etree.SubElement(sampleid, "SPUID", spuid_namespace=config_dict["Spuid_Namespace"])
spuid.text = row["bs-sample_name"]
descriptor = etree.SubElement(biosample, "Descriptor")
title = etree.SubElement(descriptor, "Title")
title.text = row["bs-description"]
if ("bs-sample_title" in metadata and pd.notnull(row["bs-sample_title"]) and row["bs-sample_title"].strip != "") or ("bs-sample_description" in metadata and pd.notnull(row["bs-sample_description"]) and row["bs-sample_description"].strip != ""):
descriptor = etree.SubElement(biosample, "Descriptor")
if "bs-sample_title" in metadata and pd.notnull(row["bs-sample_title"]) and row["bs-sample_title"].strip != "":
sample_title = etree.SubElement(descriptor, "Title")
sample_title.text = row["bs-sample_title"]
if "bs-sample_description" in metadata and pd.notnull(row["bs-sample_description"]) and row["bs-sample_description"].strip != "":
sample_description = etree.SubElement(descriptor, "Description")
sample_description.text = row["bs-sample_description"]
organismxml = etree.SubElement(biosample, "Organism")
organismname = etree.SubElement(organismxml, "OrganismName")
organismname.text = row["organism"]
if pd.notnull(row["bioproject"]) and row["bioproject"].strip() != "":
if "bioproject" in metadata and pd.notnull(row["bioproject"]) and row["bioproject"].strip() != "":
bioproject = etree.SubElement(biosample, "BioProject")
primaryid = etree.SubElement(bioproject, "PrimaryId", db="BioProject")
primaryid.text = row["bioproject"]
Expand All @@ -140,10 +161,12 @@ def create_submission_xml(organism: str, database: str, submission_name: str, co
# Attributes
attributes = etree.SubElement(biosample, "Attributes")
# Remove columns with bs-prefix that are not attributes
biosample_cols = [col for col in database_df.columns.tolist() if (col.startswith('bs-')) and (col not in ["bs-sample_name", "bs-package", "bs-description"])]
biosample_cols = [col for col in database_df.columns.tolist() if (col.startswith('bs-')) and (col not in ["bs-sample_name", "bs-package", "bs-title", "bs-comment", "bs-sample_title", "bs-sample_description"])]
for col in biosample_cols:
attribute = etree.SubElement(attributes, "Attribute", attribute_name=col.replace("bs-",""))
attribute.text = row[col]
attribute_value = row[col]
if pd.notnull(attribute_value) and attribute_value.strip() != "":
attribute = etree.SubElement(attributes, "Attribute", attribute_name=col.replace("bs-",""))
attribute.text = row[col]
# Add collection date to Attributes
attribute = etree.SubElement(attributes, "Attribute", attribute_name="collection_date")
attribute.text = row["collection_date"]
Expand Down Expand Up @@ -174,20 +197,21 @@ def create_submission_xml(organism: str, database: str, submission_name: str, co
datatype = etree.SubElement(file, "DataType")
datatype.text = "generic-data"
# Remove columns with sra- prefix that are not attributes
sra_cols = [col for col in database_df.columns.tolist() if col.startswith('sra-') and not re.match("(sra-sample_name|sra-file_location|sra-file_\d*)", col)]
sra_cols = [col for col in database_df.columns.tolist() if col.startswith('sra-') and not re.match("(sra-sample_name|sra-title|sra-comment|sra-file_location|sra-file_\d*)", col)]
for col in sra_cols:
attribute = etree.SubElement(addfiles, "Attribute", name=col.replace("sra-",""))
attribute.text = row[col]
attribute_value = row[col]
if pd.notnull(attribute_value) and attribute_value.strip() != "":
attribute = etree.SubElement(addfiles, "Attribute", name=col.replace("sra-",""))
attribute.text = row[col]
if pd.notnull(row["bioproject"]) and row["bioproject"].strip() != "":
attribute_ref_id = etree.SubElement(addfiles, "AttributeRefId", name="BioProject")
refid = etree.SubElement(attribute_ref_id, "RefId")
primaryid = etree.SubElement(refid, "PrimaryId")
primaryid.text = row["bioproject"]
if config_dict["Link_Sample_Between_NCBI_Databases"] and metadata.columns.str.contains("bs-sample_name").any():
attribute_ref_id = etree.SubElement(addfiles, "AttributeRefId", name="BioSample")
refid = etree.SubElement(attribute_ref_id, "RefId")
spuid = etree.SubElement(refid, "SPUID", spuid_namespace=config_dict["Spuid_Namespace"])
spuid.text = metadata.loc[metadata["sra-sample_name"] == row["sra-sample_name"], "bs-sample_name"].iloc[0]
attribute_ref_id = etree.SubElement(addfiles, "AttributeRefId", name="BioSample")
refid = etree.SubElement(attribute_ref_id, "RefId")
spuid = etree.SubElement(refid, "SPUID", spuid_namespace=config_dict["Spuid_Namespace"])
spuid.text = metadata.loc[metadata["sra-sample_name"] == row["sra-sample_name"], "bs-sample_name"].iloc[0]
identifier = etree.SubElement(addfiles, "Identifier")
spuid = etree.SubElement(identifier, "SPUID", spuid_namespace=config_dict["Spuid_Namespace"])
spuid.text = row["sra-sample_name"]
Expand All @@ -209,7 +233,7 @@ def create_biosample_sra_submission(organism: str, database: str, submission_nam
create_raw_reads_list(submission_dir=submission_dir, raw_files_list=raw_files_list)
manual_df = metadata.copy()
create_manual_submission_files(database=database, submission_dir=submission_dir, metadata=manual_df, config_dict=config_dict)
xml_str = create_submission_xml(organism=organism, database=database, submission_name=submission_name, metadata=metadata, config_dict=config_dict, failed_seqs_auto_removed=True)
xml_str = create_submission_xml(organism=organism, database=database, submission_name=submission_name, metadata=metadata, config_dict=config_dict)
file_handler.save_xml(xml_str, submission_dir)

# Read xml report and get status of the submission
Expand Down
44 changes: 44 additions & 0 deletions config/biosample/Beta-lactamase_1_0.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,26 @@
description="Identifier name used for BioSample. Max length is 50 characters.",
title="sample_name",
),
"bs-sample_title": Column(
dtype="object",
checks=None,
nullable=True,
unique=False,
coerce=False,
required=False,
description="Optional additional title for sample. Will be autogenerated by NCBI if not provided.",
title="sample title",
),
"bs-sample_description": Column(
dtype="object",
checks=None,
nullable=True,
unique=False,
coerce=False,
required=False,
description="Optional description for sample.",
title="sample description",
),
"bs-strain": Column(
dtype="object",
checks=None,
Expand Down Expand Up @@ -124,6 +144,30 @@
description="The geographical coordinates of the location where the sample was collected. Specify as degrees latitude and longitude in format \"d[d.dddd] N|S d[dd.dddd] W|E\", eg, 38.98 N 77.11 W",
title="latitude and longitude",
),
"bs-title": Column(
dtype="object",
checks=[
Check(lambda s: s.nunique() == 1),
],
nullable=True,
unique=False,
coerce=False,
required=False,
description="Optional internal field for how the BioSample submission should be named when viewed from the NCBI submission portal. If not provided, when performing submissions <--submission_name> with the suffix \"-BS\" will be used instead.",
title="biosample submission portal name",
),
"bs-comment": Column(
dtype="object",
checks=[
Check(lambda s: s.nunique() == 1),
],
nullable=True,
unique=False,
coerce=False,
required=False,
description="Optional internal field explaining the purpose of the submission for when interacting and resolving submission issues with NCBI.",
title="biosample submission portal description",
)
},
checks=[
Check(lambda df: ~(df["bs-strain"].isnull() & df["bs-isolate"].isnull()), ignore_na = False),
Expand Down
44 changes: 44 additions & 0 deletions config/biosample/Human_1_0.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,26 @@
description="Identifier name used for BioSample. Max length is 50 characters.",
title="sample_name",
),
"bs-sample_title": Column(
dtype="object",
checks=None,
nullable=True,
unique=False,
coerce=False,
required=False,
description="Optional additional title for sample. Will be autogenerated by NCBI if not provided.",
title="sample title",
),
"bs-sample_description": Column(
dtype="object",
checks=None,
nullable=True,
unique=False,
coerce=False,
required=False,
description="Optional description for sample.",
title="sample description",
),
"bs-isolate": Column(
dtype="object",
checks=None,
Expand Down Expand Up @@ -220,6 +240,30 @@
required=False,
title="treatment",
),
"bs-title": Column(
dtype="object",
checks=[
Check(lambda s: s.nunique() == 1),
],
nullable=True,
unique=False,
coerce=False,
required=False,
description="Optional internal field for how the BioSample submission should be named when viewed from the NCBI submission portal. If not provided, when performing submissions <--submission_name> with the suffix \"-BS\" will be used instead.",
title="biosample submission portal name",
),
"bs-comment": Column(
dtype="object",
checks=[
Check(lambda s: s.nunique() == 1),
],
nullable=True,
unique=False,
coerce=False,
required=False,
description="Optional internal field explaining the purpose of the submission for when interacting and resolving submission issues with NCBI.",
title="biosample submission portal description",
)
},
checks=None,
index=None,
Expand Down
44 changes: 44 additions & 0 deletions config/biosample/Invertebrate_1_0.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,26 @@
description="Identifier name used for BioSample. Max length is 50 characters.",
title="sample_name",
),
"bs-sample_title": Column(
dtype="object",
checks=None,
nullable=True,
unique=False,
coerce=False,
required=False,
description="Optional additional title for sample. Will be autogenerated by NCBI if not provided.",
title="sample title",
),
"bs-sample_description": Column(
dtype="object",
checks=None,
nullable=True,
unique=False,
coerce=False,
required=False,
description="Optional description for sample.",
title="sample description",
),
"bs-isolate": Column(
dtype="object",
checks=None,
Expand Down Expand Up @@ -204,6 +224,30 @@
description="temperature of the sample at time of sampling",
title="temperature",
),
"bs-title": Column(
dtype="object",
checks=[
Check(lambda s: s.nunique() == 1),
],
nullable=True,
unique=False,
coerce=False,
required=False,
description="Optional internal field for how the BioSample submission should be named when viewed from the NCBI submission portal. If not provided, when performing submissions <--submission_name> with the suffix \"-BS\" will be used instead.",
title="biosample submission portal name",
),
"bs-comment": Column(
dtype="object",
checks=[
Check(lambda s: s.nunique() == 1),
],
nullable=True,
unique=False,
coerce=False,
required=False,
description="Optional internal field explaining the purpose of the submission for when interacting and resolving submission issues with NCBI.",
title="biosample submission portal description",
)
},
checks=[
Check(lambda df: ~(df["bs-isolate"].isnull() & df["bs-breed"].isnull()), ignore_na = False),
Expand Down
Loading
Loading