Skip to content

Commit

Permalink
feat(ena-submission, ingest): Submit more geolocation information, fi…
Browse files Browse the repository at this point in the history
…lter out all versions of an ENA accession (#3183)

* Filter out all versions of a submitted accession

* Also submit geoLocAdmin2 and geoLocCity in biosamples

* Add tests

* Fix exception handling and add more logging
  • Loading branch information
anna-parker authored Nov 6, 2024
1 parent cbbbc97 commit 17cfd80
Show file tree
Hide file tree
Showing 7 changed files with 20 additions and 9 deletions.
2 changes: 1 addition & 1 deletion ena-submission/config/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ metadata_mapping:
'geographic location (country and/or sea)':
loculus_fields: [geoLocCountry]
'geographic location (region and locality)':
loculus_fields: [geoLocAdmin1]
loculus_fields: [geoLocAdmin1, geoLocAdmin2, geoLocCity]
'sample capture status':
loculus_fields: [purposeOfSampling]
'host disease outcome':
Expand Down
9 changes: 7 additions & 2 deletions ena-submission/src/ena_deposition/create_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,22 +119,26 @@ def create_manifest_object(
address.get("country"),
]
address_string = ", ".join([x for x in address_list if x is not None])
logging.debug("Created address from group_info")
except Exception as e:
logging.error(f"Was unable to create address, setting address to center_name due to {e}")

metadata = submission_table_entry["metadata"]
unaligned_nucleotide_sequences = submission_table_entry["unaligned_nucleotide_sequences"]
organism_metadata = config.organisms[group_key["organism"]]["enaDeposition"]
chromosome_list_object = create_chromosome_list_object(unaligned_nucleotide_sequences, seq_key)
logging.debug("Created chromosome list object")
chromosome_list_file = create_chromosome_list(list_object=chromosome_list_object, dir=dir)
logging.debug("Created chromosome list file")
authors = (
metadata["authors"] if metadata.get("authors") else metadata.get("submitter", "Unknown")
)
try:
authors = reformat_authors_from_loculus_to_embl_style(authors)
except ValueError as err:
logging.debug("Reformatted authors")
except Exception as err:
msg = f"Was unable to format authors: {authors} as ENA expects"
logger.error(msg)
logging.error(msg)
raise ValueError(msg) from err
collection_date = metadata.get("sampleCollectionDate", "Unknown")
country = metadata.get("geoLocCountry", "Unknown")
Expand Down Expand Up @@ -163,6 +167,7 @@ def create_manifest_object(
organism=organism,
dir=dir,
)
logging.debug("Created flatfile")
program = (
metadata["sequencingInstrument"] if metadata.get("sequencingInstrument") else "Unknown"
)
Expand Down
2 changes: 1 addition & 1 deletion ena-submission/src/ena_deposition/create_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def get_sample_attributes(config: Config, sample_metadata: dict[str, str], row:
else:
continue
else:
value = ";".join(
value = "; ".join(
[str(metadata) for metadata in loculus_metadata_field_values if metadata]
)
if value:
Expand Down
6 changes: 3 additions & 3 deletions ena-submission/test/approved_ena_submission_list_test.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"bodyProduct": null,
"displayName": "Pakistan/LOC_0001TLY.1/2023-08-26",
"foodProduct": null,
"geoLocCity": null,
"geoLocCity": "Rawalpindi",
"geoLocSite": null,
"hostAgeBin": null,
"hostDisease": null,
Expand All @@ -36,8 +36,8 @@
"passageNumber": null,
"travelHistory": null,
"anatomicalPart": null,
"geoLocAdmin1": null,
"geoLocAdmin2": null,
"geoLocAdmin1": "Punjab",
"geoLocAdmin2": "Rawalpindi",
"geoLocLatitude": null,
"geoLocLongitude": null,
"geoLocCountry": "Pakistan",
Expand Down
4 changes: 4 additions & 0 deletions ena-submission/test/test_sample_request.xml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@
<TAG>geographic location (country and/or sea)</TAG>
<VALUE>Pakistan</VALUE>
</SAMPLE_ATTRIBUTE>
<SAMPLE_ATTRIBUTE>
<TAG>geographic location (region and locality)</TAG>
<VALUE>Punjab; Rawalpindi; Rawalpindi</VALUE>
</SAMPLE_ATTRIBUTE>
<SAMPLE_ATTRIBUTE>
<TAG>host health state</TAG>
<VALUE>Hospital care required</VALUE>
Expand Down
6 changes: 4 additions & 2 deletions ingest/scripts/filter_out_depositions.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,14 @@ def filter_out_depositions(
df = pd.read_csv(input_metadata_tsv, sep="\t", dtype=str, keep_default_na=False)
original_count = len(df)
with open(exclude_insdc_accessions, encoding="utf-8") as f:
loculus_insdc_accessions = [line.strip() for line in f]
loculus_insdc_accessions: set = {line.strip().split(".")[0] for line in f} # Remove version

with open(exclude_biosample_accessions, encoding="utf-8") as f:
loculus_biosample_accessions = [line.strip() for line in f]

filtered_df = df[~df["genbankAccession"].isin(loculus_insdc_accessions)]
filtered_df = df[
~df["genbankAccession"].str.split(".").str[0].isin(loculus_insdc_accessions)
] # Filter out all versions of an accession
filtered_df = filtered_df[~filtered_df["biosampleAccession"].isin(loculus_biosample_accessions)]
logger.info(f"Filtered out {(original_count - len(filtered_df))} sequences.")
filtered_df.to_csv(output_metadata_tsv, sep="\t", index=False)
Expand Down
Binary file modified ingest/tests/test_data_cchf/ncbi_dataset.zip
Binary file not shown.

0 comments on commit 17cfd80

Please sign in to comment.