From 17cfd8019d8a68cf2681aa476b16441c60afb8ce Mon Sep 17 00:00:00 2001
From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com>
Date: Wed, 6 Nov 2024 13:28:44 +0100
Subject: [PATCH] feat(ena-submission, ingest): Submit more geolocation
information, filter out all versions of an ENA accession (#3183)
* Filter out all versions of a submitted accession
* Also submit geoLocAdmin2 and geoLocCity in biosamples
* Add tests
* Fix exception handling and add more logging
---
ena-submission/config/defaults.yaml | 2 +-
.../src/ena_deposition/create_assembly.py | 9 +++++++--
.../src/ena_deposition/create_sample.py | 2 +-
.../approved_ena_submission_list_test.json | 6 +++---
ena-submission/test/test_sample_request.xml | 4 ++++
ingest/scripts/filter_out_depositions.py | 6 ++++--
ingest/tests/test_data_cchf/ncbi_dataset.zip | Bin 3057 -> 2127 bytes
7 files changed, 20 insertions(+), 9 deletions(-)
diff --git a/ena-submission/config/defaults.yaml b/ena-submission/config/defaults.yaml
index 16d959bfe..b1acab7ae 100644
--- a/ena-submission/config/defaults.yaml
+++ b/ena-submission/config/defaults.yaml
@@ -30,7 +30,7 @@ metadata_mapping:
'geographic location (country and/or sea)':
loculus_fields: [geoLocCountry]
'geographic location (region and locality)':
- loculus_fields: [geoLocAdmin1]
+ loculus_fields: [geoLocAdmin1, geoLocAdmin2, geoLocCity]
'sample capture status':
loculus_fields: [purposeOfSampling]
'host disease outcome':
diff --git a/ena-submission/src/ena_deposition/create_assembly.py b/ena-submission/src/ena_deposition/create_assembly.py
index c12a3aa57..ccf61e67f 100644
--- a/ena-submission/src/ena_deposition/create_assembly.py
+++ b/ena-submission/src/ena_deposition/create_assembly.py
@@ -119,6 +119,7 @@ def create_manifest_object(
address.get("country"),
]
address_string = ", ".join([x for x in address_list if x is not None])
+ logging.debug("Created address from group_info")
except Exception as e:
logging.error(f"Was unable to create address, setting address to center_name due to {e}")
@@ -126,15 +127,18 @@ def create_manifest_object(
unaligned_nucleotide_sequences = submission_table_entry["unaligned_nucleotide_sequences"]
organism_metadata = config.organisms[group_key["organism"]]["enaDeposition"]
chromosome_list_object = create_chromosome_list_object(unaligned_nucleotide_sequences, seq_key)
+ logging.debug("Created chromosome list object")
chromosome_list_file = create_chromosome_list(list_object=chromosome_list_object, dir=dir)
+ logging.debug("Created chromosome list file")
authors = (
metadata["authors"] if metadata.get("authors") else metadata.get("submitter", "Unknown")
)
try:
authors = reformat_authors_from_loculus_to_embl_style(authors)
- except ValueError as err:
+ logging.debug("Reformatted authors")
+ except Exception as err:
msg = f"Was unable to format authors: {authors} as ENA expects"
- logger.error(msg)
+ logging.error(msg)
raise ValueError(msg) from err
collection_date = metadata.get("sampleCollectionDate", "Unknown")
country = metadata.get("geoLocCountry", "Unknown")
@@ -163,6 +167,7 @@ def create_manifest_object(
organism=organism,
dir=dir,
)
+ logging.debug("Created flatfile")
program = (
metadata["sequencingInstrument"] if metadata.get("sequencingInstrument") else "Unknown"
)
diff --git a/ena-submission/src/ena_deposition/create_sample.py b/ena-submission/src/ena_deposition/create_sample.py
index f685ea472..9b3e1f12f 100644
--- a/ena-submission/src/ena_deposition/create_sample.py
+++ b/ena-submission/src/ena_deposition/create_sample.py
@@ -67,7 +67,7 @@ def get_sample_attributes(config: Config, sample_metadata: dict[str, str], row:
else:
continue
else:
- value = ";".join(
+ value = "; ".join(
[str(metadata) for metadata in loculus_metadata_field_values if metadata]
)
if value:
diff --git a/ena-submission/test/approved_ena_submission_list_test.json b/ena-submission/test/approved_ena_submission_list_test.json
index 4139efce8..17da088e3 100644
--- a/ena-submission/test/approved_ena_submission_list_test.json
+++ b/ena-submission/test/approved_ena_submission_list_test.json
@@ -15,7 +15,7 @@
"bodyProduct": null,
"displayName": "Pakistan/LOC_0001TLY.1/2023-08-26",
"foodProduct": null,
- "geoLocCity": null,
+ "geoLocCity": "Rawalpindi",
"geoLocSite": null,
"hostAgeBin": null,
"hostDisease": null,
@@ -36,8 +36,8 @@
"passageNumber": null,
"travelHistory": null,
"anatomicalPart": null,
- "geoLocAdmin1": null,
- "geoLocAdmin2": null,
+ "geoLocAdmin1": "Punjab",
+ "geoLocAdmin2": "Rawalpindi",
"geoLocLatitude": null,
"geoLocLongitude": null,
"geoLocCountry": "Pakistan",
diff --git a/ena-submission/test/test_sample_request.xml b/ena-submission/test/test_sample_request.xml
index 7c1c63456..4911524a4 100644
--- a/ena-submission/test/test_sample_request.xml
+++ b/ena-submission/test/test_sample_request.xml
@@ -27,6 +27,10 @@
geographic location (country and/or sea)
Pakistan
+
+ geographic location (region and locality)
+ Punjab; Rawalpindi; Rawalpindi
+
host health state
Hospital care required
diff --git a/ingest/scripts/filter_out_depositions.py b/ingest/scripts/filter_out_depositions.py
index 04e288152..b8c49f643 100644
--- a/ingest/scripts/filter_out_depositions.py
+++ b/ingest/scripts/filter_out_depositions.py
@@ -72,12 +72,14 @@ def filter_out_depositions(
df = pd.read_csv(input_metadata_tsv, sep="\t", dtype=str, keep_default_na=False)
original_count = len(df)
with open(exclude_insdc_accessions, encoding="utf-8") as f:
- loculus_insdc_accessions = [line.strip() for line in f]
+ loculus_insdc_accessions: set = {line.strip().split(".")[0] for line in f} # Remove version
with open(exclude_biosample_accessions, encoding="utf-8") as f:
loculus_biosample_accessions = [line.strip() for line in f]
- filtered_df = df[~df["genbankAccession"].isin(loculus_insdc_accessions)]
+ filtered_df = df[
+ ~df["genbankAccession"].str.split(".").str[0].isin(loculus_insdc_accessions)
+ ] # Filter out all versions of an accession
filtered_df = filtered_df[~filtered_df["biosampleAccession"].isin(loculus_biosample_accessions)]
logger.info(f"Filtered out {(original_count - len(filtered_df))} sequences.")
filtered_df.to_csv(output_metadata_tsv, sep="\t", index=False)
diff --git a/ingest/tests/test_data_cchf/ncbi_dataset.zip b/ingest/tests/test_data_cchf/ncbi_dataset.zip
index a94f72b44a34d240c665de30ef55def558eb7a2a..a1e627611b04248e2f9c289d3342dfee394452b7 100644
GIT binary patch
literal 2127
zcmai#X*kqt8^Hg~LPmtcVUQ&wLiVwYn1(|Qbxf!zdzkFY*tatF-Jp`KF~*vb$TFcJ
z&TB7QXe1mLS*FI}l~OsT(}(wR)p?)ix|ip}{d?|jKb#pD0t48mH^t26Tk}2f0OEkp
zRcChx7pFicf@`441uHH9Ty4mOXGc&dH*f&5eEjLLq{a)jlTJCB4!Yt&anR1?9)
z$E(nlS}l*?DvtX62c}q(X6F*JmUjCd@l2N-TgNaG
zBN@4a6G9Ug4Ilhsl1zKU?`GX&%M!Y)NY}|YdOD^|Pt{2z4(5WEgAOLqt0OumKWJ*`
z^uxz%y7G5N4sllx94ij?QSvjEZCjuqZgNe%BDpJzB$N_k0d
z95d*_&XVxfjYxm+@{lm7HXy&q%Y^CAgN*%9(~u2PAq>2tHFIY4&i|@aDeXqJxiM~o
zX^takYAOB|a?aX>*Z-zAiSP-7N+NPJ69I>#cK|6$6^oXlumQa9$?(mrZ)~lx>
z4}-fsNl1~CV5M2!gR4%BUgaw`SlI=_0rzQ%mp6%zs9}41=6lU&Al3^yFO`vj?k=l;6NT#$6j5*RptxsdZZoz8fFB
zf(VYJZrzbCr##lVWFDBx-IzJ_j{e_Uy@38-BjB+YT@tfUyoSwNVAm?g&oJLJOi>m3
z6;(fT4`QT3(qz`<7Ue@#aEl!`iwz!jy`C)!i`pE;@{}2*5ZmW8pSdTDMgtYHP78Ud
z`Ew1|Q%5DHr1JTZlDremv#wn6wT_Y6n{C@_raU>4v<_Z~rk_$tDL$wQq691IdVStQ
zC2aK`R`?Tvb&8hlX6D%)=JbWv!t#kZ<+iO{<~owkM^=qdf9!>zpZj1*9GSE4wRoY{
z*rqtxXp|Vf?r^pvW4Djc4u)o~z(c1{h~_L^+!_VwWLjUf;8%KNTQ+9E?|HH`edLVm
z&1dSAQO_#DdXFgmXY@wjf+Y$5h5ScD#v7_N##1ms@{GF%>y}l2;col{fcE52zEVqH@y_H+wl^=)oAI$u2HD~@%O&Y`
z#W$)L#ivI9RAkY^F+#R$9>#Af6v>hxZIvuuaRL4ZvK}0vbrnLJo4nPiF&^7nge1Fg^J&<+SnHjr%W%@uxyMeg!SWhu265$K{-sUx*;Z2I94*f#p77sE~*D
zdRAs6Qbz_O9jbs>M=}GGO2#p=mgU=w)Xv|``ngUl%Y;EvTP~Y2jq<$;OhK1kKbP*h
z>zzz}`TIkj5r%^R>Z?p#Imwo@8ue1mR^#DgNRvV-;nK$OjcEY%h^k&
z_ejaarX5mO@981eW@B(NXe_jBXGi|5T`4gQrmpZd;vTtlj{5YyPUWhokc(B;d5j4Z
zzSq2{eLzIrike9whsoU+gF6S3hK=8prw&X=m!_sEtJ8Yk9xY9c4zv3<;&2hOyJxop
zAPMl#G%503n0x>e!*2IU6Z(%d_j%~wkOIK}B2Vu}ei!J!aet=If4F`g-xmrlJ@scY!}0&Cuio(d6`mq=XN?
zK4BmD!#V;Qm?lUts|!do7Aguf|LB(NVCqv;bPQ1QIL@Q+%q{p(hk&^6x${@Obu>D%NvT0i~tQROxVDGm1~ZX=CfrqfEw8
zQ$_daetiDea?i_!3O)Nu%pN4}J(6@*weHA{zzG^1pGDWEoSQypr2;?CPp(%z9yNze
z8(tr|<@D4{r9>&*@k>Uds$htDlMQQ;;*OT)&q3^!9@ar-@!>JioICv_RA1HU8yt3e
z*0lad4CApv5lb$eyUm=36+#sL&Ji(DW?KD@Gx`4IGY@pqLpBR#|MNE1Sz$E0;?~ZM
z2SlfS>FS;oBIRRR^k?SzudYJ+H)cIA@w<|sK9Rlf*Nu(xn^M2XY)ihZmC94(CL}R+
zk=j{-{6*K7q#WyRN)almzR-6=qf>Z8(f&^1>%RY7I`f+sHf?A1YeJFJ9;*B*Y
z?wck`S{@3$=Wrzprb}`*_f^s49|6#_m?H2Z(O#!hId-lhP{*m*#X=dS+)vl=!qAMa*
zy0tf?{KD#+yY8khoNLu7E5EWftz~6~?zyYNcON#rUA(!vU~j6CgLPb0OO&OWs<_f|
z?p5m^S~f~AsFi)7&0F*HZJGS)%C@UKtN+?Z>VBwszkKqo*9u?$Y-z7k4QkF>S5&lD
zack4G7SB2$J&Ehs`j2f_@;)%_VP)oDqITkWQwQ(Lsjm-Bn5DQiU(NJBi2kp>a^VGc
z`?*&FUM<#;knl5}lV~?Z9TATubEmjzt(KW4I5*CF>5S0VvnAQK)n2#x
z-l1SVrBrWrd!X*AumucX{bo0oFNu2gM}H67k4obL&+gR2yEC)7EuzD8eGgw2{25Ys
zO|ty9>GUb4{(F~eR9shYYvN_KkJCa0dhsuQQ)i<3oD{WifRcudG`VFfTCW3^ZEx)PwoLq5eiO
zZc{b84mnZ7v)i4x#hC80MsrLMf3>kHM!>Hu;KX+&_Lt{!b}(L=s?Vi$CGU}<{Licc
zJ(ku7D<>SezU8uDTNwLPpP7&H^|^#~b=9{lvHrv%R=jn}E9Y4`v%Q$Y@17StGC#L-
z$u++Xyq9jr3t9c+bGq!YjyY8JwZuBf8&OTsqH%$Tw0K3{1-tq;?|;bcB4FROP%!S2
zf6YPjUyR4nSf3aF-GAIqXupHx-bUZ;mizo!Vi}!n^_%0=FB-`Hu>GuhsHW4v=mq=9
z1qV(jC^z@;7#W>7v0(iq#{gw}Pu8V>?YM%kuTWUr)Wh}bp4N*B6Ui^K?2@7>Ark@}
z-B{s_zpu1j$b;(Qi|gj8aI-NmfNB^-GXlBp0oBraIO|Sur3|S+aaThF*t%e35@E*O
z$^x}JK;W$-h(u`=pfm?S?Jl%>G{75TG_2Ky+^7Myx?tdKqc@OAW*ZD<3CJ*P?IXA)
zj%