From 17cfd8019d8a68cf2681aa476b16441c60afb8ce Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Wed, 6 Nov 2024 13:28:44 +0100 Subject: [PATCH] feat(ena-submission, ingest): Submit more geolocation information, filter out all versions of an ENA accession (#3183) * Filter out all versions of a submitted accession * Also submit geoLocAdmin2 and geoLocCity in biosamples * Add tests * Fix exception handling and add more logging --- ena-submission/config/defaults.yaml | 2 +- .../src/ena_deposition/create_assembly.py | 9 +++++++-- .../src/ena_deposition/create_sample.py | 2 +- .../approved_ena_submission_list_test.json | 6 +++--- ena-submission/test/test_sample_request.xml | 4 ++++ ingest/scripts/filter_out_depositions.py | 6 ++++-- ingest/tests/test_data_cchf/ncbi_dataset.zip | Bin 3057 -> 2127 bytes 7 files changed, 20 insertions(+), 9 deletions(-) diff --git a/ena-submission/config/defaults.yaml b/ena-submission/config/defaults.yaml index 16d959bfe..b1acab7ae 100644 --- a/ena-submission/config/defaults.yaml +++ b/ena-submission/config/defaults.yaml @@ -30,7 +30,7 @@ metadata_mapping: 'geographic location (country and/or sea)': loculus_fields: [geoLocCountry] 'geographic location (region and locality)': - loculus_fields: [geoLocAdmin1] + loculus_fields: [geoLocAdmin1, geoLocAdmin2, geoLocCity] 'sample capture status': loculus_fields: [purposeOfSampling] 'host disease outcome': diff --git a/ena-submission/src/ena_deposition/create_assembly.py b/ena-submission/src/ena_deposition/create_assembly.py index c12a3aa57..ccf61e67f 100644 --- a/ena-submission/src/ena_deposition/create_assembly.py +++ b/ena-submission/src/ena_deposition/create_assembly.py @@ -119,6 +119,7 @@ def create_manifest_object( address.get("country"), ] address_string = ", ".join([x for x in address_list if x is not None]) + logging.debug("Created address from group_info") except Exception as e: logging.error(f"Was unable to create address, setting address to center_name due to {e}") @@ -126,15 +127,18 @@ def create_manifest_object( unaligned_nucleotide_sequences = submission_table_entry["unaligned_nucleotide_sequences"] organism_metadata = config.organisms[group_key["organism"]]["enaDeposition"] chromosome_list_object = create_chromosome_list_object(unaligned_nucleotide_sequences, seq_key) + logging.debug("Created chromosome list object") chromosome_list_file = create_chromosome_list(list_object=chromosome_list_object, dir=dir) + logging.debug("Created chromosome list file") authors = ( metadata["authors"] if metadata.get("authors") else metadata.get("submitter", "Unknown") ) try: authors = reformat_authors_from_loculus_to_embl_style(authors) - except ValueError as err: + logging.debug("Reformatted authors") + except Exception as err: msg = f"Was unable to format authors: {authors} as ENA expects" - logger.error(msg) + logging.error(msg) raise ValueError(msg) from err collection_date = metadata.get("sampleCollectionDate", "Unknown") country = metadata.get("geoLocCountry", "Unknown") @@ -163,6 +167,7 @@ def create_manifest_object( organism=organism, dir=dir, ) + logging.debug("Created flatfile") program = ( metadata["sequencingInstrument"] if metadata.get("sequencingInstrument") else "Unknown" ) diff --git a/ena-submission/src/ena_deposition/create_sample.py b/ena-submission/src/ena_deposition/create_sample.py index f685ea472..9b3e1f12f 100644 --- a/ena-submission/src/ena_deposition/create_sample.py +++ b/ena-submission/src/ena_deposition/create_sample.py @@ -67,7 +67,7 @@ def get_sample_attributes(config: Config, sample_metadata: dict[str, str], row: else: continue else: - value = ";".join( + value = "; ".join( [str(metadata) for metadata in loculus_metadata_field_values if metadata] ) if value: diff --git a/ena-submission/test/approved_ena_submission_list_test.json b/ena-submission/test/approved_ena_submission_list_test.json index 4139efce8..17da088e3 100644 --- a/ena-submission/test/approved_ena_submission_list_test.json +++ b/ena-submission/test/approved_ena_submission_list_test.json @@ -15,7 +15,7 @@ "bodyProduct": null, "displayName": "Pakistan/LOC_0001TLY.1/2023-08-26", "foodProduct": null, - "geoLocCity": null, + "geoLocCity": "Rawalpindi", "geoLocSite": null, "hostAgeBin": null, "hostDisease": null, @@ -36,8 +36,8 @@ "passageNumber": null, "travelHistory": null, "anatomicalPart": null, - "geoLocAdmin1": null, - "geoLocAdmin2": null, + "geoLocAdmin1": "Punjab", + "geoLocAdmin2": "Rawalpindi", "geoLocLatitude": null, "geoLocLongitude": null, "geoLocCountry": "Pakistan", diff --git a/ena-submission/test/test_sample_request.xml b/ena-submission/test/test_sample_request.xml index 7c1c63456..4911524a4 100644 --- a/ena-submission/test/test_sample_request.xml +++ b/ena-submission/test/test_sample_request.xml @@ -27,6 +27,10 @@ geographic location (country and/or sea) Pakistan + + geographic location (region and locality) + Punjab; Rawalpindi; Rawalpindi + host health state Hospital care required diff --git a/ingest/scripts/filter_out_depositions.py b/ingest/scripts/filter_out_depositions.py index 04e288152..b8c49f643 100644 --- a/ingest/scripts/filter_out_depositions.py +++ b/ingest/scripts/filter_out_depositions.py @@ -72,12 +72,14 @@ def filter_out_depositions( df = pd.read_csv(input_metadata_tsv, sep="\t", dtype=str, keep_default_na=False) original_count = len(df) with open(exclude_insdc_accessions, encoding="utf-8") as f: - loculus_insdc_accessions = [line.strip() for line in f] + loculus_insdc_accessions: set = {line.strip().split(".")[0] for line in f} # Remove version with open(exclude_biosample_accessions, encoding="utf-8") as f: loculus_biosample_accessions = [line.strip() for line in f] - filtered_df = df[~df["genbankAccession"].isin(loculus_insdc_accessions)] + filtered_df = df[ + ~df["genbankAccession"].str.split(".").str[0].isin(loculus_insdc_accessions) + ] # Filter out all versions of an accession filtered_df = filtered_df[~filtered_df["biosampleAccession"].isin(loculus_biosample_accessions)] logger.info(f"Filtered out {(original_count - len(filtered_df))} sequences.") filtered_df.to_csv(output_metadata_tsv, sep="\t", index=False) diff --git a/ingest/tests/test_data_cchf/ncbi_dataset.zip b/ingest/tests/test_data_cchf/ncbi_dataset.zip index a94f72b44a34d240c665de30ef55def558eb7a2a..a1e627611b04248e2f9c289d3342dfee394452b7 100644 GIT binary patch literal 2127 zcmai#X*kqt8^Hg~LPmtcVUQ&wLiVwYn1(|Qbxf!zdzkFY*tatF-Jp`KF~*vb$TFcJ z&TB7QXe1mLS*FI}l~OsT(}(wR)p?)ix|ip}{d?|jKb#pD0t48mH^t26Tk}2f0OEkp zRcChx7pFicf@`441uHH9Ty4mOXGc&dH*f&5eEjLLq{a)jlTJCB4!Yt&anR1?9) z$E(nlS}l*?DvtX62c}q(X6F*JmUjCd@l2N-TgNaG zBN@4a6G9Ug4Ilhsl1zKU?`GX&%M!Y)NY}|YdOD^|Pt{2z4(5WEgAOLqt0OumKWJ*` z^uxz%y7G5N4sllx94ij?QSvjEZCjuqZgNe%BDpJzB$N_k0d z95d*_&XVxfjYxm+@{lm7HXy&q%Y^CAgN*%9(~u2PAq>2tHFIY4&i|@aDeXqJxiM~o zX^takYAOB|a?aX>*Z-zAiSP-7N+NPJ69I>#cK|6$6^oXlumQa9$?(mrZ)~lx> z4}-fsNl1~CV5M2!gR4%BUgaw`SlI=_0rzQ%mp6%zs9}41=6lU&Al3^yFO`vj?k=l;6NT#$6j5*RptxsdZZoz8fFB zf(VYJZrzbCr##lVWFDBx-IzJ_j{e_Uy@38-BjB+YT@tfUyoSwNVAm?g&oJLJOi>m3 z6;(fT4`QT3(qz`<7Ue@#aEl!`iwz!jy`C)!i`pE;@{}2*5ZmW8pSdTDMgtYHP78Ud z`Ew1|Q%5DHr1JTZlDremv#wn6wT_Y6n{C@_raU>4v<_Z~rk_$tDL$wQq691IdVStQ zC2aK`R`?Tvb&8hlX6D%)=JbWv!t#kZ<+iO{<~owkM^=qdf9!>zpZj1*9GSE4wRoY{ z*rqtxXp|Vf?r^pvW4Djc4u)o~z(c1{h~_L^+!_VwWLjUf;8%KNTQ+9E?|HH`edLVm z&1dSAQO_#DdXFgmXY@wjf+Y$5h5ScD#v7_N##1ms@{GF%>y}l2;col{fcE52zEVqH@y_H+wl^=)oAI$u2HD~@%O&Y` z#W$)L#ivI9RAkY^F+#R$9>#Af6v>hxZIvuuaRL4ZvK}0vbrnLJo4nPiF&^7nge1Fg^J&<+SnHjr%W%@uxyMeg!SWhu265$K{-sUx*;Z2I94*f#p77sE~*D zdRAs6Qbz_O9jbs>M=}GGO2#p=mgU=w)Xv|``ngUl%Y;EvTP~Y2jq<$;OhK1kKbP*h z>zzz}`TIkj5r%^R>Z?p#Imwo@8ue1mR^#DgNRvV-;nK$OjcEY%h^k& z_ejaarX5mO@981eW@B(NXe_jBXGi|5T`4gQrmpZd;vTtlj{5YyPUWhokc(B;d5j4Z zzSq2{eLzIrike9whsoU+gF6S3hK=8prw&X=m!_sEtJ8Yk9xY9c4zv3<;&2hOyJxop zAPMl#G%503n0x>e!*2IU6Z(%d_j%~wkOIK}B2Vu}ei!J!aet=If4F`g-xmrlJ@scY!}0&Cuio(d6`mq=XN? zK4BmD!#V;Qm?lUts|!do7Aguf|LB(NVCqv;bPQ1QIL@Q+%q{p(hk&^6x${@Obu>D%NvT0i~tQROxVDGm1~ZX=CfrqfEw8 zQ$_daetiDea?i_!3O)Nu%pN4}J(6@*weHA{zzG^1pGDWEoSQypr2;?CPp(%z9yNze z8(tr|<@D4{r9>&*@k>Uds$htDlMQQ;;*OT)&q3^!9@ar-@!>JioICv_RA1HU8yt3e z*0lad4CApv5lb$eyUm=36+#sL&Ji(DW?KD@Gx`4IGY@pqLpBR#|MNE1Sz$E0;?~ZM z2SlfS>FS;oBIRRR^k?SzudYJ+H)cIA@w<|sK9Rlf*Nu(xn^M2XY)ihZmC94(CL}R+ zk=j{-{6*K7q#WyRN)almzR-6=qf>Z8(f&^1>%RY7I`f+sHf?A1YeJFJ9;*B*Y z?wck`S{@3$=Wrzprb}`*_f^s49|6#_m?H2Z(O#!hId-lhP{*m*#X=dS+)vl=!qAMa* zy0tf?{KD#+yY8khoNLu7E5EWftz~6~?zyYNcON#rUA(!vU~j6CgLPb0OO&OWs<_f| z?p5m^S~f~AsFi)7&0F*HZJGS)%C@UKtN+?Z>VBwszkKqo*9u?$Y-z7k4QkF>S5&lD zack4G7SB2$J&Ehs`j2f_@;)%_VP)oDqITkWQwQ(Lsjm-Bn5DQiU(NJBi2kp>a^VGc z`?*&FUM<#;knl5}lV~?Z9TATubEmjzt(KW4I5*CF>5S0VvnAQK)n2#x z-l1SVrBrWrd!X*AumucX{bo0oFNu2gM}H67k4obL&+gR2yEC)7EuzD8eGgw2{25Ys zO|ty9>GUb4{(F~eR9shYYvN_KkJCa0dhsuQQ)i<3oD{WifRcudG`VFfTCW3^ZEx)PwoLq5eiO zZc{b84mnZ7v)i4x#hC80MsrLMf3>kHM!>Hu;KX+&_Lt{!b}(L=s?Vi$CGU}<{Licc zJ(ku7D<>SezU8uDTNwLPpP7&H^|^#~b=9{lvHrv%R=jn}E9Y4`v%Q$Y@17StGC#L- z$u++Xyq9jr3t9c+bGq!YjyY8JwZuBf8&OTsqH%$Tw0K3{1-tq;?|;bcB4FROP%!S2 zf6YPjUyR4nSf3aF-GAIqXupHx-bUZ;mizo!Vi}!n^_%0=FB-`Hu>GuhsHW4v=mq=9 z1qV(jC^z@;7#W>7v0(iq#{gw}Pu8V>?YM%kuTWUr)Wh}bp4N*B6Ui^K?2@7>Ark@} z-B{s_zpu1j$b;(Qi|gj8aI-NmfNB^-GXlBp0oBraIO|Sur3|S+aaThF*t%e35@E*O z$^x}JK;W$-h(u`=pfm?S?Jl%>G{75TG_2Ky+^7Myx?tdKqc@OAW*ZD<3CJ*P?IXA) zj%