diff --git a/data/Anopheles_Metadata_Manifest_V4.0_20221220.xlsx b/data/Anopheles_Metadata_Manifest_V4.0_20240813.xlsx similarity index 80% rename from data/Anopheles_Metadata_Manifest_V4.0_20221220.xlsx rename to data/Anopheles_Metadata_Manifest_V4.0_20240813.xlsx index 2080089..8ad7f0d 100644 Binary files a/data/Anopheles_Metadata_Manifest_V4.0_20221220.xlsx and b/data/Anopheles_Metadata_Manifest_V4.0_20240813.xlsx differ diff --git a/data/bioscan_partners.tsv b/data/bioscan_partners.tsv index e05f564..542e20c 100644 --- a/data/bioscan_partners.tsv +++ b/data/bioscan_partners.tsv @@ -54,4 +54,29 @@ YWTP YORKSHIRE WILDLIFE TRUST (POTTERIC CARR) CZRC CHESTER ZOO RECORD CHESTER CEHG UK CENTRE FOR ECOLOGY AND HYDROLOGY (GAIT BARROWS) UPBM UNIVERSITY OF PLYMOUTH -LJMU LIVERPOOL JOHN MOORES UNIVERSITY \ No newline at end of file +LJMU LIVERPOOL JOHN MOORES UNIVERSITY +LUME LOUGHBOROUGH UNIVERSITY MET STATION +LUHW LOUGHBOROUGH UNIVERSITY HOLYWELL WOODS +EPEY ESCRICK PARK ESTATE +EWTA ESSEX WILDLIFE TRUST (ABBOTTS HALL) +DCTC DURELL CONSERVATION TRUST (CAIRNGORMS) +CEFW CORROUR ESTATE FOURT WILLIAM +LWTC LANCASHIRE WILDLIFE TRUST CUTACRE +SNST SANGER NON STANDARD +JARO JARON SANGER +NTMD NATIONAL TRUST (MORTEHOE DEVON) +NTWE NATIONAL TRUST (WIMPOLE ESTATE) +CUPH CAMBRIDGE UNIVERSITY (PETERHOUSE) +NTSS NATIONAL TRUST (SHROPSHIRE & STAFFORDSHIRE) +NTHH NATIONAL TRUST (HOLT HEATH) +HAAU HARPER ADAMS UNIVERSITY +QMRL QUEEN MARY UL RIVERY LABORATORY +NTDB NATIONAL TRUST (DANBURY AND BLAKES) +NTML NATIONAL TRUST (MELFORD) +NTHF NATIONAL TRUST (HATFIELD FOREST) +NTBF NATIONAL TRUST (BRIDGES FARM) +WREN WILD WRENDALE +KNEP KNEPP ESTATE +BCLT BARNES COMMON LIMITED +GCEB GLENLIVET CROWN ESTATE BALLANTRUAN WOOD +HELG HELIGAN ESTATE \ No newline at end of file diff --git a/work/env.yml b/work/env.yml index ced2570..a29bc4b 100644 --- a/work/env.yml +++ b/work/env.yml @@ -1,8 +1,7 @@ name: bioscan_metadata_dev channels: - - bioconda - conda-forge - - defaults + - bioconda dependencies: - python =3.7.6 - geopy =2.1.0 diff --git a/work/fix_biosc.ipynb b/work/fix_biosc.ipynb new file mode 100644 index 0000000..d8252a3 --- /dev/null +++ b/work/fix_biosc.ipynb @@ -0,0 +1,495 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "049f231a", + "metadata": {}, + "outputs": [], + "source": [ + "%run validate_partner_manifest_dev.ipynb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "306337d0", + "metadata": {}, + "outputs": [], + "source": [ + "fn = '../results/20241029_bge/BGKU_2024_BIOSCAN_Manifest_V2.0_am60.xlsx'\n", + "df = get_data(fn, sheet='TAB 2 Metadata Entry')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "099288c4", + "metadata": {}, + "outputs": [], + "source": [ + "df = fix_date_formats(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c2436ea", + "metadata": {}, + "outputs": [], + "source": [ + "v = infer_bioscan_version(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e704b5e6", + "metadata": {}, + "outputs": [], + "source": [ + "df = validate_series(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14390a88", + "metadata": {}, + "outputs": [], + "source": [ + "df = remove_nonbreaking_spaces(df)\n", + "df = remove_trailing_spaces(df, title='sample')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fb63516", + "metadata": {}, + "outputs": [], + "source": [ + "template_fn='../data/BIOSCAN_Manifest_V3_20240301.xlsx'\n", + "template_df = get_data(template_fn, sheet='TAB 2 Metadata Entry')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adc37c20", + "metadata": {}, + "outputs": [], + "source": [ + "check_columns(df, template_df, bioscan_version=v)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3abff9dd", + "metadata": {}, + "outputs": [], + "source": [ + "valid_dict = get_valid_dict(template_fn, validation_sheet='TAB 4 DO NOT EDIT - Data Valida')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ff4b9f7", + "metadata": {}, + "outputs": [], + "source": [ + "df['CATCH_LOT'] = df['CATCH_LOT'].replace('','NOT_APPLICABLE')\n", + "validate_regex('CATCH_LOT', df, na_values=[])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61a4aa37", + "metadata": {}, + "outputs": [], + "source": [ + "contrib_sheet='TAB 1 Contributors'\n", + "contrib_df = validate_contributors(fn, contrib_sheet=contrib_sheet)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15631787", + "metadata": {}, + "outputs": [], + "source": [ + "df['TUBE_OR_WELL_ID'] = df['TUBE_OR_WELL_ID'].apply(lambda x: x[0] + str(int(x[1:])))\n", + "df['TUBE_OR_WELL_ID'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "610512d4", + "metadata": {}, + "outputs": [], + "source": [ + "df, gal, partner_code = validate_plates_wells(\n", + " df, contrib_df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID', bioscan=True, bioscan_version=v)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b57ab15c", + "metadata": {}, + "outputs": [], + "source": [ + "df['ORGANISM_PART'].replace('', 'NOT_APPLICABLE', inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c34d73d", + "metadata": {}, + "outputs": [], + "source": [ + "df, is_blank = check_blanks(df, bioscan=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f82ae7c3", + "metadata": {}, + "outputs": [], + "source": [ + "df.loc[~is_blank, 'OTHER_INFORMATION'] = df['ORGANISM_PART']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ee8e00d", + "metadata": {}, + "outputs": [], + "source": [ + "df['ORGANISM_PART'] = df['ORGANISM_PART'].str.upper().str.replace(',','|')\n", + "df['ORGANISM_PART'].replace({\n", + " 'ENTIRE INDIVIDUAL':'WHOLE_ORGANISM',\n", + " 'ANTENA':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n", + " 'IMAGO LEG':'LEG',\n", + " 'MARGINAL PIECE':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n", + " 'ORGANISM':'WHOLE_ORGANISM',\n", + " 'EXUVIUM':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n", + " 'IMAGO ANTENA':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n", + " 'IMAGO':'WHOLE_ORGANISM',\n", + " 'BODY PART':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n", + " 'PUPA':'WHOLE_ORGANISM',\n", + " 'PART PUPA':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n", + " 'LEG| ANTENA':'LEG| **OTHER_SOMATIC_ANIMAL_TISSUE**'\n", + "}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f8291a4", + "metadata": {}, + "outputs": [], + "source": [ + "validate_values('ORGANISM_PART', df, valid_dict, sep='|')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0593d6e0", + "metadata": {}, + "outputs": [], + "source": [ + "validate_values('PRESERVATIVE_SOLUTION', df, valid_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56f8a0d1", + "metadata": {}, + "outputs": [], + "source": [ + "validate_values('BOTTLE_DIRECTION', \n", + " df[~is_blank & (df['COLLECTION_METHOD'] == 'MALAISE_TRAP')], # allow for blank in non-Malaise trap samples\n", + " valid_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d146be4b", + "metadata": {}, + "outputs": [], + "source": [ + "validate_regex('DATE_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ba7297f", + "metadata": {}, + "outputs": [], + "source": [ + "check_catch_lot_dates(df[~is_blank])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49ce32eb", + "metadata": {}, + "outputs": [], + "source": [ + "df['DECIMAL_LATITUDE'] = df['DECIMAL_LATITUDE'].str.rstrip(',')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f11c2c18", + "metadata": {}, + "outputs": [], + "source": [ + "validate_regex('DECIMAL_LATITUDE', df[~is_blank], na_values=[])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4eeec734", + "metadata": {}, + "outputs": [], + "source": [ + "df['DECIMAL_LONGITUDE'] = df['DECIMAL_LONGITUDE'].str.rstrip(',')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47e4396b", + "metadata": {}, + "outputs": [], + "source": [ + "validate_regex('DECIMAL_LONGITUDE', df[~is_blank], na_values=[])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bac2c168", + "metadata": {}, + "outputs": [], + "source": [ + "validate_regex('WHAT_3_WORDS', df[~is_blank], na_values=[''])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6222f03", + "metadata": {}, + "outputs": [], + "source": [ + "validate_regex('TIME_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eba6116b", + "metadata": {}, + "outputs": [], + "source": [ + "validate_regex('DURATION_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1118e76", + "metadata": {}, + "outputs": [], + "source": [ + "validate_values('COLLECTION_METHOD', df[~is_blank], valid_dict, na_values=[''])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d94b301", + "metadata": {}, + "outputs": [], + "source": [ + "validate_regex('DATE_OF_PLATING', df[~is_blank], na_values=['NOT_COLLECTED',''])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7d7e222", + "metadata": {}, + "outputs": [], + "source": [ + "compare_dates_text('DATE_OF_COLLECTION', 'DATE_OF_PLATING', df[~is_blank])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9210d1e6", + "metadata": {}, + "outputs": [], + "source": [ + "df['PREDICTED_ORDER_OR_GROUP'].replace({\n", + " 'Neotaenioglossa':'Neotaenioglossa',\n", + " 'Basommatophora':'Basommatophora',\n", + " 'Odonáta':'Odonata',\n", + " 'Oligochaeta gen. sp.':'Oligochaeta'\n", + "}, inplace=True)\n", + "df['PREDICTED_FAMILY'].replace({\n", + " 'none':'',\n", + " 'fam.':'',\n", + " 'Glossiphoniidae)':'Glossiphoniidae'\n", + "}, inplace=True)\n", + "df['PREDICTED_GENUS'].replace({\n", + " 'none':'',\n", + " 'gen.':'',\n", + " 'genus':''\n", + "}, inplace=True)\n", + "df['PREDICTED_SCIENTIFIC_NAME'].replace({\n", + " 'sp':'',\n", + " 'sp.':'',\n", + "}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d0292de", + "metadata": {}, + "outputs": [], + "source": [ + "df['PREDICTED_SCIENTIFIC_NAME'] = df['PREDICTED_SCIENTIFIC_NAME'].str.replace('sp ','sp_')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e3a8169", + "metadata": {}, + "outputs": [], + "source": [ + "df.loc[\n", + " (df['PREDICTED_SCIENTIFIC_NAME'] != '') & ~df['PREDICTED_SCIENTIFIC_NAME'].str.contains(' '),\n", + " 'PREDICTED_SCIENTIFIC_NAME'\n", + "] = df['PREDICTED_GENUS'] + ' ' + df['PREDICTED_SCIENTIFIC_NAME']\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb9dab13", + "metadata": {}, + "outputs": [], + "source": [ + "df = validate_taxonomy(df, ncbi, anospp=False, na_values = [''])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6408bf17", + "metadata": {}, + "outputs": [], + "source": [ + "df['SEX'] = df['SEX'].str.upper()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8bb2db80", + "metadata": {}, + "outputs": [], + "source": [ + "validate_values('SPECIMEN_IDENTITY_RISK', df[~is_blank], valid_dict, na_values=[''])\n", + "validate_specimen_id_risk(df)\n", + "validate_values('LIFESTAGE', df[~is_blank], valid_dict, na_values=[''])\n", + "validate_values('SEX', df[~is_blank], valid_dict, na_values=[''])\n", + "validate_values('SORTING_SOLUTION_USED', df[~is_blank], valid_dict, na_values=[''])\n", + "validate_values('CATCH_BOTTLE_TEMPERATURE_STORAGE', df[~is_blank], valid_dict, na_values=[''])\n", + "validate_values('PLATE_TEMPERATURE_STORAGE', df[~is_blank], valid_dict, na_values=[''])\n", + "# white cols - validated for all samples\n", + "validate_freetext('MORPHOSPECIES_DESCRIPTION', df)\n", + "validate_freetext('DESCRIPTION_OF_COLLECTION_METHOD', df)\n", + "validate_freetext('HABITAT', df)\n", + "validate_freetext('PRESERVATION_APPROACH', df)\n", + "# TODO check if STS will need something here\n", + "validate_freetext('COLLECTOR_SAMPLE_ID', df)\n", + "validate_freetext('VOUCHER_ID', df)\n", + "validate_regex('ELEVATION', df, na_values=[''])\n", + "validate_freetext('OTHER_INFORMATION', df)\n", + "# validate_freetext('MISC_METADATA', df)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7fdc8a7", + "metadata": {}, + "outputs": [], + "source": [ + "validate_identifier('IDENTIFIED_BY', df, contrib_df, na_values=[''])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1cec8e8", + "metadata": {}, + "outputs": [], + "source": [ + "df.to_excel('../results/20241029_bge/BGKU_2024_patched.xlsx', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d823de22", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/work/sts_concat.ipynb b/work/sts_concat.ipynb new file mode 100644 index 0000000..e68ad4a --- /dev/null +++ b/work/sts_concat.ipynb @@ -0,0 +1,1145 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "437e7b79", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import glob\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "296aaf90", + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option('display.max_columns', 500)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3581d1d0", + "metadata": {}, + "outputs": [], + "source": [ + "sts_dir = '/Users/am60/Downloads/STS manifests/Uploaded to STS'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "440a59d3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "112" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sts_fns = glob.glob(sts_dir + '/*.xlsx')\n", + "sts_dfs = []\n", + "for sts_fn in sts_fns:\n", + " try:\n", + " sdf = pd.read_excel(sts_fn, sheet_name='Metadata Entry')\n", + " sdf['FILENAME'] = sts_fn.split('/')[-1]\n", + " sts_dfs.append(sdf)\n", + " except:\n", + " print(f'{sts_fn} is not an STS manifest, skipping')\n", + " \n", + "len(sts_dfs)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "16919f21", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(190752, 53)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sts_df = pd.concat(sts_dfs)\n", + "sts_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "6d5470ca", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['SERIES', 'CATCH_LOT', 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID',\n", + " 'ORGANISM_PART', 'PRESERVATIVE_SOLUTION', 'CATCH_SOLUTION',\n", + " 'BOTTLE_DIRECTION', 'DATE_OF_COLLECTION', 'COUNTRY_OF_COLLECTION',\n", + " 'COLLECTION_LOCATION', 'DECIMAL_LATITUDE', 'DECIMAL_LONGITUDE',\n", + " 'WHAT_3_WORDS', 'TIME_OF_COLLECTION', 'DURATION_OF_COLLECTION',\n", + " 'COLLECTION_METHOD', 'DATE_OF_PLATING', 'PREDICTED_ORDER_OR_GROUP',\n", + " 'PREDICTED_FAMILY', 'PREDICTED_GENUS', 'PREDICTED_SCIENTIFIC_NAME',\n", + " 'SPECIMEN_IDENTITY_RISK', 'LIFESTAGE', 'SEX', 'SORTING_SOLUTION_USED',\n", + " 'CATCH_BOTTLE_TEMPERATURE_STORAGE', 'PLATE_TEMPERATURE_STORAGE',\n", + " 'AMOUNT_OF_CATCH_PLATED', 'MORPHOSPECIES_DESCRIPTION',\n", + " 'DESCRIPTION_OF_COLLECTION_METHOD', 'HABITAT', 'PRESERVATION_APPROACH',\n", + " 'COLLECTOR_SAMPLE_ID', 'VOUCHER_ID', 'ELEVATION', 'OTHER_INFORMATION',\n", + " 'IDENTIFIED_BY', 'SPECIMEN_ID', 'SCIENTIFIC_NAME', 'TAXON_ID', 'GAL',\n", + " 'SYMBIONT', 'REGULATORY_COMPLIANCE', 'HAZARD_GROUP', 'CONTRIBUTORS',\n", + " 'FILENAME', 'WEATHER', 'IDENTIFIER_AFFILIATION', 'MISC_METADATA',\n", + " 'SPECIMEN_ID.1', 'PREDICTED_TAXON_ID', 'IDENTIFICATION_RISK'],\n", + " dtype='object')" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sts_df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c1511c58", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sts_df.SPECIMEN_ID.is_unique" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "2aefdd26", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sts_df.SPECIMEN_ID.isna().any()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d439223a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SERIESCATCH_LOTRACK_OR_PLATE_IDTUBE_OR_WELL_IDORGANISM_PARTPRESERVATIVE_SOLUTIONCATCH_SOLUTIONBOTTLE_DIRECTIONDATE_OF_COLLECTIONCOUNTRY_OF_COLLECTIONCOLLECTION_LOCATIONDECIMAL_LATITUDEDECIMAL_LONGITUDEWHAT_3_WORDSTIME_OF_COLLECTIONDURATION_OF_COLLECTIONCOLLECTION_METHODDATE_OF_PLATINGPREDICTED_ORDER_OR_GROUPPREDICTED_FAMILYPREDICTED_GENUSPREDICTED_SCIENTIFIC_NAMESPECIMEN_IDENTITY_RISKLIFESTAGESEXSORTING_SOLUTION_USEDCATCH_BOTTLE_TEMPERATURE_STORAGEPLATE_TEMPERATURE_STORAGEAMOUNT_OF_CATCH_PLATEDMORPHOSPECIES_DESCRIPTIONDESCRIPTION_OF_COLLECTION_METHODHABITATPRESERVATION_APPROACHCOLLECTOR_SAMPLE_IDVOUCHER_IDELEVATIONOTHER_INFORMATIONIDENTIFIED_BYSPECIMEN_IDSCIENTIFIC_NAMETAXON_IDGALSYMBIONTREGULATORY_COMPLIANCEHAZARD_GROUPCONTRIBUTORSFILENAMEWEATHERIDENTIFIER_AFFILIATIONMISC_METADATASPECIMEN_ID.1PREDICTED_TAXON_IDIDENTIFICATION_RISK
01C001FMOZZ00000609AA1LEG100%_ETHANOL100%_ETHANOLN2021-05-27 00:00:00UNITED KINGDOMUNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W...51.842248-4.150253///inclined.lists.crossings09:04:00P1DTMALAISE_TRAP2021-06-09 00:00:00HymenopteraApidaeBombusBombus pascuorumNaNADULTFEMALENROOM_TEMPERATUREFRIDGENaNNaNdirectionalHorticulturalNaNC001-DWG-N|NBGW-001-ANaNNaNNaNNaNMOZZ00000609A_A1unidentified32644National Botanic Garden of WalesTARGETYHG1NaNNBGW-2206-Manifest-V2.0.xlsxNaNNaNNaNMOZZ00000609A_A1NaNNaN
12C001FMOZZ00000609AB1WHOLE_ORGANISM100%_ETHANOL100%_ETHANOLN2021-05-27 00:00:00UNITED KINGDOMUNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W...51.842248-4.150253///inclined.lists.crossings09:04:00P1DTMALAISE_TRAP2021-06-09 00:00:00DipteraSyrphidaeNaNNaNNaNADULTNaNNROOM_TEMPERATUREFRIDGENaNNaNdirectionalHorticulturalNaNC001-DWG-N|NBGW-001-ANaNNaNNaNNaNMOZZ00000609A_B1unidentified32644National Botanic Garden of WalesTARGETYHG1NaNNBGW-2206-Manifest-V2.0.xlsxNaNNaNNaNMOZZ00000609A_B1NaNNaN
23C001FMOZZ00000609AC1WHOLE_ORGANISM100%_ETHANOL100%_ETHANOLN2021-05-27 00:00:00UNITED KINGDOMUNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W...51.842248-4.150253///inclined.lists.crossings09:04:00P1DTMALAISE_TRAP2021-06-09 00:00:00DipteraNaNNaNNaNNaNADULTNaNNROOM_TEMPERATUREFRIDGENaNNaNdirectionalHorticulturalNaNC001-DWG-N|NBGW-001-ANaNNaNNaNNaNMOZZ00000609A_C1unidentified32644National Botanic Garden of WalesTARGETYHG1NaNNBGW-2206-Manifest-V2.0.xlsxNaNNaNNaNMOZZ00000609A_C1NaNNaN
34C001FMOZZ00000609AD1WHOLE_ORGANISM100%_ETHANOL100%_ETHANOLN2021-05-27 00:00:00UNITED KINGDOMUNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W...51.842248-4.150253///inclined.lists.crossings09:04:00P1DTMALAISE_TRAP2021-06-09 00:00:00DipteraNaNNaNNaNNaNADULTNaNNROOM_TEMPERATUREFRIDGENaNNaNdirectionalHorticulturalNaNC001-DWG-N|NBGW-001-ANaNNaNNaNNaNMOZZ00000609A_D1unidentified32644National Botanic Garden of WalesTARGETYHG1NaNNBGW-2206-Manifest-V2.0.xlsxNaNNaNNaNMOZZ00000609A_D1NaNNaN
45C001FMOZZ00000609AE1WHOLE_ORGANISM100%_ETHANOL100%_ETHANOLN2021-05-27 00:00:00UNITED KINGDOMUNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W...51.842248-4.150253///inclined.lists.crossings09:04:00P1DTMALAISE_TRAP2021-06-09 00:00:00DipteraNaNNaNNaNNaNADULTNaNNROOM_TEMPERATUREFRIDGENaNNaNdirectionalHorticulturalNaNC001-DWG-N|NBGW-001-ANaNNaNNaNNaNMOZZ00000609A_E1unidentified32644National Botanic Garden of WalesTARGETYHG1NaNNBGW-2206-Manifest-V2.0.xlsxNaNNaNNaNMOZZ00000609A_E1NaNNaN
..................................................................................................................................................................
29712972C052FNBGW_011D12WHOLE_ORGANISM100%_ETHANOL100%_ETHANOLS2022-04-29 00:00:00UNITED KINGDOMUNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W...51.842248-4.150253///inclined.lists.crossings09:05:00P1DTMALAISE_TRAP2022-05-06 00:00:00DipteraNaNNaNNaNNaNADULTNaNNROOM_TEMPERATUREFRIDGENaNNaNdirectionalHORTICULTURALNaNC012-DWG-SNaNNaNNaNNaNNBGW_011_D12unidentified32644National Botanic Garden of WalesTARGETYHG1NaNNBGW-2206-Manifest-V2.0.xlsxNaNNaNNaNNBGW_011_D12NaNNaN
29722973C052FNBGW_011E12WHOLE_ORGANISM100%_ETHANOL100%_ETHANOLS2022-04-29 00:00:00UNITED KINGDOMUNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W...51.842248-4.150253///inclined.lists.crossings09:05:00P1DTMALAISE_TRAP2022-05-06 00:00:00DipteraNaNNaNNaNNaNADULTNaNNROOM_TEMPERATUREFRIDGENaNNaNdirectionalHORTICULTURALNaNC012-DWG-SNaNNaNNaNNaNNBGW_011_E12unidentified32644National Botanic Garden of WalesTARGETYHG1NaNNBGW-2206-Manifest-V2.0.xlsxNaNNaNNaNNBGW_011_E12NaNNaN
29732974C052FNBGW_011F12WHOLE_ORGANISM100%_ETHANOL100%_ETHANOLS2022-04-29 00:00:00UNITED KINGDOMUNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W...51.842248-4.150253///inclined.lists.crossings09:05:00P1DTMALAISE_TRAP2022-05-06 00:00:00DipteraNaNNaNNaNNaNADULTNaNNROOM_TEMPERATUREFRIDGENaNNaNdirectionalHORTICULTURALNaNC012-DWG-SNaNNaNNaNNaNNBGW_011_F12unidentified32644National Botanic Garden of WalesTARGETYHG1NaNNBGW-2206-Manifest-V2.0.xlsxNaNNaNNaNNBGW_011_F12NaNNaN
29742975C052FNBGW_011G12WHOLE_ORGANISM100%_ETHANOL100%_ETHANOLS2022-04-29 00:00:00UNITED KINGDOMUNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W...51.842248-4.150253///inclined.lists.crossings09:05:00P1DTMALAISE_TRAP2022-05-06 00:00:00DipteraNaNNaNNaNNaNADULTNaNNROOM_TEMPERATUREFRIDGENaNNaNdirectionalHORTICULTURALNaNC012-DWG-SNaNNaNNaNNaNNBGW_011_G12unidentified32644National Botanic Garden of WalesTARGETYHG1NaNNBGW-2206-Manifest-V2.0.xlsxNaNNaNNaNNBGW_011_G12NaNNaN
29752976NOT_APPLICABLENBGW_011H12NOT_APPLICABLE100%_ETHANOL100%_ETHANOLNaNNaTNaNNaNNaNNaNNaNNaNNaNNaNNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNBGW_011_H12blank sample2582415National Botanic Garden of WalesTARGETYHG1NaNNBGW-2206-Manifest-V2.0.xlsxNaNNaNNaNNBGW_011_H12NaNNaN
\n", + "

2976 rows × 53 columns

\n", + "
" + ], + "text/plain": [ + " SERIES CATCH_LOT RACK_OR_PLATE_ID TUBE_OR_WELL_ID ORGANISM_PART \\\n", + "0 1 C001F MOZZ00000609A A1 LEG \n", + "1 2 C001F MOZZ00000609A B1 WHOLE_ORGANISM \n", + "2 3 C001F MOZZ00000609A C1 WHOLE_ORGANISM \n", + "3 4 C001F MOZZ00000609A D1 WHOLE_ORGANISM \n", + "4 5 C001F MOZZ00000609A E1 WHOLE_ORGANISM \n", + "... ... ... ... ... ... \n", + "2971 2972 C052F NBGW_011 D12 WHOLE_ORGANISM \n", + "2972 2973 C052F NBGW_011 E12 WHOLE_ORGANISM \n", + "2973 2974 C052F NBGW_011 F12 WHOLE_ORGANISM \n", + "2974 2975 C052F NBGW_011 G12 WHOLE_ORGANISM \n", + "2975 2976 NOT_APPLICABLE NBGW_011 H12 NOT_APPLICABLE \n", + "\n", + " PRESERVATIVE_SOLUTION CATCH_SOLUTION BOTTLE_DIRECTION \\\n", + "0 100%_ETHANOL 100%_ETHANOL N \n", + "1 100%_ETHANOL 100%_ETHANOL N \n", + "2 100%_ETHANOL 100%_ETHANOL N \n", + "3 100%_ETHANOL 100%_ETHANOL N \n", + "4 100%_ETHANOL 100%_ETHANOL N \n", + "... ... ... ... \n", + "2971 100%_ETHANOL 100%_ETHANOL S \n", + "2972 100%_ETHANOL 100%_ETHANOL S \n", + "2973 100%_ETHANOL 100%_ETHANOL S \n", + "2974 100%_ETHANOL 100%_ETHANOL S \n", + "2975 100%_ETHANOL 100%_ETHANOL NaN \n", + "\n", + " DATE_OF_COLLECTION COUNTRY_OF_COLLECTION \\\n", + "0 2021-05-27 00:00:00 UNITED KINGDOM \n", + "1 2021-05-27 00:00:00 UNITED KINGDOM \n", + "2 2021-05-27 00:00:00 UNITED KINGDOM \n", + "3 2021-05-27 00:00:00 UNITED KINGDOM \n", + "4 2021-05-27 00:00:00 UNITED KINGDOM \n", + "... ... ... \n", + "2971 2022-04-29 00:00:00 UNITED KINGDOM \n", + "2972 2022-04-29 00:00:00 UNITED KINGDOM \n", + "2973 2022-04-29 00:00:00 UNITED KINGDOM \n", + "2974 2022-04-29 00:00:00 UNITED KINGDOM \n", + "2975 NaT NaN \n", + "\n", + " COLLECTION_LOCATION DECIMAL_LATITUDE \\\n", + "0 UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... 51.842248 \n", + "1 UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... 51.842248 \n", + "2 UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... 51.842248 \n", + "3 UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... 51.842248 \n", + "4 UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... 51.842248 \n", + "... ... ... \n", + "2971 UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... 51.842248 \n", + "2972 UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... 51.842248 \n", + "2973 UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... 51.842248 \n", + "2974 UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... 51.842248 \n", + "2975 NaN NaN \n", + "\n", + " DECIMAL_LONGITUDE WHAT_3_WORDS TIME_OF_COLLECTION \\\n", + "0 -4.150253 ///inclined.lists.crossings 09:04:00 \n", + "1 -4.150253 ///inclined.lists.crossings 09:04:00 \n", + "2 -4.150253 ///inclined.lists.crossings 09:04:00 \n", + "3 -4.150253 ///inclined.lists.crossings 09:04:00 \n", + "4 -4.150253 ///inclined.lists.crossings 09:04:00 \n", + "... ... ... ... \n", + "2971 -4.150253 ///inclined.lists.crossings 09:05:00 \n", + "2972 -4.150253 ///inclined.lists.crossings 09:05:00 \n", + "2973 -4.150253 ///inclined.lists.crossings 09:05:00 \n", + "2974 -4.150253 ///inclined.lists.crossings 09:05:00 \n", + "2975 NaN NaN NaN \n", + "\n", + " DURATION_OF_COLLECTION COLLECTION_METHOD DATE_OF_PLATING \\\n", + "0 P1DT MALAISE_TRAP 2021-06-09 00:00:00 \n", + "1 P1DT MALAISE_TRAP 2021-06-09 00:00:00 \n", + "2 P1DT MALAISE_TRAP 2021-06-09 00:00:00 \n", + "3 P1DT MALAISE_TRAP 2021-06-09 00:00:00 \n", + "4 P1DT MALAISE_TRAP 2021-06-09 00:00:00 \n", + "... ... ... ... \n", + "2971 P1DT MALAISE_TRAP 2022-05-06 00:00:00 \n", + "2972 P1DT MALAISE_TRAP 2022-05-06 00:00:00 \n", + "2973 P1DT MALAISE_TRAP 2022-05-06 00:00:00 \n", + "2974 P1DT MALAISE_TRAP 2022-05-06 00:00:00 \n", + "2975 NaN NaN NaT \n", + "\n", + " PREDICTED_ORDER_OR_GROUP PREDICTED_FAMILY PREDICTED_GENUS \\\n", + "0 Hymenoptera Apidae Bombus \n", + "1 Diptera Syrphidae NaN \n", + "2 Diptera NaN NaN \n", + "3 Diptera NaN NaN \n", + "4 Diptera NaN NaN \n", + "... ... ... ... \n", + "2971 Diptera NaN NaN \n", + "2972 Diptera NaN NaN \n", + "2973 Diptera NaN NaN \n", + "2974 Diptera NaN NaN \n", + "2975 NaN NaN NaN \n", + "\n", + " PREDICTED_SCIENTIFIC_NAME SPECIMEN_IDENTITY_RISK LIFESTAGE SEX \\\n", + "0 Bombus pascuorum NaN ADULT FEMALE \n", + "1 NaN NaN ADULT NaN \n", + "2 NaN NaN ADULT NaN \n", + "3 NaN NaN ADULT NaN \n", + "4 NaN NaN ADULT NaN \n", + "... ... ... ... ... \n", + "2971 NaN NaN ADULT NaN \n", + "2972 NaN NaN ADULT NaN \n", + "2973 NaN NaN ADULT NaN \n", + "2974 NaN NaN ADULT NaN \n", + "2975 NaN NaN NaN NaN \n", + "\n", + " SORTING_SOLUTION_USED CATCH_BOTTLE_TEMPERATURE_STORAGE \\\n", + "0 N ROOM_TEMPERATURE \n", + "1 N ROOM_TEMPERATURE \n", + "2 N ROOM_TEMPERATURE \n", + "3 N ROOM_TEMPERATURE \n", + "4 N ROOM_TEMPERATURE \n", + "... ... ... \n", + "2971 N ROOM_TEMPERATURE \n", + "2972 N ROOM_TEMPERATURE \n", + "2973 N ROOM_TEMPERATURE \n", + "2974 N ROOM_TEMPERATURE \n", + "2975 NaN NaN \n", + "\n", + " PLATE_TEMPERATURE_STORAGE AMOUNT_OF_CATCH_PLATED \\\n", + "0 FRIDGE NaN \n", + "1 FRIDGE NaN \n", + "2 FRIDGE NaN \n", + "3 FRIDGE NaN \n", + "4 FRIDGE NaN \n", + "... ... ... \n", + "2971 FRIDGE NaN \n", + "2972 FRIDGE NaN \n", + "2973 FRIDGE NaN \n", + "2974 FRIDGE NaN \n", + "2975 NaN NaN \n", + "\n", + " MORPHOSPECIES_DESCRIPTION DESCRIPTION_OF_COLLECTION_METHOD \\\n", + "0 NaN directional \n", + "1 NaN directional \n", + "2 NaN directional \n", + "3 NaN directional \n", + "4 NaN directional \n", + "... ... ... \n", + "2971 NaN directional \n", + "2972 NaN directional \n", + "2973 NaN directional \n", + "2974 NaN directional \n", + "2975 NaN NaN \n", + "\n", + " HABITAT PRESERVATION_APPROACH COLLECTOR_SAMPLE_ID VOUCHER_ID \\\n", + "0 Horticultural NaN C001-DWG-N|NBGW-001-A NaN \n", + "1 Horticultural NaN C001-DWG-N|NBGW-001-A NaN \n", + "2 Horticultural NaN C001-DWG-N|NBGW-001-A NaN \n", + "3 Horticultural NaN C001-DWG-N|NBGW-001-A NaN \n", + "4 Horticultural NaN C001-DWG-N|NBGW-001-A NaN \n", + "... ... ... ... ... \n", + "2971 HORTICULTURAL NaN C012-DWG-S NaN \n", + "2972 HORTICULTURAL NaN C012-DWG-S NaN \n", + "2973 HORTICULTURAL NaN C012-DWG-S NaN \n", + "2974 HORTICULTURAL NaN C012-DWG-S NaN \n", + "2975 NaN NaN NaN NaN \n", + "\n", + " ELEVATION OTHER_INFORMATION IDENTIFIED_BY SPECIMEN_ID \\\n", + "0 NaN NaN NaN MOZZ00000609A_A1 \n", + "1 NaN NaN NaN MOZZ00000609A_B1 \n", + "2 NaN NaN NaN MOZZ00000609A_C1 \n", + "3 NaN NaN NaN MOZZ00000609A_D1 \n", + "4 NaN NaN NaN MOZZ00000609A_E1 \n", + "... ... ... ... ... \n", + "2971 NaN NaN NaN NBGW_011_D12 \n", + "2972 NaN NaN NaN NBGW_011_E12 \n", + "2973 NaN NaN NaN NBGW_011_F12 \n", + "2974 NaN NaN NaN NBGW_011_G12 \n", + "2975 NaN NaN NaN NBGW_011_H12 \n", + "\n", + " SCIENTIFIC_NAME TAXON_ID GAL SYMBIONT \\\n", + "0 unidentified 32644 National Botanic Garden of Wales TARGET \n", + "1 unidentified 32644 National Botanic Garden of Wales TARGET \n", + "2 unidentified 32644 National Botanic Garden of Wales TARGET \n", + "3 unidentified 32644 National Botanic Garden of Wales TARGET \n", + "4 unidentified 32644 National Botanic Garden of Wales TARGET \n", + "... ... ... ... ... \n", + "2971 unidentified 32644 National Botanic Garden of Wales TARGET \n", + "2972 unidentified 32644 National Botanic Garden of Wales TARGET \n", + "2973 unidentified 32644 National Botanic Garden of Wales TARGET \n", + "2974 unidentified 32644 National Botanic Garden of Wales TARGET \n", + "2975 blank sample 2582415 National Botanic Garden of Wales TARGET \n", + "\n", + " REGULATORY_COMPLIANCE HAZARD_GROUP CONTRIBUTORS \\\n", + "0 Y HG1 NaN \n", + "1 Y HG1 NaN \n", + "2 Y HG1 NaN \n", + "3 Y HG1 NaN \n", + "4 Y HG1 NaN \n", + "... ... ... ... \n", + "2971 Y HG1 NaN \n", + "2972 Y HG1 NaN \n", + "2973 Y HG1 NaN \n", + "2974 Y HG1 NaN \n", + "2975 Y HG1 NaN \n", + "\n", + " FILENAME WEATHER IDENTIFIER_AFFILIATION \\\n", + "0 NBGW-2206-Manifest-V2.0.xlsx NaN NaN \n", + "1 NBGW-2206-Manifest-V2.0.xlsx NaN NaN \n", + "2 NBGW-2206-Manifest-V2.0.xlsx NaN NaN \n", + "3 NBGW-2206-Manifest-V2.0.xlsx NaN NaN \n", + "4 NBGW-2206-Manifest-V2.0.xlsx NaN NaN \n", + "... ... ... ... \n", + "2971 NBGW-2206-Manifest-V2.0.xlsx NaN NaN \n", + "2972 NBGW-2206-Manifest-V2.0.xlsx NaN NaN \n", + "2973 NBGW-2206-Manifest-V2.0.xlsx NaN NaN \n", + "2974 NBGW-2206-Manifest-V2.0.xlsx NaN NaN \n", + "2975 NBGW-2206-Manifest-V2.0.xlsx NaN NaN \n", + "\n", + " MISC_METADATA SPECIMEN_ID.1 PREDICTED_TAXON_ID IDENTIFICATION_RISK \n", + "0 NaN MOZZ00000609A_A1 NaN NaN \n", + "1 NaN MOZZ00000609A_B1 NaN NaN \n", + "2 NaN MOZZ00000609A_C1 NaN NaN \n", + "3 NaN MOZZ00000609A_D1 NaN NaN \n", + "4 NaN MOZZ00000609A_E1 NaN NaN \n", + "... ... ... ... ... \n", + "2971 NaN NBGW_011_D12 NaN NaN \n", + "2972 NaN NBGW_011_E12 NaN NaN \n", + "2973 NaN NBGW_011_F12 NaN NaN \n", + "2974 NaN NBGW_011_G12 NaN NaN \n", + "2975 NaN NBGW_011_H12 NaN NaN \n", + "\n", + "[2976 rows x 53 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sts_df[~sts_df[\"SPECIMEN_ID.1\"].isna()]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c4b421d5", + "metadata": {}, + "outputs": [], + "source": [ + "sts_df.to_csv('sts_concat.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6fc50384", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/work/validate_anospp.ipynb b/work/validate_anospp.ipynb index 6ee15b3..49d6c36 100644 --- a/work/validate_anospp.ipynb +++ b/work/validate_anospp.ipynb @@ -25,7 +25,7 @@ "metadata": {}, "outputs": [], "source": [ - "def validate_anospp(fn, template_fn='../data/Anopheles_Metadata_Manifest_V4.0_20221220.xlsx', \n", + "def validate_anospp(fn, template_fn='../data/Anopheles_Metadata_Manifest_V4.0_20240813.xlsx', \n", " verbose=False, samples_sheet='TAB 2 Metadata Entry',\n", " contrib_sheet='TAB 1 Contributors', write_sts=True):\n", " '''\n", @@ -46,7 +46,7 @@ " df = validate_series(df)\n", " # clean up data\n", " df = remove_nonbreaking_spaces(df)\n", - " df = remove_trailing_spaces(df)\n", + " df = remove_trailing_spaces(df, title='sample')\n", " \n", " # read NCBI taxonomy\n", " ncbi = ete3.NCBITaxa()\n", @@ -121,10 +121,7 @@ " \n", " print('\\n'.join(df.RACK_OR_PLATE_ID.unique()))\n", " \n", - " return df\n", - "\n", - "fn = '../data/Anopheles_Metadata_Manifest_V4.0_20221220.xlsx'\n", - "template_fn = fn" + " return df" ] }, { @@ -134,6 +131,8 @@ "metadata": {}, "outputs": [], "source": [ + "fn = '../data/Anopheles_Metadata_Manifest_V4.0_20240813.xlsx'\n", + "template_fn = fn\n", "df = validate_anospp(fn, template_fn, verbose=True, samples_sheet='TAB 3 TEST Metadata Entry')" ] }, @@ -162,7 +161,7 @@ "metadata": {}, "outputs": [], "source": [ - "fn = '../results/20231019_a_poal_adad/Anopheles_Metadata_Manifest_V4.0_POAL_101623_am60.xlsx'\n", + "fn = '../results/20240819_mg/MANIFEST_MOSQUITOES_MADAGASCAR_am60_2.xlsx'\n", "df = validate_anospp(fn, template_fn, samples_sheet='TAB 2 Metadata Entry', contrib_sheet='TAB 1 Contributors')" ] }, diff --git a/work/validate_bioscan.ipynb b/work/validate_bioscan.ipynb index 77c35df..a89a133 100644 --- a/work/validate_bioscan.ipynb +++ b/work/validate_bioscan.ipynb @@ -44,7 +44,7 @@ " df = validate_series(df)\n", " # clean up data\n", " df = remove_nonbreaking_spaces(df)\n", - " df = remove_trailing_spaces(df)\n", + " df = remove_trailing_spaces(df, title='sample')\n", " \n", " # read NCBI taxonomy\n", " ncbi = ete3.NCBITaxa()\n", @@ -83,10 +83,18 @@ " if v == 'v3':\n", " validate_values('CATCH_SOLUTION', df[~is_blank], valid_dict)\n", " df = strip_asterisks('CATCH_SOLUTION', df)\n", - " validate_values('BOTTLE_DIRECTION', \n", - " df[~is_blank & (df['COLLECTION_METHOD'] == 'MALAISE_TRAP')], # allow for blank in non-Malaise trap samples\n", - " valid_dict)\n", - " validate_regex('DATE_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED'])\n", + " is_malaise = (~is_blank & (df['COLLECTION_METHOD'] == 'MALAISE_TRAP'))\n", + " is_other = (~is_blank & (df['COLLECTION_METHOD'] != 'MALAISE_TRAP'))\n", + " validate_values('BOTTLE_DIRECTION', df[is_malaise], valid_dict, \n", + " extra_msg=' for malaise trap samples')\n", + " # allow only empty for non-malaise trap - weird capture of bottle direction filled and collection method empty\n", + " validate_values('BOTTLE_DIRECTION', df[is_other], {'BOTTLE_DIRECTION':['']}, \n", + " extra_msg=' for non-malaise trap samples')\n", + " # no missing datex expected for malaise\n", + " validate_regex('DATE_OF_COLLECTION', df[is_malaise], \n", + " extra_msg=' for malaise trap samples')\n", + " validate_regex('DATE_OF_COLLECTION', df[is_other], na_values=['NOT_COLLECTED'],\n", + " extra_msg=' for non-malaise trap samples')\n", " check_catch_lot_dates(df[~is_blank])\n", " validate_regex('DECIMAL_LATITUDE', df[~is_blank], na_values=[])\n", " validate_regex('DECIMAL_LONGITUDE', df[~is_blank], na_values=[])\n", @@ -96,11 +104,23 @@ " \n", " # purple cols - valiated for non-blank samples\n", " validate_regex('WHAT_3_WORDS', df[~is_blank], na_values=[''])\n", - " validate_regex('TIME_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])\n", - " validate_regex('DURATION_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])\n", + " # no missing times expected for malaise\n", + " validate_regex('TIME_OF_COLLECTION', df[is_malaise],\n", + " extra_msg=' for malaise trap samples') \n", + " validate_regex('TIME_OF_COLLECTION', df[is_other], na_values=['NOT_COLLECTED',''],\n", + " extra_msg=' for non-malaise trap samples') \n", + " # no missing durations for malaise\n", + " validate_regex('DURATION_OF_COLLECTION', df[is_malaise],\n", + " extra_msg=' for malaise trap samples')\n", + " validate_regex('DURATION_OF_COLLECTION', df[is_other], na_values=['NOT_COLLECTED',''],\n", + " extra_msg=' for non-malaise trap samples')\n", " validate_values('COLLECTION_METHOD', df[~is_blank], valid_dict, na_values=[''])\n", " df = strip_asterisks('COLLECTION_METHOD', df)\n", - " validate_regex('DATE_OF_PLATING', df[~is_blank], na_values=['NOT_COLLECTED',''])\n", + " # no missing plating dates for malaise\n", + " validate_regex('DATE_OF_PLATING', df[is_malaise],\n", + " extra_msg=' for malaise trap samples')\n", + " validate_regex('DATE_OF_PLATING', df[~is_blank], na_values=['NOT_COLLECTED',''],\n", + " extra_msg=' for non-malaise trap samples')\n", " compare_dates_text('DATE_OF_COLLECTION', 'DATE_OF_PLATING', df[~is_blank])\n", " # taxonomy validation adds taxid columns to original dataframe - skipping for now\n", " df = validate_taxonomy(df, ncbi, anospp=False, na_values = [''])\n", @@ -123,7 +143,7 @@ " validate_freetext('VOUCHER_ID', df)\n", " validate_regex('ELEVATION', df, na_values=[''])\n", " validate_freetext('OTHER_INFORMATION', df)\n", - " validate_freetext('MISC_METADATA', df)\n", + " # validate_freetext('MISC_METADATA', df)\n", " validate_identifier('IDENTIFIED_BY', df, contrib_df, na_values=[''])\n", " \n", " df = expand_plate_only(df)\n", @@ -186,7 +206,7 @@ }, "outputs": [], "source": [ - "fn = '../results/20240304_shap_test/SHAP_2401_BIOSCAN_Manifest_V3.xlsx'\n", + "fn = '../results/20241108_dev_033/NEWI_2408_BIOSCAN_Manifest_V3.xlsx'\n", "df = validate_bioscan(fn, samples_sheet='TAB 2 Metadata Entry', contrib_sheet='TAB 1 Contributors', verbose=False)" ] }, @@ -197,7 +217,8 @@ "metadata": {}, "outputs": [], "source": [ - "fn = '../results/20231006_bold_test/BGE_single_specimen_metadata&biobanking_sheet_v5_Yulia_Guglia_Diptera_12_plates.xlsx'\n", + "%run validate_partner_manifest_dev.ipynb\n", + "fn = '../results/20241104_bgeg/BGE_2024_edit.xlsx'\n", "df = validate_bioscan(fn, samples_sheet='TAB 2 Metadata Entry', contrib_sheet='TAB 1 Contributors', \n", " verbose=False, bold_input=True)" ] @@ -205,7 +226,7 @@ { "cell_type": "code", "execution_count": null, - "id": "40a9e9aa", + "id": "d3e12c2f", "metadata": {}, "outputs": [], "source": [] diff --git a/work/validate_partner_manifest_dev.ipynb b/work/validate_partner_manifest_dev.ipynb index 351f324..a63d7b5 100644 --- a/work/validate_partner_manifest_dev.ipynb +++ b/work/validate_partner_manifest_dev.ipynb @@ -35,7 +35,7 @@ "metadata": {}, "outputs": [], "source": [ - "VALIDATION_VERSION = '0.3.2'\n", + "VALIDATION_VERSION = '0.3.3'\n", "ANOSPP_VERSION = '4.0'\n", "# V2.0, but V3 in SOP\n", "BIOSCAN_VERSION = '3'" @@ -70,7 +70,7 @@ "outputs": [], "source": [ "anospp_fn = '../data/Anopheles_Metadata_Manifest_V4.0_20221220.xlsx'\n", - "biosc_fn = '../data/BIOSCAN_Manifest_V3_20230818.xlsx'" + "biosc_fn = '../data/BIOSCAN_Manifest_V3_20240301.xlsx'" ] }, { @@ -113,8 +113,15 @@ " \n", " logging.info(f'parsing bold manifest from \"{fn}\" sheet \"{sheet}\"')\n", " \n", + " # second BGE manifest header\n", " df = pd.read_excel(fn, dtype=str, keep_default_na=False,\n", - " sheet_name=sheet, header=3).iloc[3:]\n", + " sheet_name=sheet, header=1).iloc[1:]\n", + " if 'Plate ID' not in df.columns:\n", + " # first BGE manifest header\n", + " df = pd.read_excel(fn, dtype=str, keep_default_na=False,\n", + " sheet_name=sheet, header=3).iloc[3:]\n", + " if ('Plate ID' not in df.columns) or (df['Well Position'].iloc[0] != 'A01'):\n", + " raise ValueError('could not parse bold manifest')\n", " \n", " df.index = (i + 1 for i in range(df.shape[0]))\n", " df.index.name = 'SERIES'\n", @@ -207,8 +214,6 @@ " \n", " df.rename(columns=col_mapping, inplace=True)\n", " \n", - " \n", - " \n", " # reorder columns, add missing, remove \"LEAVE BLANK\" values\n", " for col in template_df.columns:\n", " if col not in df.columns:\n", @@ -219,50 +224,115 @@ " }, inplace=True)\n", " df = df[template_df.columns]\n", " \n", - " # contents adjustments\n", + " # strip leading zeroes from well ids\n", " df['TUBE_OR_WELL_ID'] = df['TUBE_OR_WELL_ID'].str.replace(\n", " '([A-H])0', r'\\1', regex=True\n", " )\n", + " \n", + " # blanks handling\n", + " for i, r in df[df['ORGANISM_PART'] == ''].iterrows():\n", + " df.loc[i, 'ORGANISM_PART'] = 'NOT_APPLICABLE'\n", + " if df.loc[i, 'TUBE_OR_WELL_ID'] == '':\n", + " # originally we wanted to backfill well IDs, \n", + " # but BGEP has varying order of wells in the manifest\n", + " # thus not handling for now\n", + " raise ValueError(f'Cannot handle empty well ids for blanks, row {i+1} - please fill')\n", + " elif r['TUBE_OR_WELL_ID'] == 'A1':\n", + " raise ValueError(f'Cannot handle blanks at A1, row {i+1}')\n", + " df.loc[i, 'RACK_OR_PLATE_ID'] = df.loc[i-1, 'RACK_OR_PLATE_ID']\n", + " df.loc[i, 'PRESERVATIVE_SOLUTION'] = df.loc[i-1, 'PRESERVATIVE_SOLUTION']\n", + "\n", + " # sts manifest is ordered as A1,B1\n", + " sts_well_ids = []\n", + " for col in range(1,13):\n", + " for row in 'ABCDEFGH':\n", + " sts_well_ids.append(f'{row}{col}')\n", + " well_order_cat = pd.CategoricalDtype(sts_well_ids, ordered=True)\n", + " df['TUBE_OR_WELL_ID'] = df['TUBE_OR_WELL_ID'].astype(well_order_cat)\n", + " plate_order_cat = pd.CategoricalDtype(df.RACK_OR_PLATE_ID.drop_duplicates(), ordered=True)\n", + " df['RACK_OR_PLATE_ID'] = df['RACK_OR_PLATE_ID'].astype(plate_order_cat)\n", + " df = df.sort_values(by=['RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID'])\n", + " for col in ['RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID']:\n", + " df[col] = df[col].astype(str)\n", + " # reset index again\n", + " df.index = (i + 1 for i in range(df.shape[0]))\n", + " df.index.name = 'SERIES'\n", + " \n", + " # other contents adjustments\n", " df['SEX'] = df['SEX'].str.upper()\n", " df['LIFESTAGE'] = df['LIFESTAGE'].str.upper()\n", - " df['ORGANISM_PART'] = df['ORGANISM_PART'].str.upper().str.replace(', ', ' | ')\n", + " df['ORGANISM_PART'] = df['ORGANISM_PART'].str.upper() \\\n", + " .str.replace(', ', ' | ').str.strip()\n", " df['ORGANISM_PART'].replace({\n", + " 'LEGS':'LEG',\n", + " 'WINGS':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n", + " 'WING':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n", + " 'ABDOMEN | LEGS':'ABDOMEN | LEG',\n", + " 'HEAD | LEGS':'HEAD | LEG',\n", + " 'BODY':'WHOLE_ORGANISM',\n", + " 'WHOLE':'WHOLE_ORGANISM',\n", + " 'WHOLE BODY':'WHOLE_ORGANISM',\n", + " 'WHOLE ORGANISM':'WHOLE_ORGANISM',\n", " 'ORGANISM':'WHOLE_ORGANISM',\n", " 'WHOLE SPECIMEN':'WHOLE_ORGANISM',\n", - " 'WING':'OTHER_SOMATIC_TISSUE'\n", + " 'WHOLE_SPECIMEN':'WHOLE_ORGANISM',\n", + " 'SOMATIC PART':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n", + " 'TAIL':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n", + " 'ANTENA':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n", + " 'ANTENNA':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n", + " 'HALF_SPECIMEN':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n", + " 'PIECE OF BODY':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n", + " 'MUSCLE TISSUE':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n", " }, inplace=True)\n", " df['PRESERVATIVE_SOLUTION'].replace({\n", + " 'Ethanol':'V%_ETHANOL',\n", " 'ethanol':'V%_ETHANOL',\n", - " 'dry':'NONE'\n", + " 'dry':'NONE',\n", + " '':'NONE' # problem with blanks?\n", " }, inplace=True)\n", - " df['DATE_OF_COLLECTION'] = df['DATE_OF_COLLECTION'].str.split('/').str.get(-1)\n", + " df['CATCH_SOLUTION'].replace({\n", + " 'Ethanol':'V%_ETHANOL',\n", + " 'ethanol':'V%_ETHANOL',\n", + " 'dry':'NONE',\n", + " '':'NONE' # problem with blanks?\n", + " }, inplace=True)\n", + " df['DECIMAL_LATITUDE'] = df['DECIMAL_LATITUDE'].replace('', np.nan).astype(float).round(6).astype(str)\n", + " df['DECIMAL_LONGITUDE'] = df['DECIMAL_LONGITUDE'].replace('', np.nan).astype(float).round(6).astype(str)\n", + " # df['DATE_OF_COLLECTION'] = df['DATE_OF_COLLECTION'].str.split('/').str.get(-1)\n", " df['DATE_OF_COLLECTION'] = pd.to_datetime(df['DATE_OF_COLLECTION'], errors='coerce') \\\n", " .dt.strftime('%Y-%m-%d') \\\n", " .fillna('')\n", " species_identified = (df['PREDICTED_SCIENTIFIC_NAME'].str.contains('[a-z]', regex=True))\n", " df.loc[species_identified, 'PREDICTED_SCIENTIFIC_NAME'] = df.loc[species_identified, 'PREDICTED_GENUS'].str.title() + \\\n", " ' ' + df.loc[species_identified, 'PREDICTED_SCIENTIFIC_NAME'].str.lower()\n", - " \n", - " # blanks handling\n", - " for i, r in df[df['ORGANISM_PART'] == ''].iterrows():\n", - " df.loc[i, 'ORGANISM_PART'] = 'NOT_APPLICABLE'\n", - " if r['TUBE_OR_WELL_ID'] == 'A1':\n", - " raise ValueError(f'Cannot handle blanks at A1, row {i+1}')\n", - " df.loc[i, 'RACK_OR_PLATE_ID'] = df.loc[i-1, 'RACK_OR_PLATE_ID']\n", - " df.loc[i, 'PRESERVATIVE_SOLUTION'] = df.loc[i-1, 'PRESERVATIVE_SOLUTION']\n", " \n", " is_blank = (df['ORGANISM_PART'] == 'NOT_APPLICABLE')\n", " \n", " # auto-fill\n", " df['CATCH_LOT'] = 'NOT_APPLICABLE'\n", " df.loc[~is_blank, 'CATCH_SOLUTION'] = df['PRESERVATIVE_SOLUTION']\n", + " df.loc[is_blank, 'CATCH_SOLUTION'] = ''\n", " df.loc[~is_blank, 'BOTTLE_DIRECTION'] = 'NOT_APPLICABLE'\n", - " df.loc[df['DESCRIPTION_OF_COLLECTION_METHOD'].str.contains('pan trap'), 'COLLECTION_METHOD'] = 'PAN_TRAP'\n", " df.loc[~is_blank, 'AMOUNT_OF_CATCH_PLATED'] = 'NOT_APPLICABLE'\n", " \n", + " # encoding\n", + " \n", + " # encoding for collection method\n", + " df.loc[df['DESCRIPTION_OF_COLLECTION_METHOD'].str.lower() \\\n", + " .str.contains('pan trap'), 'COLLECTION_METHOD'] = 'PAN_TRAP'\n", + " logging.info(f'inferred {df[df.COLLECTION_METHOD == \"PAN_TRAP\"].shape[0]} samples collected with PAN_TRAP')\n", + " df.loc[df['DESCRIPTION_OF_COLLECTION_METHOD'].str.lower() \\\n", + " .str.contains('malaise trap'), 'COLLECTION_METHOD'] = 'MALAISE_TRAP'\n", + " logging.info(f'inferred {df[df.COLLECTION_METHOD == \"MALAISE_TRAP\"].shape[0]} samples collected with MALAISE_TRAP')\n", + " df.loc[df['DESCRIPTION_OF_COLLECTION_METHOD'].str.lower() \\\n", + " .str.contains('sweep net'), 'COLLECTION_METHOD'] = 'AERIAL_NET'\n", + " df.loc[~is_blank & (df.COLLECTION_METHOD == ''), 'COLLECTION_METHOD'] = '**OTHER**'\n", + " \n", + " \n", " return df\n", "# template_df = get_data(biosc_fn, sheet='TAB 2 Metadata Entry')\n", - "# bold_fn = '../results/20231006_bold_test/BGE_single_specimen_metadata&biobanking_sheet_v5_Kaliuzhna_Braconidae.xlsx'\n", + "# # bold_fn = '../results/20231006_bold_test/BGE_single_specimen_metadata&biobanking_sheet_v5_Kaliuzhna_Braconidae.xlsx'\n", + "# bold_fn = '../results/20240730_bold_bge_poland/BGEP_am60.xlsx'\n", "# df = parse_bold(bold_fn, template_df, 'BGE entry')\n", "# df" ] @@ -402,7 +472,7 @@ "metadata": {}, "outputs": [], "source": [ - "def remove_trailing_spaces(df):\n", + "def remove_trailing_spaces(df, title):\n", " \n", " any_trailing_spaces = False\n", " \n", @@ -415,7 +485,7 @@ " any_trailing_spaces = True\n", " \n", " if any_trailing_spaces:\n", - " logging.info('removed some trailing spaces from the manifest')\n", + " logging.info(f'removed some trailing spaces from the {title} manifest')\n", " \n", " return df\n", "\n", @@ -524,7 +594,7 @@ " df = pd.read_excel(fn, dtype=str, keep_default_na=False,\n", " sheet_name=contrib_sheet)\n", " \n", - " df = remove_trailing_spaces(df)\n", + " df = remove_trailing_spaces(df, title='contributors')\n", " \n", " if 'EMAIL ADDRESS' in df.columns:\n", " logging.warning('replacing \"EMAIL ADDRESS\" with \"EMAIL_ADDRESS\" contributors column name')\n", @@ -575,7 +645,7 @@ "metadata": {}, "outputs": [], "source": [ - "def validate_regex(col, df, na_values=[]):\n", + "def validate_regex(col, df, na_values=[], extra_msg=''):\n", " \n", " logging.debug(f'validating data format in {col} column')\n", " \n", @@ -596,10 +666,10 @@ " 'CATCH_LOT': (r'^C\\d{3}[A-Z]$|^NOT_APPLICABLE$', \n", " 'like C123A or NOT_APPLICABLE'),\n", " 'DATE_OF_COLLECTION': (date_regex, 'in YYYY-MM-DD format'),\n", - " 'DECIMAL_LATITUDE': (r'^[-+]?([0-8]\\d|\\d)(\\.\\d+)?$', \n", - " 'between -90 and 90'),\n", - " 'DECIMAL_LONGITUDE': (r'^[-+]?(1[0-7]\\d|\\d\\d|\\d)(\\.\\d+)?$', \n", - " 'between -180 and 180'),\n", + " 'DECIMAL_LATITUDE': (r'^[-+]?([0-8]\\d|\\d)(\\.\\d{1,6})?$', \n", + " 'between -90 and 90 with up to 6 decimals'),\n", + " 'DECIMAL_LONGITUDE': (r'^[-+]?(1[0-7]\\d|\\d\\d|\\d)(\\.\\d{1,6})?$', \n", + " 'between -180 and 180 with up to 6 decimals'),\n", " 'WHAT_3_WORDS': (r'^///[a-z]+\\.[a-z]+\\.[a-z]+$', \n", " 'like ///one.two.three'),\n", " 'TIME_OF_COLLECTION': (r'^(?:[01]\\d|2[0-3]):[0-5]\\d$|^(?:[01]\\d|2[0-3]):[0-5]\\d:[0-5]\\d$', \n", @@ -617,7 +687,7 @@ " if not is_valid_regex.all():\n", " offending_values = list(series[~is_valid_regex].unique())\n", " s = index_ranges(series[~is_valid_regex])\n", - " msg = (f'{col} format incorrect for SERIES {s}: found {offending_values} - '\n", + " msg = (f'{col} format{extra_msg} incorrect for SERIES {s}: found {offending_values} - '\n", " f'expected to be {regexs[col][1]}')\n", " if col == 'CATCH_LOT':\n", " logging.warning(msg)\n", @@ -629,7 +699,7 @@ " if not is_recommended_regex.all():\n", " offending_values = list(series[~is_recommended_regex & is_valid_regex].unique())\n", " s = index_ranges(series[~is_recommended_regex & is_valid_regex])\n", - " msg = (f'{col} format suggestion for SERIES {s}: found {offending_values} - '\n", + " msg = (f'{col} format suggestion{extra_msg} for SERIES {s}: found {offending_values} - '\n", " f'we ask to use PT[n]H format (hours only)')\n", " logging.warning(msg)\n", " " @@ -830,6 +900,12 @@ " f'found {set(non_blank_penultimate_well_df.ORGANISM_PART.to_list())}, '\n", " f'these samples will not be sequenced',\n", " )\n", + " \n", + " blank_non_term_df = blank_df[~blank_df.TUBE_OR_WELL_ID.isin(['G12','H12'])]\n", + " if blank_non_term_df.shape[0] > 0:\n", + " logging.warning(\n", + " f'blanks in non-G12/H12 wells at SERIES {index_ranges(blank_non_term_df)}',\n", + " )\n", "\n", " any_excessive_blank_info = False\n", " \n", @@ -865,7 +941,7 @@ "metadata": {}, "outputs": [], "source": [ - "def validate_values(col, df, valid_dict, sep=None, na_values=[], level='e'):\n", + "def validate_values(col, df, valid_dict, sep=None, na_values=[], level='e', extra_msg=''):\n", " \n", " logging.debug(f'validating for allowed values in {col} column')\n", " \n", @@ -883,9 +959,10 @@ " col_values = set(series.unique())\n", " # use separator to split values\n", " if sep:\n", + " series = series.apply(lambda v: [x.strip() for x in v.split(sep)])\n", " sep_col_values = list()\n", - " for v in col_values:\n", - " sep_col_values.extend([x.strip() for x in v.split(sep)])\n", + " for v in series:\n", + " sep_col_values.extend(v)\n", " col_values = set(sep_col_values)\n", " \n", " valid_values = set(valid_dict[col])\n", @@ -897,12 +974,11 @@ " if invalid_value == '':\n", " invalid_value_series = index_ranges(series[series == ''])\n", " elif sep:\n", - " # match pipe-separated\n", - " invalid_value_series = index_ranges(series[series.str.contains(\n", - " r'(?:^|\\|)' + invalid_value + r'(?:$|\\|)', regex=True)])\n", + " invalid_mask = series.apply(lambda v: invalid_value in v)\n", + " invalid_value_series = index_ranges(series[invalid_mask])\n", " else:\n", " invalid_value_series = index_ranges(series[series == invalid_value])\n", - " msg = f'invalid value in {col} column, SERIES {invalid_value_series}: \"{invalid_value}\"'\n", + " msg = f'invalid value{extra_msg} in {col} column, SERIES {invalid_value_series}: \"{invalid_value}\"'\n", " if level == 'i':\n", " logging.info(msg)\n", " elif level == 'w':\n", @@ -990,7 +1066,7 @@ "metadata": {}, "outputs": [], "source": [ - "def validate_country_and_coordinates(df, fn, na_values=[], bioscan=False):\n", + "def validate_country_and_coordinates(df, fn, na_values=[''], bioscan=False):\n", " \n", " logging.debug('validating COUNTRY_OF_COLLECTION against DECIMAL_LATITUDE and DECIMAL_LONGITUDE')\n", " \n", @@ -1013,7 +1089,7 @@ " # get location data for coordinates\n", " # use local copy of web query results for re-runs\n", " # this \n", - " loc_fn = fn+'_loc.pkl'\n", + " loc_fn = fn + '_loc.pkl'\n", " if os.path.isfile(loc_fn):\n", " locations = pickle.load(open(loc_fn, \"rb\"))\n", " else:\n", @@ -1027,8 +1103,10 @@ " # pre-fill with unknown country\n", " locations[c] = {'address':{'country':'UNKNOWN'}}\n", " # check coordniate correctness\n", + " coord_list = c.split(', ')\n", + " lat = coord_list[0]\n", + " lon = coord_list[1]\n", " try:\n", - " lat, lon = c.split(', ')\n", " lat, lon = float(lat), float(lon)\n", " except:\n", " unparsed_df = df[(df[lat_col] == str(lat)) & df[lon_col] == str(lon)]\n", @@ -1061,8 +1139,13 @@ " # extract countries from location data\n", " loc_countries = dict()\n", " for coord in locations.keys():\n", - " \n", - " lat, lon = coord.split(', ')\n", + " try:\n", + " lat, lon = coord.split(', ')\n", + " except:\n", + " logging.error(\n", + " f'problem parsing coordinates {coord} in locations results'\n", + " )\n", + " continue\n", " coord_series = index_ranges(df.query(f'({lat_col} == \"{lat}\") & ({lon_col} == \"{lon}\")'))\n", " \n", " coord_country = locations[coord]['address']['country'].upper()\n", @@ -1441,11 +1524,12 @@ " logging.info('dropping IDENTIFIER_AFFILIATION column for bioscan manifest v2')\n", " df = df.drop(columns=['IDENTIFIER_AFFILIATION'])\n", " # add contributors - delimiters checked in validate_contributors\n", - " contrib_series = contrib_df['FULL_NAME'] + ';' + \\\n", - " contrib_df['PRIMARY_AFFILIATION'] + ';' + \\\n", - " contrib_df['EMAIL_ADDRESS'] + ';' + \\\n", - " contrib_df['CONTRIBUTION']\n", - " df['CONTRIBUTORS'] = '|'.join(list(contrib_series))\n", + " if bioscan:\n", + " contrib_series = contrib_df['FULL_NAME'] + ';' + \\\n", + " contrib_df['PRIMARY_AFFILIATION'] + ';' + \\\n", + " contrib_df['EMAIL_ADDRESS'] + ';' + \\\n", + " contrib_df['CONTRIBUTION']\n", + " df['CONTRIBUTORS'] = '|'.join(list(contrib_series))\n", " # add supplier sample name prefixes to control samples\n", "# df['BIOSCAN_SUPPLIER_SAMPLE_NAME'] = df['SPECIMEN_ID']\n", "# df.loc[is_blank, 'BIOSCAN_SUPPLIER_SAMPLE_NAME'] = 'CONTROL_NEG_LYSATE_' + df['SPECIMEN_ID']\n",