diff --git a/data/Anopheles_Metadata_Manifest_V4.0_20221220.xlsx b/data/Anopheles_Metadata_Manifest_V4.0_20240813.xlsx
similarity index 80%
rename from data/Anopheles_Metadata_Manifest_V4.0_20221220.xlsx
rename to data/Anopheles_Metadata_Manifest_V4.0_20240813.xlsx
index 2080089..8ad7f0d 100644
Binary files a/data/Anopheles_Metadata_Manifest_V4.0_20221220.xlsx and b/data/Anopheles_Metadata_Manifest_V4.0_20240813.xlsx differ
diff --git a/data/bioscan_partners.tsv b/data/bioscan_partners.tsv
index e05f564..542e20c 100644
--- a/data/bioscan_partners.tsv
+++ b/data/bioscan_partners.tsv
@@ -54,4 +54,29 @@ YWTP YORKSHIRE WILDLIFE TRUST (POTTERIC CARR)
CZRC CHESTER ZOO RECORD CHESTER
CEHG UK CENTRE FOR ECOLOGY AND HYDROLOGY (GAIT BARROWS)
UPBM UNIVERSITY OF PLYMOUTH
-LJMU LIVERPOOL JOHN MOORES UNIVERSITY
\ No newline at end of file
+LJMU LIVERPOOL JOHN MOORES UNIVERSITY
+LUME LOUGHBOROUGH UNIVERSITY MET STATION
+LUHW LOUGHBOROUGH UNIVERSITY HOLYWELL WOODS
+EPEY ESCRICK PARK ESTATE
+EWTA ESSEX WILDLIFE TRUST (ABBOTTS HALL)
+DCTC DURELL CONSERVATION TRUST (CAIRNGORMS)
+CEFW CORROUR ESTATE FOURT WILLIAM
+LWTC LANCASHIRE WILDLIFE TRUST CUTACRE
+SNST SANGER NON STANDARD
+JARO JARON SANGER
+NTMD NATIONAL TRUST (MORTEHOE DEVON)
+NTWE NATIONAL TRUST (WIMPOLE ESTATE)
+CUPH CAMBRIDGE UNIVERSITY (PETERHOUSE)
+NTSS NATIONAL TRUST (SHROPSHIRE & STAFFORDSHIRE)
+NTHH NATIONAL TRUST (HOLT HEATH)
+HAAU HARPER ADAMS UNIVERSITY
+QMRL QUEEN MARY UL RIVERY LABORATORY
+NTDB NATIONAL TRUST (DANBURY AND BLAKES)
+NTML NATIONAL TRUST (MELFORD)
+NTHF NATIONAL TRUST (HATFIELD FOREST)
+NTBF NATIONAL TRUST (BRIDGES FARM)
+WREN WILD WRENDALE
+KNEP KNEPP ESTATE
+BCLT BARNES COMMON LIMITED
+GCEB GLENLIVET CROWN ESTATE BALLANTRUAN WOOD
+HELG HELIGAN ESTATE
\ No newline at end of file
diff --git a/work/env.yml b/work/env.yml
index ced2570..a29bc4b 100644
--- a/work/env.yml
+++ b/work/env.yml
@@ -1,8 +1,7 @@
name: bioscan_metadata_dev
channels:
- - bioconda
- conda-forge
- - defaults
+ - bioconda
dependencies:
- python =3.7.6
- geopy =2.1.0
diff --git a/work/fix_biosc.ipynb b/work/fix_biosc.ipynb
new file mode 100644
index 0000000..d8252a3
--- /dev/null
+++ b/work/fix_biosc.ipynb
@@ -0,0 +1,495 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "049f231a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%run validate_partner_manifest_dev.ipynb"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "306337d0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fn = '../results/20241029_bge/BGKU_2024_BIOSCAN_Manifest_V2.0_am60.xlsx'\n",
+ "df = get_data(fn, sheet='TAB 2 Metadata Entry')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "099288c4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = fix_date_formats(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7c2436ea",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "v = infer_bioscan_version(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e704b5e6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = validate_series(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "14390a88",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = remove_nonbreaking_spaces(df)\n",
+ "df = remove_trailing_spaces(df, title='sample')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1fb63516",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "template_fn='../data/BIOSCAN_Manifest_V3_20240301.xlsx'\n",
+ "template_df = get_data(template_fn, sheet='TAB 2 Metadata Entry')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "adc37c20",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "check_columns(df, template_df, bioscan_version=v)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3abff9dd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "valid_dict = get_valid_dict(template_fn, validation_sheet='TAB 4 DO NOT EDIT - Data Valida')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9ff4b9f7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['CATCH_LOT'] = df['CATCH_LOT'].replace('','NOT_APPLICABLE')\n",
+ "validate_regex('CATCH_LOT', df, na_values=[])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "61a4aa37",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "contrib_sheet='TAB 1 Contributors'\n",
+ "contrib_df = validate_contributors(fn, contrib_sheet=contrib_sheet)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "15631787",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['TUBE_OR_WELL_ID'] = df['TUBE_OR_WELL_ID'].apply(lambda x: x[0] + str(int(x[1:])))\n",
+ "df['TUBE_OR_WELL_ID'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "610512d4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df, gal, partner_code = validate_plates_wells(\n",
+ " df, contrib_df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID', bioscan=True, bioscan_version=v)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b57ab15c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['ORGANISM_PART'].replace('', 'NOT_APPLICABLE', inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8c34d73d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df, is_blank = check_blanks(df, bioscan=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f82ae7c3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.loc[~is_blank, 'OTHER_INFORMATION'] = df['ORGANISM_PART']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8ee8e00d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['ORGANISM_PART'] = df['ORGANISM_PART'].str.upper().str.replace(',','|')\n",
+ "df['ORGANISM_PART'].replace({\n",
+ " 'ENTIRE INDIVIDUAL':'WHOLE_ORGANISM',\n",
+ " 'ANTENA':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n",
+ " 'IMAGO LEG':'LEG',\n",
+ " 'MARGINAL PIECE':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n",
+ " 'ORGANISM':'WHOLE_ORGANISM',\n",
+ " 'EXUVIUM':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n",
+ " 'IMAGO ANTENA':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n",
+ " 'IMAGO':'WHOLE_ORGANISM',\n",
+ " 'BODY PART':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n",
+ " 'PUPA':'WHOLE_ORGANISM',\n",
+ " 'PART PUPA':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n",
+ " 'LEG| ANTENA':'LEG| **OTHER_SOMATIC_ANIMAL_TISSUE**'\n",
+ "}, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6f8291a4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validate_values('ORGANISM_PART', df, valid_dict, sep='|')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0593d6e0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validate_values('PRESERVATIVE_SOLUTION', df, valid_dict)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "56f8a0d1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validate_values('BOTTLE_DIRECTION', \n",
+ " df[~is_blank & (df['COLLECTION_METHOD'] == 'MALAISE_TRAP')], # allow for blank in non-Malaise trap samples\n",
+ " valid_dict)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d146be4b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validate_regex('DATE_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4ba7297f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "check_catch_lot_dates(df[~is_blank])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "49ce32eb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['DECIMAL_LATITUDE'] = df['DECIMAL_LATITUDE'].str.rstrip(',')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f11c2c18",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validate_regex('DECIMAL_LATITUDE', df[~is_blank], na_values=[])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4eeec734",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['DECIMAL_LONGITUDE'] = df['DECIMAL_LONGITUDE'].str.rstrip(',')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "47e4396b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validate_regex('DECIMAL_LONGITUDE', df[~is_blank], na_values=[])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bac2c168",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validate_regex('WHAT_3_WORDS', df[~is_blank], na_values=[''])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a6222f03",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validate_regex('TIME_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "eba6116b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validate_regex('DURATION_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a1118e76",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validate_values('COLLECTION_METHOD', df[~is_blank], valid_dict, na_values=[''])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6d94b301",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validate_regex('DATE_OF_PLATING', df[~is_blank], na_values=['NOT_COLLECTED',''])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e7d7e222",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "compare_dates_text('DATE_OF_COLLECTION', 'DATE_OF_PLATING', df[~is_blank])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9210d1e6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['PREDICTED_ORDER_OR_GROUP'].replace({\n",
+ " 'Neotaenioglossa':'Neotaenioglossa',\n",
+ " 'Basommatophora':'Basommatophora',\n",
+ " 'Odonáta':'Odonata',\n",
+ " 'Oligochaeta gen. sp.':'Oligochaeta'\n",
+ "}, inplace=True)\n",
+ "df['PREDICTED_FAMILY'].replace({\n",
+ " 'none':'',\n",
+ " 'fam.':'',\n",
+ " 'Glossiphoniidae)':'Glossiphoniidae'\n",
+ "}, inplace=True)\n",
+ "df['PREDICTED_GENUS'].replace({\n",
+ " 'none':'',\n",
+ " 'gen.':'',\n",
+ " 'genus':''\n",
+ "}, inplace=True)\n",
+ "df['PREDICTED_SCIENTIFIC_NAME'].replace({\n",
+ " 'sp':'',\n",
+ " 'sp.':'',\n",
+ "}, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4d0292de",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['PREDICTED_SCIENTIFIC_NAME'] = df['PREDICTED_SCIENTIFIC_NAME'].str.replace('sp ','sp_')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4e3a8169",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.loc[\n",
+ " (df['PREDICTED_SCIENTIFIC_NAME'] != '') & ~df['PREDICTED_SCIENTIFIC_NAME'].str.contains(' '),\n",
+ " 'PREDICTED_SCIENTIFIC_NAME'\n",
+ "] = df['PREDICTED_GENUS'] + ' ' + df['PREDICTED_SCIENTIFIC_NAME']\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "eb9dab13",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = validate_taxonomy(df, ncbi, anospp=False, na_values = [''])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6408bf17",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['SEX'] = df['SEX'].str.upper()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8bb2db80",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validate_values('SPECIMEN_IDENTITY_RISK', df[~is_blank], valid_dict, na_values=[''])\n",
+ "validate_specimen_id_risk(df)\n",
+ "validate_values('LIFESTAGE', df[~is_blank], valid_dict, na_values=[''])\n",
+ "validate_values('SEX', df[~is_blank], valid_dict, na_values=[''])\n",
+ "validate_values('SORTING_SOLUTION_USED', df[~is_blank], valid_dict, na_values=[''])\n",
+ "validate_values('CATCH_BOTTLE_TEMPERATURE_STORAGE', df[~is_blank], valid_dict, na_values=[''])\n",
+ "validate_values('PLATE_TEMPERATURE_STORAGE', df[~is_blank], valid_dict, na_values=[''])\n",
+ "# white cols - validated for all samples\n",
+ "validate_freetext('MORPHOSPECIES_DESCRIPTION', df)\n",
+ "validate_freetext('DESCRIPTION_OF_COLLECTION_METHOD', df)\n",
+ "validate_freetext('HABITAT', df)\n",
+ "validate_freetext('PRESERVATION_APPROACH', df)\n",
+ "# TODO check if STS will need something here\n",
+ "validate_freetext('COLLECTOR_SAMPLE_ID', df)\n",
+ "validate_freetext('VOUCHER_ID', df)\n",
+ "validate_regex('ELEVATION', df, na_values=[''])\n",
+ "validate_freetext('OTHER_INFORMATION', df)\n",
+ "# validate_freetext('MISC_METADATA', df)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b7fdc8a7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validate_identifier('IDENTIFIED_BY', df, contrib_df, na_values=[''])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d1cec8e8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.to_excel('../results/20241029_bge/BGKU_2024_patched.xlsx', index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d823de22",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/work/sts_concat.ipynb b/work/sts_concat.ipynb
new file mode 100644
index 0000000..e68ad4a
--- /dev/null
+++ b/work/sts_concat.ipynb
@@ -0,0 +1,1145 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "437e7b79",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import glob\n",
+ "import os"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "296aaf90",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.set_option('display.max_columns', 500)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "3581d1d0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sts_dir = '/Users/am60/Downloads/STS manifests/Uploaded to STS'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "440a59d3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "112"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sts_fns = glob.glob(sts_dir + '/*.xlsx')\n",
+ "sts_dfs = []\n",
+ "for sts_fn in sts_fns:\n",
+ " try:\n",
+ " sdf = pd.read_excel(sts_fn, sheet_name='Metadata Entry')\n",
+ " sdf['FILENAME'] = sts_fn.split('/')[-1]\n",
+ " sts_dfs.append(sdf)\n",
+ " except:\n",
+ " print(f'{sts_fn} is not an STS manifest, skipping')\n",
+ " \n",
+ "len(sts_dfs)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "16919f21",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(190752, 53)"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sts_df = pd.concat(sts_dfs)\n",
+ "sts_df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "6d5470ca",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['SERIES', 'CATCH_LOT', 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID',\n",
+ " 'ORGANISM_PART', 'PRESERVATIVE_SOLUTION', 'CATCH_SOLUTION',\n",
+ " 'BOTTLE_DIRECTION', 'DATE_OF_COLLECTION', 'COUNTRY_OF_COLLECTION',\n",
+ " 'COLLECTION_LOCATION', 'DECIMAL_LATITUDE', 'DECIMAL_LONGITUDE',\n",
+ " 'WHAT_3_WORDS', 'TIME_OF_COLLECTION', 'DURATION_OF_COLLECTION',\n",
+ " 'COLLECTION_METHOD', 'DATE_OF_PLATING', 'PREDICTED_ORDER_OR_GROUP',\n",
+ " 'PREDICTED_FAMILY', 'PREDICTED_GENUS', 'PREDICTED_SCIENTIFIC_NAME',\n",
+ " 'SPECIMEN_IDENTITY_RISK', 'LIFESTAGE', 'SEX', 'SORTING_SOLUTION_USED',\n",
+ " 'CATCH_BOTTLE_TEMPERATURE_STORAGE', 'PLATE_TEMPERATURE_STORAGE',\n",
+ " 'AMOUNT_OF_CATCH_PLATED', 'MORPHOSPECIES_DESCRIPTION',\n",
+ " 'DESCRIPTION_OF_COLLECTION_METHOD', 'HABITAT', 'PRESERVATION_APPROACH',\n",
+ " 'COLLECTOR_SAMPLE_ID', 'VOUCHER_ID', 'ELEVATION', 'OTHER_INFORMATION',\n",
+ " 'IDENTIFIED_BY', 'SPECIMEN_ID', 'SCIENTIFIC_NAME', 'TAXON_ID', 'GAL',\n",
+ " 'SYMBIONT', 'REGULATORY_COMPLIANCE', 'HAZARD_GROUP', 'CONTRIBUTORS',\n",
+ " 'FILENAME', 'WEATHER', 'IDENTIFIER_AFFILIATION', 'MISC_METADATA',\n",
+ " 'SPECIMEN_ID.1', 'PREDICTED_TAXON_ID', 'IDENTIFICATION_RISK'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sts_df.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "c1511c58",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sts_df.SPECIMEN_ID.is_unique"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "2aefdd26",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "False"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sts_df.SPECIMEN_ID.isna().any()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "d439223a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " SERIES | \n",
+ " CATCH_LOT | \n",
+ " RACK_OR_PLATE_ID | \n",
+ " TUBE_OR_WELL_ID | \n",
+ " ORGANISM_PART | \n",
+ " PRESERVATIVE_SOLUTION | \n",
+ " CATCH_SOLUTION | \n",
+ " BOTTLE_DIRECTION | \n",
+ " DATE_OF_COLLECTION | \n",
+ " COUNTRY_OF_COLLECTION | \n",
+ " COLLECTION_LOCATION | \n",
+ " DECIMAL_LATITUDE | \n",
+ " DECIMAL_LONGITUDE | \n",
+ " WHAT_3_WORDS | \n",
+ " TIME_OF_COLLECTION | \n",
+ " DURATION_OF_COLLECTION | \n",
+ " COLLECTION_METHOD | \n",
+ " DATE_OF_PLATING | \n",
+ " PREDICTED_ORDER_OR_GROUP | \n",
+ " PREDICTED_FAMILY | \n",
+ " PREDICTED_GENUS | \n",
+ " PREDICTED_SCIENTIFIC_NAME | \n",
+ " SPECIMEN_IDENTITY_RISK | \n",
+ " LIFESTAGE | \n",
+ " SEX | \n",
+ " SORTING_SOLUTION_USED | \n",
+ " CATCH_BOTTLE_TEMPERATURE_STORAGE | \n",
+ " PLATE_TEMPERATURE_STORAGE | \n",
+ " AMOUNT_OF_CATCH_PLATED | \n",
+ " MORPHOSPECIES_DESCRIPTION | \n",
+ " DESCRIPTION_OF_COLLECTION_METHOD | \n",
+ " HABITAT | \n",
+ " PRESERVATION_APPROACH | \n",
+ " COLLECTOR_SAMPLE_ID | \n",
+ " VOUCHER_ID | \n",
+ " ELEVATION | \n",
+ " OTHER_INFORMATION | \n",
+ " IDENTIFIED_BY | \n",
+ " SPECIMEN_ID | \n",
+ " SCIENTIFIC_NAME | \n",
+ " TAXON_ID | \n",
+ " GAL | \n",
+ " SYMBIONT | \n",
+ " REGULATORY_COMPLIANCE | \n",
+ " HAZARD_GROUP | \n",
+ " CONTRIBUTORS | \n",
+ " FILENAME | \n",
+ " WEATHER | \n",
+ " IDENTIFIER_AFFILIATION | \n",
+ " MISC_METADATA | \n",
+ " SPECIMEN_ID.1 | \n",
+ " PREDICTED_TAXON_ID | \n",
+ " IDENTIFICATION_RISK | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " C001F | \n",
+ " MOZZ00000609A | \n",
+ " A1 | \n",
+ " LEG | \n",
+ " 100%_ETHANOL | \n",
+ " 100%_ETHANOL | \n",
+ " N | \n",
+ " 2021-05-27 00:00:00 | \n",
+ " UNITED KINGDOM | \n",
+ " UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... | \n",
+ " 51.842248 | \n",
+ " -4.150253 | \n",
+ " ///inclined.lists.crossings | \n",
+ " 09:04:00 | \n",
+ " P1DT | \n",
+ " MALAISE_TRAP | \n",
+ " 2021-06-09 00:00:00 | \n",
+ " Hymenoptera | \n",
+ " Apidae | \n",
+ " Bombus | \n",
+ " Bombus pascuorum | \n",
+ " NaN | \n",
+ " ADULT | \n",
+ " FEMALE | \n",
+ " N | \n",
+ " ROOM_TEMPERATURE | \n",
+ " FRIDGE | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " directional | \n",
+ " Horticultural | \n",
+ " NaN | \n",
+ " C001-DWG-N|NBGW-001-A | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " MOZZ00000609A_A1 | \n",
+ " unidentified | \n",
+ " 32644 | \n",
+ " National Botanic Garden of Wales | \n",
+ " TARGET | \n",
+ " Y | \n",
+ " HG1 | \n",
+ " NaN | \n",
+ " NBGW-2206-Manifest-V2.0.xlsx | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " MOZZ00000609A_A1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " C001F | \n",
+ " MOZZ00000609A | \n",
+ " B1 | \n",
+ " WHOLE_ORGANISM | \n",
+ " 100%_ETHANOL | \n",
+ " 100%_ETHANOL | \n",
+ " N | \n",
+ " 2021-05-27 00:00:00 | \n",
+ " UNITED KINGDOM | \n",
+ " UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... | \n",
+ " 51.842248 | \n",
+ " -4.150253 | \n",
+ " ///inclined.lists.crossings | \n",
+ " 09:04:00 | \n",
+ " P1DT | \n",
+ " MALAISE_TRAP | \n",
+ " 2021-06-09 00:00:00 | \n",
+ " Diptera | \n",
+ " Syrphidae | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ADULT | \n",
+ " NaN | \n",
+ " N | \n",
+ " ROOM_TEMPERATURE | \n",
+ " FRIDGE | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " directional | \n",
+ " Horticultural | \n",
+ " NaN | \n",
+ " C001-DWG-N|NBGW-001-A | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " MOZZ00000609A_B1 | \n",
+ " unidentified | \n",
+ " 32644 | \n",
+ " National Botanic Garden of Wales | \n",
+ " TARGET | \n",
+ " Y | \n",
+ " HG1 | \n",
+ " NaN | \n",
+ " NBGW-2206-Manifest-V2.0.xlsx | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " MOZZ00000609A_B1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " C001F | \n",
+ " MOZZ00000609A | \n",
+ " C1 | \n",
+ " WHOLE_ORGANISM | \n",
+ " 100%_ETHANOL | \n",
+ " 100%_ETHANOL | \n",
+ " N | \n",
+ " 2021-05-27 00:00:00 | \n",
+ " UNITED KINGDOM | \n",
+ " UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... | \n",
+ " 51.842248 | \n",
+ " -4.150253 | \n",
+ " ///inclined.lists.crossings | \n",
+ " 09:04:00 | \n",
+ " P1DT | \n",
+ " MALAISE_TRAP | \n",
+ " 2021-06-09 00:00:00 | \n",
+ " Diptera | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ADULT | \n",
+ " NaN | \n",
+ " N | \n",
+ " ROOM_TEMPERATURE | \n",
+ " FRIDGE | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " directional | \n",
+ " Horticultural | \n",
+ " NaN | \n",
+ " C001-DWG-N|NBGW-001-A | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " MOZZ00000609A_C1 | \n",
+ " unidentified | \n",
+ " 32644 | \n",
+ " National Botanic Garden of Wales | \n",
+ " TARGET | \n",
+ " Y | \n",
+ " HG1 | \n",
+ " NaN | \n",
+ " NBGW-2206-Manifest-V2.0.xlsx | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " MOZZ00000609A_C1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " C001F | \n",
+ " MOZZ00000609A | \n",
+ " D1 | \n",
+ " WHOLE_ORGANISM | \n",
+ " 100%_ETHANOL | \n",
+ " 100%_ETHANOL | \n",
+ " N | \n",
+ " 2021-05-27 00:00:00 | \n",
+ " UNITED KINGDOM | \n",
+ " UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... | \n",
+ " 51.842248 | \n",
+ " -4.150253 | \n",
+ " ///inclined.lists.crossings | \n",
+ " 09:04:00 | \n",
+ " P1DT | \n",
+ " MALAISE_TRAP | \n",
+ " 2021-06-09 00:00:00 | \n",
+ " Diptera | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ADULT | \n",
+ " NaN | \n",
+ " N | \n",
+ " ROOM_TEMPERATURE | \n",
+ " FRIDGE | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " directional | \n",
+ " Horticultural | \n",
+ " NaN | \n",
+ " C001-DWG-N|NBGW-001-A | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " MOZZ00000609A_D1 | \n",
+ " unidentified | \n",
+ " 32644 | \n",
+ " National Botanic Garden of Wales | \n",
+ " TARGET | \n",
+ " Y | \n",
+ " HG1 | \n",
+ " NaN | \n",
+ " NBGW-2206-Manifest-V2.0.xlsx | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " MOZZ00000609A_D1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " C001F | \n",
+ " MOZZ00000609A | \n",
+ " E1 | \n",
+ " WHOLE_ORGANISM | \n",
+ " 100%_ETHANOL | \n",
+ " 100%_ETHANOL | \n",
+ " N | \n",
+ " 2021-05-27 00:00:00 | \n",
+ " UNITED KINGDOM | \n",
+ " UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... | \n",
+ " 51.842248 | \n",
+ " -4.150253 | \n",
+ " ///inclined.lists.crossings | \n",
+ " 09:04:00 | \n",
+ " P1DT | \n",
+ " MALAISE_TRAP | \n",
+ " 2021-06-09 00:00:00 | \n",
+ " Diptera | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ADULT | \n",
+ " NaN | \n",
+ " N | \n",
+ " ROOM_TEMPERATURE | \n",
+ " FRIDGE | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " directional | \n",
+ " Horticultural | \n",
+ " NaN | \n",
+ " C001-DWG-N|NBGW-001-A | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " MOZZ00000609A_E1 | \n",
+ " unidentified | \n",
+ " 32644 | \n",
+ " National Botanic Garden of Wales | \n",
+ " TARGET | \n",
+ " Y | \n",
+ " HG1 | \n",
+ " NaN | \n",
+ " NBGW-2206-Manifest-V2.0.xlsx | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " MOZZ00000609A_E1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 2971 | \n",
+ " 2972 | \n",
+ " C052F | \n",
+ " NBGW_011 | \n",
+ " D12 | \n",
+ " WHOLE_ORGANISM | \n",
+ " 100%_ETHANOL | \n",
+ " 100%_ETHANOL | \n",
+ " S | \n",
+ " 2022-04-29 00:00:00 | \n",
+ " UNITED KINGDOM | \n",
+ " UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... | \n",
+ " 51.842248 | \n",
+ " -4.150253 | \n",
+ " ///inclined.lists.crossings | \n",
+ " 09:05:00 | \n",
+ " P1DT | \n",
+ " MALAISE_TRAP | \n",
+ " 2022-05-06 00:00:00 | \n",
+ " Diptera | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ADULT | \n",
+ " NaN | \n",
+ " N | \n",
+ " ROOM_TEMPERATURE | \n",
+ " FRIDGE | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " directional | \n",
+ " HORTICULTURAL | \n",
+ " NaN | \n",
+ " C012-DWG-S | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NBGW_011_D12 | \n",
+ " unidentified | \n",
+ " 32644 | \n",
+ " National Botanic Garden of Wales | \n",
+ " TARGET | \n",
+ " Y | \n",
+ " HG1 | \n",
+ " NaN | \n",
+ " NBGW-2206-Manifest-V2.0.xlsx | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NBGW_011_D12 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2972 | \n",
+ " 2973 | \n",
+ " C052F | \n",
+ " NBGW_011 | \n",
+ " E12 | \n",
+ " WHOLE_ORGANISM | \n",
+ " 100%_ETHANOL | \n",
+ " 100%_ETHANOL | \n",
+ " S | \n",
+ " 2022-04-29 00:00:00 | \n",
+ " UNITED KINGDOM | \n",
+ " UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... | \n",
+ " 51.842248 | \n",
+ " -4.150253 | \n",
+ " ///inclined.lists.crossings | \n",
+ " 09:05:00 | \n",
+ " P1DT | \n",
+ " MALAISE_TRAP | \n",
+ " 2022-05-06 00:00:00 | \n",
+ " Diptera | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ADULT | \n",
+ " NaN | \n",
+ " N | \n",
+ " ROOM_TEMPERATURE | \n",
+ " FRIDGE | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " directional | \n",
+ " HORTICULTURAL | \n",
+ " NaN | \n",
+ " C012-DWG-S | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NBGW_011_E12 | \n",
+ " unidentified | \n",
+ " 32644 | \n",
+ " National Botanic Garden of Wales | \n",
+ " TARGET | \n",
+ " Y | \n",
+ " HG1 | \n",
+ " NaN | \n",
+ " NBGW-2206-Manifest-V2.0.xlsx | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NBGW_011_E12 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2973 | \n",
+ " 2974 | \n",
+ " C052F | \n",
+ " NBGW_011 | \n",
+ " F12 | \n",
+ " WHOLE_ORGANISM | \n",
+ " 100%_ETHANOL | \n",
+ " 100%_ETHANOL | \n",
+ " S | \n",
+ " 2022-04-29 00:00:00 | \n",
+ " UNITED KINGDOM | \n",
+ " UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... | \n",
+ " 51.842248 | \n",
+ " -4.150253 | \n",
+ " ///inclined.lists.crossings | \n",
+ " 09:05:00 | \n",
+ " P1DT | \n",
+ " MALAISE_TRAP | \n",
+ " 2022-05-06 00:00:00 | \n",
+ " Diptera | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ADULT | \n",
+ " NaN | \n",
+ " N | \n",
+ " ROOM_TEMPERATURE | \n",
+ " FRIDGE | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " directional | \n",
+ " HORTICULTURAL | \n",
+ " NaN | \n",
+ " C012-DWG-S | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NBGW_011_F12 | \n",
+ " unidentified | \n",
+ " 32644 | \n",
+ " National Botanic Garden of Wales | \n",
+ " TARGET | \n",
+ " Y | \n",
+ " HG1 | \n",
+ " NaN | \n",
+ " NBGW-2206-Manifest-V2.0.xlsx | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NBGW_011_F12 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2974 | \n",
+ " 2975 | \n",
+ " C052F | \n",
+ " NBGW_011 | \n",
+ " G12 | \n",
+ " WHOLE_ORGANISM | \n",
+ " 100%_ETHANOL | \n",
+ " 100%_ETHANOL | \n",
+ " S | \n",
+ " 2022-04-29 00:00:00 | \n",
+ " UNITED KINGDOM | \n",
+ " UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... | \n",
+ " 51.842248 | \n",
+ " -4.150253 | \n",
+ " ///inclined.lists.crossings | \n",
+ " 09:05:00 | \n",
+ " P1DT | \n",
+ " MALAISE_TRAP | \n",
+ " 2022-05-06 00:00:00 | \n",
+ " Diptera | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ADULT | \n",
+ " NaN | \n",
+ " N | \n",
+ " ROOM_TEMPERATURE | \n",
+ " FRIDGE | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " directional | \n",
+ " HORTICULTURAL | \n",
+ " NaN | \n",
+ " C012-DWG-S | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NBGW_011_G12 | \n",
+ " unidentified | \n",
+ " 32644 | \n",
+ " National Botanic Garden of Wales | \n",
+ " TARGET | \n",
+ " Y | \n",
+ " HG1 | \n",
+ " NaN | \n",
+ " NBGW-2206-Manifest-V2.0.xlsx | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NBGW_011_G12 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2975 | \n",
+ " 2976 | \n",
+ " NOT_APPLICABLE | \n",
+ " NBGW_011 | \n",
+ " H12 | \n",
+ " NOT_APPLICABLE | \n",
+ " 100%_ETHANOL | \n",
+ " 100%_ETHANOL | \n",
+ " NaN | \n",
+ " NaT | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaT | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NBGW_011_H12 | \n",
+ " blank sample | \n",
+ " 2582415 | \n",
+ " National Botanic Garden of Wales | \n",
+ " TARGET | \n",
+ " Y | \n",
+ " HG1 | \n",
+ " NaN | \n",
+ " NBGW-2206-Manifest-V2.0.xlsx | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NBGW_011_H12 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2976 rows × 53 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " SERIES CATCH_LOT RACK_OR_PLATE_ID TUBE_OR_WELL_ID ORGANISM_PART \\\n",
+ "0 1 C001F MOZZ00000609A A1 LEG \n",
+ "1 2 C001F MOZZ00000609A B1 WHOLE_ORGANISM \n",
+ "2 3 C001F MOZZ00000609A C1 WHOLE_ORGANISM \n",
+ "3 4 C001F MOZZ00000609A D1 WHOLE_ORGANISM \n",
+ "4 5 C001F MOZZ00000609A E1 WHOLE_ORGANISM \n",
+ "... ... ... ... ... ... \n",
+ "2971 2972 C052F NBGW_011 D12 WHOLE_ORGANISM \n",
+ "2972 2973 C052F NBGW_011 E12 WHOLE_ORGANISM \n",
+ "2973 2974 C052F NBGW_011 F12 WHOLE_ORGANISM \n",
+ "2974 2975 C052F NBGW_011 G12 WHOLE_ORGANISM \n",
+ "2975 2976 NOT_APPLICABLE NBGW_011 H12 NOT_APPLICABLE \n",
+ "\n",
+ " PRESERVATIVE_SOLUTION CATCH_SOLUTION BOTTLE_DIRECTION \\\n",
+ "0 100%_ETHANOL 100%_ETHANOL N \n",
+ "1 100%_ETHANOL 100%_ETHANOL N \n",
+ "2 100%_ETHANOL 100%_ETHANOL N \n",
+ "3 100%_ETHANOL 100%_ETHANOL N \n",
+ "4 100%_ETHANOL 100%_ETHANOL N \n",
+ "... ... ... ... \n",
+ "2971 100%_ETHANOL 100%_ETHANOL S \n",
+ "2972 100%_ETHANOL 100%_ETHANOL S \n",
+ "2973 100%_ETHANOL 100%_ETHANOL S \n",
+ "2974 100%_ETHANOL 100%_ETHANOL S \n",
+ "2975 100%_ETHANOL 100%_ETHANOL NaN \n",
+ "\n",
+ " DATE_OF_COLLECTION COUNTRY_OF_COLLECTION \\\n",
+ "0 2021-05-27 00:00:00 UNITED KINGDOM \n",
+ "1 2021-05-27 00:00:00 UNITED KINGDOM \n",
+ "2 2021-05-27 00:00:00 UNITED KINGDOM \n",
+ "3 2021-05-27 00:00:00 UNITED KINGDOM \n",
+ "4 2021-05-27 00:00:00 UNITED KINGDOM \n",
+ "... ... ... \n",
+ "2971 2022-04-29 00:00:00 UNITED KINGDOM \n",
+ "2972 2022-04-29 00:00:00 UNITED KINGDOM \n",
+ "2973 2022-04-29 00:00:00 UNITED KINGDOM \n",
+ "2974 2022-04-29 00:00:00 UNITED KINGDOM \n",
+ "2975 NaT NaN \n",
+ "\n",
+ " COLLECTION_LOCATION DECIMAL_LATITUDE \\\n",
+ "0 UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... 51.842248 \n",
+ "1 UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... 51.842248 \n",
+ "2 UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... 51.842248 \n",
+ "3 UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... 51.842248 \n",
+ "4 UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... 51.842248 \n",
+ "... ... ... \n",
+ "2971 UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... 51.842248 \n",
+ "2972 UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... 51.842248 \n",
+ "2973 UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... 51.842248 \n",
+ "2974 UNITED KINGDOM | WALES | LLANARTHNE | DOUBLE W... 51.842248 \n",
+ "2975 NaN NaN \n",
+ "\n",
+ " DECIMAL_LONGITUDE WHAT_3_WORDS TIME_OF_COLLECTION \\\n",
+ "0 -4.150253 ///inclined.lists.crossings 09:04:00 \n",
+ "1 -4.150253 ///inclined.lists.crossings 09:04:00 \n",
+ "2 -4.150253 ///inclined.lists.crossings 09:04:00 \n",
+ "3 -4.150253 ///inclined.lists.crossings 09:04:00 \n",
+ "4 -4.150253 ///inclined.lists.crossings 09:04:00 \n",
+ "... ... ... ... \n",
+ "2971 -4.150253 ///inclined.lists.crossings 09:05:00 \n",
+ "2972 -4.150253 ///inclined.lists.crossings 09:05:00 \n",
+ "2973 -4.150253 ///inclined.lists.crossings 09:05:00 \n",
+ "2974 -4.150253 ///inclined.lists.crossings 09:05:00 \n",
+ "2975 NaN NaN NaN \n",
+ "\n",
+ " DURATION_OF_COLLECTION COLLECTION_METHOD DATE_OF_PLATING \\\n",
+ "0 P1DT MALAISE_TRAP 2021-06-09 00:00:00 \n",
+ "1 P1DT MALAISE_TRAP 2021-06-09 00:00:00 \n",
+ "2 P1DT MALAISE_TRAP 2021-06-09 00:00:00 \n",
+ "3 P1DT MALAISE_TRAP 2021-06-09 00:00:00 \n",
+ "4 P1DT MALAISE_TRAP 2021-06-09 00:00:00 \n",
+ "... ... ... ... \n",
+ "2971 P1DT MALAISE_TRAP 2022-05-06 00:00:00 \n",
+ "2972 P1DT MALAISE_TRAP 2022-05-06 00:00:00 \n",
+ "2973 P1DT MALAISE_TRAP 2022-05-06 00:00:00 \n",
+ "2974 P1DT MALAISE_TRAP 2022-05-06 00:00:00 \n",
+ "2975 NaN NaN NaT \n",
+ "\n",
+ " PREDICTED_ORDER_OR_GROUP PREDICTED_FAMILY PREDICTED_GENUS \\\n",
+ "0 Hymenoptera Apidae Bombus \n",
+ "1 Diptera Syrphidae NaN \n",
+ "2 Diptera NaN NaN \n",
+ "3 Diptera NaN NaN \n",
+ "4 Diptera NaN NaN \n",
+ "... ... ... ... \n",
+ "2971 Diptera NaN NaN \n",
+ "2972 Diptera NaN NaN \n",
+ "2973 Diptera NaN NaN \n",
+ "2974 Diptera NaN NaN \n",
+ "2975 NaN NaN NaN \n",
+ "\n",
+ " PREDICTED_SCIENTIFIC_NAME SPECIMEN_IDENTITY_RISK LIFESTAGE SEX \\\n",
+ "0 Bombus pascuorum NaN ADULT FEMALE \n",
+ "1 NaN NaN ADULT NaN \n",
+ "2 NaN NaN ADULT NaN \n",
+ "3 NaN NaN ADULT NaN \n",
+ "4 NaN NaN ADULT NaN \n",
+ "... ... ... ... ... \n",
+ "2971 NaN NaN ADULT NaN \n",
+ "2972 NaN NaN ADULT NaN \n",
+ "2973 NaN NaN ADULT NaN \n",
+ "2974 NaN NaN ADULT NaN \n",
+ "2975 NaN NaN NaN NaN \n",
+ "\n",
+ " SORTING_SOLUTION_USED CATCH_BOTTLE_TEMPERATURE_STORAGE \\\n",
+ "0 N ROOM_TEMPERATURE \n",
+ "1 N ROOM_TEMPERATURE \n",
+ "2 N ROOM_TEMPERATURE \n",
+ "3 N ROOM_TEMPERATURE \n",
+ "4 N ROOM_TEMPERATURE \n",
+ "... ... ... \n",
+ "2971 N ROOM_TEMPERATURE \n",
+ "2972 N ROOM_TEMPERATURE \n",
+ "2973 N ROOM_TEMPERATURE \n",
+ "2974 N ROOM_TEMPERATURE \n",
+ "2975 NaN NaN \n",
+ "\n",
+ " PLATE_TEMPERATURE_STORAGE AMOUNT_OF_CATCH_PLATED \\\n",
+ "0 FRIDGE NaN \n",
+ "1 FRIDGE NaN \n",
+ "2 FRIDGE NaN \n",
+ "3 FRIDGE NaN \n",
+ "4 FRIDGE NaN \n",
+ "... ... ... \n",
+ "2971 FRIDGE NaN \n",
+ "2972 FRIDGE NaN \n",
+ "2973 FRIDGE NaN \n",
+ "2974 FRIDGE NaN \n",
+ "2975 NaN NaN \n",
+ "\n",
+ " MORPHOSPECIES_DESCRIPTION DESCRIPTION_OF_COLLECTION_METHOD \\\n",
+ "0 NaN directional \n",
+ "1 NaN directional \n",
+ "2 NaN directional \n",
+ "3 NaN directional \n",
+ "4 NaN directional \n",
+ "... ... ... \n",
+ "2971 NaN directional \n",
+ "2972 NaN directional \n",
+ "2973 NaN directional \n",
+ "2974 NaN directional \n",
+ "2975 NaN NaN \n",
+ "\n",
+ " HABITAT PRESERVATION_APPROACH COLLECTOR_SAMPLE_ID VOUCHER_ID \\\n",
+ "0 Horticultural NaN C001-DWG-N|NBGW-001-A NaN \n",
+ "1 Horticultural NaN C001-DWG-N|NBGW-001-A NaN \n",
+ "2 Horticultural NaN C001-DWG-N|NBGW-001-A NaN \n",
+ "3 Horticultural NaN C001-DWG-N|NBGW-001-A NaN \n",
+ "4 Horticultural NaN C001-DWG-N|NBGW-001-A NaN \n",
+ "... ... ... ... ... \n",
+ "2971 HORTICULTURAL NaN C012-DWG-S NaN \n",
+ "2972 HORTICULTURAL NaN C012-DWG-S NaN \n",
+ "2973 HORTICULTURAL NaN C012-DWG-S NaN \n",
+ "2974 HORTICULTURAL NaN C012-DWG-S NaN \n",
+ "2975 NaN NaN NaN NaN \n",
+ "\n",
+ " ELEVATION OTHER_INFORMATION IDENTIFIED_BY SPECIMEN_ID \\\n",
+ "0 NaN NaN NaN MOZZ00000609A_A1 \n",
+ "1 NaN NaN NaN MOZZ00000609A_B1 \n",
+ "2 NaN NaN NaN MOZZ00000609A_C1 \n",
+ "3 NaN NaN NaN MOZZ00000609A_D1 \n",
+ "4 NaN NaN NaN MOZZ00000609A_E1 \n",
+ "... ... ... ... ... \n",
+ "2971 NaN NaN NaN NBGW_011_D12 \n",
+ "2972 NaN NaN NaN NBGW_011_E12 \n",
+ "2973 NaN NaN NaN NBGW_011_F12 \n",
+ "2974 NaN NaN NaN NBGW_011_G12 \n",
+ "2975 NaN NaN NaN NBGW_011_H12 \n",
+ "\n",
+ " SCIENTIFIC_NAME TAXON_ID GAL SYMBIONT \\\n",
+ "0 unidentified 32644 National Botanic Garden of Wales TARGET \n",
+ "1 unidentified 32644 National Botanic Garden of Wales TARGET \n",
+ "2 unidentified 32644 National Botanic Garden of Wales TARGET \n",
+ "3 unidentified 32644 National Botanic Garden of Wales TARGET \n",
+ "4 unidentified 32644 National Botanic Garden of Wales TARGET \n",
+ "... ... ... ... ... \n",
+ "2971 unidentified 32644 National Botanic Garden of Wales TARGET \n",
+ "2972 unidentified 32644 National Botanic Garden of Wales TARGET \n",
+ "2973 unidentified 32644 National Botanic Garden of Wales TARGET \n",
+ "2974 unidentified 32644 National Botanic Garden of Wales TARGET \n",
+ "2975 blank sample 2582415 National Botanic Garden of Wales TARGET \n",
+ "\n",
+ " REGULATORY_COMPLIANCE HAZARD_GROUP CONTRIBUTORS \\\n",
+ "0 Y HG1 NaN \n",
+ "1 Y HG1 NaN \n",
+ "2 Y HG1 NaN \n",
+ "3 Y HG1 NaN \n",
+ "4 Y HG1 NaN \n",
+ "... ... ... ... \n",
+ "2971 Y HG1 NaN \n",
+ "2972 Y HG1 NaN \n",
+ "2973 Y HG1 NaN \n",
+ "2974 Y HG1 NaN \n",
+ "2975 Y HG1 NaN \n",
+ "\n",
+ " FILENAME WEATHER IDENTIFIER_AFFILIATION \\\n",
+ "0 NBGW-2206-Manifest-V2.0.xlsx NaN NaN \n",
+ "1 NBGW-2206-Manifest-V2.0.xlsx NaN NaN \n",
+ "2 NBGW-2206-Manifest-V2.0.xlsx NaN NaN \n",
+ "3 NBGW-2206-Manifest-V2.0.xlsx NaN NaN \n",
+ "4 NBGW-2206-Manifest-V2.0.xlsx NaN NaN \n",
+ "... ... ... ... \n",
+ "2971 NBGW-2206-Manifest-V2.0.xlsx NaN NaN \n",
+ "2972 NBGW-2206-Manifest-V2.0.xlsx NaN NaN \n",
+ "2973 NBGW-2206-Manifest-V2.0.xlsx NaN NaN \n",
+ "2974 NBGW-2206-Manifest-V2.0.xlsx NaN NaN \n",
+ "2975 NBGW-2206-Manifest-V2.0.xlsx NaN NaN \n",
+ "\n",
+ " MISC_METADATA SPECIMEN_ID.1 PREDICTED_TAXON_ID IDENTIFICATION_RISK \n",
+ "0 NaN MOZZ00000609A_A1 NaN NaN \n",
+ "1 NaN MOZZ00000609A_B1 NaN NaN \n",
+ "2 NaN MOZZ00000609A_C1 NaN NaN \n",
+ "3 NaN MOZZ00000609A_D1 NaN NaN \n",
+ "4 NaN MOZZ00000609A_E1 NaN NaN \n",
+ "... ... ... ... ... \n",
+ "2971 NaN NBGW_011_D12 NaN NaN \n",
+ "2972 NaN NBGW_011_E12 NaN NaN \n",
+ "2973 NaN NBGW_011_F12 NaN NaN \n",
+ "2974 NaN NBGW_011_G12 NaN NaN \n",
+ "2975 NaN NBGW_011_H12 NaN NaN \n",
+ "\n",
+ "[2976 rows x 53 columns]"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sts_df[~sts_df[\"SPECIMEN_ID.1\"].isna()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "c4b421d5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sts_df.to_csv('sts_concat.csv', index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6fc50384",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/work/validate_anospp.ipynb b/work/validate_anospp.ipynb
index 6ee15b3..49d6c36 100644
--- a/work/validate_anospp.ipynb
+++ b/work/validate_anospp.ipynb
@@ -25,7 +25,7 @@
"metadata": {},
"outputs": [],
"source": [
- "def validate_anospp(fn, template_fn='../data/Anopheles_Metadata_Manifest_V4.0_20221220.xlsx', \n",
+ "def validate_anospp(fn, template_fn='../data/Anopheles_Metadata_Manifest_V4.0_20240813.xlsx', \n",
" verbose=False, samples_sheet='TAB 2 Metadata Entry',\n",
" contrib_sheet='TAB 1 Contributors', write_sts=True):\n",
" '''\n",
@@ -46,7 +46,7 @@
" df = validate_series(df)\n",
" # clean up data\n",
" df = remove_nonbreaking_spaces(df)\n",
- " df = remove_trailing_spaces(df)\n",
+ " df = remove_trailing_spaces(df, title='sample')\n",
" \n",
" # read NCBI taxonomy\n",
" ncbi = ete3.NCBITaxa()\n",
@@ -121,10 +121,7 @@
" \n",
" print('\\n'.join(df.RACK_OR_PLATE_ID.unique()))\n",
" \n",
- " return df\n",
- "\n",
- "fn = '../data/Anopheles_Metadata_Manifest_V4.0_20221220.xlsx'\n",
- "template_fn = fn"
+ " return df"
]
},
{
@@ -134,6 +131,8 @@
"metadata": {},
"outputs": [],
"source": [
+ "fn = '../data/Anopheles_Metadata_Manifest_V4.0_20240813.xlsx'\n",
+ "template_fn = fn\n",
"df = validate_anospp(fn, template_fn, verbose=True, samples_sheet='TAB 3 TEST Metadata Entry')"
]
},
@@ -162,7 +161,7 @@
"metadata": {},
"outputs": [],
"source": [
- "fn = '../results/20231019_a_poal_adad/Anopheles_Metadata_Manifest_V4.0_POAL_101623_am60.xlsx'\n",
+ "fn = '../results/20240819_mg/MANIFEST_MOSQUITOES_MADAGASCAR_am60_2.xlsx'\n",
"df = validate_anospp(fn, template_fn, samples_sheet='TAB 2 Metadata Entry', contrib_sheet='TAB 1 Contributors')"
]
},
diff --git a/work/validate_bioscan.ipynb b/work/validate_bioscan.ipynb
index 77c35df..a89a133 100644
--- a/work/validate_bioscan.ipynb
+++ b/work/validate_bioscan.ipynb
@@ -44,7 +44,7 @@
" df = validate_series(df)\n",
" # clean up data\n",
" df = remove_nonbreaking_spaces(df)\n",
- " df = remove_trailing_spaces(df)\n",
+ " df = remove_trailing_spaces(df, title='sample')\n",
" \n",
" # read NCBI taxonomy\n",
" ncbi = ete3.NCBITaxa()\n",
@@ -83,10 +83,18 @@
" if v == 'v3':\n",
" validate_values('CATCH_SOLUTION', df[~is_blank], valid_dict)\n",
" df = strip_asterisks('CATCH_SOLUTION', df)\n",
- " validate_values('BOTTLE_DIRECTION', \n",
- " df[~is_blank & (df['COLLECTION_METHOD'] == 'MALAISE_TRAP')], # allow for blank in non-Malaise trap samples\n",
- " valid_dict)\n",
- " validate_regex('DATE_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED'])\n",
+ " is_malaise = (~is_blank & (df['COLLECTION_METHOD'] == 'MALAISE_TRAP'))\n",
+ " is_other = (~is_blank & (df['COLLECTION_METHOD'] != 'MALAISE_TRAP'))\n",
+ " validate_values('BOTTLE_DIRECTION', df[is_malaise], valid_dict, \n",
+ " extra_msg=' for malaise trap samples')\n",
+ " # allow only empty for non-malaise trap - weird capture of bottle direction filled and collection method empty\n",
+ " validate_values('BOTTLE_DIRECTION', df[is_other], {'BOTTLE_DIRECTION':['']}, \n",
+ " extra_msg=' for non-malaise trap samples')\n",
+ " # no missing datex expected for malaise\n",
+ " validate_regex('DATE_OF_COLLECTION', df[is_malaise], \n",
+ " extra_msg=' for malaise trap samples')\n",
+ " validate_regex('DATE_OF_COLLECTION', df[is_other], na_values=['NOT_COLLECTED'],\n",
+ " extra_msg=' for non-malaise trap samples')\n",
" check_catch_lot_dates(df[~is_blank])\n",
" validate_regex('DECIMAL_LATITUDE', df[~is_blank], na_values=[])\n",
" validate_regex('DECIMAL_LONGITUDE', df[~is_blank], na_values=[])\n",
@@ -96,11 +104,23 @@
" \n",
" # purple cols - valiated for non-blank samples\n",
" validate_regex('WHAT_3_WORDS', df[~is_blank], na_values=[''])\n",
- " validate_regex('TIME_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])\n",
- " validate_regex('DURATION_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])\n",
+ " # no missing times expected for malaise\n",
+ " validate_regex('TIME_OF_COLLECTION', df[is_malaise],\n",
+ " extra_msg=' for malaise trap samples') \n",
+ " validate_regex('TIME_OF_COLLECTION', df[is_other], na_values=['NOT_COLLECTED',''],\n",
+ " extra_msg=' for non-malaise trap samples') \n",
+ " # no missing durations for malaise\n",
+ " validate_regex('DURATION_OF_COLLECTION', df[is_malaise],\n",
+ " extra_msg=' for malaise trap samples')\n",
+ " validate_regex('DURATION_OF_COLLECTION', df[is_other], na_values=['NOT_COLLECTED',''],\n",
+ " extra_msg=' for non-malaise trap samples')\n",
" validate_values('COLLECTION_METHOD', df[~is_blank], valid_dict, na_values=[''])\n",
" df = strip_asterisks('COLLECTION_METHOD', df)\n",
- " validate_regex('DATE_OF_PLATING', df[~is_blank], na_values=['NOT_COLLECTED',''])\n",
+ " # no missing plating dates for malaise\n",
+ " validate_regex('DATE_OF_PLATING', df[is_malaise],\n",
+ " extra_msg=' for malaise trap samples')\n",
+ " validate_regex('DATE_OF_PLATING', df[~is_blank], na_values=['NOT_COLLECTED',''],\n",
+ " extra_msg=' for non-malaise trap samples')\n",
" compare_dates_text('DATE_OF_COLLECTION', 'DATE_OF_PLATING', df[~is_blank])\n",
" # taxonomy validation adds taxid columns to original dataframe - skipping for now\n",
" df = validate_taxonomy(df, ncbi, anospp=False, na_values = [''])\n",
@@ -123,7 +143,7 @@
" validate_freetext('VOUCHER_ID', df)\n",
" validate_regex('ELEVATION', df, na_values=[''])\n",
" validate_freetext('OTHER_INFORMATION', df)\n",
- " validate_freetext('MISC_METADATA', df)\n",
+ " # validate_freetext('MISC_METADATA', df)\n",
" validate_identifier('IDENTIFIED_BY', df, contrib_df, na_values=[''])\n",
" \n",
" df = expand_plate_only(df)\n",
@@ -186,7 +206,7 @@
},
"outputs": [],
"source": [
- "fn = '../results/20240304_shap_test/SHAP_2401_BIOSCAN_Manifest_V3.xlsx'\n",
+ "fn = '../results/20241108_dev_033/NEWI_2408_BIOSCAN_Manifest_V3.xlsx'\n",
"df = validate_bioscan(fn, samples_sheet='TAB 2 Metadata Entry', contrib_sheet='TAB 1 Contributors', verbose=False)"
]
},
@@ -197,7 +217,8 @@
"metadata": {},
"outputs": [],
"source": [
- "fn = '../results/20231006_bold_test/BGE_single_specimen_metadata&biobanking_sheet_v5_Yulia_Guglia_Diptera_12_plates.xlsx'\n",
+ "%run validate_partner_manifest_dev.ipynb\n",
+ "fn = '../results/20241104_bgeg/BGE_2024_edit.xlsx'\n",
"df = validate_bioscan(fn, samples_sheet='TAB 2 Metadata Entry', contrib_sheet='TAB 1 Contributors', \n",
" verbose=False, bold_input=True)"
]
@@ -205,7 +226,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "40a9e9aa",
+ "id": "d3e12c2f",
"metadata": {},
"outputs": [],
"source": []
diff --git a/work/validate_partner_manifest_dev.ipynb b/work/validate_partner_manifest_dev.ipynb
index 351f324..a63d7b5 100644
--- a/work/validate_partner_manifest_dev.ipynb
+++ b/work/validate_partner_manifest_dev.ipynb
@@ -35,7 +35,7 @@
"metadata": {},
"outputs": [],
"source": [
- "VALIDATION_VERSION = '0.3.2'\n",
+ "VALIDATION_VERSION = '0.3.3'\n",
"ANOSPP_VERSION = '4.0'\n",
"# V2.0, but V3 in SOP\n",
"BIOSCAN_VERSION = '3'"
@@ -70,7 +70,7 @@
"outputs": [],
"source": [
"anospp_fn = '../data/Anopheles_Metadata_Manifest_V4.0_20221220.xlsx'\n",
- "biosc_fn = '../data/BIOSCAN_Manifest_V3_20230818.xlsx'"
+ "biosc_fn = '../data/BIOSCAN_Manifest_V3_20240301.xlsx'"
]
},
{
@@ -113,8 +113,15 @@
" \n",
" logging.info(f'parsing bold manifest from \"{fn}\" sheet \"{sheet}\"')\n",
" \n",
+ " # second BGE manifest header\n",
" df = pd.read_excel(fn, dtype=str, keep_default_na=False,\n",
- " sheet_name=sheet, header=3).iloc[3:]\n",
+ " sheet_name=sheet, header=1).iloc[1:]\n",
+ " if 'Plate ID' not in df.columns:\n",
+ " # first BGE manifest header\n",
+ " df = pd.read_excel(fn, dtype=str, keep_default_na=False,\n",
+ " sheet_name=sheet, header=3).iloc[3:]\n",
+ " if ('Plate ID' not in df.columns) or (df['Well Position'].iloc[0] != 'A01'):\n",
+ " raise ValueError('could not parse bold manifest')\n",
" \n",
" df.index = (i + 1 for i in range(df.shape[0]))\n",
" df.index.name = 'SERIES'\n",
@@ -207,8 +214,6 @@
" \n",
" df.rename(columns=col_mapping, inplace=True)\n",
" \n",
- " \n",
- " \n",
" # reorder columns, add missing, remove \"LEAVE BLANK\" values\n",
" for col in template_df.columns:\n",
" if col not in df.columns:\n",
@@ -219,50 +224,115 @@
" }, inplace=True)\n",
" df = df[template_df.columns]\n",
" \n",
- " # contents adjustments\n",
+ " # strip leading zeroes from well ids\n",
" df['TUBE_OR_WELL_ID'] = df['TUBE_OR_WELL_ID'].str.replace(\n",
" '([A-H])0', r'\\1', regex=True\n",
" )\n",
+ " \n",
+ " # blanks handling\n",
+ " for i, r in df[df['ORGANISM_PART'] == ''].iterrows():\n",
+ " df.loc[i, 'ORGANISM_PART'] = 'NOT_APPLICABLE'\n",
+ " if df.loc[i, 'TUBE_OR_WELL_ID'] == '':\n",
+ " # originally we wanted to backfill well IDs, \n",
+ " # but BGEP has varying order of wells in the manifest\n",
+ " # thus not handling for now\n",
+ " raise ValueError(f'Cannot handle empty well ids for blanks, row {i+1} - please fill')\n",
+ " elif r['TUBE_OR_WELL_ID'] == 'A1':\n",
+ " raise ValueError(f'Cannot handle blanks at A1, row {i+1}')\n",
+ " df.loc[i, 'RACK_OR_PLATE_ID'] = df.loc[i-1, 'RACK_OR_PLATE_ID']\n",
+ " df.loc[i, 'PRESERVATIVE_SOLUTION'] = df.loc[i-1, 'PRESERVATIVE_SOLUTION']\n",
+ "\n",
+ " # sts manifest is ordered as A1,B1\n",
+ " sts_well_ids = []\n",
+ " for col in range(1,13):\n",
+ " for row in 'ABCDEFGH':\n",
+ " sts_well_ids.append(f'{row}{col}')\n",
+ " well_order_cat = pd.CategoricalDtype(sts_well_ids, ordered=True)\n",
+ " df['TUBE_OR_WELL_ID'] = df['TUBE_OR_WELL_ID'].astype(well_order_cat)\n",
+ " plate_order_cat = pd.CategoricalDtype(df.RACK_OR_PLATE_ID.drop_duplicates(), ordered=True)\n",
+ " df['RACK_OR_PLATE_ID'] = df['RACK_OR_PLATE_ID'].astype(plate_order_cat)\n",
+ " df = df.sort_values(by=['RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID'])\n",
+ " for col in ['RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID']:\n",
+ " df[col] = df[col].astype(str)\n",
+ " # reset index again\n",
+ " df.index = (i + 1 for i in range(df.shape[0]))\n",
+ " df.index.name = 'SERIES'\n",
+ " \n",
+ " # other contents adjustments\n",
" df['SEX'] = df['SEX'].str.upper()\n",
" df['LIFESTAGE'] = df['LIFESTAGE'].str.upper()\n",
- " df['ORGANISM_PART'] = df['ORGANISM_PART'].str.upper().str.replace(', ', ' | ')\n",
+ " df['ORGANISM_PART'] = df['ORGANISM_PART'].str.upper() \\\n",
+ " .str.replace(', ', ' | ').str.strip()\n",
" df['ORGANISM_PART'].replace({\n",
+ " 'LEGS':'LEG',\n",
+ " 'WINGS':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n",
+ " 'WING':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n",
+ " 'ABDOMEN | LEGS':'ABDOMEN | LEG',\n",
+ " 'HEAD | LEGS':'HEAD | LEG',\n",
+ " 'BODY':'WHOLE_ORGANISM',\n",
+ " 'WHOLE':'WHOLE_ORGANISM',\n",
+ " 'WHOLE BODY':'WHOLE_ORGANISM',\n",
+ " 'WHOLE ORGANISM':'WHOLE_ORGANISM',\n",
" 'ORGANISM':'WHOLE_ORGANISM',\n",
" 'WHOLE SPECIMEN':'WHOLE_ORGANISM',\n",
- " 'WING':'OTHER_SOMATIC_TISSUE'\n",
+ " 'WHOLE_SPECIMEN':'WHOLE_ORGANISM',\n",
+ " 'SOMATIC PART':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n",
+ " 'TAIL':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n",
+ " 'ANTENA':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n",
+ " 'ANTENNA':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n",
+ " 'HALF_SPECIMEN':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n",
+ " 'PIECE OF BODY':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n",
+ " 'MUSCLE TISSUE':'**OTHER_SOMATIC_ANIMAL_TISSUE**',\n",
" }, inplace=True)\n",
" df['PRESERVATIVE_SOLUTION'].replace({\n",
+ " 'Ethanol':'V%_ETHANOL',\n",
" 'ethanol':'V%_ETHANOL',\n",
- " 'dry':'NONE'\n",
+ " 'dry':'NONE',\n",
+ " '':'NONE' # problem with blanks?\n",
" }, inplace=True)\n",
- " df['DATE_OF_COLLECTION'] = df['DATE_OF_COLLECTION'].str.split('/').str.get(-1)\n",
+ " df['CATCH_SOLUTION'].replace({\n",
+ " 'Ethanol':'V%_ETHANOL',\n",
+ " 'ethanol':'V%_ETHANOL',\n",
+ " 'dry':'NONE',\n",
+ " '':'NONE' # problem with blanks?\n",
+ " }, inplace=True)\n",
+ " df['DECIMAL_LATITUDE'] = df['DECIMAL_LATITUDE'].replace('', np.nan).astype(float).round(6).astype(str)\n",
+ " df['DECIMAL_LONGITUDE'] = df['DECIMAL_LONGITUDE'].replace('', np.nan).astype(float).round(6).astype(str)\n",
+ " # df['DATE_OF_COLLECTION'] = df['DATE_OF_COLLECTION'].str.split('/').str.get(-1)\n",
" df['DATE_OF_COLLECTION'] = pd.to_datetime(df['DATE_OF_COLLECTION'], errors='coerce') \\\n",
" .dt.strftime('%Y-%m-%d') \\\n",
" .fillna('')\n",
" species_identified = (df['PREDICTED_SCIENTIFIC_NAME'].str.contains('[a-z]', regex=True))\n",
" df.loc[species_identified, 'PREDICTED_SCIENTIFIC_NAME'] = df.loc[species_identified, 'PREDICTED_GENUS'].str.title() + \\\n",
" ' ' + df.loc[species_identified, 'PREDICTED_SCIENTIFIC_NAME'].str.lower()\n",
- " \n",
- " # blanks handling\n",
- " for i, r in df[df['ORGANISM_PART'] == ''].iterrows():\n",
- " df.loc[i, 'ORGANISM_PART'] = 'NOT_APPLICABLE'\n",
- " if r['TUBE_OR_WELL_ID'] == 'A1':\n",
- " raise ValueError(f'Cannot handle blanks at A1, row {i+1}')\n",
- " df.loc[i, 'RACK_OR_PLATE_ID'] = df.loc[i-1, 'RACK_OR_PLATE_ID']\n",
- " df.loc[i, 'PRESERVATIVE_SOLUTION'] = df.loc[i-1, 'PRESERVATIVE_SOLUTION']\n",
" \n",
" is_blank = (df['ORGANISM_PART'] == 'NOT_APPLICABLE')\n",
" \n",
" # auto-fill\n",
" df['CATCH_LOT'] = 'NOT_APPLICABLE'\n",
" df.loc[~is_blank, 'CATCH_SOLUTION'] = df['PRESERVATIVE_SOLUTION']\n",
+ " df.loc[is_blank, 'CATCH_SOLUTION'] = ''\n",
" df.loc[~is_blank, 'BOTTLE_DIRECTION'] = 'NOT_APPLICABLE'\n",
- " df.loc[df['DESCRIPTION_OF_COLLECTION_METHOD'].str.contains('pan trap'), 'COLLECTION_METHOD'] = 'PAN_TRAP'\n",
" df.loc[~is_blank, 'AMOUNT_OF_CATCH_PLATED'] = 'NOT_APPLICABLE'\n",
" \n",
+ " # encoding\n",
+ " \n",
+ " # encoding for collection method\n",
+ " df.loc[df['DESCRIPTION_OF_COLLECTION_METHOD'].str.lower() \\\n",
+ " .str.contains('pan trap'), 'COLLECTION_METHOD'] = 'PAN_TRAP'\n",
+ " logging.info(f'inferred {df[df.COLLECTION_METHOD == \"PAN_TRAP\"].shape[0]} samples collected with PAN_TRAP')\n",
+ " df.loc[df['DESCRIPTION_OF_COLLECTION_METHOD'].str.lower() \\\n",
+ " .str.contains('malaise trap'), 'COLLECTION_METHOD'] = 'MALAISE_TRAP'\n",
+ " logging.info(f'inferred {df[df.COLLECTION_METHOD == \"MALAISE_TRAP\"].shape[0]} samples collected with MALAISE_TRAP')\n",
+ " df.loc[df['DESCRIPTION_OF_COLLECTION_METHOD'].str.lower() \\\n",
+ " .str.contains('sweep net'), 'COLLECTION_METHOD'] = 'AERIAL_NET'\n",
+ " df.loc[~is_blank & (df.COLLECTION_METHOD == ''), 'COLLECTION_METHOD'] = '**OTHER**'\n",
+ " \n",
+ " \n",
" return df\n",
"# template_df = get_data(biosc_fn, sheet='TAB 2 Metadata Entry')\n",
- "# bold_fn = '../results/20231006_bold_test/BGE_single_specimen_metadata&biobanking_sheet_v5_Kaliuzhna_Braconidae.xlsx'\n",
+ "# # bold_fn = '../results/20231006_bold_test/BGE_single_specimen_metadata&biobanking_sheet_v5_Kaliuzhna_Braconidae.xlsx'\n",
+ "# bold_fn = '../results/20240730_bold_bge_poland/BGEP_am60.xlsx'\n",
"# df = parse_bold(bold_fn, template_df, 'BGE entry')\n",
"# df"
]
@@ -402,7 +472,7 @@
"metadata": {},
"outputs": [],
"source": [
- "def remove_trailing_spaces(df):\n",
+ "def remove_trailing_spaces(df, title):\n",
" \n",
" any_trailing_spaces = False\n",
" \n",
@@ -415,7 +485,7 @@
" any_trailing_spaces = True\n",
" \n",
" if any_trailing_spaces:\n",
- " logging.info('removed some trailing spaces from the manifest')\n",
+ " logging.info(f'removed some trailing spaces from the {title} manifest')\n",
" \n",
" return df\n",
"\n",
@@ -524,7 +594,7 @@
" df = pd.read_excel(fn, dtype=str, keep_default_na=False,\n",
" sheet_name=contrib_sheet)\n",
" \n",
- " df = remove_trailing_spaces(df)\n",
+ " df = remove_trailing_spaces(df, title='contributors')\n",
" \n",
" if 'EMAIL ADDRESS' in df.columns:\n",
" logging.warning('replacing \"EMAIL ADDRESS\" with \"EMAIL_ADDRESS\" contributors column name')\n",
@@ -575,7 +645,7 @@
"metadata": {},
"outputs": [],
"source": [
- "def validate_regex(col, df, na_values=[]):\n",
+ "def validate_regex(col, df, na_values=[], extra_msg=''):\n",
" \n",
" logging.debug(f'validating data format in {col} column')\n",
" \n",
@@ -596,10 +666,10 @@
" 'CATCH_LOT': (r'^C\\d{3}[A-Z]$|^NOT_APPLICABLE$', \n",
" 'like C123A or NOT_APPLICABLE'),\n",
" 'DATE_OF_COLLECTION': (date_regex, 'in YYYY-MM-DD format'),\n",
- " 'DECIMAL_LATITUDE': (r'^[-+]?([0-8]\\d|\\d)(\\.\\d+)?$', \n",
- " 'between -90 and 90'),\n",
- " 'DECIMAL_LONGITUDE': (r'^[-+]?(1[0-7]\\d|\\d\\d|\\d)(\\.\\d+)?$', \n",
- " 'between -180 and 180'),\n",
+ " 'DECIMAL_LATITUDE': (r'^[-+]?([0-8]\\d|\\d)(\\.\\d{1,6})?$', \n",
+ " 'between -90 and 90 with up to 6 decimals'),\n",
+ " 'DECIMAL_LONGITUDE': (r'^[-+]?(1[0-7]\\d|\\d\\d|\\d)(\\.\\d{1,6})?$', \n",
+ " 'between -180 and 180 with up to 6 decimals'),\n",
" 'WHAT_3_WORDS': (r'^///[a-z]+\\.[a-z]+\\.[a-z]+$', \n",
" 'like ///one.two.three'),\n",
" 'TIME_OF_COLLECTION': (r'^(?:[01]\\d|2[0-3]):[0-5]\\d$|^(?:[01]\\d|2[0-3]):[0-5]\\d:[0-5]\\d$', \n",
@@ -617,7 +687,7 @@
" if not is_valid_regex.all():\n",
" offending_values = list(series[~is_valid_regex].unique())\n",
" s = index_ranges(series[~is_valid_regex])\n",
- " msg = (f'{col} format incorrect for SERIES {s}: found {offending_values} - '\n",
+ " msg = (f'{col} format{extra_msg} incorrect for SERIES {s}: found {offending_values} - '\n",
" f'expected to be {regexs[col][1]}')\n",
" if col == 'CATCH_LOT':\n",
" logging.warning(msg)\n",
@@ -629,7 +699,7 @@
" if not is_recommended_regex.all():\n",
" offending_values = list(series[~is_recommended_regex & is_valid_regex].unique())\n",
" s = index_ranges(series[~is_recommended_regex & is_valid_regex])\n",
- " msg = (f'{col} format suggestion for SERIES {s}: found {offending_values} - '\n",
+ " msg = (f'{col} format suggestion{extra_msg} for SERIES {s}: found {offending_values} - '\n",
" f'we ask to use PT[n]H format (hours only)')\n",
" logging.warning(msg)\n",
" "
@@ -830,6 +900,12 @@
" f'found {set(non_blank_penultimate_well_df.ORGANISM_PART.to_list())}, '\n",
" f'these samples will not be sequenced',\n",
" )\n",
+ " \n",
+ " blank_non_term_df = blank_df[~blank_df.TUBE_OR_WELL_ID.isin(['G12','H12'])]\n",
+ " if blank_non_term_df.shape[0] > 0:\n",
+ " logging.warning(\n",
+ " f'blanks in non-G12/H12 wells at SERIES {index_ranges(blank_non_term_df)}',\n",
+ " )\n",
"\n",
" any_excessive_blank_info = False\n",
" \n",
@@ -865,7 +941,7 @@
"metadata": {},
"outputs": [],
"source": [
- "def validate_values(col, df, valid_dict, sep=None, na_values=[], level='e'):\n",
+ "def validate_values(col, df, valid_dict, sep=None, na_values=[], level='e', extra_msg=''):\n",
" \n",
" logging.debug(f'validating for allowed values in {col} column')\n",
" \n",
@@ -883,9 +959,10 @@
" col_values = set(series.unique())\n",
" # use separator to split values\n",
" if sep:\n",
+ " series = series.apply(lambda v: [x.strip() for x in v.split(sep)])\n",
" sep_col_values = list()\n",
- " for v in col_values:\n",
- " sep_col_values.extend([x.strip() for x in v.split(sep)])\n",
+ " for v in series:\n",
+ " sep_col_values.extend(v)\n",
" col_values = set(sep_col_values)\n",
" \n",
" valid_values = set(valid_dict[col])\n",
@@ -897,12 +974,11 @@
" if invalid_value == '':\n",
" invalid_value_series = index_ranges(series[series == ''])\n",
" elif sep:\n",
- " # match pipe-separated\n",
- " invalid_value_series = index_ranges(series[series.str.contains(\n",
- " r'(?:^|\\|)' + invalid_value + r'(?:$|\\|)', regex=True)])\n",
+ " invalid_mask = series.apply(lambda v: invalid_value in v)\n",
+ " invalid_value_series = index_ranges(series[invalid_mask])\n",
" else:\n",
" invalid_value_series = index_ranges(series[series == invalid_value])\n",
- " msg = f'invalid value in {col} column, SERIES {invalid_value_series}: \"{invalid_value}\"'\n",
+ " msg = f'invalid value{extra_msg} in {col} column, SERIES {invalid_value_series}: \"{invalid_value}\"'\n",
" if level == 'i':\n",
" logging.info(msg)\n",
" elif level == 'w':\n",
@@ -990,7 +1066,7 @@
"metadata": {},
"outputs": [],
"source": [
- "def validate_country_and_coordinates(df, fn, na_values=[], bioscan=False):\n",
+ "def validate_country_and_coordinates(df, fn, na_values=[''], bioscan=False):\n",
" \n",
" logging.debug('validating COUNTRY_OF_COLLECTION against DECIMAL_LATITUDE and DECIMAL_LONGITUDE')\n",
" \n",
@@ -1013,7 +1089,7 @@
" # get location data for coordinates\n",
" # use local copy of web query results for re-runs\n",
" # this \n",
- " loc_fn = fn+'_loc.pkl'\n",
+ " loc_fn = fn + '_loc.pkl'\n",
" if os.path.isfile(loc_fn):\n",
" locations = pickle.load(open(loc_fn, \"rb\"))\n",
" else:\n",
@@ -1027,8 +1103,10 @@
" # pre-fill with unknown country\n",
" locations[c] = {'address':{'country':'UNKNOWN'}}\n",
" # check coordniate correctness\n",
+ " coord_list = c.split(', ')\n",
+ " lat = coord_list[0]\n",
+ " lon = coord_list[1]\n",
" try:\n",
- " lat, lon = c.split(', ')\n",
" lat, lon = float(lat), float(lon)\n",
" except:\n",
" unparsed_df = df[(df[lat_col] == str(lat)) & df[lon_col] == str(lon)]\n",
@@ -1061,8 +1139,13 @@
" # extract countries from location data\n",
" loc_countries = dict()\n",
" for coord in locations.keys():\n",
- " \n",
- " lat, lon = coord.split(', ')\n",
+ " try:\n",
+ " lat, lon = coord.split(', ')\n",
+ " except:\n",
+ " logging.error(\n",
+ " f'problem parsing coordinates {coord} in locations results'\n",
+ " )\n",
+ " continue\n",
" coord_series = index_ranges(df.query(f'({lat_col} == \"{lat}\") & ({lon_col} == \"{lon}\")'))\n",
" \n",
" coord_country = locations[coord]['address']['country'].upper()\n",
@@ -1441,11 +1524,12 @@
" logging.info('dropping IDENTIFIER_AFFILIATION column for bioscan manifest v2')\n",
" df = df.drop(columns=['IDENTIFIER_AFFILIATION'])\n",
" # add contributors - delimiters checked in validate_contributors\n",
- " contrib_series = contrib_df['FULL_NAME'] + ';' + \\\n",
- " contrib_df['PRIMARY_AFFILIATION'] + ';' + \\\n",
- " contrib_df['EMAIL_ADDRESS'] + ';' + \\\n",
- " contrib_df['CONTRIBUTION']\n",
- " df['CONTRIBUTORS'] = '|'.join(list(contrib_series))\n",
+ " if bioscan:\n",
+ " contrib_series = contrib_df['FULL_NAME'] + ';' + \\\n",
+ " contrib_df['PRIMARY_AFFILIATION'] + ';' + \\\n",
+ " contrib_df['EMAIL_ADDRESS'] + ';' + \\\n",
+ " contrib_df['CONTRIBUTION']\n",
+ " df['CONTRIBUTORS'] = '|'.join(list(contrib_series))\n",
" # add supplier sample name prefixes to control samples\n",
"# df['BIOSCAN_SUPPLIER_SAMPLE_NAME'] = df['SPECIMEN_ID']\n",
"# df.loc[is_blank, 'BIOSCAN_SUPPLIER_SAMPLE_NAME'] = 'CONTROL_NEG_LYSATE_' + df['SPECIMEN_ID']\n",