Skip to content
This repository has been archived by the owner on Aug 1, 2024. It is now read-only.

Commit

Permalink
adding management command to extract metadata for specimen_ids
Browse files Browse the repository at this point in the history
  • Loading branch information
shaw2thefloor committed Oct 26, 2022
1 parent ca2db97 commit 00ba300
Show file tree
Hide file tree
Showing 2 changed files with 153 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
from bson import json_util
from dal import cursor_to_list
from dal.copo_da import Sample
from datetime import datetime, timezone
from django.conf import settings as settings
from django.core.management import BaseCommand
from xlrd import open_workbook, XLRDError
import ast
import dal.copo_da as da
import importlib
import json
import pandas as pd
import re

schema_version_path_dtol_lookups = f'web.apps.web_copo.schema_versions.{settings.CURRENT_SCHEMA_VERSION}.lookup.dtol_lookups'
dtol_lookups_data = importlib.import_module(schema_version_path_dtol_lookups)
DTOL_ENA_MAPPINGS = dtol_lookups_data.DTOL_ENA_MAPPINGS
TOL_PROFILE_TYPES = dtol_lookups_data.TOL_PROFILE_TYPES


class Command(BaseCommand):
# The following information is shown when a user types "help"
help = "Extract/parse \"SPECIMEN_ID\" from an .xlsx file then, retrieve metadata from COPO database based on the " \
"data and output the result in a .json format"

def __init__(self):
super().__init__()

def add_arguments(self, parser):
parser.add_argument("xlsx", type=str)

def parse_db_data_to_json(self, data_in_db, json_filename):
datetime_fields = ["date_modified", "time_created"]
df_list = []
data = json.loads(json_util.dumps(data_in_db))

for sample in data[0]:
'''
Remove nested dicitonaries from the sample dictionary by retrieving
the value of the nested dictionary and assigning it to the key of the outer dictionary
if a dictionary is present within the list of values
'''

for key, value in sample.items():
if type(value) is dict and key == "_id":
field_value = value.get('$oid')
sample[key] = field_value

if type(value) is dict and key in datetime_fields:
field_value = value.get('$date')

# Convert datetime from milliseconds to timestamp
field_value = datetime.fromtimestamp(field_value / 1000.0, tz=timezone.utc).strftime(
'%Y-%m-%d %H:%M:%S.%f')
sample[key] = field_value

# Create a dictionary that maps the COPO database field name (i.e. old key)
# to the ENA field name (i.e. new key)
copo_and_ena_field_names_dict = {key: value['ena'] for key, value in DTOL_ENA_MAPPINGS.items() if
key in list(sample.keys())}

# Replace each COPO database field name with the corresponding ENA field name
# in the sample dictionary
for key, value in list(sample.items()):
sample[copo_and_ena_field_names_dict.get(key, key)] = sample.pop(key)

length_of_data_list = 35 if "biospecimens" in json_filename else 128 # ternary operator
assert length_of_data_list == len(list(sample.keys()))
df = pd.DataFrame(data=[list(sample.values())], columns=list(sample.keys()))
df_list.append(df)

concat_df = pd.concat(df_list, ignore_index=True) # Concatenate all the datafarmes
concat_df.to_json(json_filename)

# A command must define handle()
def handle(self, *args, **options):
file_path_dict = ast.literal_eval(str(options))
excel_file_path = file_path_dict.get("xlsx")
try:
open_workbook(excel_file_path)
df = pd.read_excel(excel_file_path) # Convert excel file to a Python Pandas dataframe
rows_list = df.to_dict('records') # Get all rows from the excel spreadsheet
pattern_without_prefix = "EMu/\d{9}"
pattern_with_prefix = "EMu/NHMUK\d{9}" # "SPECIMEN_ID" field begins with the prefix - "NHMUK"
pattern_with_Emu_only = "EMu/"

# Iterate through each row in the spreadsheet to retrieve the "SPECIMEN_ID"
specimen_id_list = []
for row in rows_list:
search_query_without_prefix = re.search(pattern_without_prefix, str(row))
search_query_with_prefix = re.search(pattern_with_prefix, str(row))
if search_query_without_prefix:
specimen_id = search_query_without_prefix.group(0)
# Remove the substring, "EMu/" then, add the string, "NHMUK", to get the "SPECIMEN_ID"
specimen_id = "NHMUK".join(specimen_id.split(pattern_with_Emu_only))
specimen_id_list.append(specimen_id)

if search_query_with_prefix:
specimen_id = search_query_with_prefix.group(0)
# Remove the substring, "EMu/", to retain the "SPECIMEN_ID" only
specimen_id = "".join(specimen_id.split(pattern_with_Emu_only))
specimen_id_list.append(specimen_id)

assert len(specimen_id_list) == 2138

specimen_ids_not_in_db = []
samples_only_in_db = []
sources_only_in_db = []
samples_and_sources_in_db = []
specimen_id_list = ["MBA-190930-001A", "MBA-190930-001B", "MBA-190930-099Q", "EDTOLQ0405"]

for specimen in specimen_id_list:
sample_in_db = cursor_to_list(Sample().get_sample_by_specimen_id(specimen))
source_in_db = da.Source().get_by_specimen(specimen)

if not sample_in_db and not source_in_db:
specimen_ids_not_in_db.append(specimen)
elif sample_in_db:
samples_only_in_db.append(sample_in_db)
elif source_in_db:
sources_only_in_db.append(source_in_db)
else:
# if sample_in_db and source_in_db
samples_and_sources_in_db.append(sample_in_db)
samples_and_sources_in_db.append(source_in_db)

# Convert list of "SPECIMEN_ID" not present in COPO to json format
if specimen_ids_not_in_db:
specimen_ids_df = pd.DataFrame(data=[specimen_ids_not_in_db], columns=["SPECIMEN_ID"])
specimen_ids_df.to_json('specimen_ids_not_present_in_copo_from_nhmdump_excel.json')

# Parse db list of dictionary data to json
if samples_only_in_db:
# Samples only
self.parse_db_data_to_json(samples_only_in_db, 'copo_biosamples_from_spreadsheet_excel.json')

if sources_only_in_db:
# Sources only
self.parse_db_data_to_json(sources_only_in_db, 'copo_biospecimens_from_spreadsheet_excel.json')

if samples_and_sources_in_db:
# Samples and sources....if sample_in_db and source_in_db
print(samples_and_sources_in_db)
self.parse_db_data_to_json(samples_and_sources_in_db,
'copo_biosamples_and_biosources_from_nhmdump_spreadsheet.json')

if not samples_only_in_db and not sources_only_in_db and not samples_and_sources_in_db:
print("*********************************")
print("Error: Data in the .xlsx file do not correspond to any data in the database!")
print("*********************************")

except XLRDError as error:
print("Error: ", error)
Binary file not shown.

0 comments on commit 00ba300

Please sign in to comment.