Skip to content

Commit

Permalink
Save WIP before working on FileReferences->ExperimentalImagingDataset
Browse files Browse the repository at this point in the history
  • Loading branch information
kbab committed Jul 4, 2024
1 parent b619daa commit 34d24b5
Show file tree
Hide file tree
Showing 4 changed files with 453 additions and 104 deletions.
221 changes: 204 additions & 17 deletions bia-ingest-shared-models/bia_ingest_sm/conversion.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,89 @@
import re
import hashlib
import uuid
from typing import List, Any, Dict, Optional, Tuple
from typing import List, Any, Dict, Optional, Tuple, Type
from pydantic import BaseModel
from .biostudies import Submission, attributes_to_dict, Section, Attribute
from src.bia_models import bia_data_model, semantic_models


def get_experimental_imaging_dataset(submission: Submission) -> List[bia_data_model.ExperimentalImagingDataset]:
"""Map biostudies.Submission study components to bia_data_model.ExperimentalImagingDataset
"""
study_components = find_sections_recursive(submission.section, ["Study Component",], [])
analysis_method_dict = get_image_analysis_method(submission)

# TODO: Need to persist this (API finally, but initially to disk)
biosamples_in_submission = get_biosample(submission)

# Index biosamples by title_id. Makes linking with associations more
# straight forward.
# Use for loop instead of dict comprehension to allow biosamples with
# same title to form list
biosamples_in_submission_uuid = {}
for biosample in get_biosample(submission):
if biosample.title_id in biosamples_in_submission_uuid:
biosamples_in_submission_uuid[biosample.title_id].append(biosample.uuid)
else:
biosamples_in_submission_uuid[biosample.title_id] = [biosample.uuid,]

experimental_imaging_dataset = []
for section in study_components:
attr_dict = attributes_to_dict(section.attributes)
key_mapping = [
("biosample", "Biosample", None,),
("specimen", "Specimen", None,),
("image_acquisition", "Image acquisition", None,),
("image_analysis", "Image analysis", None,),
("image_correlation", "Image correlation", None,),
]
associations = get_generic_section_as_list(section, ["Associations",], key_mapping)

analysis_method_list = []
biosample_list = []
image_acquisition_method_list = []
correlation_method_list = []
specimen_preparation_method_list = []

if len(associations) > 0:
# Image Analysis Method
analysis_methods_from_associations = [a.get("image_analysis") for a in associations]
for analysis_method in analysis_method_dict.values():
if analysis_method.method_description in analysis_methods_from_associations:
analysis_method_list.append(analysis_method)

# Biosample
biosamples_from_associations = [a.get("biosample") for a in associations]
for biosample in biosamples_from_associations:
if biosample in biosamples_in_submission_uuid:
biosample_list.extend(biosamples_in_submission_uuid[biosample])



model_dict = {
"title_id": attr_dict["Name"],
#"description": attr_dict["Description"],
"submitted_in_study": get_study_uuid(submission),
"file": [],
"image": [],
"specimen_preparation_method": specimen_preparation_method_list,
"acquisition_method": image_acquisition_method_list,
"biological_entity": biosample_list,
"analysis_method": analysis_method_list,
"correlation_method": correlation_method_list,
"file_reference_count": 0,
"image_count": 0,
"example_image_uri": [],
}
# TODO: Add 'description' to computation of uuid (Maybe accno?)
model_dict["uuid"] = dict_to_uuid(model_dict, ["title_id", "submitted_in_study", ])
experimental_imaging_dataset.append(
bia_data_model.ExperimentalImagingDataset.model_validate(model_dict)
)

return experimental_imaging_dataset

def get_study(submission: Submission) -> bia_data_model.Study:
"""Return an API study model populated from the submission
Expand All @@ -26,6 +104,7 @@ def get_study(submission: Submission) -> bia_data_model.Study:
study_attributes.pop("License")

study_dict = {
"uuid": get_study_uuid(submission),
"accession_id": submission.accno,
# TODO: Do more robust search for title - sometimes it is in
# actual submission - see old ingest code
Expand All @@ -42,12 +121,25 @@ def get_study(submission: Submission) -> bia_data_model.Study:
"experimental_imaging_component": [],
"annotation_component": [],
}
study_uuid = dict_to_uuid(study_dict, ["accession_id",])
study_dict["uuid"] = study_uuid
#study_uuid = dict_to_uuid(study_dict, ["accession_id",])
#study_dict["uuid"] = study_uuid
study = bia_data_model.Study.model_validate(study_dict)

return study

def get_image_analysis_method(submission: Submission) -> Dict[str, semantic_models.ImageAnalysisMethod]:

key_mapping = [
("method_description", "Title", None,),
("features_analysed", "Image analysis overview", None,),
]

return get_generic_section_as_dict(
submission, ["Image analysis",], key_mapping, semantic_models.ImageAnalysisMethod
)

def get_study_uuid(submission: Submission) -> str:
return dict_to_uuid({"accession_id": submission.accno}, ["accession_id",])

def study_title_from_submission(submission: Submission) -> str:

Expand Down Expand Up @@ -75,7 +167,7 @@ def get_external_reference(
"""Map biostudies.Submission.Link to semantic_models.ExternalReference
"""
sections = find_sections_recursive(submission.section, ["links",])
sections = find_sections_recursive(submission.section, ["links",], [])

key_mapping = [
("link", "url", None),
Expand All @@ -100,8 +192,7 @@ def get_grant(submission: Submission) -> List[semantic_models.Grant]:
("id", "grant_id", None),
]
grant_dict = get_generic_section_as_dict(
submission, ["Funding",], semantic_models.Grant, key_mapping
)
submission, ["Funding",], key_mapping, semantic_models.Grant )

grant_list = []
for k, v in grant_dict.items():
Expand All @@ -118,23 +209,26 @@ def get_funding_body(submission: Submission) -> semantic_models.FundingBody:
("display_name", "Agency", None,),
]
funding_body = get_generic_section_as_dict(
submission, ["Funding",], semantic_models.FundingBody, key_mapping
submission, ["Funding",], key_mapping, semantic_models.FundingBody
)
return funding_body


# TODO: Put comments and docstring
def get_generic_section_as_list(
submission: Submission,
root: [Submission|Section],
section_name: List[str],
mapped_object: [Any],
key_mapping: List[Tuple[str, str, [str | None | List]]],
mapped_object: Optional[Any] = None,
mapped_attrs_dict: Optional[Dict[str, Any]] = None,
) -> List[Any]:
) -> List[Any|Dict[str,str|List[str]]]:
"""Map biostudies.Submission objects to either semantic_models or bia_data_model equivalent
"""
sections = find_sections_recursive(submission.section, section_name)
if type(root) is Submission:
sections = find_sections_recursive(root.section, section_name, [])
else:
sections = find_sections_recursive(root, section_name, [])

return_list = []
for section in sections:
Expand All @@ -143,27 +237,36 @@ def get_generic_section_as_list(
else:
attr_dict = mattributes_to_dict(section.attributes, mapped_attrs_dict)
model_dict = {k: attr_dict.get(v, default) for k, v, default in key_mapping}
return_list.append(mapped_object.model_validate(model_dict))
if mapped_object is None:
return_list.append(model_dict)
else:
return_list.append(mapped_object.model_validate(model_dict))
return return_list


# TODO: Put comments and docstring
def get_generic_section_as_dict(
submission: Submission,
root: [Submission|Section],
section_name: List[str],
mapped_object: [Any],
key_mapping: List[Tuple[str, str, [str | None | List]]],
) -> Dict[str, Any]:
mapped_object: Optional[Any]=None,
) -> Dict[str, Any|Dict[str, Dict[str,str|List[str]]]]:
"""Map biostudies.Submission objects to dict containing either semantic_models or bia_data_model equivalent
"""
sections = find_sections_recursive(submission.section, section_name)
if type(root) is Submission:
sections = find_sections_recursive(root.section, section_name, [])
else:
sections = find_sections_recursive(root, section_name, [])

return_dict = {}
for section in sections:
attr_dict = attributes_to_dict(section.attributes)
model_dict = {k: attr_dict.get(v, default) for k, v, default in key_mapping}
return_dict[section.accno] = mapped_object.model_validate(model_dict)
if mapped_object is None:
return_dict[section.accno] = model_dict
else:
return_dict[section.accno] = mapped_object.model_validate(model_dict)
return return_dict


Expand Down Expand Up @@ -247,6 +350,90 @@ def get_contributor(submission: Submission) -> List[semantic_models.Contributor]

return contributors

# This function instantiates any API model given a dict of its attributes
# Hence the use of the pydantic BaseModel which all API models
# are derived from in the type hinting
def dicts_to_api_models(
dicts: List[Dict[str, Any]], api_model_class: Type[BaseModel]
) -> BaseModel:

api_models = []
for model_dict in dicts:
api_models.append(api_model_class.model_validate(model_dict))

return api_models

def get_biosample(submission: Submission) -> List[bia_data_model.BioSample]:

biosample_model_dicts = extract_biosample_dicts(submission)
biosamples = dicts_to_api_models(biosample_model_dicts, bia_data_model.BioSample)
return biosamples

def extract_biosample_dicts(submission: Submission) -> List[Dict[str, Any]]:
biosample_sections = find_sections_recursive(submission.section, ["Biosample"], [])

key_mapping = [
("title_id", "Title", ""),
("description", "Description", ""),
# TODO: discuss adding this to semantic model with FS
#("biological_entity", "Biological entity", ""),
("organism", "Organism", ""),
]

model_dicts = []
for section in biosample_sections:
attr_dict = attributes_to_dict(section.attributes)

model_dict = {k: attr_dict.get(v, default) for k, v, default in key_mapping}

model_dict["accno"] = section.__dict__.get("accno", "")

# Obtain scientic and common names from organism
organism = model_dict.pop("organism", "")
try:
organism_scientific_name, organism_common_name = organism.split("(")
organism_common_name = organism_common_name.rstrip(")")
except ValueError:
organism_scientific_name = organism
organism_common_name = ""
taxon = semantic_models.Taxon.model_validate({
"common_name": organism_common_name.strip(),
"scientific_name": organism_scientific_name.strip(),
"ncbi_id": None,
})
model_dict["organism_classification"] = [ taxon.model_dump() ]

# Populate intrinsic and extrinsic variables
for api_key, biostudies_key in (
("intrinsic_variable_description", "Intrinsic variable"),
("extrinsic_variable_description", "Extrinsic variable",),
("experimental_variable_description", "Experimental variable",),
):
model_dict[api_key] = []
if biostudies_key in attr_dict:
model_dict[api_key].append(attr_dict[biostudies_key])

model_dict["accession_id"] = submission.accno
model_dict["uuid"] = generate_biosample_uuid(model_dict)
model_dicts.append(model_dict)

return model_dicts


def generate_biosample_uuid(biosample_dict: Dict[str, Any]) -> str:
attributes_to_consider = [
"accession_id",
"accno",
"title_id",
"organism_classification",
"description",
# TODO: Discuss including below in semantic_models.BioSample
#"biological_entity",
"intrinsic_variable_description",
"extrinsic_variable_description",
"experimental_variable_description",
]
return dict_to_uuid(biosample_dict, attributes_to_consider)

def find_sections_recursive(
section: Section, search_types: List[str], results: Optional[List[Section]] = []
Expand Down
Loading

0 comments on commit 34d24b5

Please sign in to comment.